robotstxtparser.php (9340B)
1 <?php 2 /** 3 * Class for parsing robots.txt files 4 * 5 * @author Eugene Yurkevich (bopodaa@gmail.com) 6 * 7 * 8 * Some useful links and materials: 9 * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt 10 * @link https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml 11 */ 12 13 namespace d_robots_txt_parser; 14 15 class RobotsTxtParser 16 { 17 // default encoding 18 const DEFAULT_ENCODING = 'UTF-8'; 19 20 // states 21 const STATE_ZERO_POINT = 'zero-point'; 22 const STATE_READ_DIRECTIVE = 'read-directive'; 23 const STATE_SKIP_SPACE = 'skip-space'; 24 const STATE_SKIP_LINE = 'skip-line'; 25 const STATE_READ_VALUE = 'read-value'; 26 27 // directives 28 const DIRECTIVE_ALLOW = 'allow'; 29 const DIRECTIVE_DISALLOW = 'disallow'; 30 const DIRECTIVE_HOST = 'host'; 31 const DIRECTIVE_SITEMAP = 'sitemap'; 32 const DIRECTIVE_USERAGENT = 'user-agent'; 33 const DIRECTIVE_CRAWL_DELAY = 'crawl-delay'; 34 const DIRECTIVE_CLEAN_PARAM = 'clean-param'; 35 36 /** 37 * Default user-agent 38 * First off, links should be checked by specific user-agent rules. If specific user-agent isn't specified than default user-agent used. 39 */ 40 const USER_AGENT_ALL = '*'; 41 42 // current state 43 private $state = ''; 44 45 // robots.txt file content 46 private $content = ''; 47 48 // rules set 49 private $rules = array(); 50 51 // internally used variables 52 private $current_word = ''; 53 private $current_char = ''; 54 private $char_index = 0; 55 private $current_directive = ''; 56 private $userAgent = self::USER_AGENT_ALL; 57 58 /** 59 * @param string $content - file content 60 * @param string $encoding - encoding 61 * @return RobotsTxtParser 62 */ 63 public function __construct($content, $encoding = self::DEFAULT_ENCODING) 64 { 65 // convert encoding 66 $encoding = !empty($encoding) ? $encoding : mb_detect_encoding($content); 67 mb_internal_encoding($encoding); 68 69 // set content 70 $this->content = iconv($encoding, 'UTF-8//IGNORE', $content); 71 $this->content .= "\n"; 72 73 // set default state 74 $this->state = self::STATE_ZERO_POINT; 75 76 // parse rules - default state 77 $this->prepareRules(); 78 } 79 80 /** 81 * Get rules by specific bot (user-agent) 82 * Use $userAgent = NULL to get all rules for all user-agents grouped by user-agent. User-agents will return in lower case. 83 * Use $userAgent = '*' to get common rules. 84 * Use $userAgent = 'YandexBot' to get rules for user-agent 'YandexBot'. 85 * 86 * @param string $userAgent 87 * @return array 88 */ 89 public function getRules($userAgent = NULL) 90 { 91 if (is_null($userAgent)) { 92 //return all rules 93 return $this->rules; 94 } 95 else { 96 $userAgent = mb_strtolower($userAgent); 97 if (isset($this->rules[$userAgent])) { 98 return $this->rules[$userAgent]; 99 } 100 else { 101 return array(); 102 } 103 } 104 } 105 106 /** 107 * Get sitemaps links. 108 * Sitemap always relates to all user-agents and return in rules with user-agent "*" 109 * 110 * @return array all sitemap urls 111 */ 112 public function getSitemaps() 113 { 114 $rules = $this->getRules(self::USER_AGENT_ALL); 115 if (!empty($rules[self::DIRECTIVE_SITEMAP])) { 116 return $rules[self::DIRECTIVE_SITEMAP]; 117 } 118 119 return array(); 120 } 121 122 public function getContent() 123 { 124 return $this->content; 125 } 126 127 /** 128 * Comment signal (#) 129 */ 130 private function sharp() 131 { 132 return ($this->current_char == '#'); 133 } 134 135 /** 136 * Allow directive signal 137 */ 138 private function directiveAllow() 139 { 140 return ($this->current_directive == self::DIRECTIVE_ALLOW); 141 } 142 143 /** 144 * Disallow directive signal 145 */ 146 private function directiveDisallow() 147 { 148 return ($this->current_directive == self::DIRECTIVE_DISALLOW); 149 } 150 151 /** 152 * Host directive signal 153 */ 154 private function directiveHost() 155 { 156 return ($this->current_directive == self::DIRECTIVE_HOST); 157 } 158 159 /** 160 * Sitemap directive signal 161 */ 162 private function directiveSitemap() 163 { 164 return ($this->current_directive == self::DIRECTIVE_SITEMAP); 165 } 166 167 /** 168 * Clean-param directive signal 169 */ 170 private function directiveCleanParam() 171 { 172 return ($this->current_directive == self::DIRECTIVE_CLEAN_PARAM); 173 } 174 175 /** 176 * User-agent directive signal 177 */ 178 private function directiveUserAgent() 179 { 180 return ($this->current_directive == self::DIRECTIVE_USERAGENT); 181 } 182 183 /** 184 * Crawl-Delay directive signal 185 */ 186 private function directiveCrawlDelay() 187 { 188 return ($this->current_directive == self::DIRECTIVE_CRAWL_DELAY); 189 } 190 191 /** 192 * Key : value pair separator signal 193 */ 194 private function lineSeparator() 195 { 196 return ($this->current_char == ':'); 197 } 198 199 /** 200 * Move to new line signal 201 */ 202 private function newLine() 203 { 204 $asciiCode = ord($this->current_char); 205 206 return ($this->current_char == "\n" 207 || $asciiCode == 13 208 || $asciiCode == 10 209 || $this->current_word == "\r\n" 210 || $this->current_word == "\n\r" 211 ); 212 } 213 214 /** 215 * "Space" signal 216 */ 217 private function space() 218 { 219 return ($this->current_char == "\s"); 220 } 221 222 /** 223 * Change state 224 * 225 * @param string $stateTo - state that should be set 226 * @return void 227 */ 228 private function switchState($stateTo = self::STATE_SKIP_LINE) 229 { 230 $this->state = $stateTo; 231 } 232 233 /** 234 * Parse rules 235 * 236 * @return void 237 */ 238 public function prepareRules() 239 { 240 $contentLength = mb_strlen($this->content); 241 while ($this->char_index <= $contentLength) { 242 $this->step(); 243 } 244 245 foreach ($this->rules as $userAgent => $directive) { 246 foreach ($directive as $directiveName => $directiveValue) { 247 if (is_array($directiveValue)) { 248 $this->rules[$userAgent][$directiveName] = array_values(array_unique($directiveValue)); 249 } 250 } 251 } 252 } 253 254 /** 255 * Check if we should switch 256 * @return bool 257 */ 258 private function shouldSwitchToZeroPoint() 259 { 260 return in_array(strtolower($this->current_word), array( 261 self::DIRECTIVE_ALLOW, 262 self::DIRECTIVE_DISALLOW, 263 self::DIRECTIVE_HOST, 264 self::DIRECTIVE_USERAGENT, 265 self::DIRECTIVE_SITEMAP, 266 self::DIRECTIVE_CRAWL_DELAY, 267 self::DIRECTIVE_CLEAN_PARAM, 268 ), true); 269 } 270 271 /** 272 * Process state ZERO_POINT 273 * @return RobotsTxtParser 274 */ 275 private function zeroPoint() 276 { 277 if ($this->shouldSwitchToZeroPoint()) { 278 $this->switchState(self::STATE_READ_DIRECTIVE); 279 } // unknown directive - skip it 280 elseif ($this->newLine()) { 281 $this->current_word = ""; 282 $this->increment(); 283 } 284 else { 285 $this->increment(); 286 } 287 return $this; 288 } 289 290 /** 291 * Read directive 292 * @return RobotsTxtParser 293 */ 294 private function readDirective() 295 { 296 $this->current_directive = strtolower(trim($this->current_word)); 297 298 $this->increment(); 299 300 if ($this->lineSeparator()) { 301 $this->current_word = ""; 302 $this->switchState(self::STATE_READ_VALUE); 303 } 304 else { 305 if ($this->space()) { 306 $this->switchState(self::STATE_SKIP_SPACE); 307 } 308 if ($this->sharp()) { 309 $this->switchState(self::STATE_SKIP_LINE); 310 } 311 } 312 return $this; 313 } 314 315 /** 316 * Skip space 317 * @return RobotsTxtParser 318 */ 319 private function skipSpace() 320 { 321 $this->char_index++; 322 $this->current_word = mb_substr($this->current_word, -1); 323 return $this; 324 } 325 326 /** 327 * Skip line 328 * @return RobotsTxtParser 329 */ 330 private function skipLine() 331 { 332 $this->char_index++; 333 $this->switchState(self::STATE_ZERO_POINT); 334 return $this; 335 } 336 337 /** 338 * Read value 339 * @return RobotsTxtParser 340 */ 341 private function readValue() 342 { 343 if ($this->newLine()) { 344 $this->assignValueToDirective(); 345 } 346 elseif ($this->sharp()) { 347 $this->current_word = mb_substr($this->current_word, 0, -1); 348 $this->assignValueToDirective(); 349 } 350 else { 351 $this->increment(); 352 } 353 return $this; 354 } 355 356 private function assignValueToDirective() 357 { 358 if ($this->directiveUserAgent()) { 359 $this->userAgent = mb_strtolower(trim($this->current_word)); 360 if (!isset($this->rules[$this->userAgent])) { 361 $this->rules[$this->userAgent] = array(); 362 } 363 } 364 elseif ($this->directiveCrawlDelay()) { 365 $this->rules[$this->userAgent][$this->current_directive] = (double)$this->current_word; 366 } 367 elseif ($this->directiveSitemap()) { 368 $this->rules[self::USER_AGENT_ALL][$this->current_directive][] = $this->current_word; 369 } 370 elseif ($this->directiveCleanParam()) { 371 $this->rules[$this->userAgent][$this->current_directive][] = trim($this->current_word); 372 } 373 elseif ($this->directiveHost()) { 374 if (empty($this->rules['*'][$this->current_directive])) { // save only first host directive value, assign to '*' 375 $this->rules['*'][$this->current_directive] = $this->current_word; 376 } 377 } 378 else { 379 if (!empty($this->current_word)) { 380 $this->rules[$this->userAgent][$this->current_directive][] = $this->current_word; 381 } 382 } 383 $this->current_word = ''; 384 $this->current_directive = ''; 385 $this->switchState(self::STATE_ZERO_POINT); 386 } 387 388 /** 389 * Machine step 390 * 391 * @return void 392 */ 393 private function step() 394 { 395 switch ($this->state) { 396 case self::STATE_ZERO_POINT: 397 $this->zeroPoint(); 398 break; 399 400 case self::STATE_READ_DIRECTIVE: 401 $this->readDirective(); 402 break; 403 404 case self::STATE_SKIP_SPACE: 405 $this->skipSpace(); 406 break; 407 408 case self::STATE_SKIP_LINE: 409 $this->skipLine(); 410 break; 411 412 case self::STATE_READ_VALUE: 413 $this->readValue(); 414 break; 415 } 416 } 417 418 /** 419 * Move to the following step 420 * 421 * @return void 422 */ 423 private function increment() 424 { 425 $this->current_char = mb_substr($this->content, $this->char_index, 1); 426 $this->current_word .= $this->current_char; 427 if (!$this->directiveCleanParam() && !$this->directiveUserAgent()) { 428 $this->current_word = trim($this->current_word); 429 } 430 $this->char_index++; 431 } 432 }