shop.balmet.com

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

robotstxtparser.php (9340B)


      1 <?php
      2 /**
      3  * Class for parsing robots.txt files
      4  *
      5  * @author Eugene Yurkevich (bopodaa@gmail.com)
      6  *
      7  *
      8  * Some useful links and materials:
      9  * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
     10  * @link https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml
     11  */
     12  
     13 namespace d_robots_txt_parser;
     14 
     15 class RobotsTxtParser
     16 {
     17 	// default encoding
     18 	const DEFAULT_ENCODING = 'UTF-8';
     19 
     20 	// states
     21 	const STATE_ZERO_POINT = 'zero-point';
     22 	const STATE_READ_DIRECTIVE = 'read-directive';
     23 	const STATE_SKIP_SPACE = 'skip-space';
     24 	const STATE_SKIP_LINE = 'skip-line';
     25 	const STATE_READ_VALUE = 'read-value';
     26 
     27 	// directives
     28 	const DIRECTIVE_ALLOW = 'allow';
     29 	const DIRECTIVE_DISALLOW = 'disallow';
     30 	const DIRECTIVE_HOST = 'host';
     31 	const DIRECTIVE_SITEMAP = 'sitemap';
     32 	const DIRECTIVE_USERAGENT = 'user-agent';
     33 	const DIRECTIVE_CRAWL_DELAY = 'crawl-delay';
     34 	const DIRECTIVE_CLEAN_PARAM = 'clean-param';
     35 
     36 	/**
     37 	 * Default user-agent
     38 	 * First off, links should be checked by specific user-agent rules. If specific user-agent isn't specified than default user-agent used.
     39 	 */
     40 	const USER_AGENT_ALL = '*';
     41 
     42 	// current state
     43 	private $state = '';
     44 
     45 	// robots.txt file content
     46 	private $content = '';
     47 
     48 	// rules set
     49 	private $rules = array();
     50 
     51 	// internally used variables
     52 	private $current_word = '';
     53 	private $current_char = '';
     54 	private $char_index = 0;
     55 	private $current_directive = '';
     56 	private $userAgent = self::USER_AGENT_ALL;
     57 
     58 	/**
     59 	 * @param  string $content - file content
     60 	 * @param  string $encoding - encoding
     61 	 * @return RobotsTxtParser
     62 	 */
     63 	public function __construct($content, $encoding = self::DEFAULT_ENCODING)
     64 	{
     65 		// convert encoding
     66 		$encoding = !empty($encoding) ? $encoding : mb_detect_encoding($content);
     67 		mb_internal_encoding($encoding);
     68 
     69 		// set content
     70 		$this->content = iconv($encoding, 'UTF-8//IGNORE', $content);
     71 		$this->content .= "\n";
     72 
     73 		// set default state
     74 		$this->state = self::STATE_ZERO_POINT;
     75 
     76 		// parse rules - default state
     77 		$this->prepareRules();
     78 	}
     79 
     80 	/**
     81 	 * Get rules by specific bot (user-agent)
     82 	 * Use $userAgent = NULL to get all rules for all user-agents grouped by user-agent. User-agents will return in lower case.
     83 	 * Use $userAgent = '*' to get common rules.
     84 	 * Use $userAgent = 'YandexBot' to get rules for user-agent 'YandexBot'.
     85 	 *
     86 	 * @param string $userAgent
     87 	 * @return array
     88 	 */
     89 	public function getRules($userAgent = NULL)
     90 	{
     91 		if (is_null($userAgent)) {
     92 			//return all rules
     93 			return $this->rules;
     94 		}
     95 		else {
     96 			$userAgent = mb_strtolower($userAgent);
     97 			if (isset($this->rules[$userAgent])) {
     98 				return $this->rules[$userAgent];
     99 			}
    100 			else {
    101 				return array();
    102 			}
    103 		}
    104 	}
    105 
    106 	/**
    107 	 * Get sitemaps links.
    108 	 * Sitemap always relates to all user-agents and return in rules with user-agent "*"
    109 	 *
    110 	 * @return array  all sitemap urls
    111 	 */
    112 	public function getSitemaps()
    113 	{
    114 		$rules = $this->getRules(self::USER_AGENT_ALL);
    115 		if (!empty($rules[self::DIRECTIVE_SITEMAP])) {
    116 			return $rules[self::DIRECTIVE_SITEMAP];
    117 		}
    118 
    119 		return array();
    120 	}
    121 
    122 	public function getContent()
    123 	{
    124 		return $this->content;
    125 	}
    126 
    127 	/**
    128 	 * Comment signal (#)
    129 	 */
    130 	private function sharp()
    131 	{
    132 		return ($this->current_char == '#');
    133 	}
    134 
    135 	/**
    136 	 * Allow directive signal
    137 	 */
    138 	private function directiveAllow()
    139 	{
    140 		return ($this->current_directive == self::DIRECTIVE_ALLOW);
    141 	}
    142 
    143 	/**
    144 	 * Disallow directive signal
    145 	 */
    146 	private function directiveDisallow()
    147 	{
    148 		return ($this->current_directive == self::DIRECTIVE_DISALLOW);
    149 	}
    150 
    151 	/**
    152 	 * Host directive signal
    153 	 */
    154 	private function directiveHost()
    155 	{
    156 		return ($this->current_directive == self::DIRECTIVE_HOST);
    157 	}
    158 
    159 	/**
    160 	 * Sitemap directive signal
    161 	 */
    162 	private function directiveSitemap()
    163 	{
    164 		return ($this->current_directive == self::DIRECTIVE_SITEMAP);
    165 	}
    166 
    167 	/**
    168 	 * Clean-param directive signal
    169 	 */
    170 	private function directiveCleanParam()
    171 	{
    172 		return ($this->current_directive == self::DIRECTIVE_CLEAN_PARAM);
    173 	}
    174 
    175 	/**
    176 	 * User-agent directive signal
    177 	 */
    178 	private function directiveUserAgent()
    179 	{
    180 		return ($this->current_directive == self::DIRECTIVE_USERAGENT);
    181 	}
    182 
    183 	/**
    184 	 * Crawl-Delay directive signal
    185 	 */
    186 	private function directiveCrawlDelay()
    187 	{
    188 		return ($this->current_directive == self::DIRECTIVE_CRAWL_DELAY);
    189 	}
    190 
    191 	/**
    192 	 * Key : value pair separator signal
    193 	 */
    194 	private function lineSeparator()
    195 	{
    196 		return ($this->current_char == ':');
    197 	}
    198 
    199 	/**
    200 	 * Move to new line signal
    201 	 */
    202 	private function newLine()
    203 	{
    204 		$asciiCode = ord($this->current_char);
    205 
    206 		return ($this->current_char == "\n"
    207 			|| $asciiCode == 13
    208 			|| $asciiCode == 10
    209 			|| $this->current_word == "\r\n"
    210 			|| $this->current_word == "\n\r"
    211 		);
    212 	}
    213 
    214 	/**
    215 	 * "Space" signal
    216 	 */
    217 	private function space()
    218 	{
    219 		return ($this->current_char == "\s");
    220 	}
    221 
    222 	/**
    223 	 * Change state
    224 	 *
    225 	 * @param string $stateTo - state that should be set
    226 	 * @return void
    227 	 */
    228 	private function switchState($stateTo = self::STATE_SKIP_LINE)
    229 	{
    230 		$this->state = $stateTo;
    231 	}
    232 
    233 	/**
    234 	 * Parse rules
    235 	 *
    236 	 * @return void
    237 	 */
    238 	public function prepareRules()
    239 	{
    240 		$contentLength = mb_strlen($this->content);
    241 		while ($this->char_index <= $contentLength) {
    242 			$this->step();
    243 		}
    244 
    245 		foreach ($this->rules as $userAgent => $directive) {
    246 			foreach ($directive as $directiveName => $directiveValue) {
    247 				if (is_array($directiveValue)) {
    248 					$this->rules[$userAgent][$directiveName] = array_values(array_unique($directiveValue));
    249 				}
    250 			}
    251 		}
    252 	}
    253 
    254 	/**
    255 	 * Check if we should switch
    256 	 * @return bool
    257 	 */
    258 	private function shouldSwitchToZeroPoint()
    259 	{
    260 		return in_array(strtolower($this->current_word), array(
    261 			self::DIRECTIVE_ALLOW,
    262 			self::DIRECTIVE_DISALLOW,
    263 			self::DIRECTIVE_HOST,
    264 			self::DIRECTIVE_USERAGENT,
    265 			self::DIRECTIVE_SITEMAP,
    266 			self::DIRECTIVE_CRAWL_DELAY,
    267 			self::DIRECTIVE_CLEAN_PARAM,
    268 		), true);
    269 	}
    270 
    271 	/**
    272 	 * Process state ZERO_POINT
    273 	 * @return RobotsTxtParser
    274 	 */
    275 	private function zeroPoint()
    276 	{
    277 		if ($this->shouldSwitchToZeroPoint()) {
    278 			$this->switchState(self::STATE_READ_DIRECTIVE);
    279 		} // unknown directive - skip it
    280 		elseif ($this->newLine()) {
    281 			$this->current_word = "";
    282 			$this->increment();
    283 		}
    284 		else {
    285 			$this->increment();
    286 		}
    287 		return $this;
    288 	}
    289 
    290 	/**
    291 	 * Read directive
    292 	 * @return RobotsTxtParser
    293 	 */
    294 	private function readDirective()
    295 	{
    296 		$this->current_directive = strtolower(trim($this->current_word));
    297 
    298 		$this->increment();
    299 
    300 		if ($this->lineSeparator()) {
    301 			$this->current_word = "";
    302 			$this->switchState(self::STATE_READ_VALUE);
    303 		}
    304 		else {
    305 			if ($this->space()) {
    306 				$this->switchState(self::STATE_SKIP_SPACE);
    307 			}
    308 			if ($this->sharp()) {
    309 				$this->switchState(self::STATE_SKIP_LINE);
    310 			}
    311 		}
    312 		return $this;
    313 	}
    314 
    315 	/**
    316 	 * Skip space
    317 	 * @return RobotsTxtParser
    318 	 */
    319 	private function skipSpace()
    320 	{
    321 		$this->char_index++;
    322 		$this->current_word = mb_substr($this->current_word, -1);
    323 		return $this;
    324 	}
    325 
    326 	/**
    327 	 * Skip line
    328 	 * @return RobotsTxtParser
    329 	 */
    330 	private function skipLine()
    331 	{
    332 		$this->char_index++;
    333 		$this->switchState(self::STATE_ZERO_POINT);
    334 		return $this;
    335 	}
    336 
    337 	/**
    338 	 * Read value
    339 	 * @return RobotsTxtParser
    340 	 */
    341 	private function readValue()
    342 	{
    343 		if ($this->newLine()) {
    344 			$this->assignValueToDirective();
    345 		}
    346 		elseif ($this->sharp()) {
    347 			$this->current_word = mb_substr($this->current_word, 0, -1);
    348 			$this->assignValueToDirective();
    349 		}
    350 		else {
    351 			$this->increment();
    352 		}
    353 		return $this;
    354 	}
    355 
    356 	private function assignValueToDirective()
    357 	{
    358 		if ($this->directiveUserAgent()) {
    359 			$this->userAgent = mb_strtolower(trim($this->current_word));
    360 			if (!isset($this->rules[$this->userAgent])) {
    361 				$this->rules[$this->userAgent] = array();
    362 			}
    363 		}
    364 		elseif ($this->directiveCrawlDelay()) {
    365 			$this->rules[$this->userAgent][$this->current_directive] = (double)$this->current_word;
    366 		}
    367 		elseif ($this->directiveSitemap()) {
    368 			$this->rules[self::USER_AGENT_ALL][$this->current_directive][] = $this->current_word;
    369 		}
    370 		elseif ($this->directiveCleanParam()) {
    371 			$this->rules[$this->userAgent][$this->current_directive][] = trim($this->current_word);
    372 		}
    373 		elseif ($this->directiveHost()) {
    374 			if (empty($this->rules['*'][$this->current_directive])) { // save only first host directive value, assign to '*'
    375 				$this->rules['*'][$this->current_directive] = $this->current_word;
    376 			}
    377 		}
    378 		else {
    379 			if (!empty($this->current_word)) {
    380 				$this->rules[$this->userAgent][$this->current_directive][] = $this->current_word;
    381 			}
    382 		}
    383 		$this->current_word = '';
    384 		$this->current_directive = '';
    385 		$this->switchState(self::STATE_ZERO_POINT);
    386 	}
    387 
    388 	/**
    389 	 * Machine step
    390 	 *
    391 	 * @return void
    392 	 */
    393 	private function step()
    394 	{
    395 		switch ($this->state) {
    396 			case self::STATE_ZERO_POINT:
    397 				$this->zeroPoint();
    398 				break;
    399 
    400 			case self::STATE_READ_DIRECTIVE:
    401 				$this->readDirective();
    402 				break;
    403 
    404 			case self::STATE_SKIP_SPACE:
    405 				$this->skipSpace();
    406 				break;
    407 
    408 			case self::STATE_SKIP_LINE:
    409 				$this->skipLine();
    410 				break;
    411 
    412 			case self::STATE_READ_VALUE:
    413 				$this->readValue();
    414 				break;
    415 		}
    416 	}
    417 
    418 	/**
    419 	 * Move to the following step
    420 	 *
    421 	 * @return void
    422 	 */
    423 	private function increment()
    424 	{
    425 		$this->current_char = mb_substr($this->content, $this->char_index, 1);
    426 		$this->current_word .= $this->current_char;
    427 		if (!$this->directiveCleanParam() && !$this->directiveUserAgent()) {
    428 			$this->current_word = trim($this->current_word);
    429 		}
    430 		$this->char_index++;
    431 	}
    432 }