shop.balmet.com

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

robotstxtvalidator.php (5040B)


      1 <?php
      2 /**
      3  * Class RobotsTxtValidator
      4  * Class designed to check is url allowed or not to crawl by specific user-agent according to robots.txt rules.
      5  */
      6  
      7 namespace d_robots_txt_parser;
      8 
      9 use Exception;
     10 
     11 class RobotsTxtValidator
     12 {
     13 	/**
     14 	 * @var array  Data with ordered rules to determine isUrl Allow/Disallow
     15 	 */
     16 	private $orderedDirectivesCache;
     17 
     18 	/**
     19 	 * @var array All rules from RobotsTxtParser
     20 	 */
     21 	private $rules;
     22 
     23 	/**
     24 	 * RobotsTxtValidator constructor
     25 	 *
     26 	 * @param array $rules  Array of all rules from class RobotsTxtParser
     27 	 */
     28 	public function __construct(array $rules)
     29 	{
     30 		$this->rules = $rules;
     31 	}
     32 
     33 	/**
     34 	 * Return true if url is allow to crawl by robots.txt rules otherwise false
     35 	 *
     36 	 * @param string $url
     37 	 * @param string $userAgent
     38 	 * @return bool
     39 	 */
     40 	public function isUrlAllow($url, $userAgent = '*')
     41 	{
     42 		$relativeUrl = $this->getRelativeUrl($url);
     43 
     44 		$orderedDirectives = $this->getOrderedDirectivesByUserAgent($userAgent);
     45 
     46 		// if has not allow rules we can determine when url disallowed even on one coincidence - just to do it faster.
     47 		$hasAllowDirectives = true;
     48 		foreach ($orderedDirectives as $directiveRow) {
     49 			if ($directiveRow['directive'] == 'allow' ) {
     50 				$hasAllowDirectives = true;
     51 				break;
     52 			}
     53 		}
     54 
     55 		$isAllow = true;
     56 		foreach ($orderedDirectives as $directiveRow) {
     57 			if (!in_array($directiveRow['directive'], array('allow', 'disallow'))) {
     58 				continue;
     59 			}
     60 
     61 			if (preg_match($directiveRow['rule_regexp'], $relativeUrl)) {
     62 				if ($directiveRow['directive'] == 'allow' ) {
     63 					$isAllow = true;
     64 				}
     65 				else {
     66 					if (!$hasAllowDirectives) {
     67 						return false;
     68 					}
     69 
     70 					$isAllow = false;
     71 				}
     72 			}
     73 		}
     74 
     75 		return $isAllow;
     76 	}
     77 
     78 	/**
     79 	 * Return true if url is disallow to crawl by robots.txt rules otherwise false
     80 	 *
     81 	 * @param string $url
     82 	 * @param string $userAgent
     83 	 * @return bool
     84 	 */
     85 	public function isUrlDisallow($url, $userAgent = '*')
     86 	{
     87 		return !$this->isUrlAllow($url, $userAgent);
     88 	}
     89 
     90 	/**
     91 	 * Get array of ordered by length rules from allow and disallow directives by specific user-agent
     92 	 * If you have already stored robots.txt rules into database, you can use query like this to fetch ordered rules:
     93 	 * mysql> SELECT directive,value FROM robots_txt where site_id = ?d and directive IN ('allow','disallow) AND user_agent = ? ORDER BY CHAR_LENGTH(value) ASC;
     94 	 *
     95 	 * @param string $userAgent
     96 	 * @return array
     97 	 */
     98 	private function getOrderedDirectivesByUserAgent($userAgent)
     99 	{
    100 		if (!isset($this->orderedDirectivesCache[$userAgent])) {
    101 			if (!empty($this->rules[$userAgent])) {
    102 				//put data to execution cache
    103 				$this->orderedDirectivesCache[$userAgent] = $this->orderDirectives($this->rules[$userAgent]);
    104 			}
    105 			else {
    106 				$this->orderedDirectivesCache[$userAgent] = array();
    107 			}
    108 		}
    109 
    110 		return $this->orderedDirectivesCache[$userAgent];
    111 	}
    112 
    113 	/**
    114 	 * Order directives by rule char length
    115 	 *
    116 	 * @param array $rules
    117 	 * @return array $directives
    118 	 */
    119 	private function orderDirectives(array $rules)
    120 	{
    121 		$directives = array();
    122 
    123 		$allowRules = !empty($rules['allow']) ? $rules['allow'] : array();
    124 		$disallowRules = !empty($rules['disallow']) ? $rules['disallow'] : array();
    125 
    126 		foreach ($allowRules as $rule) {
    127 			$directives[] = array(
    128 				'directive' => 'allow',
    129 				'rule' => $rule,
    130 				'rule_regexp' => $this->prepareRegexpRule($rule),
    131 			);
    132 		}
    133 
    134 		foreach ($disallowRules as $rule) {
    135 			$directives[] = array(
    136 				'directive' => 'disallow',
    137 				'rule' => $rule,
    138 				'rule_regexp' => $this->prepareRegexpRule($rule),
    139 			);
    140 		}
    141 
    142 		usort($directives, function ($row1, $row2) {
    143 			return mb_strlen($row1['rule']) > mb_strlen($row2['rule']);
    144 		});
    145 
    146 		return $directives;
    147 	}
    148 
    149 	/**
    150 	 * Always returns relative url without domain which start from "/", e.g.:
    151 	 *
    152 	 * http://example.com/test       -> /test
    153 	 * https://example.com/test/path -> /test/path
    154 	 * /test/any/path                -> /test/any/path
    155 	 * http://example.com            -> /
    156 	 * /                             -> /
    157 	 * /some/path                    -> /some/path
    158 	 *
    159 	 * @param string $url
    160 	 * @return string
    161 	 * @throws Exception
    162 	 */
    163 	private function getRelativeUrl($url)
    164 	{
    165 		try {
    166 			if (!$url) {
    167 				throw new Exception('Url should not be empty');
    168 			}
    169 
    170 			if (!preg_match('!^https?://!i', $url)) {
    171 				if (empty($url[0]) || $url[0] !== '/') {
    172 					throw new Exception('Url should start from "/" or has protocol with domain, got ' . $url);
    173 				}
    174 				else {
    175 					return $url;
    176 				}
    177 			}
    178 		
    179 		} catch (Exception $e) {
    180 			var_dump($e->getMessage());
    181 		}
    182 
    183 		return parse_url($url, PHP_URL_PATH);
    184 	}
    185 
    186 	/**
    187 	 * Convert robots.txt rule to php RegExp
    188 	 *
    189 	 * @param string $ruleValue
    190 	 * @return string
    191 	 */
    192 	private static function prepareRegexpRule($ruleValue)
    193 	{
    194 		$replacements = array(
    195 			'/\$./' => '\$',
    196 			'/\?/' => '\?',
    197 			'/\./' => '\.',
    198 			'/\*/' => '.*',
    199 		);
    200 
    201 		$ruleValue = preg_replace(array_keys($replacements), array_values($replacements), $ruleValue);
    202 
    203 		$regexp = '/^' . str_replace('/', '\/', $ruleValue) . '/';
    204 		return str_replace('\\\\/', '\/', $regexp);
    205 	}
    206 }