robotstxtvalidator.php (5040B)
1 <?php 2 /** 3 * Class RobotsTxtValidator 4 * Class designed to check is url allowed or not to crawl by specific user-agent according to robots.txt rules. 5 */ 6 7 namespace d_robots_txt_parser; 8 9 use Exception; 10 11 class RobotsTxtValidator 12 { 13 /** 14 * @var array Data with ordered rules to determine isUrl Allow/Disallow 15 */ 16 private $orderedDirectivesCache; 17 18 /** 19 * @var array All rules from RobotsTxtParser 20 */ 21 private $rules; 22 23 /** 24 * RobotsTxtValidator constructor 25 * 26 * @param array $rules Array of all rules from class RobotsTxtParser 27 */ 28 public function __construct(array $rules) 29 { 30 $this->rules = $rules; 31 } 32 33 /** 34 * Return true if url is allow to crawl by robots.txt rules otherwise false 35 * 36 * @param string $url 37 * @param string $userAgent 38 * @return bool 39 */ 40 public function isUrlAllow($url, $userAgent = '*') 41 { 42 $relativeUrl = $this->getRelativeUrl($url); 43 44 $orderedDirectives = $this->getOrderedDirectivesByUserAgent($userAgent); 45 46 // if has not allow rules we can determine when url disallowed even on one coincidence - just to do it faster. 47 $hasAllowDirectives = true; 48 foreach ($orderedDirectives as $directiveRow) { 49 if ($directiveRow['directive'] == 'allow' ) { 50 $hasAllowDirectives = true; 51 break; 52 } 53 } 54 55 $isAllow = true; 56 foreach ($orderedDirectives as $directiveRow) { 57 if (!in_array($directiveRow['directive'], array('allow', 'disallow'))) { 58 continue; 59 } 60 61 if (preg_match($directiveRow['rule_regexp'], $relativeUrl)) { 62 if ($directiveRow['directive'] == 'allow' ) { 63 $isAllow = true; 64 } 65 else { 66 if (!$hasAllowDirectives) { 67 return false; 68 } 69 70 $isAllow = false; 71 } 72 } 73 } 74 75 return $isAllow; 76 } 77 78 /** 79 * Return true if url is disallow to crawl by robots.txt rules otherwise false 80 * 81 * @param string $url 82 * @param string $userAgent 83 * @return bool 84 */ 85 public function isUrlDisallow($url, $userAgent = '*') 86 { 87 return !$this->isUrlAllow($url, $userAgent); 88 } 89 90 /** 91 * Get array of ordered by length rules from allow and disallow directives by specific user-agent 92 * If you have already stored robots.txt rules into database, you can use query like this to fetch ordered rules: 93 * mysql> SELECT directive,value FROM robots_txt where site_id = ?d and directive IN ('allow','disallow) AND user_agent = ? ORDER BY CHAR_LENGTH(value) ASC; 94 * 95 * @param string $userAgent 96 * @return array 97 */ 98 private function getOrderedDirectivesByUserAgent($userAgent) 99 { 100 if (!isset($this->orderedDirectivesCache[$userAgent])) { 101 if (!empty($this->rules[$userAgent])) { 102 //put data to execution cache 103 $this->orderedDirectivesCache[$userAgent] = $this->orderDirectives($this->rules[$userAgent]); 104 } 105 else { 106 $this->orderedDirectivesCache[$userAgent] = array(); 107 } 108 } 109 110 return $this->orderedDirectivesCache[$userAgent]; 111 } 112 113 /** 114 * Order directives by rule char length 115 * 116 * @param array $rules 117 * @return array $directives 118 */ 119 private function orderDirectives(array $rules) 120 { 121 $directives = array(); 122 123 $allowRules = !empty($rules['allow']) ? $rules['allow'] : array(); 124 $disallowRules = !empty($rules['disallow']) ? $rules['disallow'] : array(); 125 126 foreach ($allowRules as $rule) { 127 $directives[] = array( 128 'directive' => 'allow', 129 'rule' => $rule, 130 'rule_regexp' => $this->prepareRegexpRule($rule), 131 ); 132 } 133 134 foreach ($disallowRules as $rule) { 135 $directives[] = array( 136 'directive' => 'disallow', 137 'rule' => $rule, 138 'rule_regexp' => $this->prepareRegexpRule($rule), 139 ); 140 } 141 142 usort($directives, function ($row1, $row2) { 143 return mb_strlen($row1['rule']) > mb_strlen($row2['rule']); 144 }); 145 146 return $directives; 147 } 148 149 /** 150 * Always returns relative url without domain which start from "/", e.g.: 151 * 152 * http://example.com/test -> /test 153 * https://example.com/test/path -> /test/path 154 * /test/any/path -> /test/any/path 155 * http://example.com -> / 156 * / -> / 157 * /some/path -> /some/path 158 * 159 * @param string $url 160 * @return string 161 * @throws Exception 162 */ 163 private function getRelativeUrl($url) 164 { 165 try { 166 if (!$url) { 167 throw new Exception('Url should not be empty'); 168 } 169 170 if (!preg_match('!^https?://!i', $url)) { 171 if (empty($url[0]) || $url[0] !== '/') { 172 throw new Exception('Url should start from "/" or has protocol with domain, got ' . $url); 173 } 174 else { 175 return $url; 176 } 177 } 178 179 } catch (Exception $e) { 180 var_dump($e->getMessage()); 181 } 182 183 return parse_url($url, PHP_URL_PATH); 184 } 185 186 /** 187 * Convert robots.txt rule to php RegExp 188 * 189 * @param string $ruleValue 190 * @return string 191 */ 192 private static function prepareRegexpRule($ruleValue) 193 { 194 $replacements = array( 195 '/\$./' => '\$', 196 '/\?/' => '\?', 197 '/\./' => '\.', 198 '/\*/' => '.*', 199 ); 200 201 $ruleValue = preg_replace(array_keys($replacements), array_values($replacements), $ruleValue); 202 203 $regexp = '/^' . str_replace('/', '\/', $ruleValue) . '/'; 204 return str_replace('\\\\/', '\/', $regexp); 205 } 206 }