d_simple_html_dom.php (74361B)
1 <?php 2 /** 3 * Website: http://sourceforge.net/projects/simplehtmldom/ 4 * Additional projects: http://sourceforge.net/projects/debugobject/ 5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 * Contributions by: 7 * Yousuke Kumakura (Attribute filters) 8 * Vadim Voituk (Negative indexes supports of "find" method) 9 * Antcs (Constructor with automatically load contents either text or file/url) 10 * 11 * all affected sections have comments starting with "PaperG" 12 * 13 * Paperg - Added case insensitive testing of the value of the selector. 14 * 15 * Paperg - Added tag_start for the starting index of tags - NOTE: This works 16 * but not accurately. This tag_start gets counted AFTER \r\n have been crushed 17 * out, and after the remove_noice calls so it will not reflect the REAL 18 * position of the tag in the source, it will almost always be smaller by some 19 * amount. We use this to determine how far into the file the tag in question 20 * is. This "percentage" will never be accurate as the $dom->size is the "real" 21 * number of bytes the dom was created from. But for most purposes, it's a 22 * really good estimation. 23 * 24 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags 25 * closed is great for malformed html, but it CAN lead to parsing errors. 26 * 27 * Allow the user to tell us how much they trust the html. 28 * 29 * Paperg add the text and plaintext to the selectors for the find syntax. 30 * plaintext implies text in the innertext of a node. text implies that the 31 * tag is a text node. This allows for us to find tags based on the text they 32 * contain. 33 * 34 * Create find_ancestor_tag to see if a tag is - at any level - inside of 35 * another specific tag. 36 * 37 * Paperg: added parse_charset so that we know about the character set of 38 * the source document. NOTE: If the user's system has a routine called 39 * get_last_retrieve_url_contents_content_type availalbe, we will assume it's 40 * returning the content-type header from the last transfer or curl_exec, and 41 * we will parse that and use it in preference to any other method of charset 42 * detection. 43 * 44 * Found infinite loop in the case of broken html in restore_noise. Rewrote to 45 * protect from that. 46 * 47 * PaperG (John Schlick) Added get_display_size for "IMG" tags. 48 * 49 * Licensed under The MIT License 50 * Redistributions of files must retain the above copyright notice. 51 * 52 * @author S.C. Chen <me578022@gmail.com> 53 * @author John Schlick 54 * @author Rus Carroll 55 * @version Rev. 1.8.1 (247) 56 * @package PlaceLocalInclude 57 * @subpackage d_simple_html_dom 58 */ 59 60 /** 61 * All of the Defines for the classes below. 62 * @author S.C. Chen <me578022@gmail.com> 63 */ 64 define('HDOM_TYPE_ELEMENT', 1); 65 define('HDOM_TYPE_COMMENT', 2); 66 define('HDOM_TYPE_TEXT', 3); 67 define('HDOM_TYPE_ENDTAG', 4); 68 define('HDOM_TYPE_ROOT', 5); 69 define('HDOM_TYPE_UNKNOWN', 6); 70 define('HDOM_QUOTE_DOUBLE', 0); 71 define('HDOM_QUOTE_SINGLE', 1); 72 define('HDOM_QUOTE_NO', 3); 73 define('HDOM_INFO_BEGIN', 0); 74 define('HDOM_INFO_END', 1); 75 define('HDOM_INFO_QUOTE', 2); 76 define('HDOM_INFO_SPACE', 3); 77 define('HDOM_INFO_TEXT', 4); 78 define('HDOM_INFO_INNER', 5); 79 define('HDOM_INFO_OUTER', 6); 80 define('HDOM_INFO_ENDSPACE', 7); 81 82 /** The default target charset */ 83 defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 84 85 /** The default <br> text used instead of <br> tags when returning text */ 86 defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 87 88 /** The default <span> text used instead of <span> tags when returning text */ 89 defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 90 91 /** The maximum file size the parser should load */ 92 defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 93 94 /** Contents between curly braces "{" and "}" are interpreted as text */ 95 define('HDOM_SMARTY_AS_TEXT', 1); 96 97 // helper functions 98 // ----------------------------------------------------------------------------- 99 // get html dom from file 100 // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 101 function file_get_html( 102 $url, 103 $use_include_path = false, 104 $context = null, 105 $offset = 0, 106 $maxLen = -1, 107 $lowercase = true, 108 $forceTagsClosed = true, 109 $target_charset = DEFAULT_TARGET_CHARSET, 110 $stripRN = true, 111 $defaultBRText = DEFAULT_BR_TEXT, 112 $defaultSpanText = DEFAULT_SPAN_TEXT) 113 { 114 // Ensure maximum length is greater than zero 115 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 116 117 // We DO force the tags to be terminated. 118 $dom = new d_simple_html_dom( 119 null, 120 $lowercase, 121 $forceTagsClosed, 122 $target_charset, 123 $stripRN, 124 $defaultBRText, 125 $defaultSpanText); 126 127 /** 128 * For sourceforge users: uncomment the next line and comment the 129 * retrieve_url_contents line 2 lines down if it is not already done. 130 */ 131 $contents = file_get_contents( 132 $url, 133 $use_include_path, 134 $context, 135 $offset, 136 $maxLen); 137 138 // Paperg - use our own mechanism for getting the contents as we want to 139 // control the timeout. 140 // $contents = retrieve_url_contents($url); 141 if (empty($contents) || strlen($contents) > $maxLen) { return false; } 142 143 // The second parameter can force the selectors to all be lowercase. 144 $dom->load($contents, $lowercase, $stripRN); 145 return $dom; 146 } 147 148 // get html dom from string 149 function str_get_html( 150 $str, 151 $lowercase = true, 152 $forceTagsClosed = true, 153 $target_charset = DEFAULT_TARGET_CHARSET, 154 $stripRN = true, 155 $defaultBRText = DEFAULT_BR_TEXT, 156 $defaultSpanText = DEFAULT_SPAN_TEXT) 157 { 158 $dom = new d_simple_html_dom( 159 null, 160 $lowercase, 161 $forceTagsClosed, 162 $target_charset, 163 $stripRN, 164 $defaultBRText, 165 $defaultSpanText); 166 167 if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 168 $dom->clear(); 169 return false; 170 } 171 172 $dom->load($str, $lowercase, $stripRN); 173 return $dom; 174 } 175 176 // dump html dom tree 177 function dump_html_tree($node, $show_attr = true, $deep = 0) 178 { 179 $node->dump($node); 180 } 181 182 /** 183 * simple html dom node 184 * PaperG - added ability for "find" routine to lowercase the value of the 185 * selector. 186 * 187 * PaperG - added $tag_start to track the start position of the tag in the total 188 * byte index 189 * 190 * @package PlaceLocalInclude 191 */ 192 class d_simple_html_dom_node 193 { 194 /** 195 * Node type 196 * 197 * Default is {@see HDOM_TYPE_TEXT} 198 * 199 * @var int 200 */ 201 public $nodetype = HDOM_TYPE_TEXT; 202 203 /** 204 * Tag name 205 * 206 * Default is 'text' 207 * 208 * @var string 209 */ 210 public $tag = 'text'; 211 212 /** 213 * List of attributes 214 * 215 * @var array 216 */ 217 public $attr = array(); 218 219 /** 220 * List of child node objects 221 * 222 * @var array 223 */ 224 public $children = array(); 225 public $nodes = array(); 226 227 /** 228 * The parent node object 229 * 230 * @var object|null 231 */ 232 public $parent = null; 233 234 // The "info" array - see HDOM_INFO_... for what each element contains. 235 public $_ = array(); 236 237 /** 238 * Start position of the tag in the document 239 * 240 * @var int 241 */ 242 public $tag_start = 0; 243 244 /** 245 * The DOM object 246 * 247 * @var object|null 248 */ 249 private $dom = null; 250 251 /** 252 * Construct new node object 253 * 254 * Adds itself to the list of DOM Nodes {@see d_simple_html_dom::$nodes} 255 */ 256 function __construct($dom) 257 { 258 $this->dom = $dom; 259 $dom->nodes[] = $this; 260 } 261 262 function __destruct() 263 { 264 $this->clear(); 265 } 266 267 function __toString() 268 { 269 return $this->outertext(); 270 } 271 272 // clean up memory due to php5 circular references memory leak... 273 function clear() 274 { 275 $this->dom = null; 276 $this->nodes = null; 277 $this->parent = null; 278 $this->children = null; 279 } 280 281 // dump node's tree 282 function dump($show_attr = true, $deep = 0) 283 { 284 $lead = str_repeat(' ', $deep); 285 286 echo $lead . $this->tag; 287 288 if ($show_attr && count($this->attr) > 0) { 289 echo '('; 290 foreach ($this->attr as $k => $v) { 291 echo "[$k]=>\"" . $this->$k . '", '; 292 } 293 echo ')'; 294 } 295 296 echo "\n"; 297 298 if ($this->nodes) { 299 foreach ($this->nodes as $c) { 300 $c->dump($show_attr, $deep + 1); 301 } 302 } 303 } 304 305 306 // Debugging function to dump a single dom node with a bunch of information about it. 307 function dump_node($echo = true) 308 { 309 $string = $this->tag; 310 311 if (count($this->attr) > 0) { 312 $string .= '('; 313 foreach ($this->attr as $k => $v) { 314 $string .= "[$k]=>\"" . $this->$k . '", '; 315 } 316 $string .= ')'; 317 } 318 319 if (count($this->_) > 0) { 320 $string .= ' $_ ('; 321 foreach ($this->_ as $k => $v) { 322 if (is_array($v)) { 323 $string .= "[$k]=>("; 324 foreach ($v as $k2 => $v2) { 325 $string .= "[$k2]=>\"" . $v2 . '", '; 326 } 327 $string .= ')'; 328 } else { 329 $string .= "[$k]=>\"" . $v . '", '; 330 } 331 } 332 $string .= ')'; 333 } 334 335 if (isset($this->text)) { 336 $string .= ' text: (' . $this->text . ')'; 337 } 338 339 $string .= " HDOM_INNER_INFO: '"; 340 341 if (isset($node->_[HDOM_INFO_INNER])) { 342 $string .= $node->_[HDOM_INFO_INNER] . "'"; 343 } else { 344 $string .= ' NULL '; 345 } 346 347 $string .= ' children: ' . count($this->children); 348 $string .= ' nodes: ' . count($this->nodes); 349 $string .= ' tag_start: ' . $this->tag_start; 350 $string .= "\n"; 351 352 if ($echo) { 353 echo $string; 354 return; 355 } else { 356 return $string; 357 } 358 } 359 360 /** 361 * Return or set parent node 362 * 363 * @param object|null $parent (optional) The parent node, `null` to return 364 * the current parent node. 365 * @return object|null The parent node 366 */ 367 function parent($parent = null) 368 { 369 // I am SURE that this doesn't work properly. 370 // It fails to unset the current node from it's current parents nodes or 371 // children list first. 372 if ($parent !== null) { 373 $this->parent = $parent; 374 $this->parent->nodes[] = $this; 375 $this->parent->children[] = $this; 376 } 377 378 return $this->parent; 379 } 380 381 /** 382 * @return bool True if the node has at least one child node 383 */ 384 function has_child() 385 { 386 return !empty($this->children); 387 } 388 389 /** 390 * Get child node at specified index 391 * 392 * @param int $idx The index of the child node to return, `-1` to return all 393 * child nodes. 394 * @return object|array|null The child node at the specified index, all child 395 * nodes or null if the index is invalid. 396 */ 397 function children($idx = -1) 398 { 399 if ($idx === -1) { 400 return $this->children; 401 } 402 403 if (isset($this->children[$idx])) { 404 return $this->children[$idx]; 405 } 406 407 return null; 408 } 409 410 /** 411 * Get first child node 412 * 413 * @return object|null The first child node or null if the current node has 414 * no child nodes. 415 * 416 * @todo Use `empty()` instead of `count()` to improve performance on large 417 * arrays. 418 */ 419 function first_child() 420 { 421 if (count($this->children) > 0) { 422 return $this->children[0]; 423 } 424 return null; 425 } 426 427 /** 428 * Get last child node 429 * 430 * @return object|null The last child node or null if the current node has 431 * no child nodes. 432 * 433 * @todo Use `end()` to slightly improve performance on large arrays. 434 */ 435 function last_child() 436 { 437 if (($count = count($this->children)) > 0) { 438 return $this->children[$count - 1]; 439 } 440 return null; 441 } 442 443 /** 444 * Get next sibling node 445 * 446 * @return object|null The sibling node or null if the current node has no 447 * sibling nodes. 448 */ 449 function next_sibling() 450 { 451 if ($this->parent === null) { 452 return null; 453 } 454 455 $idx = 0; 456 $count = count($this->parent->children); 457 458 while ($idx < $count && $this !== $this->parent->children[$idx]) { 459 ++$idx; 460 } 461 462 if (++$idx >= $count) { 463 return null; 464 } 465 466 return $this->parent->children[$idx]; 467 } 468 469 /** 470 * Get previous sibling node 471 * 472 * @return object|null The sibling node or null if the current node has no 473 * sibling nodes. 474 */ 475 function prev_sibling() 476 { 477 if ($this->parent === null) { return null; } 478 479 $idx = 0; 480 $count = count($this->parent->children); 481 482 while ($idx < $count && $this !== $this->parent->children[$idx]) { 483 ++$idx; 484 } 485 486 if (--$idx < 0) { return null; } 487 488 return $this->parent->children[$idx]; 489 } 490 491 /** 492 * Traverse ancestors to the first matching tag. 493 * 494 * @param string $tag Tag to find 495 * @return object|null First matching node in the DOM tree or null if no 496 * match was found. 497 * 498 * @todo Null is returned implicitly by calling ->parent on the root node. 499 * This behaviour could change at any time, rendering this function invalid. 500 */ 501 function find_ancestor_tag($tag) 502 { 503 global $debug_object; 504 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 505 506 // Start by including ourselves in the comparison. 507 $returnDom = $this; 508 509 while (!is_null($returnDom)) { 510 if (is_object($debug_object)) { 511 $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag); 512 } 513 514 if ($returnDom->tag == $tag) { 515 break; 516 } 517 518 $returnDom = $returnDom->parent; 519 } 520 521 return $returnDom; 522 } 523 524 /** 525 * Get node's inner text (everything inside the opening and closing tags) 526 * 527 * @return string 528 */ 529 function innertext() 530 { 531 if (isset($this->_[HDOM_INFO_INNER])) { 532 return $this->_[HDOM_INFO_INNER]; 533 } 534 535 if (isset($this->_[HDOM_INFO_TEXT])) { 536 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 537 } 538 539 $ret = ''; 540 541 foreach ($this->nodes as $n) { 542 $ret .= $n->outertext(); 543 } 544 545 return $ret; 546 } 547 548 /** 549 * Get node's outer text (everything including the opening and closing tags) 550 * 551 * @return string 552 */ 553 function outertext() 554 { 555 global $debug_object; 556 557 if (is_object($debug_object)) { 558 $text = ''; 559 560 if ($this->tag === 'text') { 561 if (!empty($this->text)) { 562 $text = ' with text: ' . $this->text; 563 } 564 } 565 566 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 567 } 568 569 if ($this->tag === 'root') return $this->innertext(); 570 571 // trigger callback 572 if ($this->dom && $this->dom->callback !== null) { 573 call_user_func_array($this->dom->callback, array($this)); 574 } 575 576 if (isset($this->_[HDOM_INFO_OUTER])) { 577 return $this->_[HDOM_INFO_OUTER]; 578 } 579 580 if (isset($this->_[HDOM_INFO_TEXT])) { 581 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 582 } 583 584 // render begin tag 585 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 586 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 587 } else { 588 $ret = ''; 589 } 590 591 // render inner text 592 if (isset($this->_[HDOM_INFO_INNER])) { 593 // If it's a br tag... don't return the HDOM_INNER_INFO that we 594 // may or may not have added. 595 if ($this->tag !== 'br') { 596 $ret .= $this->_[HDOM_INFO_INNER]; 597 } 598 } else { 599 if ($this->nodes) { 600 foreach ($this->nodes as $n) { 601 $ret .= $this->convert_text($n->outertext()); 602 } 603 } 604 } 605 606 // render end tag 607 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 608 $ret .= '</' . $this->tag . '>'; 609 } 610 611 return $ret; 612 } 613 614 /** 615 * Get node's plain text (everything excluding all tags) 616 * 617 * @return string 618 */ 619 function text() 620 { 621 if (isset($this->_[HDOM_INFO_INNER])) { 622 return $this->_[HDOM_INFO_INNER]; 623 } 624 625 switch ($this->nodetype) { 626 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 627 case HDOM_TYPE_COMMENT: return ''; 628 case HDOM_TYPE_UNKNOWN: return ''; 629 } 630 631 if (strcasecmp($this->tag, 'script') === 0) { return ''; } 632 if (strcasecmp($this->tag, 'style') === 0) { return ''; } 633 634 $ret = ''; 635 636 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 637 // for some span tags, and some p tags) $this->nodes is set to NULL. 638 // NOTE: This indicates that there is a problem where it's set to NULL 639 // without a clear happening. 640 // WHY is this happening? 641 if (!is_null($this->nodes)) { 642 foreach ($this->nodes as $n) { 643 // Start paragraph after a blank line 644 if ($n->tag === 'p') { 645 $ret .= "\n\n"; 646 } 647 648 $ret .= $this->convert_text($n->text()); 649 650 // If this node is a span... add a space at the end of it so 651 // multiple spans don't run into each other. This is plaintext 652 // after all. 653 if ($n->tag === 'span') { 654 $ret .= $this->dom->default_span_text; 655 } 656 } 657 } 658 return trim($ret); 659 } 660 661 /** 662 * Get node's xml text (inner text as a CDATA section) 663 * 664 * @return string 665 */ 666 function xmltext() 667 { 668 $ret = $this->innertext(); 669 $ret = str_ireplace('<![CDATA[', '', $ret); 670 $ret = str_replace(']]>', '', $ret); 671 return $ret; 672 } 673 674 // build node's text with tag 675 function makeup() 676 { 677 // text, comment, unknown 678 if (isset($this->_[HDOM_INFO_TEXT])) { 679 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 680 } 681 682 $ret = '<' . $this->tag; 683 $i = -1; 684 685 foreach ($this->attr as $key => $val) { 686 ++$i; 687 688 // skip removed attribute 689 if ($val === null || $val === false) { continue; } 690 691 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 692 693 //no value attr: nowrap, checked selected... 694 if ($val === true) { 695 $ret .= $key; 696 } else { 697 switch ($this->_[HDOM_INFO_QUOTE][$i]) 698 { 699 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 700 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 701 default: $quote = ''; 702 } 703 704 $ret .= $key 705 . $this->_[HDOM_INFO_SPACE][$i][1] 706 . '=' 707 . $this->_[HDOM_INFO_SPACE][$i][2] 708 . $quote 709 . $val 710 . $quote; 711 } 712 } 713 714 $ret = $this->dom->restore_noise($ret); 715 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 716 } 717 718 /** 719 * Find elements by CSS selector 720 * 721 * @param string $selector The CSS selector 722 * @param int|null $idx Index of element to return form the list of matching 723 * elements (default: `null` = disabled). 724 * @param bool $lowercase Matches tag names case insensitive (lowercase) if 725 * enabled (default: `false`) 726 * @return array|object|null A list of elements matching the specified CSS 727 * selector or a single element if $idx is specified or null if no element 728 * was found. 729 */ 730 function find($selector, $idx = null, $lowercase = false) 731 { 732 $selectors = $this->parse_selector($selector); 733 if (($count = count($selectors)) === 0) { return array(); } 734 $found_keys = array(); 735 736 // find each selector 737 for ($c = 0; $c < $count; ++$c) { 738 // The change on the below line was documented on the sourceforge 739 // code tracker id 2788009 740 // used to be: if (($levle=count($selectors[0]))===0) return array(); 741 if (($levle = count($selectors[$c])) === 0) { return array(); } 742 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 743 744 $head = array($this->_[HDOM_INFO_BEGIN] => 1); 745 $cmd = ' '; // Combinator 746 747 // handle descendant selectors, no recursive! 748 for ($l = 0; $l < $levle; ++$l) { 749 $ret = array(); 750 751 foreach ($head as $k => $v) { 752 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 753 //PaperG - Pass this optional parameter on to the seek function. 754 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 755 } 756 757 $head = $ret; 758 $cmd = $selectors[$c][$l][4]; // Next Combinator 759 } 760 761 foreach ($head as $k => $v) { 762 if (!isset($found_keys[$k])) { 763 $found_keys[$k] = 1; 764 } 765 } 766 } 767 768 // sort keys 769 ksort($found_keys); 770 771 $found = array(); 772 foreach ($found_keys as $k => $v) { 773 $found[] = $this->dom->nodes[$k]; 774 } 775 776 // return nth-element or array 777 if (is_null($idx)) { return $found; } 778 elseif ($idx < 0) { $idx = count($found) + $idx; } 779 return (isset($found[$idx])) ? $found[$idx] : null; 780 } 781 782 /** 783 * Seek DOM elements by selector 784 * 785 * **Note** 786 * The selector element must be compatible to a selector from 787 * {@see d_simple_html_dom_node::parse_selector()} 788 * 789 * @param array $selector A selector element 790 * @param array $ret An array of matches 791 * @param bool $lowercase Matches tag names case insensitive (lowercase) if 792 * enabled (default: `false`) 793 * @return void 794 */ 795 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 796 { 797 global $debug_object; 798 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 799 800 list($tag, $id, $class, $attributes, $cmb) = $selector; 801 $nodes = array(); 802 803 if ($parent_cmd === ' ') { // Descendant Combinator 804 // Find parent closing tag if the current element doesn't have a closing 805 // tag (i.e. void element) 806 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 807 if ($end == 0) { 808 $parent = $this->parent; 809 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 810 $end -= 1; 811 $parent = $parent->parent; 812 } 813 $end += $parent->_[HDOM_INFO_END]; 814 } 815 816 // Get list of target nodes 817 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 818 $nodes_count = $end - $nodes_start; 819 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 820 } elseif ($parent_cmd === '>') { // Child Combinator 821 $nodes = $this->children; 822 } elseif ($parent_cmd === '+' 823 && $this->parent 824 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 825 $index = array_search($this, $this->parent->children, true) + 1; 826 $nodes[] = $this->parent->children[$index]; 827 } elseif ($parent_cmd === '~' 828 && $this->parent 829 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 830 $index = array_search($this, $this->parent->children, true); 831 $nodes = array_slice($this->parent->children, $index); 832 } 833 834 // Go throgh each element starting at this element until the end tag 835 // Note: If this element is a void tag, any previous void element is 836 // skipped. 837 foreach($nodes as $node) { 838 $pass = true; 839 840 // Skip root nodes 841 if(!$node->parent) { 842 $pass = false; 843 } 844 845 // Skip if node isn't a child node (i.e. text nodes) 846 if($pass && !in_array($node, $node->parent->children, true)) { 847 $pass = false; 848 } 849 850 // Skip if tag doesn't match 851 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 852 $pass = false; 853 } 854 855 // Skip if ID doesn't exist 856 if ($pass && $id !== '' && !isset($node->attr['id'])) { 857 $pass = false; 858 } 859 860 // Check if ID matches 861 if ($pass && $id !== '' && isset($node->attr['id'])) { 862 // Note: Only consider the first ID (as browsers do) 863 $node_id = explode(' ', trim($node->attr['id']))[0]; 864 865 if($id !== $node_id) { $pass = false; } 866 } 867 868 // Check if all class(es) exist 869 if ($pass && $class !== '' && is_array($class) && !empty($class)) { 870 if (isset($node->attr['class'])) { 871 $node_classes = explode(' ', $node->attr['class']); 872 873 if ($lowercase) { 874 $node_classes = array_map('strtolower', $node_classes); 875 } 876 877 foreach($class as $c) { 878 if(!in_array($c, $node_classes)) { 879 $pass = false; 880 break; 881 } 882 } 883 } else { 884 $pass = false; 885 } 886 } 887 888 // Check attributes 889 if ($pass 890 && $attributes !== '' 891 && is_array($attributes) 892 && !empty($attributes)) { 893 foreach($attributes as $a) { 894 list ( 895 $att_name, 896 $att_expr, 897 $att_val, 898 $att_inv, 899 $att_case_sensitivity 900 ) = $a; 901 902 // Handle indexing attributes (i.e. "[2]") 903 /** 904 * Note: This is not supported by the CSS Standard but adds 905 * the ability to select items compatible to XPath (i.e. 906 * the 3rd element within it's parent). 907 * 908 * Note: This doesn't conflict with the CSS Standard which 909 * doesn't work on numeric attributes anyway. 910 */ 911 if (is_numeric($att_name) 912 && $att_expr === '' 913 && $att_val === '') { 914 $count = 0; 915 916 // Find index of current element in parent 917 foreach ($node->parent->children as $c) { 918 if ($c->tag === $node->tag) ++$count; 919 if ($c === $node) break; 920 } 921 922 // If this is the correct node, continue with next 923 // attribute 924 if ($count === (int)$att_name) continue; 925 } 926 927 // Check attribute availability 928 if ($att_inv) { // Attribute should NOT be set 929 if (isset($node->attr[$att_name])) { 930 $pass = false; 931 break; 932 } 933 } else { // Attribute should be set 934 // todo: "plaintext" is not a valid CSS selector! 935 if ($att_name !== 'plaintext' 936 && !isset($node->attr[$att_name])) { 937 $pass = false; 938 break; 939 } 940 } 941 942 // Continue with next attribute if expression isn't defined 943 if ($att_expr === '') continue; 944 945 // If they have told us that this is a "plaintext" 946 // search then we want the plaintext of the node - right? 947 // todo "plaintext" is not a valid CSS selector! 948 if ($att_name === 'plaintext') { 949 $nodeKeyValue = $node->text(); 950 } else { 951 $nodeKeyValue = $node->attr[$att_name]; 952 } 953 954 if (is_object($debug_object)) { 955 $debug_object->debug_log(2, 956 'testing node: ' 957 . $node->tag 958 . ' for attribute: ' 959 . $att_name 960 . $att_expr 961 . $att_val 962 . ' where nodes value is: ' 963 . $nodeKeyValue 964 ); 965 } 966 967 // If lowercase is set, do a case insensitive test of 968 // the value of the selector. 969 if ($lowercase) { 970 $check = $this->match( 971 $att_expr, 972 strtolower($att_val), 973 strtolower($nodeKeyValue), 974 $att_case_sensitivity 975 ); 976 } else { 977 $check = $this->match( 978 $att_expr, 979 $att_val, 980 $nodeKeyValue, 981 $att_case_sensitivity 982 ); 983 } 984 985 if (is_object($debug_object)) { 986 $debug_object->debug_log(2, 987 'after match: ' 988 . ($check ? 'true' : 'false') 989 ); 990 } 991 992 if (!$check) { 993 $pass = false; 994 break; 995 } 996 } 997 } 998 999 // Found a match. Add to list and clear node 1000 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 1001 unset($node); 1002 } 1003 // It's passed by reference so this is actually what this function returns. 1004 if (is_object($debug_object)) { 1005 $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 1006 } 1007 } 1008 1009 /** 1010 * Match value and pattern for a given CSS expression 1011 * 1012 * **Supported Expressions** 1013 * 1014 * | Expression | Description 1015 * | ---------- | ----------- 1016 * | `=` | $value and $pattern must be equal 1017 * | `!=` | $value and $pattern must not be equal 1018 * | `^=` | $value must start with $pattern 1019 * | `$=` | $value must end with $pattern 1020 * | `*=` | $value must contain $pattern 1021 * 1022 * @param string $exp The expression. 1023 * @param string $pattern The pattern 1024 * @param string $value The value 1025 * @value bool True if $value matches $pattern 1026 */ 1027 protected function match($exp, $pattern, $value, $case_sensitivity) 1028 { 1029 global $debug_object; 1030 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 1031 1032 if ($case_sensitivity === 'i') { 1033 $pattern = strtolower($pattern); 1034 $value = strtolower($value); 1035 } 1036 1037 switch ($exp) { 1038 case '=': 1039 return ($value === $pattern); 1040 case '!=': 1041 return ($value !== $pattern); 1042 case '^=': 1043 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 1044 case '$=': 1045 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 1046 case '*=': 1047 return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 1048 case '|=': 1049 /** 1050 * [att|=val] 1051 * 1052 * Represents an element with the att attribute, its value 1053 * either being exactly "val" or beginning with "val" 1054 * immediately followed by "-" (U+002D). 1055 */ 1056 return strpos($value, $pattern) === 0; 1057 case '~=': 1058 /** 1059 * [att~=val] 1060 * 1061 * Represents an element with the att attribute whose value is a 1062 * whitespace-separated list of words, one of which is exactly 1063 * "val". If "val" contains whitespace, it will never represent 1064 * anything (since the words are separated by spaces). Also if 1065 * "val" is the empty string, it will never represent anything. 1066 */ 1067 return in_array($pattern, explode(' ', trim($value)), true); 1068 } 1069 return false; 1070 } 1071 1072 /** 1073 * Parse CSS selector 1074 * 1075 * @param string $selector_string CSS selector string 1076 * @return array List of CSS selectors. The format depends on the type of 1077 * selector: 1078 * 1079 * ```php 1080 * 1081 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div' 1082 * array( // list of combinator selectors, i.e. 'img > p > div' 1083 * array( // selector element 1084 * [0], // (string) The element tag 1085 * [1], // (string) The element id 1086 * [2], // (array<string>) The element classes 1087 * [3], // (array<array<string>>) The list of attributes, each 1088 * // with four elements: name, expression, value, inverted 1089 * [4] // (string) The selector combinator (' ' | '>' | '+' | '~') 1090 * ) 1091 * ) 1092 * ) 1093 * ``` 1094 * 1095 * @link https://www.w3.org/TR/selectors/#compound Compound selector 1096 */ 1097 protected function parse_selector($selector_string) 1098 { 1099 global $debug_object; 1100 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1101 1102 /** 1103 * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 1104 * 1105 * Paperg: Add the colon to the attribute, so that it properly finds 1106 * <tag attr:ibute="something" > like google does. 1107 * 1108 * Note: if you try to look at this attribute, you MUST use getAttribute 1109 * since $dom->x:y will fail the php syntax check. 1110 * 1111 * Notice the \[ starting the attribute? and the @? following? This 1112 * implies that an attribute can begin with an @ sign that is not 1113 * captured. This implies that an html attribute specifier may start 1114 * with an @ sign that is NOT captured by the expression. Farther study 1115 * is required to determine of this should be documented or removed. 1116 * 1117 * Matches selectors in this order: 1118 * 1119 * [0] - full match 1120 * 1121 * [1] - tag name 1122 * ([\w:\*-]*) 1123 * Matches the tag name consisting of zero or more words, colons, 1124 * asterisks and hyphens. 1125 * 1126 * [2] - id name 1127 * (?:\#([\w-]+)) 1128 * Optionally matches a id name, consisting of an "#" followed by 1129 * the id name (one or more words and hyphens). 1130 * 1131 * [3] - class names (including dots) 1132 * (?:\.([\w\.-]+))? 1133 * Optionally matches a list of classs, consisting of an "." 1134 * followed by the class name (one or more words and hyphens) 1135 * where multiple classes can be chained (i.e. ".foo.bar.baz") 1136 * 1137 * [4] - attributes 1138 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 1139 * Optionally matches the attributes list 1140 * 1141 * [5] - separator 1142 * ([\/, >+~]+) 1143 * Matches the selector list separator 1144 */ 1145 // phpcs:ignore Generic.Files.LineLength 1146 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 1147 1148 preg_match_all( 1149 $pattern, 1150 trim($selector_string) . ' ', // Add final ' ' as pseudo separator 1151 $matches, 1152 PREG_SET_ORDER 1153 ); 1154 1155 if (is_object($debug_object)) { 1156 $debug_object->debug_log(2, 'Matches Array: ', $matches); 1157 } 1158 1159 $selectors = array(); 1160 $result = array(); 1161 1162 foreach ($matches as $m) { 1163 $m[0] = trim($m[0]); 1164 1165 // Skip NoOps 1166 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 1167 1168 // Convert to lowercase 1169 if ($this->dom->lowercase) { 1170 $m[1] = strtolower($m[1]); 1171 } 1172 1173 // Extract classes 1174 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 1175 1176 /* Extract attributes (pattern based on the pattern above!) 1177 1178 * [0] - full match 1179 * [1] - attribute name 1180 * [2] - attribute expression 1181 * [3] - attribute value 1182 * [4] - case sensitivity 1183 * 1184 * Note: Attributes can be negated with a "!" prefix to their name 1185 */ 1186 if($m[4] !== '') { 1187 preg_match_all( 1188 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is", 1189 trim($m[4]), 1190 $attributes, 1191 PREG_SET_ORDER 1192 ); 1193 1194 // Replace element by array 1195 $m[4] = array(); 1196 1197 foreach($attributes as $att) { 1198 // Skip empty matches 1199 if(trim($att[0]) === '') { continue; } 1200 1201 $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 1202 $m[4][] = array( 1203 $inverted ? substr($att[1], 1) : $att[1], // Name 1204 (isset($att[2])) ? $att[2] : '', // Expression 1205 (isset($att[3])) ? $att[3] : '', // Value 1206 $inverted, // Inverted Flag 1207 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 1208 ); 1209 } 1210 } 1211 1212 // Sanitize Separator 1213 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 1214 $m[5] = ' '; 1215 } else { // Other Separator 1216 $m[5] = trim($m[5]); 1217 } 1218 1219 // Clear Separator if it's a Selector List 1220 if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 1221 1222 // Remove full match before adding to results 1223 array_shift($m); 1224 $result[] = $m; 1225 1226 if ($is_list) { // Selector List 1227 $selectors[] = $result; 1228 $result = array(); 1229 } 1230 } 1231 1232 if (count($result) > 0) { $selectors[] = $result; } 1233 return $selectors; 1234 } 1235 1236 function __get($name) 1237 { 1238 if (isset($this->attr[$name])) { 1239 return $this->convert_text($this->attr[$name]); 1240 } 1241 switch ($name) { 1242 case 'outertext': return $this->outertext(); 1243 case 'innertext': return $this->innertext(); 1244 case 'plaintext': return $this->text(); 1245 case 'xmltext': return $this->xmltext(); 1246 default: return array_key_exists($name, $this->attr); 1247 } 1248 } 1249 1250 function __set($name, $value) 1251 { 1252 global $debug_object; 1253 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1254 1255 switch ($name) { 1256 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 1257 case 'innertext': 1258 if (isset($this->_[HDOM_INFO_TEXT])) { 1259 return $this->_[HDOM_INFO_TEXT] = $value; 1260 } 1261 return $this->_[HDOM_INFO_INNER] = $value; 1262 } 1263 1264 if (!isset($this->attr[$name])) { 1265 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 1266 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1267 } 1268 1269 $this->attr[$name] = $value; 1270 } 1271 1272 function __isset($name) 1273 { 1274 switch ($name) { 1275 case 'outertext': return true; 1276 case 'innertext': return true; 1277 case 'plaintext': return true; 1278 } 1279 //no value attr: nowrap, checked selected... 1280 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1281 } 1282 1283 function __unset($name) 1284 { 1285 if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1286 } 1287 1288 // PaperG - Function to convert the text from one character set to another 1289 // if the two sets are not the same. 1290 function convert_text($text) 1291 { 1292 global $debug_object; 1293 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1294 1295 $converted_text = $text; 1296 1297 $sourceCharset = ''; 1298 $targetCharset = ''; 1299 1300 if ($this->dom) { 1301 $sourceCharset = strtoupper($this->dom->_charset); 1302 $targetCharset = strtoupper($this->dom->_target_charset); 1303 } 1304 1305 if (is_object($debug_object)) { 1306 $debug_object->debug_log(3, 1307 'source charset: ' 1308 . $sourceCharset 1309 . ' target charaset: ' 1310 . $targetCharset 1311 ); 1312 } 1313 1314 if (!empty($sourceCharset) 1315 && !empty($targetCharset) 1316 && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1317 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1318 if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1319 && ($this->is_utf8($text))) { 1320 $converted_text = $text; 1321 } else { 1322 $converted_text = iconv($sourceCharset, $targetCharset, $text); 1323 } 1324 } 1325 1326 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1327 if ($targetCharset === 'UTF-8') { 1328 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1329 $converted_text = substr($converted_text, 3); 1330 } 1331 1332 if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1333 $converted_text = substr($converted_text, 0, -3); 1334 } 1335 } 1336 1337 return $converted_text; 1338 } 1339 1340 /** 1341 * Returns true if $string is valid UTF-8 and false otherwise. 1342 * 1343 * @param mixed $str String to be tested 1344 * @return boolean 1345 */ 1346 static function is_utf8($str) 1347 { 1348 $c = 0; $b = 0; 1349 $bits = 0; 1350 $len = strlen($str); 1351 for($i = 0; $i < $len; $i++) { 1352 $c = ord($str[$i]); 1353 if($c > 128) { 1354 if(($c >= 254)) { return false; } 1355 elseif($c >= 252) { $bits = 6; } 1356 elseif($c >= 248) { $bits = 5; } 1357 elseif($c >= 240) { $bits = 4; } 1358 elseif($c >= 224) { $bits = 3; } 1359 elseif($c >= 192) { $bits = 2; } 1360 else { return false; } 1361 if(($i + $bits) > $len) { return false; } 1362 while($bits > 1) { 1363 $i++; 1364 $b = ord($str[$i]); 1365 if($b < 128 || $b > 191) { return false; } 1366 $bits--; 1367 } 1368 } 1369 } 1370 return true; 1371 } 1372 1373 /** 1374 * Function to try a few tricks to determine the displayed size of an img on 1375 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all 1376 * other tag types. 1377 * 1378 * @author John Schlick 1379 * @version April 19 2012 1380 * @return array an array containing the 'height' and 'width' of the image 1381 * on the page or -1 if we can't figure it out. 1382 */ 1383 function get_display_size() 1384 { 1385 global $debug_object; 1386 1387 $width = -1; 1388 $height = -1; 1389 1390 if ($this->tag !== 'img') { 1391 return false; 1392 } 1393 1394 // See if there is aheight or width attribute in the tag itself. 1395 if (isset($this->attr['width'])) { 1396 $width = $this->attr['width']; 1397 } 1398 1399 if (isset($this->attr['height'])) { 1400 $height = $this->attr['height']; 1401 } 1402 1403 // Now look for an inline style. 1404 if (isset($this->attr['style'])) { 1405 // Thanks to user gnarf from stackoverflow for this regular expression. 1406 $attributes = array(); 1407 1408 preg_match_all( 1409 '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1410 $this->attr['style'], 1411 $matches, 1412 PREG_SET_ORDER 1413 ); 1414 1415 foreach ($matches as $match) { 1416 $attributes[$match[1]] = $match[2]; 1417 } 1418 1419 // If there is a width in the style attributes: 1420 if (isset($attributes['width']) && $width == -1) { 1421 // check that the last two characters are px (pixels) 1422 if (strtolower(substr($attributes['width'], -2)) === 'px') { 1423 $proposed_width = substr($attributes['width'], 0, -2); 1424 // Now make sure that it's an integer and not something stupid. 1425 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1426 $width = $proposed_width; 1427 } 1428 } 1429 } 1430 1431 // If there is a width in the style attributes: 1432 if (isset($attributes['height']) && $height == -1) { 1433 // check that the last two characters are px (pixels) 1434 if (strtolower(substr($attributes['height'], -2)) == 'px') { 1435 $proposed_height = substr($attributes['height'], 0, -2); 1436 // Now make sure that it's an integer and not something stupid. 1437 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1438 $height = $proposed_height; 1439 } 1440 } 1441 } 1442 1443 } 1444 1445 // Future enhancement: 1446 // Look in the tag to see if there is a class or id specified that has 1447 // a height or width attribute to it. 1448 1449 // Far future enhancement 1450 // Look at all the parent tags of this image to see if they specify a 1451 // class or id that has an img selector that specifies a height or width 1452 // Note that in this case, the class or id will have the img subselector 1453 // for it to apply to the image. 1454 1455 // ridiculously far future development 1456 // If the class or id is specified in a SEPARATE css file thats not on 1457 // the page, go get it and do what we were just doing for the ones on 1458 // the page. 1459 1460 $result = array( 1461 'height' => $height, 1462 'width' => $width 1463 ); 1464 1465 return $result; 1466 } 1467 1468 // camel naming conventions 1469 function getAllAttributes() 1470 { 1471 return $this->attr; 1472 } 1473 1474 function getAttribute($name) 1475 { 1476 return $this->__get($name); 1477 } 1478 1479 function setAttribute($name, $value) 1480 { 1481 $this->__set($name, $value); 1482 } 1483 1484 function hasAttribute($name) 1485 { 1486 return $this->__isset($name); 1487 } 1488 1489 function removeAttribute($name) 1490 { 1491 $this->__set($name, null); 1492 } 1493 1494 function getElementById($id) 1495 { 1496 return $this->find("#$id", 0); 1497 } 1498 1499 function getElementsById($id, $idx = null) 1500 { 1501 return $this->find("#$id", $idx); 1502 } 1503 1504 function getElementByTagName($name) 1505 { 1506 return $this->find($name, 0); 1507 } 1508 1509 function getElementsByTagName($name, $idx = null) 1510 { 1511 return $this->find($name, $idx); 1512 } 1513 1514 function parentNode() 1515 { 1516 return $this->parent(); 1517 } 1518 1519 function childNodes($idx = -1) 1520 { 1521 return $this->children($idx); 1522 } 1523 1524 function firstChild() 1525 { 1526 return $this->first_child(); 1527 } 1528 1529 function lastChild() 1530 { 1531 return $this->last_child(); 1532 } 1533 1534 function nextSibling() 1535 { 1536 return $this->next_sibling(); 1537 } 1538 1539 function previousSibling() 1540 { 1541 return $this->prev_sibling(); 1542 } 1543 1544 function hasChildNodes() 1545 { 1546 return $this->has_child(); 1547 } 1548 1549 function nodeName() 1550 { 1551 return $this->tag; 1552 } 1553 1554 function appendChild($node) 1555 { 1556 $node->parent($this); 1557 return $node; 1558 } 1559 1560 } 1561 1562 /** 1563 * simple html dom parser 1564 * 1565 * Paperg - in the find routine: allow us to specify that we want case 1566 * insensitive testing of the value of the selector. 1567 * 1568 * Paperg - change $size from protected to public so we can easily access it 1569 * 1570 * Paperg - added ForceTagsClosed in the constructor which tells us whether we 1571 * trust the html or not. Default is to NOT trust it. 1572 * 1573 * @package PlaceLocalInclude 1574 */ 1575 class d_simple_html_dom 1576 { 1577 /** 1578 * The root node of the document 1579 * 1580 * @var object 1581 */ 1582 public $root = null; 1583 1584 /** 1585 * List of nodes in the current DOM 1586 * 1587 * @var array 1588 */ 1589 public $nodes = array(); 1590 1591 /** 1592 * Callback function to run for each element in the DOM. 1593 * 1594 * @var callable|null 1595 */ 1596 public $callback = null; 1597 1598 /** 1599 * Indicates how tags and attributes are matched 1600 * 1601 * @var bool When set to **true** tags and attributes will be converted to 1602 * lowercase before matching. 1603 */ 1604 public $lowercase = false; 1605 1606 /** 1607 * Original document size 1608 * 1609 * Holds the original document size. 1610 * 1611 * @var int 1612 */ 1613 public $original_size; 1614 1615 /** 1616 * Current document size 1617 * 1618 * Holds the current document size. The document size is determined by the 1619 * string length of ({@see d_simple_html_dom::$doc}). 1620 * 1621 * _Note_: Using this variable is more efficient than calling `strlen($doc)` 1622 * 1623 * @var int 1624 * */ 1625 public $size; 1626 1627 /** 1628 * Current position in the document 1629 * 1630 * @var int 1631 */ 1632 protected $pos; 1633 1634 /** 1635 * The document 1636 * 1637 * @var string 1638 */ 1639 protected $doc; 1640 1641 /** 1642 * Current character 1643 * 1644 * Holds the current character at position {@see d_simple_html_dom::$pos} in 1645 * the document {@see d_simple_html_dom::$doc} 1646 * 1647 * _Note_: Using this variable is more efficient than calling 1648 * `substr($doc, $pos, 1)` 1649 * 1650 * @var string 1651 */ 1652 protected $char; 1653 1654 protected $cursor; 1655 1656 /** 1657 * Parent node of the next node detected by the parser 1658 * 1659 * @var object 1660 */ 1661 protected $parent; 1662 protected $noise = array(); 1663 1664 /** 1665 * Tokens considered blank in HTML 1666 * 1667 * @var string 1668 */ 1669 protected $token_blank = " \t\r\n"; 1670 1671 /** 1672 * Tokens to identify the equal sign for attributes, stopping either at the 1673 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e. 1674 * "<html>") 1675 * 1676 * @var string 1677 */ 1678 protected $token_equal = ' =/>'; 1679 1680 /** 1681 * Tokens to identify the end of a tag name. A tag name either ends on the 1682 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t") 1683 * 1684 * @var string 1685 */ 1686 protected $token_slash = " />\r\n\t"; 1687 1688 /** 1689 * Tokens to identify the end of an attribute 1690 * 1691 * @var string 1692 */ 1693 protected $token_attr = ' >'; 1694 1695 // Note that this is referenced by a child node, and so it needs to be 1696 // public for that node to see this information. 1697 public $_charset = ''; 1698 public $_target_charset = ''; 1699 1700 /** 1701 * Innertext for <br> elements 1702 * 1703 * @var string 1704 */ 1705 protected $default_br_text = ''; 1706 1707 /** 1708 * Suffix for <span> elements 1709 * 1710 * @var string 1711 */ 1712 public $default_span_text = ''; 1713 1714 /** 1715 * Defines a list of self-closing tags (Void elements) according to the HTML 1716 * Specification 1717 * 1718 * _Remarks_: 1719 * - Use `isset()` instead of `in_array()` on array elements to boost 1720 * performance about 30% 1721 * - Sort elements by name for better readability! 1722 * 1723 * @link https://www.w3.org/TR/html HTML Specification 1724 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements 1725 */ 1726 protected $self_closing_tags = array( 1727 'area' => 1, 1728 'base' => 1, 1729 'br' => 1, 1730 'col' => 1, 1731 'embed' => 1, 1732 'hr' => 1, 1733 'img' => 1, 1734 'input' => 1, 1735 'link' => 1, 1736 'meta' => 1, 1737 'param' => 1, 1738 'source' => 1, 1739 'track' => 1, 1740 'wbr' => 1 1741 ); 1742 1743 /** 1744 * Defines a list of tags which - if closed - close all optional closing 1745 * elements within if they haven't been closed yet. (So, an element where 1746 * neither opening nor closing tag is omissible consistently closes every 1747 * optional closing element within) 1748 * 1749 * _Remarks_: 1750 * - Use `isset()` instead of `in_array()` on array elements to boost 1751 * performance about 30% 1752 * - Sort elements by name for better readability! 1753 */ 1754 protected $block_tags = array( 1755 'body' => 1, 1756 'div' => 1, 1757 'form' => 1, 1758 'root' => 1, 1759 'span' => 1, 1760 'table' => 1 1761 ); 1762 1763 /** 1764 * Defines elements whose end tag is omissible. 1765 * 1766 * * key = Name of an element whose end tag is omissible. 1767 * * value = Names of elements whose end tag is omissible, that are closed 1768 * by the current element. 1769 * 1770 * _Remarks_: 1771 * - Use `isset()` instead of `in_array()` on array elements to boost 1772 * performance about 30% 1773 * - Sort elements by name for better readability! 1774 * 1775 * **Example** 1776 * 1777 * An `li` element’s end tag may be omitted if the `li` element is immediately 1778 * followed by another `li` element. To do that, add following element to the 1779 * array: 1780 * 1781 * ```php 1782 * 'li' => array('li'), 1783 * ``` 1784 * 1785 * With this, the following two examples are considered equal. Note that the 1786 * second example is missing the closing tags on `li` elements. 1787 * 1788 * ```html 1789 * <ul><li>First Item</li><li>Second Item</li></ul> 1790 * ``` 1791 * 1792 * <ul><li>First Item</li><li>Second Item</li></ul> 1793 * 1794 * ```html 1795 * <ul><li>First Item<li>Second Item</ul> 1796 * ``` 1797 * 1798 * <ul><li>First Item<li>Second Item</ul> 1799 * 1800 * @var array A two-dimensional array where the key is the name of an 1801 * element whose end tag is omissible and the value is an array of elements 1802 * whose end tag is omissible, that are closed by the current element. 1803 * 1804 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags 1805 * 1806 * @todo The implementation of optional closing tags doesn't work in all cases 1807 * because it only consideres elements who close other optional closing 1808 * tags, not taking into account that some (non-blocking) tags should close 1809 * these optional closing tags. For example, the end tag for "p" is omissible 1810 * and can be closed by an "address" element, whose end tag is NOT omissible. 1811 * Currently a "p" element without closing tag stops at the next "p" element 1812 * or blocking tag, even if it contains other elements. 1813 * 1814 * @todo Known sourceforge issue #2977341 1815 * B tags that are not closed cause us to return everything to the end of 1816 * the document. 1817 */ 1818 protected $optional_closing_tags = array( 1819 // Not optional, see 1820 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1821 'b' => array('b' => 1), 1822 'dd' => array('dd' => 1, 'dt' => 1), 1823 // Not optional, see 1824 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1825 'dl' => array('dd' => 1, 'dt' => 1), 1826 'dt' => array('dd' => 1, 'dt' => 1), 1827 'li' => array('li' => 1), 1828 'optgroup' => array('optgroup' => 1, 'option' => 1), 1829 'option' => array('optgroup' => 1, 'option' => 1), 1830 'p' => array('p' => 1), 1831 'rp' => array('rp' => 1, 'rt' => 1), 1832 'rt' => array('rp' => 1, 'rt' => 1), 1833 'td' => array('td' => 1, 'th' => 1), 1834 'th' => array('td' => 1, 'th' => 1), 1835 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1836 ); 1837 1838 function __construct( 1839 $str = null, 1840 $lowercase = true, 1841 $forceTagsClosed = true, 1842 $target_charset = DEFAULT_TARGET_CHARSET, 1843 $stripRN = true, 1844 $defaultBRText = DEFAULT_BR_TEXT, 1845 $defaultSpanText = DEFAULT_SPAN_TEXT, 1846 $options = 0) 1847 { 1848 if ($str) { 1849 if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1850 $this->load_file($str); 1851 } else { 1852 $this->load( 1853 $str, 1854 $lowercase, 1855 $stripRN, 1856 $defaultBRText, 1857 $defaultSpanText, 1858 $options 1859 ); 1860 } 1861 } 1862 // Forcing tags to be closed implies that we don't trust the html, but 1863 // it can lead to parsing errors if we SHOULD trust the html. 1864 if (!$forceTagsClosed) { 1865 $this->optional_closing_array = array(); 1866 } 1867 1868 $this->_target_charset = $target_charset; 1869 } 1870 1871 function __destruct() 1872 { 1873 $this->clear(); 1874 } 1875 1876 // load html from string 1877 function load( 1878 $str, 1879 $lowercase = true, 1880 $stripRN = true, 1881 $defaultBRText = DEFAULT_BR_TEXT, 1882 $defaultSpanText = DEFAULT_SPAN_TEXT, 1883 $options = 0) 1884 { 1885 global $debug_object; 1886 1887 // prepare 1888 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1889 1890 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1891 // Script tags removal now preceeds style tag removal. 1892 // strip out <script> tags 1893 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1894 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1895 1896 // strip out the \r \n's if we are told to. 1897 if ($stripRN) { 1898 $this->doc = str_replace("\r", ' ', $this->doc); 1899 $this->doc = str_replace("\n", ' ', $this->doc); 1900 1901 // set the length of content since we have changed it. 1902 $this->size = strlen($this->doc); 1903 } 1904 1905 // strip out cdata 1906 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1907 // strip out comments 1908 $this->remove_noise("'<!--(.*?)-->'is"); 1909 // strip out <style> tags 1910 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1911 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1912 // strip out preformatted tags 1913 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1914 // strip out server side scripts 1915 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1916 1917 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1918 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1919 } 1920 1921 // parsing 1922 $this->parse(); 1923 // end 1924 $this->root->_[HDOM_INFO_END] = $this->cursor; 1925 $this->parse_charset(); 1926 1927 // make load function chainable 1928 return $this; 1929 } 1930 1931 // load html from file 1932 function load_file() 1933 { 1934 $args = func_get_args(); 1935 1936 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1937 $this->load($doc, true); 1938 } else { 1939 return false; 1940 } 1941 } 1942 1943 /** 1944 * Set the callback function 1945 * 1946 * @param callable $function_name Callback function to run for each element 1947 * in the DOM. 1948 * @return void 1949 */ 1950 function set_callback($function_name) 1951 { 1952 $this->callback = $function_name; 1953 } 1954 1955 /** 1956 * Remove callback function 1957 * 1958 * @return void 1959 */ 1960 function remove_callback() 1961 { 1962 $this->callback = null; 1963 } 1964 1965 // save dom as string 1966 function save($filepath = '') 1967 { 1968 $ret = $this->root->innertext(); 1969 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1970 return $ret; 1971 } 1972 1973 // find dom node by css selector 1974 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. 1975 function find($selector, $idx = null, $lowercase = false) 1976 { 1977 return $this->root->find($selector, $idx, $lowercase); 1978 } 1979 1980 // clean up memory due to php5 circular references memory leak... 1981 function clear() 1982 { 1983 foreach ($this->nodes as $n) { 1984 $n->clear(); $n = null; 1985 } 1986 1987 // This add next line is documented in the sourceforge repository. 1988 // 2977248 as a fix for ongoing memory leaks that occur even with the 1989 // use of clear. 1990 if (isset($this->children)) { 1991 foreach ($this->children as $n) { 1992 $n->clear(); $n = null; 1993 } 1994 } 1995 1996 if (isset($this->parent)) { 1997 $this->parent->clear(); 1998 unset($this->parent); 1999 } 2000 2001 if (isset($this->root)) { 2002 $this->root->clear(); 2003 unset($this->root); 2004 } 2005 2006 unset($this->doc); 2007 unset($this->noise); 2008 } 2009 2010 function dump($show_attr = true) 2011 { 2012 $this->root->dump($show_attr); 2013 } 2014 2015 // prepare HTML data and init everything 2016 protected function prepare( 2017 $str, $lowercase = true, 2018 $defaultBRText = DEFAULT_BR_TEXT, 2019 $defaultSpanText = DEFAULT_SPAN_TEXT) 2020 { 2021 $this->clear(); 2022 2023 $this->doc = trim($str); 2024 $this->size = strlen($this->doc); 2025 $this->original_size = $this->size; // original size of the html 2026 $this->pos = 0; 2027 $this->cursor = 1; 2028 $this->noise = array(); 2029 $this->nodes = array(); 2030 $this->lowercase = $lowercase; 2031 $this->default_br_text = $defaultBRText; 2032 $this->default_span_text = $defaultSpanText; 2033 $this->root = new d_simple_html_dom_node($this); 2034 $this->root->tag = 'root'; 2035 $this->root->_[HDOM_INFO_BEGIN] = -1; 2036 $this->root->nodetype = HDOM_TYPE_ROOT; 2037 $this->parent = $this->root; 2038 if ($this->size > 0) { $this->char = $this->doc[0]; } 2039 } 2040 2041 /** 2042 * Parse HTML content 2043 * 2044 * @return bool True on success 2045 */ 2046 protected function parse() 2047 { 2048 while (true) { 2049 // Read next tag if there is no text between current position and the 2050 // next opening tag. 2051 if (($s = $this->copy_until_char('<')) === '') { 2052 if($this->read_tag()) { 2053 continue; 2054 } else { 2055 return true; 2056 } 2057 } 2058 2059 // Add a text node for text between tags 2060 $node = new d_simple_html_dom_node($this); 2061 ++$this->cursor; 2062 $node->_[HDOM_INFO_TEXT] = $s; 2063 $this->link_nodes($node, false); 2064 } 2065 } 2066 2067 // PAPERG - dkchou - added this to try to identify the character set of the 2068 // page we have just parsed so we know better how to spit it out later. 2069 // NOTE: IF you provide a routine called 2070 // get_last_retrieve_url_contents_content_type which returns the 2071 // CURLINFO_CONTENT_TYPE from the last curl_exec 2072 // (or the content_type header from the last transfer), we will parse THAT, 2073 // and if a charset is specified, we will use it over any other mechanism. 2074 protected function parse_charset() 2075 { 2076 global $debug_object; 2077 2078 $charset = null; 2079 2080 if (function_exists('get_last_retrieve_url_contents_content_type')) { 2081 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 2082 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 2083 if ($success) { 2084 $charset = $matches[1]; 2085 if (is_object($debug_object)) { 2086 $debug_object->debug_log(2, 2087 'header content-type found charset of: ' 2088 . $charset 2089 ); 2090 } 2091 } 2092 } 2093 2094 if (empty($charset)) { 2095 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 2096 2097 if (!empty($el)) { 2098 $fullvalue = $el->content; 2099 if (is_object($debug_object)) { 2100 $debug_object->debug_log(2, 2101 'meta content-type tag found' 2102 . $fullvalue 2103 ); 2104 } 2105 2106 if (!empty($fullvalue)) { 2107 $success = preg_match( 2108 '/charset=(.+)/i', 2109 $fullvalue, 2110 $matches 2111 ); 2112 2113 if ($success) { 2114 $charset = $matches[1]; 2115 } else { 2116 // If there is a meta tag, and they don't specify the 2117 // character set, research says that it's typically 2118 // ISO-8859-1 2119 if (is_object($debug_object)) { 2120 $debug_object->debug_log(2, 2121 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 2122 ); 2123 } 2124 2125 $charset = 'ISO-8859-1'; 2126 } 2127 } 2128 } 2129 } 2130 2131 // If we couldn't find a charset above, then lets try to detect one 2132 // based on the text we got... 2133 if (empty($charset)) { 2134 // Use this in case mb_detect_charset isn't installed/loaded on 2135 // this machine. 2136 $charset = false; 2137 if (function_exists('mb_detect_encoding')) { 2138 // Have php try to detect the encoding from the text given to us. 2139 $charset = mb_detect_encoding( 2140 $this->doc . 'ascii', 2141 $encoding_list = array( 'UTF-8', 'CP1252' ) 2142 ); 2143 2144 if (is_object($debug_object)) { 2145 $debug_object->debug_log(2, 'mb_detect found: ' . $charset); 2146 } 2147 } 2148 2149 // and if this doesn't work... then we need to just wrongheadedly 2150 // assume it's UTF-8 so that we can move on - cause this will 2151 // usually give us most of what we need... 2152 if ($charset === false) { 2153 if (is_object($debug_object)) { 2154 $debug_object->debug_log( 2155 2, 2156 'since mb_detect failed - using default of utf-8' 2157 ); 2158 } 2159 2160 $charset = 'UTF-8'; 2161 } 2162 } 2163 2164 // Since CP1252 is a superset, if we get one of it's subsets, we want 2165 // it instead. 2166 if ((strtolower($charset) == strtolower('ISO-8859-1')) 2167 || (strtolower($charset) == strtolower('Latin1')) 2168 || (strtolower($charset) == strtolower('Latin-1'))) { 2169 2170 if (is_object($debug_object)) { 2171 $debug_object->debug_log( 2172 2, 2173 'replacing ' . $charset . ' with CP1252 as its a superset' 2174 ); 2175 } 2176 2177 $charset = 'CP1252'; 2178 } 2179 2180 if (is_object($debug_object)) { 2181 $debug_object->debug_log(1, 'EXIT - ' . $charset); 2182 } 2183 2184 return $this->_charset = $charset; 2185 } 2186 2187 /** 2188 * Parse tag from current document position. 2189 * 2190 * @return bool True if a tag was found, false otherwise 2191 */ 2192 protected function read_tag() 2193 { 2194 // Set end position if no further tags found 2195 if ($this->char !== '<') { 2196 $this->root->_[HDOM_INFO_END] = $this->cursor; 2197 return false; 2198 } 2199 2200 $begin_tag_pos = $this->pos; 2201 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2202 2203 // end tag 2204 if ($this->char === '/') { 2205 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2206 2207 // Skip whitespace in end tags (i.e. in "</ html>") 2208 $this->skip($this->token_blank); 2209 $tag = $this->copy_until_char('>'); 2210 2211 // Skip attributes in end tags 2212 if (($pos = strpos($tag, ' ')) !== false) { 2213 $tag = substr($tag, 0, $pos); 2214 } 2215 2216 $parent_lower = strtolower($this->parent->tag); 2217 $tag_lower = strtolower($tag); 2218 2219 // The end tag is supposed to close the parent tag. Handle situations 2220 // when it doesn't 2221 if ($parent_lower !== $tag_lower) { 2222 // Parent tag does not have to be closed necessarily (optional closing tag) 2223 // Current tag is a block tag, so it may close an ancestor 2224 if (isset($this->optional_closing_tags[$parent_lower]) 2225 && isset($this->block_tags[$tag_lower])) { 2226 2227 $this->parent->_[HDOM_INFO_END] = 0; 2228 $org_parent = $this->parent; 2229 2230 // Traverse ancestors to find a matching opening tag 2231 // Stop at root node 2232 while (($this->parent->parent) 2233 && strtolower($this->parent->tag) !== $tag_lower 2234 ){ 2235 $this->parent = $this->parent->parent; 2236 } 2237 2238 // If we don't have a match add current tag as text node 2239 if (strtolower($this->parent->tag) !== $tag_lower) { 2240 $this->parent = $org_parent; // restore origonal parent 2241 2242 if ($this->parent->parent) { 2243 $this->parent = $this->parent->parent; 2244 } 2245 2246 $this->parent->_[HDOM_INFO_END] = $this->cursor; 2247 return $this->as_text_node($tag); 2248 } 2249 } elseif (($this->parent->parent) 2250 && isset($this->block_tags[$tag_lower]) 2251 ) { 2252 // Grandparent exists and current tag is a block tag, so our 2253 // parent doesn't have an end tag 2254 $this->parent->_[HDOM_INFO_END] = 0; // No end tag 2255 $org_parent = $this->parent; 2256 2257 // Traverse ancestors to find a matching opening tag 2258 // Stop at root node 2259 while (($this->parent->parent) 2260 && strtolower($this->parent->tag) !== $tag_lower 2261 ) { 2262 $this->parent = $this->parent->parent; 2263 } 2264 2265 // If we don't have a match add current tag as text node 2266 if (strtolower($this->parent->tag) !== $tag_lower) { 2267 $this->parent = $org_parent; // restore origonal parent 2268 $this->parent->_[HDOM_INFO_END] = $this->cursor; 2269 return $this->as_text_node($tag); 2270 } 2271 } elseif (($this->parent->parent) 2272 && strtolower($this->parent->parent->tag) === $tag_lower 2273 ) { // Grandparent exists and current tag closes it 2274 $this->parent->_[HDOM_INFO_END] = 0; 2275 $this->parent = $this->parent->parent; 2276 } else { // Random tag, add as text node 2277 return $this->as_text_node($tag); 2278 } 2279 } 2280 2281 // Set end position of parent tag to current cursor position 2282 $this->parent->_[HDOM_INFO_END] = $this->cursor; 2283 2284 if ($this->parent->parent) { 2285 $this->parent = $this->parent->parent; 2286 } 2287 2288 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2289 return true; 2290 } 2291 2292 // start tag 2293 $node = new d_simple_html_dom_node($this); 2294 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 2295 ++$this->cursor; 2296 $tag = $this->copy_until($this->token_slash); // Get tag name 2297 $node->tag_start = $begin_tag_pos; 2298 2299 // doctype, cdata & comments... 2300 // <!DOCTYPE html> 2301 // <![CDATA[ ... ]]> 2302 // <!-- Comment --> 2303 if (isset($tag[0]) && $tag[0] === '!') { 2304 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 2305 2306 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 2307 $node->nodetype = HDOM_TYPE_COMMENT; 2308 $node->tag = 'comment'; 2309 } else { // Could be doctype or CDATA but we don't care 2310 $node->nodetype = HDOM_TYPE_UNKNOWN; 2311 $node->tag = 'unknown'; 2312 } 2313 2314 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 2315 2316 $this->link_nodes($node, true); 2317 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2318 return true; 2319 } 2320 2321 // The start tag cannot contain another start tag, if so add as text 2322 // i.e. "<<html>" 2323 if ($pos = strpos($tag, '<') !== false) { 2324 $tag = '<' . substr($tag, 0, -1); 2325 $node->_[HDOM_INFO_TEXT] = $tag; 2326 $this->link_nodes($node, false); 2327 $this->char = $this->doc[--$this->pos]; // prev 2328 return true; 2329 } 2330 2331 // Handle invalid tag names (i.e. "<html#doc>") 2332 if (!preg_match('/^\w[\w:-]*$/', $tag)) { 2333 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 2334 2335 // Next char is the beginning of a new tag, don't touch it. 2336 if ($this->char === '<') { 2337 $this->link_nodes($node, false); 2338 return true; 2339 } 2340 2341 // Next char closes current tag, add and be done with it. 2342 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 2343 $this->link_nodes($node, false); 2344 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2345 return true; 2346 } 2347 2348 // begin tag, add new node 2349 $node->nodetype = HDOM_TYPE_ELEMENT; 2350 $tag_lower = strtolower($tag); 2351 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 2352 2353 // handle optional closing tags 2354 if (isset($this->optional_closing_tags[$tag_lower])) { 2355 // Traverse ancestors to close all optional closing tags 2356 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 2357 $this->parent->_[HDOM_INFO_END] = 0; 2358 $this->parent = $this->parent->parent; 2359 } 2360 $node->parent = $this->parent; 2361 } 2362 2363 $guard = 0; // prevent infinity loop 2364 2365 // [0] Space between tag and first attribute 2366 $space = array($this->copy_skip($this->token_blank), '', ''); 2367 2368 // attributes 2369 do { 2370 // Everything until the first equal sign should be the attribute name 2371 $name = $this->copy_until($this->token_equal); 2372 2373 if ($name === '' && $this->char !== null && $space[0] === '') { 2374 break; 2375 } 2376 2377 if ($guard === $this->pos) { // Escape infinite loop 2378 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2379 continue; 2380 } 2381 2382 $guard = $this->pos; 2383 2384 // handle endless '<' 2385 // Out of bounds before the tag ended 2386 if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2387 $node->nodetype = HDOM_TYPE_TEXT; 2388 $node->_[HDOM_INFO_END] = 0; 2389 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2390 $node->tag = 'text'; 2391 $this->link_nodes($node, false); 2392 return true; 2393 } 2394 2395 // handle mismatch '<' 2396 // Attributes cannot start after opening tag 2397 if ($this->doc[$this->pos - 1] == '<') { 2398 $node->nodetype = HDOM_TYPE_TEXT; 2399 $node->tag = 'text'; 2400 $node->attr = array(); 2401 $node->_[HDOM_INFO_END] = 0; 2402 $node->_[HDOM_INFO_TEXT] = substr( 2403 $this->doc, 2404 $begin_tag_pos, 2405 $this->pos - $begin_tag_pos - 1 2406 ); 2407 $this->pos -= 2; 2408 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2409 $this->link_nodes($node, false); 2410 return true; 2411 } 2412 2413 if ($name !== '/' && $name !== '') { // this is a attribute name 2414 // [1] Whitespace after attribute name 2415 $space[1] = $this->copy_skip($this->token_blank); 2416 2417 $name = $this->restore_noise($name); // might be a noisy name 2418 2419 if ($this->lowercase) { $name = strtolower($name); } 2420 2421 if ($this->char === '=') { // attribute with value 2422 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2423 $this->parse_attr($node, $name, $space); // get attribute value 2424 } else { 2425 //no value attr: nowrap, checked selected... 2426 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2427 $node->attr[$name] = true; 2428 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2429 } 2430 2431 $node->_[HDOM_INFO_SPACE][] = $space; 2432 2433 // prepare for next attribute 2434 $space = array( 2435 $this->copy_skip($this->token_blank), 2436 '', 2437 '' 2438 ); 2439 } else { // no more attributes 2440 break; 2441 } 2442 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2443 2444 $this->link_nodes($node, true); 2445 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2446 2447 // handle empty tags (i.e. "<div/>") 2448 if ($this->copy_until_char('>') === '/') { 2449 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2450 $node->_[HDOM_INFO_END] = 0; 2451 } else { 2452 // reset parent 2453 if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2454 $this->parent = $node; 2455 } 2456 } 2457 2458 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2459 2460 // If it's a BR tag, we need to set it's text to the default text. 2461 // This way when we see it in plaintext, we can generate formatting that the user wants. 2462 // since a br tag never has sub nodes, this works well. 2463 if ($node->tag === 'br') { 2464 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2465 } 2466 2467 return true; 2468 } 2469 2470 /** 2471 * Parse attribute from current document position 2472 * 2473 * @param object $node Node for the attributes 2474 * @param string $name Name of the current attribute 2475 * @param array $space Array for spacing information 2476 * @return void 2477 */ 2478 protected function parse_attr($node, $name, &$space) 2479 { 2480 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 2481 // If the attribute is already defined inside a tag, only pay attention 2482 // to the first one as opposed to the last one. 2483 // https://stackoverflow.com/a/26341866 2484 if (isset($node->attr[$name])) { 2485 return; 2486 } 2487 2488 // [2] Whitespace between "=" and the value 2489 $space[2] = $this->copy_skip($this->token_blank); 2490 2491 switch ($this->char) { 2492 case '"': // value is anything between double quotes 2493 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 2494 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2495 $node->attr[$name] = $this->restore_noise($this->copy_until_char('"')); 2496 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2497 break; 2498 case '\'': // value is anything between single quotes 2499 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; 2500 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2501 $node->attr[$name] = $this->restore_noise($this->copy_until_char('\'')); 2502 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2503 break; 2504 default: // value is anything until the first space or end tag 2505 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2506 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); 2507 } 2508 // PaperG: Attributes should not have \r or \n in them, that counts as 2509 // html whitespace. 2510 $node->attr[$name] = str_replace("\r", '', $node->attr[$name]); 2511 $node->attr[$name] = str_replace("\n", '', $node->attr[$name]); 2512 // PaperG: If this is a "class" selector, lets get rid of the preceeding 2513 // and trailing space since some people leave it in the multi class case. 2514 if ($name === 'class') { 2515 $node->attr[$name] = trim($node->attr[$name]); 2516 } 2517 } 2518 2519 /** 2520 * Link node to parent node 2521 * 2522 * @param object $node Node to link to parent 2523 * @param bool $is_child True if the node is a child of parent 2524 * @return void 2525 */ 2526 // link node's parent 2527 protected function link_nodes(&$node, $is_child) 2528 { 2529 $node->parent = $this->parent; 2530 $this->parent->nodes[] = $node; 2531 if ($is_child) { 2532 $this->parent->children[] = $node; 2533 } 2534 } 2535 2536 /** 2537 * Add tag as text node to current node 2538 * 2539 * @param string $tag Tag name 2540 * @return bool True on success 2541 */ 2542 protected function as_text_node($tag) 2543 { 2544 $node = new d_simple_html_dom_node($this); 2545 ++$this->cursor; 2546 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2547 $this->link_nodes($node, false); 2548 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2549 return true; 2550 } 2551 2552 /** 2553 * Seek from the current document position to the first occurrence of a 2554 * character not defined by the provided string. Update the current document 2555 * position to the new position. 2556 * 2557 * @param string $chars A string containing every allowed character. 2558 * @return void 2559 */ 2560 protected function skip($chars) 2561 { 2562 $this->pos += strspn($this->doc, $chars, $this->pos); 2563 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2564 } 2565 2566 /** 2567 * Copy substring from the current document position to the first occurrence 2568 * of a character not defined by the provided string. 2569 * 2570 * @param string $chars A string containing every allowed character. 2571 * @return string Substring from the current document position to the first 2572 * occurrence of a character not defined by the provided string. 2573 */ 2574 protected function copy_skip($chars) 2575 { 2576 $pos = $this->pos; 2577 $len = strspn($this->doc, $chars, $pos); 2578 $this->pos += $len; 2579 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2580 if ($len === 0) { return ''; } 2581 return substr($this->doc, $pos, $len); 2582 } 2583 2584 /** 2585 * Copy substring from the current document position to the first occurrence 2586 * of any of the provided characters. 2587 * 2588 * @param string $chars A string containing every character to stop at. 2589 * @return string Substring from the current document position to the first 2590 * occurrence of any of the provided characters. 2591 */ 2592 protected function copy_until($chars) 2593 { 2594 $pos = $this->pos; 2595 $len = strcspn($this->doc, $chars, $pos); 2596 $this->pos += $len; 2597 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2598 return substr($this->doc, $pos, $len); 2599 } 2600 2601 /** 2602 * Copy substring from the current document position to the first occurrence 2603 * of the provided string. 2604 * 2605 * @param string $char The string to stop at. 2606 * @return string Substring from the current document position to the first 2607 * occurrence of the provided string. 2608 */ 2609 protected function copy_until_char($char) 2610 { 2611 if ($this->char === null) { return ''; } 2612 2613 if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2614 $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2615 $this->char = null; 2616 $this->pos = $this->size; 2617 return $ret; 2618 } 2619 2620 if ($pos === $this->pos) { return ''; } 2621 2622 $pos_old = $this->pos; 2623 $this->char = $this->doc[$pos]; 2624 $this->pos = $pos; 2625 return substr($this->doc, $pos_old, $pos - $pos_old); 2626 } 2627 2628 /** 2629 * Remove noise from HTML content 2630 * 2631 * Noise is stored to {@see d_simple_html_dom::$noise} 2632 * 2633 * @param string $pattern The regex pattern used for finding noise 2634 * @param bool $remove_tag True to remove the entire match. Default is false 2635 * to only remove the captured data. 2636 */ 2637 protected function remove_noise($pattern, $remove_tag = false) 2638 { 2639 global $debug_object; 2640 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2641 2642 $count = preg_match_all( 2643 $pattern, 2644 $this->doc, 2645 $matches, 2646 PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2647 ); 2648 2649 for ($i = $count - 1; $i > -1; --$i) { 2650 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2651 2652 if (is_object($debug_object)) { 2653 $debug_object->debug_log(2, 'key is: ' . $key); 2654 } 2655 2656 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2657 $this->noise[$key] = $matches[$i][$idx][0]; 2658 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2659 } 2660 2661 // reset the length of content 2662 $this->size = strlen($this->doc); 2663 2664 if ($this->size > 0) { 2665 $this->char = $this->doc[0]; 2666 } 2667 } 2668 2669 /** 2670 * Restore noise to HTML content 2671 * 2672 * Noise is restored from {@see d_simple_html_dom::$noise} 2673 * 2674 * @param string $text A subset of HTML containing noise 2675 * @return string The same content with noise restored 2676 */ 2677 function restore_noise($text) 2678 { 2679 global $debug_object; 2680 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2681 2682 while (($pos = strpos($text, '___noise___')) !== false) { 2683 // Sometimes there is a broken piece of markup, and we don't GET the 2684 // pos+11 etc... token which indicates a problem outside of us... 2685 2686 // todo: "___noise___1000" (or any number with four or more digits) 2687 // in the DOM causes an infinite loop which could be utilized by 2688 // malicious software 2689 if (strlen($text) > $pos + 15) { 2690 $key = '___noise___' 2691 . $text[$pos + 11] 2692 . $text[$pos + 12] 2693 . $text[$pos + 13] 2694 . $text[$pos + 14] 2695 . $text[$pos + 15]; 2696 2697 if (is_object($debug_object)) { 2698 $debug_object->debug_log(2, 'located key of: ' . $key); 2699 } 2700 2701 if (isset($this->noise[$key])) { 2702 $text = substr($text, 0, $pos) 2703 . $this->noise[$key] 2704 . substr($text, $pos + 16); 2705 } else { 2706 // do this to prevent an infinite loop. 2707 $text = substr($text, 0, $pos) 2708 . 'UNDEFINED NOISE FOR KEY: ' 2709 . $key 2710 . substr($text, $pos + 16); 2711 } 2712 } else { 2713 // There is no valid key being given back to us... We must get 2714 // rid of the ___noise___ or we will have a problem. 2715 $text = substr($text, 0, $pos) 2716 . 'NO NUMERIC NOISE KEY' 2717 . substr($text, $pos + 11); 2718 } 2719 } 2720 return $text; 2721 } 2722 2723 // Sometimes we NEED one of the noise elements. 2724 function search_noise($text) 2725 { 2726 global $debug_object; 2727 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2728 2729 foreach($this->noise as $noiseElement) { 2730 if (strpos($noiseElement, $text) !== false) { 2731 return $noiseElement; 2732 } 2733 } 2734 } 2735 2736 function __toString() 2737 { 2738 return $this->root->innertext(); 2739 } 2740 2741 function __get($name) 2742 { 2743 switch ($name) { 2744 case 'outertext': 2745 return $this->root->innertext(); 2746 case 'innertext': 2747 return $this->root->innertext(); 2748 case 'plaintext': 2749 return $this->root->text(); 2750 case 'charset': 2751 return $this->_charset; 2752 case 'target_charset': 2753 return $this->_target_charset; 2754 } 2755 } 2756 2757 // camel naming conventions 2758 function childNodes($idx = -1) 2759 { 2760 return $this->root->childNodes($idx); 2761 } 2762 2763 function firstChild() 2764 { 2765 return $this->root->first_child(); 2766 } 2767 2768 function lastChild() 2769 { 2770 return $this->root->last_child(); 2771 } 2772 2773 function createElement($name, $value = null) 2774 { 2775 return @str_get_html("<$name>$value</$name>")->first_child(); 2776 } 2777 2778 function createTextNode($value) 2779 { 2780 return @end(str_get_html($value)->nodes); 2781 } 2782 2783 function getElementById($id) 2784 { 2785 return $this->find("#$id", 0); 2786 } 2787 2788 function getElementsById($id, $idx = null) 2789 { 2790 return $this->find("#$id", $idx); 2791 } 2792 2793 function getElementByTagName($name) 2794 { 2795 return $this->find($name, 0); 2796 } 2797 2798 function getElementsByTagName($name, $idx = -1) 2799 { 2800 return $this->find($name, $idx); 2801 } 2802 2803 function loadFile() 2804 { 2805 $args = func_get_args(); 2806 $this->load_file($args); 2807 } 2808 }