shop.balmet.com

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

d_simple_html_dom.php (74361B)


      1 <?php
      2 /**
      3  * Website: http://sourceforge.net/projects/simplehtmldom/
      4  * Additional projects: http://sourceforge.net/projects/debugobject/
      5  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
      6  * Contributions by:
      7  *	 Yousuke Kumakura (Attribute filters)
      8  *	 Vadim Voituk (Negative indexes supports of "find" method)
      9  *	 Antcs (Constructor with automatically load contents either text or file/url)
     10  *
     11  * all affected sections have comments starting with "PaperG"
     12  *
     13  * Paperg - Added case insensitive testing of the value of the selector.
     14  *
     15  * Paperg - Added tag_start for the starting index of tags - NOTE: This works
     16  * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
     17  * out, and after the remove_noice calls so it will not reflect the REAL
     18  * position of the tag in the source, it will almost always be smaller by some
     19  * amount. We use this to determine how far into the file the tag in question
     20  * is. This "percentage" will never be accurate as the $dom->size is the "real"
     21  * number of bytes the dom was created from. But for most purposes, it's a
     22  * really good estimation.
     23  *
     24  * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
     25  * closed is great for malformed html, but it CAN lead to parsing errors.
     26  *
     27  * Allow the user to tell us how much they trust the html.
     28  *
     29  * Paperg add the text and plaintext to the selectors for the find syntax.
     30  * plaintext implies text in the innertext of a node.  text implies that the
     31  * tag is a text node. This allows for us to find tags based on the text they
     32  * contain.
     33  *
     34  * Create find_ancestor_tag to see if a tag is - at any level - inside of
     35  * another specific tag.
     36  *
     37  * Paperg: added parse_charset so that we know about the character set of
     38  * the source document. NOTE: If the user's system has a routine called
     39  * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
     40  * returning the content-type header from the last transfer or curl_exec, and
     41  * we will parse that and use it in preference to any other method of charset
     42  * detection.
     43  *
     44  * Found infinite loop in the case of broken html in restore_noise. Rewrote to
     45  * protect from that.
     46  *
     47  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
     48  *
     49  * Licensed under The MIT License
     50  * Redistributions of files must retain the above copyright notice.
     51  *
     52  * @author S.C. Chen <me578022@gmail.com>
     53  * @author John Schlick
     54  * @author Rus Carroll
     55  * @version Rev. 1.8.1 (247)
     56  * @package PlaceLocalInclude
     57  * @subpackage d_simple_html_dom
     58  */
     59 
     60 /**
     61  * All of the Defines for the classes below.
     62  * @author S.C. Chen <me578022@gmail.com>
     63  */
     64 define('HDOM_TYPE_ELEMENT', 1);
     65 define('HDOM_TYPE_COMMENT', 2);
     66 define('HDOM_TYPE_TEXT', 3);
     67 define('HDOM_TYPE_ENDTAG', 4);
     68 define('HDOM_TYPE_ROOT', 5);
     69 define('HDOM_TYPE_UNKNOWN', 6);
     70 define('HDOM_QUOTE_DOUBLE', 0);
     71 define('HDOM_QUOTE_SINGLE', 1);
     72 define('HDOM_QUOTE_NO', 3);
     73 define('HDOM_INFO_BEGIN', 0);
     74 define('HDOM_INFO_END', 1);
     75 define('HDOM_INFO_QUOTE', 2);
     76 define('HDOM_INFO_SPACE', 3);
     77 define('HDOM_INFO_TEXT', 4);
     78 define('HDOM_INFO_INNER', 5);
     79 define('HDOM_INFO_OUTER', 6);
     80 define('HDOM_INFO_ENDSPACE', 7);
     81 
     82 /** The default target charset */
     83 defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
     84 
     85 /** The default <br> text used instead of <br> tags when returning text */
     86 defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
     87 
     88 /** The default <span> text used instead of <span> tags when returning text */
     89 defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
     90 
     91 /** The maximum file size the parser should load */
     92 defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
     93 
     94 /** Contents between curly braces "{" and "}" are interpreted as text */
     95 define('HDOM_SMARTY_AS_TEXT', 1);
     96 
     97 // helper functions
     98 // -----------------------------------------------------------------------------
     99 // get html dom from file
    100 // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
    101 function file_get_html(
    102 	$url,
    103 	$use_include_path = false,
    104 	$context = null,
    105 	$offset = 0,
    106 	$maxLen = -1,
    107 	$lowercase = true,
    108 	$forceTagsClosed = true,
    109 	$target_charset = DEFAULT_TARGET_CHARSET,
    110 	$stripRN = true,
    111 	$defaultBRText = DEFAULT_BR_TEXT,
    112 	$defaultSpanText = DEFAULT_SPAN_TEXT)
    113 {
    114 	// Ensure maximum length is greater than zero
    115 	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
    116 
    117 	// We DO force the tags to be terminated.
    118 	$dom = new d_simple_html_dom(
    119 		null,
    120 		$lowercase,
    121 		$forceTagsClosed,
    122 		$target_charset,
    123 		$stripRN,
    124 		$defaultBRText,
    125 		$defaultSpanText);
    126 
    127 	/**
    128 	 * For sourceforge users: uncomment the next line and comment the
    129 	 * retrieve_url_contents line 2 lines down if it is not already done.
    130 	 */
    131 	$contents = file_get_contents(
    132 		$url,
    133 		$use_include_path,
    134 		$context,
    135 		$offset,
    136 		$maxLen);
    137 
    138 	// Paperg - use our own mechanism for getting the contents as we want to
    139 	// control the timeout.
    140 	// $contents = retrieve_url_contents($url);
    141 	if (empty($contents) || strlen($contents) > $maxLen) { return false; }
    142 
    143 	// The second parameter can force the selectors to all be lowercase.
    144 	$dom->load($contents, $lowercase, $stripRN);
    145 	return $dom;
    146 }
    147 
    148 // get html dom from string
    149 function str_get_html(
    150 	$str,
    151 	$lowercase = true,
    152 	$forceTagsClosed = true,
    153 	$target_charset = DEFAULT_TARGET_CHARSET,
    154 	$stripRN = true,
    155 	$defaultBRText = DEFAULT_BR_TEXT,
    156 	$defaultSpanText = DEFAULT_SPAN_TEXT)
    157 {
    158 	$dom = new d_simple_html_dom(
    159 		null,
    160 		$lowercase,
    161 		$forceTagsClosed,
    162 		$target_charset,
    163 		$stripRN,
    164 		$defaultBRText,
    165 		$defaultSpanText);
    166 
    167 	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
    168 		$dom->clear();
    169 		return false;
    170 	}
    171 
    172 	$dom->load($str, $lowercase, $stripRN);
    173 	return $dom;
    174 }
    175 
    176 // dump html dom tree
    177 function dump_html_tree($node, $show_attr = true, $deep = 0)
    178 {
    179 	$node->dump($node);
    180 }
    181 
    182 /**
    183  * simple html dom node
    184  * PaperG - added ability for "find" routine to lowercase the value of the
    185  * selector.
    186  *
    187  * PaperG - added $tag_start to track the start position of the tag in the total
    188  * byte index
    189  *
    190  * @package PlaceLocalInclude
    191  */
    192 class d_simple_html_dom_node
    193 {
    194 	/**
    195 	 * Node type
    196 	 *
    197 	 * Default is {@see HDOM_TYPE_TEXT}
    198 	 *
    199 	 * @var int
    200 	 */
    201 	public $nodetype = HDOM_TYPE_TEXT;
    202 
    203 	/**
    204 	 * Tag name
    205 	 *
    206 	 * Default is 'text'
    207 	 *
    208 	 * @var string
    209 	 */
    210 	public $tag = 'text';
    211 
    212 	/**
    213 	 * List of attributes
    214 	 *
    215 	 * @var array
    216 	 */
    217 	public $attr = array();
    218 
    219 	/**
    220 	 * List of child node objects
    221 	 *
    222 	 * @var array
    223 	 */
    224 	public $children = array();
    225 	public $nodes = array();
    226 
    227 	/**
    228 	 * The parent node object
    229 	 *
    230 	 * @var object|null
    231 	 */
    232 	public $parent = null;
    233 
    234 	// The "info" array - see HDOM_INFO_... for what each element contains.
    235 	public $_ = array();
    236 
    237 	/**
    238 	 * Start position of the tag in the document
    239 	 *
    240 	 * @var int
    241 	 */
    242 	public $tag_start = 0;
    243 
    244 	/**
    245 	 * The DOM object
    246 	 *
    247 	 * @var object|null
    248 	 */
    249 	private $dom = null;
    250 
    251 	/**
    252 	 * Construct new node object
    253 	 *
    254 	 * Adds itself to the list of DOM Nodes {@see d_simple_html_dom::$nodes}
    255 	 */
    256 	function __construct($dom)
    257 	{
    258 		$this->dom = $dom;
    259 		$dom->nodes[] = $this;
    260 	}
    261 
    262 	function __destruct()
    263 	{
    264 		$this->clear();
    265 	}
    266 
    267 	function __toString()
    268 	{
    269 		return $this->outertext();
    270 	}
    271 
    272 	// clean up memory due to php5 circular references memory leak...
    273 	function clear()
    274 	{
    275 		$this->dom = null;
    276 		$this->nodes = null;
    277 		$this->parent = null;
    278 		$this->children = null;
    279 	}
    280 
    281 	// dump node's tree
    282 	function dump($show_attr = true, $deep = 0)
    283 	{
    284 		$lead = str_repeat('	', $deep);
    285 
    286 		echo $lead . $this->tag;
    287 
    288 		if ($show_attr && count($this->attr) > 0) {
    289 			echo '(';
    290 			foreach ($this->attr as $k => $v) {
    291 				echo "[$k]=>\"" . $this->$k . '", ';
    292 			}
    293 			echo ')';
    294 		}
    295 
    296 		echo "\n";
    297 
    298 		if ($this->nodes) {
    299 			foreach ($this->nodes as $c) {
    300 				$c->dump($show_attr, $deep + 1);
    301 			}
    302 		}
    303 	}
    304 
    305 
    306 	// Debugging function to dump a single dom node with a bunch of information about it.
    307 	function dump_node($echo = true)
    308 	{
    309 		$string = $this->tag;
    310 
    311 		if (count($this->attr) > 0) {
    312 			$string .= '(';
    313 			foreach ($this->attr as $k => $v) {
    314 				$string .= "[$k]=>\"" . $this->$k . '", ';
    315 			}
    316 			$string .= ')';
    317 		}
    318 
    319 		if (count($this->_) > 0) {
    320 			$string .= ' $_ (';
    321 			foreach ($this->_ as $k => $v) {
    322 				if (is_array($v)) {
    323 					$string .= "[$k]=>(";
    324 					foreach ($v as $k2 => $v2) {
    325 						$string .= "[$k2]=>\"" . $v2 . '", ';
    326 					}
    327 					$string .= ')';
    328 				} else {
    329 					$string .= "[$k]=>\"" . $v . '", ';
    330 				}
    331 			}
    332 			$string .= ')';
    333 		}
    334 
    335 		if (isset($this->text)) {
    336 			$string .= ' text: (' . $this->text . ')';
    337 		}
    338 
    339 		$string .= " HDOM_INNER_INFO: '";
    340 
    341 		if (isset($node->_[HDOM_INFO_INNER])) {
    342 			$string .= $node->_[HDOM_INFO_INNER] . "'";
    343 		} else {
    344 			$string .= ' NULL ';
    345 		}
    346 
    347 		$string .= ' children: ' . count($this->children);
    348 		$string .= ' nodes: ' . count($this->nodes);
    349 		$string .= ' tag_start: ' . $this->tag_start;
    350 		$string .= "\n";
    351 
    352 		if ($echo) {
    353 			echo $string;
    354 			return;
    355 		} else {
    356 			return $string;
    357 		}
    358 	}
    359 
    360 	/**
    361 	 * Return or set parent node
    362 	 *
    363 	 * @param object|null $parent (optional) The parent node, `null` to return
    364 	 * the current parent node.
    365 	 * @return object|null The parent node
    366 	 */
    367 	function parent($parent = null)
    368 	{
    369 		// I am SURE that this doesn't work properly.
    370 		// It fails to unset the current node from it's current parents nodes or
    371 		// children list first.
    372 		if ($parent !== null) {
    373 			$this->parent = $parent;
    374 			$this->parent->nodes[] = $this;
    375 			$this->parent->children[] = $this;
    376 		}
    377 
    378 		return $this->parent;
    379 	}
    380 
    381 	/**
    382 	 * @return bool True if the node has at least one child node
    383 	 */
    384 	function has_child()
    385 	{
    386 		return !empty($this->children);
    387 	}
    388 
    389 	/**
    390 	 * Get child node at specified index
    391 	 *
    392 	 * @param int $idx The index of the child node to return, `-1` to return all
    393 	 * child nodes.
    394 	 * @return object|array|null The child node at the specified index, all child
    395 	 * nodes or null if the index is invalid.
    396 	 */
    397 	function children($idx = -1)
    398 	{
    399 		if ($idx === -1) {
    400 			return $this->children;
    401 		}
    402 
    403 		if (isset($this->children[$idx])) {
    404 			return $this->children[$idx];
    405 		}
    406 
    407 		return null;
    408 	}
    409 
    410 	/**
    411 	 * Get first child node
    412 	 *
    413 	 * @return object|null The first child node or null if the current node has
    414 	 * no child nodes.
    415 	 *
    416 	 * @todo Use `empty()` instead of `count()` to improve performance on large
    417 	 * arrays.
    418 	 */
    419 	function first_child()
    420 	{
    421 		if (count($this->children) > 0) {
    422 			return $this->children[0];
    423 		}
    424 		return null;
    425 	}
    426 
    427 	/**
    428 	 * Get last child node
    429 	 *
    430 	 * @return object|null The last child node or null if the current node has
    431 	 * no child nodes.
    432 	 *
    433 	 * @todo Use `end()` to slightly improve performance on large arrays.
    434 	 */
    435 	function last_child()
    436 	{
    437 		if (($count = count($this->children)) > 0) {
    438 			return $this->children[$count - 1];
    439 		}
    440 		return null;
    441 	}
    442 
    443 	/**
    444 	 * Get next sibling node
    445 	 *
    446 	 * @return object|null The sibling node or null if the current node has no
    447 	 * sibling nodes.
    448 	 */
    449 	function next_sibling()
    450 	{
    451 		if ($this->parent === null) {
    452 			return null;
    453 		}
    454 
    455 		$idx = 0;
    456 		$count = count($this->parent->children);
    457 
    458 		while ($idx < $count && $this !== $this->parent->children[$idx]) {
    459 			++$idx;
    460 		}
    461 
    462 		if (++$idx >= $count) {
    463 			return null;
    464 		}
    465 
    466 		return $this->parent->children[$idx];
    467 	}
    468 
    469 	/**
    470 	 * Get previous sibling node
    471 	 *
    472 	 * @return object|null The sibling node or null if the current node has no
    473 	 * sibling nodes.
    474 	 */
    475 	function prev_sibling()
    476 	{
    477 		if ($this->parent === null) { return null; }
    478 
    479 		$idx = 0;
    480 		$count = count($this->parent->children);
    481 
    482 		while ($idx < $count && $this !== $this->parent->children[$idx]) {
    483 			++$idx;
    484 		}
    485 
    486 		if (--$idx < 0) { return null; }
    487 
    488 		return $this->parent->children[$idx];
    489 	}
    490 
    491 	/**
    492 	 * Traverse ancestors to the first matching tag.
    493 	 *
    494 	 * @param string $tag Tag to find
    495 	 * @return object|null First matching node in the DOM tree or null if no
    496 	 * match was found.
    497 	 *
    498 	 * @todo Null is returned implicitly by calling ->parent on the root node.
    499 	 * This behaviour could change at any time, rendering this function invalid.
    500 	 */
    501 	function find_ancestor_tag($tag)
    502 	{
    503 		global $debug_object;
    504 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
    505 
    506 		// Start by including ourselves in the comparison.
    507 		$returnDom = $this;
    508 
    509 		while (!is_null($returnDom)) {
    510 			if (is_object($debug_object)) {
    511 				$debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
    512 			}
    513 
    514 			if ($returnDom->tag == $tag) {
    515 				break;
    516 			}
    517 
    518 			$returnDom = $returnDom->parent;
    519 		}
    520 
    521 		return $returnDom;
    522 	}
    523 
    524 	/**
    525 	 * Get node's inner text (everything inside the opening and closing tags)
    526 	 *
    527 	 * @return string
    528 	 */
    529 	function innertext()
    530 	{
    531 		if (isset($this->_[HDOM_INFO_INNER])) {
    532 			return $this->_[HDOM_INFO_INNER];
    533 		}
    534 
    535 		if (isset($this->_[HDOM_INFO_TEXT])) {
    536 			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
    537 		}
    538 
    539 		$ret = '';
    540 
    541 		foreach ($this->nodes as $n) {
    542 			$ret .= $n->outertext();
    543 		}
    544 
    545 		return $ret;
    546 	}
    547 
    548 	/**
    549 	 * Get node's outer text (everything including the opening and closing tags)
    550 	 *
    551 	 * @return string
    552 	 */
    553 	function outertext()
    554 	{
    555 		global $debug_object;
    556 
    557 		if (is_object($debug_object)) {
    558 			$text = '';
    559 
    560 			if ($this->tag === 'text') {
    561 				if (!empty($this->text)) {
    562 					$text = ' with text: ' . $this->text;
    563 				}
    564 			}
    565 
    566 			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
    567 		}
    568 
    569 		if ($this->tag === 'root') return $this->innertext();
    570 
    571 		// trigger callback
    572 		if ($this->dom && $this->dom->callback !== null) {
    573 			call_user_func_array($this->dom->callback, array($this));
    574 		}
    575 
    576 		if (isset($this->_[HDOM_INFO_OUTER])) {
    577 			return $this->_[HDOM_INFO_OUTER];
    578 		}
    579 
    580 		if (isset($this->_[HDOM_INFO_TEXT])) {
    581 			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
    582 		}
    583 
    584 		// render begin tag
    585 		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
    586 			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
    587 		} else {
    588 			$ret = '';
    589 		}
    590 
    591 		// render inner text
    592 		if (isset($this->_[HDOM_INFO_INNER])) {
    593 			// If it's a br tag...  don't return the HDOM_INNER_INFO that we
    594 			// may or may not have added.
    595 			if ($this->tag !== 'br') {
    596 				$ret .= $this->_[HDOM_INFO_INNER];
    597 			}
    598 		} else {
    599 			if ($this->nodes) {
    600 				foreach ($this->nodes as $n) {
    601 					$ret .= $this->convert_text($n->outertext());
    602 				}
    603 			}
    604 		}
    605 
    606 		// render end tag
    607 		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
    608 			$ret .= '</' . $this->tag . '>';
    609 		}
    610 
    611 		return $ret;
    612 	}
    613 
    614 	/**
    615 	 * Get node's plain text (everything excluding all tags)
    616 	 *
    617 	 * @return string
    618 	 */
    619 	function text()
    620 	{
    621 		if (isset($this->_[HDOM_INFO_INNER])) {
    622 			return $this->_[HDOM_INFO_INNER];
    623 		}
    624 
    625 		switch ($this->nodetype) {
    626 			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
    627 			case HDOM_TYPE_COMMENT: return '';
    628 			case HDOM_TYPE_UNKNOWN: return '';
    629 		}
    630 
    631 		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
    632 		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
    633 
    634 		$ret = '';
    635 
    636 		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
    637 		// for some span tags, and some p tags) $this->nodes is set to NULL.
    638 		// NOTE: This indicates that there is a problem where it's set to NULL
    639 		// without a clear happening.
    640 		// WHY is this happening?
    641 		if (!is_null($this->nodes)) {
    642 			foreach ($this->nodes as $n) {
    643 				// Start paragraph after a blank line
    644 				if ($n->tag === 'p') {
    645 					$ret .= "\n\n";
    646 				}
    647 
    648 				$ret .= $this->convert_text($n->text());
    649 
    650 				// If this node is a span... add a space at the end of it so
    651 				// multiple spans don't run into each other.  This is plaintext
    652 				// after all.
    653 				if ($n->tag === 'span') {
    654 					$ret .= $this->dom->default_span_text;
    655 				}
    656 			}
    657 		}
    658 		return trim($ret);
    659 	}
    660 
    661 	/**
    662 	 * Get node's xml text (inner text as a CDATA section)
    663 	 *
    664 	 * @return string
    665 	 */
    666 	function xmltext()
    667 	{
    668 		$ret = $this->innertext();
    669 		$ret = str_ireplace('<![CDATA[', '', $ret);
    670 		$ret = str_replace(']]>', '', $ret);
    671 		return $ret;
    672 	}
    673 
    674 	// build node's text with tag
    675 	function makeup()
    676 	{
    677 		// text, comment, unknown
    678 		if (isset($this->_[HDOM_INFO_TEXT])) {
    679 			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
    680 		}
    681 
    682 		$ret = '<' . $this->tag;
    683 		$i = -1;
    684 
    685 		foreach ($this->attr as $key => $val) {
    686 			++$i;
    687 
    688 			// skip removed attribute
    689 			if ($val === null || $val === false) { continue; }
    690 
    691 			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
    692 
    693 			//no value attr: nowrap, checked selected...
    694 			if ($val === true) {
    695 				$ret .= $key;
    696 			} else {
    697 				switch ($this->_[HDOM_INFO_QUOTE][$i])
    698 				{
    699 					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
    700 					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
    701 					default: $quote = '';
    702 				}
    703 
    704 				$ret .= $key
    705 				. $this->_[HDOM_INFO_SPACE][$i][1]
    706 				. '='
    707 				. $this->_[HDOM_INFO_SPACE][$i][2]
    708 				. $quote
    709 				. $val
    710 				. $quote;
    711 			}
    712 		}
    713 
    714 		$ret = $this->dom->restore_noise($ret);
    715 		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
    716 	}
    717 
    718 	/**
    719 	 * Find elements by CSS selector
    720 	 *
    721 	 * @param string $selector The CSS selector
    722 	 * @param int|null $idx Index of element to return form the list of matching
    723 	 * elements (default: `null` = disabled).
    724 	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
    725 	 * enabled (default: `false`)
    726 	 * @return array|object|null A list of elements matching the specified CSS
    727 	 * selector or a single element if $idx is specified or null if no element
    728 	 * was found.
    729 	 */
    730 	function find($selector, $idx = null, $lowercase = false)
    731 	{
    732 		$selectors = $this->parse_selector($selector);
    733 		if (($count = count($selectors)) === 0) { return array(); }
    734 		$found_keys = array();
    735 
    736 		// find each selector
    737 		for ($c = 0; $c < $count; ++$c) {
    738 			// The change on the below line was documented on the sourceforge
    739 			// code tracker id 2788009
    740 			// used to be: if (($levle=count($selectors[0]))===0) return array();
    741 			if (($levle = count($selectors[$c])) === 0) { return array(); }
    742 			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
    743 
    744 			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
    745 			$cmd = ' '; // Combinator
    746 
    747 			// handle descendant selectors, no recursive!
    748 			for ($l = 0; $l < $levle; ++$l) {
    749 				$ret = array();
    750 
    751 				foreach ($head as $k => $v) {
    752 					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
    753 					//PaperG - Pass this optional parameter on to the seek function.
    754 					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
    755 				}
    756 
    757 				$head = $ret;
    758 				$cmd = $selectors[$c][$l][4]; // Next Combinator
    759 			}
    760 
    761 			foreach ($head as $k => $v) {
    762 				if (!isset($found_keys[$k])) {
    763 					$found_keys[$k] = 1;
    764 				}
    765 			}
    766 		}
    767 
    768 		// sort keys
    769 		ksort($found_keys);
    770 
    771 		$found = array();
    772 		foreach ($found_keys as $k => $v) {
    773 			$found[] = $this->dom->nodes[$k];
    774 		}
    775 
    776 		// return nth-element or array
    777 		if (is_null($idx)) { return $found; }
    778 		elseif ($idx < 0) { $idx = count($found) + $idx; }
    779 		return (isset($found[$idx])) ? $found[$idx] : null;
    780 	}
    781 
    782 	/**
    783 	 * Seek DOM elements by selector
    784 	 *
    785 	 * **Note**
    786 	 * The selector element must be compatible to a selector from
    787 	 * {@see d_simple_html_dom_node::parse_selector()}
    788 	 *
    789 	 * @param array $selector A selector element
    790 	 * @param array $ret An array of matches
    791 	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
    792 	 * enabled (default: `false`)
    793 	 * @return void
    794 	 */
    795 	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
    796 	{
    797 		global $debug_object;
    798 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
    799 
    800 		list($tag, $id, $class, $attributes, $cmb) = $selector;
    801 		$nodes = array();
    802 
    803 		if ($parent_cmd === ' ') { // Descendant Combinator
    804 			// Find parent closing tag if the current element doesn't have a closing
    805 			// tag (i.e. void element)
    806 			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
    807 			if ($end == 0) {
    808 				$parent = $this->parent;
    809 				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
    810 					$end -= 1;
    811 					$parent = $parent->parent;
    812 				}
    813 				$end += $parent->_[HDOM_INFO_END];
    814 			}
    815 
    816 			// Get list of target nodes
    817 			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
    818 			$nodes_count = $end - $nodes_start;
    819 			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
    820 		} elseif ($parent_cmd === '>') { // Child Combinator
    821 			$nodes = $this->children;
    822 		} elseif ($parent_cmd === '+'
    823 			&& $this->parent
    824 			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
    825 				$index = array_search($this, $this->parent->children, true) + 1;
    826 				$nodes[] = $this->parent->children[$index];
    827 		} elseif ($parent_cmd === '~'
    828 			&& $this->parent
    829 			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
    830 				$index = array_search($this, $this->parent->children, true);
    831 				$nodes = array_slice($this->parent->children, $index);
    832 		}
    833 
    834 		// Go throgh each element starting at this element until the end tag
    835 		// Note: If this element is a void tag, any previous void element is
    836 		// skipped.
    837 		foreach($nodes as $node) {
    838 			$pass = true;
    839 
    840 			// Skip root nodes
    841 			if(!$node->parent) {
    842 				$pass = false;
    843 			}
    844 
    845 			// Skip if node isn't a child node (i.e. text nodes)
    846 			if($pass && !in_array($node, $node->parent->children, true)) {
    847 				$pass = false;
    848 			}
    849 
    850 			// Skip if tag doesn't match
    851 			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
    852 				$pass = false;
    853 			}
    854 
    855 			// Skip if ID doesn't exist
    856 			if ($pass && $id !== '' && !isset($node->attr['id'])) {
    857 				$pass = false;
    858 			}
    859 
    860 			// Check if ID matches
    861 			if ($pass && $id !== '' && isset($node->attr['id'])) {
    862 				// Note: Only consider the first ID (as browsers do)
    863 				$node_id = explode(' ', trim($node->attr['id']))[0];
    864 
    865 				if($id !== $node_id) { $pass = false; }
    866 			}
    867 
    868 			// Check if all class(es) exist
    869 			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
    870 				if (isset($node->attr['class'])) {
    871 					$node_classes = explode(' ', $node->attr['class']);
    872 
    873 					if ($lowercase) {
    874 						$node_classes = array_map('strtolower', $node_classes);
    875 					}
    876 
    877 					foreach($class as $c) {
    878 						if(!in_array($c, $node_classes)) {
    879 							$pass = false;
    880 							break;
    881 						}
    882 					}
    883 				} else {
    884 					$pass = false;
    885 				}
    886 			}
    887 
    888 			// Check attributes
    889 			if ($pass
    890 				&& $attributes !== ''
    891 				&& is_array($attributes)
    892 				&& !empty($attributes)) {
    893 					foreach($attributes as $a) {
    894 						list (
    895 							$att_name,
    896 							$att_expr,
    897 							$att_val,
    898 							$att_inv,
    899 							$att_case_sensitivity
    900 						) = $a;
    901 
    902 						// Handle indexing attributes (i.e. "[2]")
    903 						/**
    904 						 * Note: This is not supported by the CSS Standard but adds
    905 						 * the ability to select items compatible to XPath (i.e.
    906 						 * the 3rd element within it's parent).
    907 						 *
    908 						 * Note: This doesn't conflict with the CSS Standard which
    909 						 * doesn't work on numeric attributes anyway.
    910 						 */
    911 						if (is_numeric($att_name)
    912 							&& $att_expr === ''
    913 							&& $att_val === '') {
    914 								$count = 0;
    915 
    916 								// Find index of current element in parent
    917 								foreach ($node->parent->children as $c) {
    918 									if ($c->tag === $node->tag) ++$count;
    919 									if ($c === $node) break;
    920 								}
    921 
    922 								// If this is the correct node, continue with next
    923 								// attribute
    924 								if ($count === (int)$att_name) continue;
    925 						}
    926 
    927 						// Check attribute availability
    928 						if ($att_inv) { // Attribute should NOT be set
    929 							if (isset($node->attr[$att_name])) {
    930 								$pass = false;
    931 								break;
    932 							}
    933 						} else { // Attribute should be set
    934 							// todo: "plaintext" is not a valid CSS selector!
    935 							if ($att_name !== 'plaintext'
    936 								&& !isset($node->attr[$att_name])) {
    937 									$pass = false;
    938 									break;
    939 							}
    940 						}
    941 
    942 						// Continue with next attribute if expression isn't defined
    943 						if ($att_expr === '') continue;
    944 
    945 						// If they have told us that this is a "plaintext"
    946 						// search then we want the plaintext of the node - right?
    947 						// todo "plaintext" is not a valid CSS selector!
    948 						if ($att_name === 'plaintext') {
    949 							$nodeKeyValue = $node->text();
    950 						} else {
    951 							$nodeKeyValue = $node->attr[$att_name];
    952 						}
    953 
    954 						if (is_object($debug_object)) {
    955 							$debug_object->debug_log(2,
    956 								'testing node: '
    957 								. $node->tag
    958 								. ' for attribute: '
    959 								. $att_name
    960 								. $att_expr
    961 								. $att_val
    962 								. ' where nodes value is: '
    963 								. $nodeKeyValue
    964 							);
    965 						}
    966 
    967 						// If lowercase is set, do a case insensitive test of
    968 						// the value of the selector.
    969 						if ($lowercase) {
    970 							$check = $this->match(
    971 								$att_expr,
    972 								strtolower($att_val),
    973 								strtolower($nodeKeyValue),
    974 								$att_case_sensitivity
    975 							);
    976 						} else {
    977 							$check = $this->match(
    978 								$att_expr,
    979 								$att_val,
    980 								$nodeKeyValue,
    981 								$att_case_sensitivity
    982 							);
    983 						}
    984 
    985 						if (is_object($debug_object)) {
    986 							$debug_object->debug_log(2,
    987 								'after match: '
    988 								. ($check ? 'true' : 'false')
    989 							);
    990 						}
    991 
    992 						if (!$check) {
    993 							$pass = false;
    994 							break;
    995 						}
    996 					}
    997 			}
    998 
    999 			// Found a match. Add to list and clear node
   1000 			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
   1001 			unset($node);
   1002 		}
   1003 		// It's passed by reference so this is actually what this function returns.
   1004 		if (is_object($debug_object)) {
   1005 			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
   1006 		}
   1007 	}
   1008 
   1009 	/**
   1010 	 * Match value and pattern for a given CSS expression
   1011 	 *
   1012 	 * **Supported Expressions**
   1013 	 *
   1014 	 * | Expression | Description
   1015 	 * | ---------- | -----------
   1016 	 * | `=`        | $value and $pattern must be equal
   1017 	 * | `!=`       | $value and $pattern must not be equal
   1018 	 * | `^=`       | $value must start with $pattern
   1019 	 * | `$=`       | $value must end with $pattern
   1020 	 * | `*=`       | $value must contain $pattern
   1021 	 *
   1022 	 * @param string $exp The expression.
   1023 	 * @param string $pattern The pattern
   1024 	 * @param string $value The value
   1025 	 * @value bool True if $value matches $pattern
   1026 	 */
   1027 	protected function match($exp, $pattern, $value, $case_sensitivity)
   1028 	{
   1029 		global $debug_object;
   1030 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
   1031 
   1032 		if ($case_sensitivity === 'i') {
   1033 			$pattern = strtolower($pattern);
   1034 			$value = strtolower($value);
   1035 		}
   1036 
   1037 		switch ($exp) {
   1038 			case '=':
   1039 				return ($value === $pattern);
   1040 			case '!=':
   1041 				return ($value !== $pattern);
   1042 			case '^=':
   1043 				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
   1044 			case '$=':
   1045 				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
   1046 			case '*=':
   1047 				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
   1048 			case '|=':
   1049 				/**
   1050 				 * [att|=val]
   1051 				 *
   1052 				 * Represents an element with the att attribute, its value
   1053 				 * either being exactly "val" or beginning with "val"
   1054 				 * immediately followed by "-" (U+002D).
   1055 				 */
   1056 				return strpos($value, $pattern) === 0;
   1057 			case '~=':
   1058 				/**
   1059 				 * [att~=val]
   1060 				 *
   1061 				 * Represents an element with the att attribute whose value is a
   1062 				 * whitespace-separated list of words, one of which is exactly
   1063 				 * "val". If "val" contains whitespace, it will never represent
   1064 				 * anything (since the words are separated by spaces). Also if
   1065 				 * "val" is the empty string, it will never represent anything.
   1066 				 */
   1067 				return in_array($pattern, explode(' ', trim($value)), true);
   1068 		}
   1069 		return false;
   1070 	}
   1071 
   1072 	/**
   1073 	 * Parse CSS selector
   1074 	 *
   1075 	 * @param string $selector_string CSS selector string
   1076 	 * @return array List of CSS selectors. The format depends on the type of
   1077 	 * selector:
   1078 	 *
   1079 	 * ```php
   1080 	 *
   1081 	 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
   1082 	 *   array( // list of combinator selectors, i.e. 'img > p > div'
   1083 	 *     array( // selector element
   1084 	 *       [0], // (string) The element tag
   1085 	 *       [1], // (string) The element id
   1086 	 *       [2], // (array<string>) The element classes
   1087 	 *       [3], // (array<array<string>>) The list of attributes, each
   1088 	 *            // with four elements: name, expression, value, inverted
   1089 	 *       [4]  // (string) The selector combinator (' ' | '>' | '+' | '~')
   1090 	 *     )
   1091 	 *   )
   1092 	 * )
   1093 	 * ```
   1094 	 *
   1095 	 * @link https://www.w3.org/TR/selectors/#compound Compound selector
   1096 	 */
   1097 	protected function parse_selector($selector_string)
   1098 	{
   1099 		global $debug_object;
   1100 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
   1101 
   1102 		/**
   1103 		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
   1104 		 *
   1105 		 * Paperg: Add the colon to the attribute, so that it properly finds
   1106 		 * <tag attr:ibute="something" > like google does.
   1107 		 *
   1108 		 * Note: if you try to look at this attribute, you MUST use getAttribute
   1109 		 * since $dom->x:y will fail the php syntax check.
   1110 		 *
   1111 		 * Notice the \[ starting the attribute? and the @? following? This
   1112 		 * implies that an attribute can begin with an @ sign that is not
   1113 		 * captured. This implies that an html attribute specifier may start
   1114 		 * with an @ sign that is NOT captured by the expression. Farther study
   1115 		 * is required to determine of this should be documented or removed.
   1116 		 *
   1117 		 * Matches selectors in this order:
   1118 		 *
   1119 		 * [0] - full match
   1120 		 *
   1121 		 * [1] - tag name
   1122 		 *     ([\w:\*-]*)
   1123 		 *     Matches the tag name consisting of zero or more words, colons,
   1124 		 *     asterisks and hyphens.
   1125 		 *
   1126 		 * [2] - id name
   1127 		 *     (?:\#([\w-]+))
   1128 		 *     Optionally matches a id name, consisting of an "#" followed by
   1129 		 *     the id name (one or more words and hyphens).
   1130 		 *
   1131 		 * [3] - class names (including dots)
   1132 		 *     (?:\.([\w\.-]+))?
   1133 		 *     Optionally matches a list of classs, consisting of an "."
   1134 		 *     followed by the class name (one or more words and hyphens)
   1135 		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
   1136 		 *
   1137 		 * [4] - attributes
   1138 		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
   1139 		 *     Optionally matches the attributes list
   1140 		 *
   1141 		 * [5] - separator
   1142 		 *     ([\/, >+~]+)
   1143 		 *     Matches the selector list separator
   1144 		 */
   1145 		// phpcs:ignore Generic.Files.LineLength
   1146 		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
   1147 
   1148 		preg_match_all(
   1149 			$pattern,
   1150 			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
   1151 			$matches,
   1152 			PREG_SET_ORDER
   1153 		);
   1154 
   1155 		if (is_object($debug_object)) {
   1156 			$debug_object->debug_log(2, 'Matches Array: ', $matches);
   1157 		}
   1158 
   1159 		$selectors = array();
   1160 		$result = array();
   1161 
   1162 		foreach ($matches as $m) {
   1163 			$m[0] = trim($m[0]);
   1164 
   1165 			// Skip NoOps
   1166 			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
   1167 
   1168 			// Convert to lowercase
   1169 			if ($this->dom->lowercase) {
   1170 				$m[1] = strtolower($m[1]);
   1171 			}
   1172 
   1173 			// Extract classes
   1174 			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
   1175 
   1176 			/* Extract attributes (pattern based on the pattern above!)
   1177 
   1178 			 * [0] - full match
   1179 			 * [1] - attribute name
   1180 			 * [2] - attribute expression
   1181 			 * [3] - attribute value
   1182 			 * [4] - case sensitivity
   1183 			 *
   1184 			 * Note: Attributes can be negated with a "!" prefix to their name
   1185 			 */
   1186 			if($m[4] !== '') {
   1187 				preg_match_all(
   1188 					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
   1189 					trim($m[4]),
   1190 					$attributes,
   1191 					PREG_SET_ORDER
   1192 				);
   1193 
   1194 				// Replace element by array
   1195 				$m[4] = array();
   1196 
   1197 				foreach($attributes as $att) {
   1198 					// Skip empty matches
   1199 					if(trim($att[0]) === '') { continue; }
   1200 
   1201 					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
   1202 					$m[4][] = array(
   1203 						$inverted ? substr($att[1], 1) : $att[1], // Name
   1204 						(isset($att[2])) ? $att[2] : '', // Expression
   1205 						(isset($att[3])) ? $att[3] : '', // Value
   1206 						$inverted, // Inverted Flag
   1207 						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
   1208 					);
   1209 				}
   1210 			}
   1211 
   1212 			// Sanitize Separator
   1213 			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
   1214 				$m[5] = ' ';
   1215 			} else { // Other Separator
   1216 				$m[5] = trim($m[5]);
   1217 			}
   1218 
   1219 			// Clear Separator if it's a Selector List
   1220 			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
   1221 
   1222 			// Remove full match before adding to results
   1223 			array_shift($m);
   1224 			$result[] = $m;
   1225 
   1226 			if ($is_list) { // Selector List
   1227 				$selectors[] = $result;
   1228 				$result = array();
   1229 			}
   1230 		}
   1231 
   1232 		if (count($result) > 0) { $selectors[] = $result; }
   1233 		return $selectors;
   1234 	}
   1235 
   1236 	function __get($name)
   1237 	{
   1238 		if (isset($this->attr[$name])) {
   1239 			return $this->convert_text($this->attr[$name]);
   1240 		}
   1241 		switch ($name) {
   1242 			case 'outertext': return $this->outertext();
   1243 			case 'innertext': return $this->innertext();
   1244 			case 'plaintext': return $this->text();
   1245 			case 'xmltext': return $this->xmltext();
   1246 			default: return array_key_exists($name, $this->attr);
   1247 		}
   1248 	}
   1249 
   1250 	function __set($name, $value)
   1251 	{
   1252 		global $debug_object;
   1253 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
   1254 
   1255 		switch ($name) {
   1256 			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
   1257 			case 'innertext':
   1258 				if (isset($this->_[HDOM_INFO_TEXT])) {
   1259 					return $this->_[HDOM_INFO_TEXT] = $value;
   1260 				}
   1261 				return $this->_[HDOM_INFO_INNER] = $value;
   1262 		}
   1263 
   1264 		if (!isset($this->attr[$name])) {
   1265 			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
   1266 			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
   1267 		}
   1268 
   1269 		$this->attr[$name] = $value;
   1270 	}
   1271 
   1272 	function __isset($name)
   1273 	{
   1274 		switch ($name) {
   1275 			case 'outertext': return true;
   1276 			case 'innertext': return true;
   1277 			case 'plaintext': return true;
   1278 		}
   1279 		//no value attr: nowrap, checked selected...
   1280 		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
   1281 	}
   1282 
   1283 	function __unset($name)
   1284 	{
   1285 		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
   1286 	}
   1287 
   1288 	// PaperG - Function to convert the text from one character set to another
   1289 	// if the two sets are not the same.
   1290 	function convert_text($text)
   1291 	{
   1292 		global $debug_object;
   1293 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
   1294 
   1295 		$converted_text = $text;
   1296 
   1297 		$sourceCharset = '';
   1298 		$targetCharset = '';
   1299 
   1300 		if ($this->dom) {
   1301 			$sourceCharset = strtoupper($this->dom->_charset);
   1302 			$targetCharset = strtoupper($this->dom->_target_charset);
   1303 		}
   1304 
   1305 		if (is_object($debug_object)) {
   1306 			$debug_object->debug_log(3,
   1307 				'source charset: '
   1308 				. $sourceCharset
   1309 				. ' target charaset: '
   1310 				. $targetCharset
   1311 			);
   1312 		}
   1313 
   1314 		if (!empty($sourceCharset)
   1315 			&& !empty($targetCharset)
   1316 			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
   1317 			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
   1318 			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
   1319 				&& ($this->is_utf8($text))) {
   1320 				$converted_text = $text;
   1321 			} else {
   1322 				$converted_text = iconv($sourceCharset, $targetCharset, $text);
   1323 			}
   1324 		}
   1325 
   1326 		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
   1327 		if ($targetCharset === 'UTF-8') {
   1328 			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
   1329 				$converted_text = substr($converted_text, 3);
   1330 			}
   1331 
   1332 			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
   1333 				$converted_text = substr($converted_text, 0, -3);
   1334 			}
   1335 		}
   1336 
   1337 		return $converted_text;
   1338 	}
   1339 
   1340 	/**
   1341 	* Returns true if $string is valid UTF-8 and false otherwise.
   1342 	*
   1343 	* @param mixed $str String to be tested
   1344 	* @return boolean
   1345 	*/
   1346 	static function is_utf8($str)
   1347 	{
   1348 		$c = 0; $b = 0;
   1349 		$bits = 0;
   1350 		$len = strlen($str);
   1351 		for($i = 0; $i < $len; $i++) {
   1352 			$c = ord($str[$i]);
   1353 			if($c > 128) {
   1354 				if(($c >= 254)) { return false; }
   1355 				elseif($c >= 252) { $bits = 6; }
   1356 				elseif($c >= 248) { $bits = 5; }
   1357 				elseif($c >= 240) { $bits = 4; }
   1358 				elseif($c >= 224) { $bits = 3; }
   1359 				elseif($c >= 192) { $bits = 2; }
   1360 				else { return false; }
   1361 				if(($i + $bits) > $len) { return false; }
   1362 				while($bits > 1) {
   1363 					$i++;
   1364 					$b = ord($str[$i]);
   1365 					if($b < 128 || $b > 191) { return false; }
   1366 					$bits--;
   1367 				}
   1368 			}
   1369 		}
   1370 		return true;
   1371 	}
   1372 
   1373 	/**
   1374 	 * Function to try a few tricks to determine the displayed size of an img on
   1375 	 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
   1376 	 * other tag types.
   1377 	 *
   1378 	 * @author John Schlick
   1379 	 * @version April 19 2012
   1380 	 * @return array an array containing the 'height' and 'width' of the image
   1381 	 * on the page or -1 if we can't figure it out.
   1382 	 */
   1383 	function get_display_size()
   1384 	{
   1385 		global $debug_object;
   1386 
   1387 		$width = -1;
   1388 		$height = -1;
   1389 
   1390 		if ($this->tag !== 'img') {
   1391 			return false;
   1392 		}
   1393 
   1394 		// See if there is aheight or width attribute in the tag itself.
   1395 		if (isset($this->attr['width'])) {
   1396 			$width = $this->attr['width'];
   1397 		}
   1398 
   1399 		if (isset($this->attr['height'])) {
   1400 			$height = $this->attr['height'];
   1401 		}
   1402 
   1403 		// Now look for an inline style.
   1404 		if (isset($this->attr['style'])) {
   1405 			// Thanks to user gnarf from stackoverflow for this regular expression.
   1406 			$attributes = array();
   1407 
   1408 			preg_match_all(
   1409 				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
   1410 				$this->attr['style'],
   1411 				$matches,
   1412 				PREG_SET_ORDER
   1413 			);
   1414 
   1415 			foreach ($matches as $match) {
   1416 				$attributes[$match[1]] = $match[2];
   1417 			}
   1418 
   1419 			// If there is a width in the style attributes:
   1420 			if (isset($attributes['width']) && $width == -1) {
   1421 				// check that the last two characters are px (pixels)
   1422 				if (strtolower(substr($attributes['width'], -2)) === 'px') {
   1423 					$proposed_width = substr($attributes['width'], 0, -2);
   1424 					// Now make sure that it's an integer and not something stupid.
   1425 					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
   1426 						$width = $proposed_width;
   1427 					}
   1428 				}
   1429 			}
   1430 
   1431 			// If there is a width in the style attributes:
   1432 			if (isset($attributes['height']) && $height == -1) {
   1433 				// check that the last two characters are px (pixels)
   1434 				if (strtolower(substr($attributes['height'], -2)) == 'px') {
   1435 					$proposed_height = substr($attributes['height'], 0, -2);
   1436 					// Now make sure that it's an integer and not something stupid.
   1437 					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
   1438 						$height = $proposed_height;
   1439 					}
   1440 				}
   1441 			}
   1442 
   1443 		}
   1444 
   1445 		// Future enhancement:
   1446 		// Look in the tag to see if there is a class or id specified that has
   1447 		// a height or width attribute to it.
   1448 
   1449 		// Far future enhancement
   1450 		// Look at all the parent tags of this image to see if they specify a
   1451 		// class or id that has an img selector that specifies a height or width
   1452 		// Note that in this case, the class or id will have the img subselector
   1453 		// for it to apply to the image.
   1454 
   1455 		// ridiculously far future development
   1456 		// If the class or id is specified in a SEPARATE css file thats not on
   1457 		// the page, go get it and do what we were just doing for the ones on
   1458 		// the page.
   1459 
   1460 		$result = array(
   1461 			'height' => $height,
   1462 			'width' => $width
   1463 		);
   1464 
   1465 		return $result;
   1466 	}
   1467 
   1468 	// camel naming conventions
   1469 	function getAllAttributes()
   1470 	{
   1471 		return $this->attr;
   1472 	}
   1473 
   1474 	function getAttribute($name)
   1475 	{
   1476 		return $this->__get($name);
   1477 	}
   1478 
   1479 	function setAttribute($name, $value)
   1480 	{
   1481 		$this->__set($name, $value);
   1482 	}
   1483 
   1484 	function hasAttribute($name)
   1485 	{
   1486 		return $this->__isset($name);
   1487 	}
   1488 
   1489 	function removeAttribute($name)
   1490 	{
   1491 		$this->__set($name, null);
   1492 	}
   1493 
   1494 	function getElementById($id)
   1495 	{
   1496 		return $this->find("#$id", 0);
   1497 	}
   1498 
   1499 	function getElementsById($id, $idx = null)
   1500 	{
   1501 		return $this->find("#$id", $idx);
   1502 	}
   1503 
   1504 	function getElementByTagName($name)
   1505 	{
   1506 		return $this->find($name, 0);
   1507 	}
   1508 
   1509 	function getElementsByTagName($name, $idx = null)
   1510 	{
   1511 		return $this->find($name, $idx);
   1512 	}
   1513 
   1514 	function parentNode()
   1515 	{
   1516 		return $this->parent();
   1517 	}
   1518 
   1519 	function childNodes($idx = -1)
   1520 	{
   1521 		return $this->children($idx);
   1522 	}
   1523 
   1524 	function firstChild()
   1525 	{
   1526 		return $this->first_child();
   1527 	}
   1528 
   1529 	function lastChild()
   1530 	{
   1531 		return $this->last_child();
   1532 	}
   1533 
   1534 	function nextSibling()
   1535 	{
   1536 		return $this->next_sibling();
   1537 	}
   1538 
   1539 	function previousSibling()
   1540 	{
   1541 		return $this->prev_sibling();
   1542 	}
   1543 
   1544 	function hasChildNodes()
   1545 	{
   1546 		return $this->has_child();
   1547 	}
   1548 
   1549 	function nodeName()
   1550 	{
   1551 		return $this->tag;
   1552 	}
   1553 
   1554 	function appendChild($node)
   1555 	{
   1556 		$node->parent($this);
   1557 		return $node;
   1558 	}
   1559 
   1560 }
   1561 
   1562 /**
   1563  * simple html dom parser
   1564  *
   1565  * Paperg - in the find routine: allow us to specify that we want case
   1566  * insensitive testing of the value of the selector.
   1567  *
   1568  * Paperg - change $size from protected to public so we can easily access it
   1569  *
   1570  * Paperg - added ForceTagsClosed in the constructor which tells us whether we
   1571  * trust the html or not.  Default is to NOT trust it.
   1572  *
   1573  * @package PlaceLocalInclude
   1574  */
   1575 class d_simple_html_dom
   1576 {
   1577 	/**
   1578 	 * The root node of the document
   1579 	 *
   1580 	 * @var object
   1581 	 */
   1582 	public $root = null;
   1583 
   1584 	/**
   1585 	 * List of nodes in the current DOM
   1586 	 *
   1587 	 * @var array
   1588 	 */
   1589 	public $nodes = array();
   1590 
   1591 	/**
   1592 	 * Callback function to run for each element in the DOM.
   1593 	 *
   1594 	 * @var callable|null
   1595 	 */
   1596 	public $callback = null;
   1597 
   1598 	/**
   1599 	 * Indicates how tags and attributes are matched
   1600 	 *
   1601 	 * @var bool When set to **true** tags and attributes will be converted to
   1602 	 * lowercase before matching.
   1603 	 */
   1604 	public $lowercase = false;
   1605 
   1606 	/**
   1607 	 * Original document size
   1608 	 *
   1609 	 * Holds the original document size.
   1610 	 *
   1611 	 * @var int
   1612 	 */
   1613 	public $original_size;
   1614 
   1615 	/**
   1616 	 * Current document size
   1617 	 *
   1618 	 * Holds the current document size. The document size is determined by the
   1619 	 * string length of ({@see d_simple_html_dom::$doc}).
   1620 	 *
   1621 	 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
   1622 	 *
   1623 	 * @var int
   1624 	 * */
   1625 	public $size;
   1626 
   1627 	/**
   1628 	 * Current position in the document
   1629 	 *
   1630 	 * @var int
   1631 	 */
   1632 	protected $pos;
   1633 
   1634 	/**
   1635 	 * The document
   1636 	 *
   1637 	 * @var string
   1638 	 */
   1639 	protected $doc;
   1640 
   1641 	/**
   1642 	 * Current character
   1643 	 *
   1644 	 * Holds the current character at position {@see d_simple_html_dom::$pos} in
   1645 	 * the document {@see d_simple_html_dom::$doc}
   1646 	 *
   1647 	 * _Note_: Using this variable is more efficient than calling
   1648 	 * `substr($doc, $pos, 1)`
   1649 	 *
   1650 	 * @var string
   1651 	 */
   1652 	protected $char;
   1653 
   1654 	protected $cursor;
   1655 
   1656 	/**
   1657 	 * Parent node of the next node detected by the parser
   1658 	 *
   1659 	 * @var object
   1660 	 */
   1661 	protected $parent;
   1662 	protected $noise = array();
   1663 
   1664 	/**
   1665 	 * Tokens considered blank in HTML
   1666 	 *
   1667 	 * @var string
   1668 	 */
   1669 	protected $token_blank = " \t\r\n";
   1670 
   1671 	/**
   1672 	 * Tokens to identify the equal sign for attributes, stopping either at the
   1673 	 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
   1674 	 * "<html>")
   1675 	 *
   1676 	 * @var string
   1677 	 */
   1678 	protected $token_equal = ' =/>';
   1679 
   1680 	/**
   1681 	 * Tokens to identify the end of a tag name. A tag name either ends on the
   1682 	 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
   1683 	 *
   1684 	 * @var string
   1685 	 */
   1686 	protected $token_slash = " />\r\n\t";
   1687 
   1688 	/**
   1689 	 * Tokens to identify the end of an attribute
   1690 	 *
   1691 	 * @var string
   1692 	 */
   1693 	protected $token_attr = ' >';
   1694 
   1695 	// Note that this is referenced by a child node, and so it needs to be
   1696 	// public for that node to see this information.
   1697 	public $_charset = '';
   1698 	public $_target_charset = '';
   1699 
   1700 	/**
   1701 	 * Innertext for <br> elements
   1702 	 *
   1703 	 * @var string
   1704 	 */
   1705 	protected $default_br_text = '';
   1706 
   1707 	/**
   1708 	 * Suffix for <span> elements
   1709 	 *
   1710 	 * @var string
   1711 	 */
   1712 	public $default_span_text = '';
   1713 
   1714 	/**
   1715 	 * Defines a list of self-closing tags (Void elements) according to the HTML
   1716 	 * Specification
   1717 	 *
   1718 	 * _Remarks_:
   1719 	 * - Use `isset()` instead of `in_array()` on array elements to boost
   1720 	 * performance about 30%
   1721 	 * - Sort elements by name for better readability!
   1722 	 *
   1723 	 * @link https://www.w3.org/TR/html HTML Specification
   1724 	 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
   1725 	 */
   1726 	protected $self_closing_tags = array(
   1727 		'area' => 1,
   1728 		'base' => 1,
   1729 		'br' => 1,
   1730 		'col' => 1,
   1731 		'embed' => 1,
   1732 		'hr' => 1,
   1733 		'img' => 1,
   1734 		'input' => 1,
   1735 		'link' => 1,
   1736 		'meta' => 1,
   1737 		'param' => 1,
   1738 		'source' => 1,
   1739 		'track' => 1,
   1740 		'wbr' => 1
   1741 	);
   1742 
   1743 	/**
   1744 	 * Defines a list of tags which - if closed - close all optional closing
   1745 	 * elements within if they haven't been closed yet. (So, an element where
   1746 	 * neither opening nor closing tag is omissible consistently closes every
   1747 	 * optional closing element within)
   1748 	 *
   1749 	 * _Remarks_:
   1750 	 * - Use `isset()` instead of `in_array()` on array elements to boost
   1751 	 * performance about 30%
   1752 	 * - Sort elements by name for better readability!
   1753 	 */
   1754 	protected $block_tags = array(
   1755 		'body' => 1,
   1756 		'div' => 1,
   1757 		'form' => 1,
   1758 		'root' => 1,
   1759 		'span' => 1,
   1760 		'table' => 1
   1761 	);
   1762 
   1763 	/**
   1764 	 * Defines elements whose end tag is omissible.
   1765 	 *
   1766 	 * * key = Name of an element whose end tag is omissible.
   1767 	 * * value = Names of elements whose end tag is omissible, that are closed
   1768 	 * by the current element.
   1769 	 *
   1770 	 * _Remarks_:
   1771 	 * - Use `isset()` instead of `in_array()` on array elements to boost
   1772 	 * performance about 30%
   1773 	 * - Sort elements by name for better readability!
   1774 	 *
   1775 	 * **Example**
   1776 	 *
   1777 	 * An `li` element’s end tag may be omitted if the `li` element is immediately
   1778 	 * followed by another `li` element. To do that, add following element to the
   1779 	 * array:
   1780 	 *
   1781 	 * ```php
   1782 	 * 'li' => array('li'),
   1783 	 * ```
   1784 	 *
   1785 	 * With this, the following two examples are considered equal. Note that the
   1786 	 * second example is missing the closing tags on `li` elements.
   1787 	 *
   1788 	 * ```html
   1789 	 * <ul><li>First Item</li><li>Second Item</li></ul>
   1790 	 * ```
   1791 	 *
   1792 	 * <ul><li>First Item</li><li>Second Item</li></ul>
   1793 	 *
   1794 	 * ```html
   1795 	 * <ul><li>First Item<li>Second Item</ul>
   1796 	 * ```
   1797 	 *
   1798 	 * <ul><li>First Item<li>Second Item</ul>
   1799 	 *
   1800 	 * @var array A two-dimensional array where the key is the name of an
   1801 	 * element whose end tag is omissible and the value is an array of elements
   1802 	 * whose end tag is omissible, that are closed by the current element.
   1803 	 *
   1804 	 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
   1805 	 *
   1806 	 * @todo The implementation of optional closing tags doesn't work in all cases
   1807 	 * because it only consideres elements who close other optional closing
   1808 	 * tags, not taking into account that some (non-blocking) tags should close
   1809 	 * these optional closing tags. For example, the end tag for "p" is omissible
   1810 	 * and can be closed by an "address" element, whose end tag is NOT omissible.
   1811 	 * Currently a "p" element without closing tag stops at the next "p" element
   1812 	 * or blocking tag, even if it contains other elements.
   1813 	 *
   1814 	 * @todo Known sourceforge issue #2977341
   1815 	 * B tags that are not closed cause us to return everything to the end of
   1816 	 * the document.
   1817 	 */
   1818 	protected $optional_closing_tags = array(
   1819 		// Not optional, see
   1820 		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
   1821 		'b' => array('b' => 1),
   1822 		'dd' => array('dd' => 1, 'dt' => 1),
   1823 		// Not optional, see
   1824 		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
   1825 		'dl' => array('dd' => 1, 'dt' => 1),
   1826 		'dt' => array('dd' => 1, 'dt' => 1),
   1827 		'li' => array('li' => 1),
   1828 		'optgroup' => array('optgroup' => 1, 'option' => 1),
   1829 		'option' => array('optgroup' => 1, 'option' => 1),
   1830 		'p' => array('p' => 1),
   1831 		'rp' => array('rp' => 1, 'rt' => 1),
   1832 		'rt' => array('rp' => 1, 'rt' => 1),
   1833 		'td' => array('td' => 1, 'th' => 1),
   1834 		'th' => array('td' => 1, 'th' => 1),
   1835 		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
   1836 	);
   1837 
   1838 	function __construct(
   1839 		$str = null,
   1840 		$lowercase = true,
   1841 		$forceTagsClosed = true,
   1842 		$target_charset = DEFAULT_TARGET_CHARSET,
   1843 		$stripRN = true,
   1844 		$defaultBRText = DEFAULT_BR_TEXT,
   1845 		$defaultSpanText = DEFAULT_SPAN_TEXT,
   1846 		$options = 0)
   1847 	{
   1848 		if ($str) {
   1849 			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
   1850 				$this->load_file($str);
   1851 			} else {
   1852 				$this->load(
   1853 					$str,
   1854 					$lowercase,
   1855 					$stripRN,
   1856 					$defaultBRText,
   1857 					$defaultSpanText,
   1858 					$options
   1859 				);
   1860 			}
   1861 		}
   1862 		// Forcing tags to be closed implies that we don't trust the html, but
   1863 		// it can lead to parsing errors if we SHOULD trust the html.
   1864 		if (!$forceTagsClosed) {
   1865 			$this->optional_closing_array = array();
   1866 		}
   1867 
   1868 		$this->_target_charset = $target_charset;
   1869 	}
   1870 
   1871 	function __destruct()
   1872 	{
   1873 		$this->clear();
   1874 	}
   1875 
   1876 	// load html from string
   1877 	function load(
   1878 		$str,
   1879 		$lowercase = true,
   1880 		$stripRN = true,
   1881 		$defaultBRText = DEFAULT_BR_TEXT,
   1882 		$defaultSpanText = DEFAULT_SPAN_TEXT,
   1883 		$options = 0)
   1884 	{
   1885 		global $debug_object;
   1886 
   1887 		// prepare
   1888 		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
   1889 
   1890 		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
   1891 		// Script tags removal now preceeds style tag removal.
   1892 		// strip out <script> tags
   1893 		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
   1894 		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
   1895 
   1896 		// strip out the \r \n's if we are told to.
   1897 		if ($stripRN) {
   1898 			$this->doc = str_replace("\r", ' ', $this->doc);
   1899 			$this->doc = str_replace("\n", ' ', $this->doc);
   1900 
   1901 			// set the length of content since we have changed it.
   1902 			$this->size = strlen($this->doc);
   1903 		}
   1904 
   1905 		// strip out cdata
   1906 		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
   1907 		// strip out comments
   1908 		$this->remove_noise("'<!--(.*?)-->'is");
   1909 		// strip out <style> tags
   1910 		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
   1911 		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
   1912 		// strip out preformatted tags
   1913 		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
   1914 		// strip out server side scripts
   1915 		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
   1916 
   1917 		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
   1918 			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
   1919 		}
   1920 
   1921 		// parsing
   1922 		$this->parse();
   1923 		// end
   1924 		$this->root->_[HDOM_INFO_END] = $this->cursor;
   1925 		$this->parse_charset();
   1926 
   1927 		// make load function chainable
   1928 		return $this;
   1929 	}
   1930 
   1931 	// load html from file
   1932 	function load_file()
   1933 	{
   1934 		$args = func_get_args();
   1935 
   1936 		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
   1937 			$this->load($doc, true);
   1938 		} else {
   1939 			return false;
   1940 		}
   1941 	}
   1942 
   1943 	/**
   1944 	 * Set the callback function
   1945 	 *
   1946 	 * @param callable $function_name Callback function to run for each element
   1947 	 * in the DOM.
   1948 	 * @return void
   1949 	 */
   1950 	function set_callback($function_name)
   1951 	{
   1952 		$this->callback = $function_name;
   1953 	}
   1954 
   1955 	/**
   1956 	 * Remove callback function
   1957 	 *
   1958 	 * @return void
   1959 	 */
   1960 	function remove_callback()
   1961 	{
   1962 		$this->callback = null;
   1963 	}
   1964 
   1965 	// save dom as string
   1966 	function save($filepath = '')
   1967 	{
   1968 		$ret = $this->root->innertext();
   1969 		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
   1970 		return $ret;
   1971 	}
   1972 
   1973 	// find dom node by css selector
   1974 	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
   1975 	function find($selector, $idx = null, $lowercase = false)
   1976 	{
   1977 		return $this->root->find($selector, $idx, $lowercase);
   1978 	}
   1979 
   1980 	// clean up memory due to php5 circular references memory leak...
   1981 	function clear()
   1982 	{
   1983 		foreach ($this->nodes as $n) {
   1984 			$n->clear(); $n = null;
   1985 		}
   1986 
   1987 		// This add next line is documented in the sourceforge repository.
   1988 		// 2977248 as a fix for ongoing memory leaks that occur even with the
   1989 		// use of clear.
   1990 		if (isset($this->children)) {
   1991 			foreach ($this->children as $n) {
   1992 				$n->clear(); $n = null;
   1993 			}
   1994 		}
   1995 
   1996 		if (isset($this->parent)) {
   1997 			$this->parent->clear();
   1998 			unset($this->parent);
   1999 		}
   2000 
   2001 		if (isset($this->root)) {
   2002 			$this->root->clear();
   2003 			unset($this->root);
   2004 		}
   2005 
   2006 		unset($this->doc);
   2007 		unset($this->noise);
   2008 	}
   2009 
   2010 	function dump($show_attr = true)
   2011 	{
   2012 		$this->root->dump($show_attr);
   2013 	}
   2014 
   2015 	// prepare HTML data and init everything
   2016 	protected function prepare(
   2017 		$str, $lowercase = true,
   2018 		$defaultBRText = DEFAULT_BR_TEXT,
   2019 		$defaultSpanText = DEFAULT_SPAN_TEXT)
   2020 	{
   2021 		$this->clear();
   2022 
   2023 		$this->doc = trim($str);
   2024 		$this->size = strlen($this->doc);
   2025 		$this->original_size = $this->size; // original size of the html
   2026 		$this->pos = 0;
   2027 		$this->cursor = 1;
   2028 		$this->noise = array();
   2029 		$this->nodes = array();
   2030 		$this->lowercase = $lowercase;
   2031 		$this->default_br_text = $defaultBRText;
   2032 		$this->default_span_text = $defaultSpanText;
   2033 		$this->root = new d_simple_html_dom_node($this);
   2034 		$this->root->tag = 'root';
   2035 		$this->root->_[HDOM_INFO_BEGIN] = -1;
   2036 		$this->root->nodetype = HDOM_TYPE_ROOT;
   2037 		$this->parent = $this->root;
   2038 		if ($this->size > 0) { $this->char = $this->doc[0]; }
   2039 	}
   2040 
   2041 	/**
   2042 	 * Parse HTML content
   2043 	 *
   2044 	 * @return bool True on success
   2045 	 */
   2046 	protected function parse()
   2047 	{
   2048 		while (true) {
   2049 			// Read next tag if there is no text between current position and the
   2050 			// next opening tag.
   2051 			if (($s = $this->copy_until_char('<')) === '') {
   2052 				if($this->read_tag()) {
   2053 					continue;
   2054 				} else {
   2055 					return true;
   2056 				}
   2057 			}
   2058 
   2059 			// Add a text node for text between tags
   2060 			$node = new d_simple_html_dom_node($this);
   2061 			++$this->cursor;
   2062 			$node->_[HDOM_INFO_TEXT] = $s;
   2063 			$this->link_nodes($node, false);
   2064 		}
   2065 	}
   2066 
   2067 	// PAPERG - dkchou - added this to try to identify the character set of the
   2068 	// page we have just parsed so we know better how to spit it out later.
   2069 	// NOTE:  IF you provide a routine called
   2070 	// get_last_retrieve_url_contents_content_type which returns the
   2071 	// CURLINFO_CONTENT_TYPE from the last curl_exec
   2072 	// (or the content_type header from the last transfer), we will parse THAT,
   2073 	// and if a charset is specified, we will use it over any other mechanism.
   2074 	protected function parse_charset()
   2075 	{
   2076 		global $debug_object;
   2077 
   2078 		$charset = null;
   2079 
   2080 		if (function_exists('get_last_retrieve_url_contents_content_type')) {
   2081 			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
   2082 			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
   2083 			if ($success) {
   2084 				$charset = $matches[1];
   2085 				if (is_object($debug_object)) {
   2086 					$debug_object->debug_log(2,
   2087 						'header content-type found charset of: '
   2088 						. $charset
   2089 					);
   2090 				}
   2091 			}
   2092 		}
   2093 
   2094 		if (empty($charset)) {
   2095 			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
   2096 
   2097 			if (!empty($el)) {
   2098 				$fullvalue = $el->content;
   2099 				if (is_object($debug_object)) {
   2100 					$debug_object->debug_log(2,
   2101 						'meta content-type tag found'
   2102 						. $fullvalue
   2103 					);
   2104 				}
   2105 
   2106 				if (!empty($fullvalue)) {
   2107 					$success = preg_match(
   2108 						'/charset=(.+)/i',
   2109 						$fullvalue,
   2110 						$matches
   2111 					);
   2112 
   2113 					if ($success) {
   2114 						$charset = $matches[1];
   2115 					} else {
   2116 						// If there is a meta tag, and they don't specify the
   2117 						// character set, research says that it's typically
   2118 						// ISO-8859-1
   2119 						if (is_object($debug_object)) {
   2120 							$debug_object->debug_log(2,
   2121 								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
   2122 							);
   2123 						}
   2124 
   2125 						$charset = 'ISO-8859-1';
   2126 					}
   2127 				}
   2128 			}
   2129 		}
   2130 
   2131 		// If we couldn't find a charset above, then lets try to detect one
   2132 		// based on the text we got...
   2133 		if (empty($charset)) {
   2134 			// Use this in case mb_detect_charset isn't installed/loaded on
   2135 			// this machine.
   2136 			$charset = false;
   2137 			if (function_exists('mb_detect_encoding')) {
   2138 				// Have php try to detect the encoding from the text given to us.
   2139 				$charset = mb_detect_encoding(
   2140 					$this->doc . 'ascii',
   2141 					$encoding_list = array( 'UTF-8', 'CP1252' )
   2142 				);
   2143 
   2144 				if (is_object($debug_object)) {
   2145 					$debug_object->debug_log(2, 'mb_detect found: ' . $charset);
   2146 				}
   2147 			}
   2148 
   2149 			// and if this doesn't work...  then we need to just wrongheadedly
   2150 			// assume it's UTF-8 so that we can move on - cause this will
   2151 			// usually give us most of what we need...
   2152 			if ($charset === false) {
   2153 				if (is_object($debug_object)) {
   2154 					$debug_object->debug_log(
   2155 						2,
   2156 						'since mb_detect failed - using default of utf-8'
   2157 					);
   2158 				}
   2159 
   2160 				$charset = 'UTF-8';
   2161 			}
   2162 		}
   2163 
   2164 		// Since CP1252 is a superset, if we get one of it's subsets, we want
   2165 		// it instead.
   2166 		if ((strtolower($charset) == strtolower('ISO-8859-1'))
   2167 			|| (strtolower($charset) == strtolower('Latin1'))
   2168 			|| (strtolower($charset) == strtolower('Latin-1'))) {
   2169 
   2170 			if (is_object($debug_object)) {
   2171 				$debug_object->debug_log(
   2172 					2,
   2173 					'replacing ' . $charset . ' with CP1252 as its a superset'
   2174 				);
   2175 			}
   2176 
   2177 			$charset = 'CP1252';
   2178 		}
   2179 
   2180 		if (is_object($debug_object)) {
   2181 			$debug_object->debug_log(1, 'EXIT - ' . $charset);
   2182 		}
   2183 
   2184 		return $this->_charset = $charset;
   2185 	}
   2186 
   2187 	/**
   2188 	 * Parse tag from current document position.
   2189 	 *
   2190 	 * @return bool True if a tag was found, false otherwise
   2191 	 */
   2192 	protected function read_tag()
   2193 	{
   2194 		// Set end position if no further tags found
   2195 		if ($this->char !== '<') {
   2196 			$this->root->_[HDOM_INFO_END] = $this->cursor;
   2197 			return false;
   2198 		}
   2199 
   2200 		$begin_tag_pos = $this->pos;
   2201 		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2202 
   2203 		// end tag
   2204 		if ($this->char === '/') {
   2205 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2206 
   2207 			// Skip whitespace in end tags (i.e. in "</   html>")
   2208 			$this->skip($this->token_blank);
   2209 			$tag = $this->copy_until_char('>');
   2210 
   2211 			// Skip attributes in end tags
   2212 			if (($pos = strpos($tag, ' ')) !== false) {
   2213 				$tag = substr($tag, 0, $pos);
   2214 			}
   2215 
   2216 			$parent_lower = strtolower($this->parent->tag);
   2217 			$tag_lower = strtolower($tag);
   2218 
   2219 			// The end tag is supposed to close the parent tag. Handle situations
   2220 			// when it doesn't
   2221 			if ($parent_lower !== $tag_lower) {
   2222 				// Parent tag does not have to be closed necessarily (optional closing tag)
   2223 				// Current tag is a block tag, so it may close an ancestor
   2224 				if (isset($this->optional_closing_tags[$parent_lower])
   2225 					&& isset($this->block_tags[$tag_lower])) {
   2226 
   2227 					$this->parent->_[HDOM_INFO_END] = 0;
   2228 					$org_parent = $this->parent;
   2229 
   2230 					// Traverse ancestors to find a matching opening tag
   2231 					// Stop at root node
   2232 					while (($this->parent->parent)
   2233 						&& strtolower($this->parent->tag) !== $tag_lower
   2234 					){
   2235 						$this->parent = $this->parent->parent;
   2236 					}
   2237 
   2238 					// If we don't have a match add current tag as text node
   2239 					if (strtolower($this->parent->tag) !== $tag_lower) {
   2240 						$this->parent = $org_parent; // restore origonal parent
   2241 
   2242 						if ($this->parent->parent) {
   2243 							$this->parent = $this->parent->parent;
   2244 						}
   2245 
   2246 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
   2247 						return $this->as_text_node($tag);
   2248 					}
   2249 				} elseif (($this->parent->parent)
   2250 					&& isset($this->block_tags[$tag_lower])
   2251 				) {
   2252 					// Grandparent exists and current tag is a block tag, so our
   2253 					// parent doesn't have an end tag
   2254 					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
   2255 					$org_parent = $this->parent;
   2256 
   2257 					// Traverse ancestors to find a matching opening tag
   2258 					// Stop at root node
   2259 					while (($this->parent->parent)
   2260 						&& strtolower($this->parent->tag) !== $tag_lower
   2261 					) {
   2262 						$this->parent = $this->parent->parent;
   2263 					}
   2264 
   2265 					// If we don't have a match add current tag as text node
   2266 					if (strtolower($this->parent->tag) !== $tag_lower) {
   2267 						$this->parent = $org_parent; // restore origonal parent
   2268 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
   2269 						return $this->as_text_node($tag);
   2270 					}
   2271 				} elseif (($this->parent->parent)
   2272 					&& strtolower($this->parent->parent->tag) === $tag_lower
   2273 				) { // Grandparent exists and current tag closes it
   2274 					$this->parent->_[HDOM_INFO_END] = 0;
   2275 					$this->parent = $this->parent->parent;
   2276 				} else { // Random tag, add as text node
   2277 					return $this->as_text_node($tag);
   2278 				}
   2279 			}
   2280 
   2281 			// Set end position of parent tag to current cursor position
   2282 			$this->parent->_[HDOM_INFO_END] = $this->cursor;
   2283 
   2284 			if ($this->parent->parent) {
   2285 				$this->parent = $this->parent->parent;
   2286 			}
   2287 
   2288 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2289 			return true;
   2290 		}
   2291 
   2292 		// start tag
   2293 		$node = new d_simple_html_dom_node($this);
   2294 		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
   2295 		++$this->cursor;
   2296 		$tag = $this->copy_until($this->token_slash); // Get tag name
   2297 		$node->tag_start = $begin_tag_pos;
   2298 
   2299 		// doctype, cdata & comments...
   2300 		// <!DOCTYPE html>
   2301 		// <![CDATA[ ... ]]>
   2302 		// <!-- Comment -->
   2303 		if (isset($tag[0]) && $tag[0] === '!') {
   2304 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
   2305 
   2306 			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
   2307 				$node->nodetype = HDOM_TYPE_COMMENT;
   2308 				$node->tag = 'comment';
   2309 			} else { // Could be doctype or CDATA but we don't care
   2310 				$node->nodetype = HDOM_TYPE_UNKNOWN;
   2311 				$node->tag = 'unknown';
   2312 			}
   2313 
   2314 			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
   2315 
   2316 			$this->link_nodes($node, true);
   2317 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2318 			return true;
   2319 		}
   2320 
   2321 		// The start tag cannot contain another start tag, if so add as text
   2322 		// i.e. "<<html>"
   2323 		if ($pos = strpos($tag, '<') !== false) {
   2324 			$tag = '<' . substr($tag, 0, -1);
   2325 			$node->_[HDOM_INFO_TEXT] = $tag;
   2326 			$this->link_nodes($node, false);
   2327 			$this->char = $this->doc[--$this->pos]; // prev
   2328 			return true;
   2329 		}
   2330 
   2331 		// Handle invalid tag names (i.e. "<html#doc>")
   2332 		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
   2333 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
   2334 
   2335 			// Next char is the beginning of a new tag, don't touch it.
   2336 			if ($this->char === '<') {
   2337 				$this->link_nodes($node, false);
   2338 				return true;
   2339 			}
   2340 
   2341 			// Next char closes current tag, add and be done with it.
   2342 			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
   2343 			$this->link_nodes($node, false);
   2344 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2345 			return true;
   2346 		}
   2347 
   2348 		// begin tag, add new node
   2349 		$node->nodetype = HDOM_TYPE_ELEMENT;
   2350 		$tag_lower = strtolower($tag);
   2351 		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
   2352 
   2353 		// handle optional closing tags
   2354 		if (isset($this->optional_closing_tags[$tag_lower])) {
   2355 			// Traverse ancestors to close all optional closing tags
   2356 			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
   2357 				$this->parent->_[HDOM_INFO_END] = 0;
   2358 				$this->parent = $this->parent->parent;
   2359 			}
   2360 			$node->parent = $this->parent;
   2361 		}
   2362 
   2363 		$guard = 0; // prevent infinity loop
   2364 
   2365 		// [0] Space between tag and first attribute
   2366 		$space = array($this->copy_skip($this->token_blank), '', '');
   2367 
   2368 		// attributes
   2369 		do {
   2370 			// Everything until the first equal sign should be the attribute name
   2371 			$name = $this->copy_until($this->token_equal);
   2372 
   2373 			if ($name === '' && $this->char !== null && $space[0] === '') {
   2374 				break;
   2375 			}
   2376 
   2377 			if ($guard === $this->pos) { // Escape infinite loop
   2378 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2379 				continue;
   2380 			}
   2381 
   2382 			$guard = $this->pos;
   2383 
   2384 			// handle endless '<'
   2385 			// Out of bounds before the tag ended
   2386 			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
   2387 				$node->nodetype = HDOM_TYPE_TEXT;
   2388 				$node->_[HDOM_INFO_END] = 0;
   2389 				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
   2390 				$node->tag = 'text';
   2391 				$this->link_nodes($node, false);
   2392 				return true;
   2393 			}
   2394 
   2395 			// handle mismatch '<'
   2396 			// Attributes cannot start after opening tag
   2397 			if ($this->doc[$this->pos - 1] == '<') {
   2398 				$node->nodetype = HDOM_TYPE_TEXT;
   2399 				$node->tag = 'text';
   2400 				$node->attr = array();
   2401 				$node->_[HDOM_INFO_END] = 0;
   2402 				$node->_[HDOM_INFO_TEXT] = substr(
   2403 					$this->doc,
   2404 					$begin_tag_pos,
   2405 					$this->pos - $begin_tag_pos - 1
   2406 				);
   2407 				$this->pos -= 2;
   2408 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2409 				$this->link_nodes($node, false);
   2410 				return true;
   2411 			}
   2412 
   2413 			if ($name !== '/' && $name !== '') { // this is a attribute name
   2414 				// [1] Whitespace after attribute name
   2415 				$space[1] = $this->copy_skip($this->token_blank);
   2416 
   2417 				$name = $this->restore_noise($name); // might be a noisy name
   2418 
   2419 				if ($this->lowercase) { $name = strtolower($name); }
   2420 
   2421 				if ($this->char === '=') { // attribute with value
   2422 					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2423 					$this->parse_attr($node, $name, $space); // get attribute value
   2424 				} else {
   2425 					//no value attr: nowrap, checked selected...
   2426 					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
   2427 					$node->attr[$name] = true;
   2428 					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
   2429 				}
   2430 
   2431 				$node->_[HDOM_INFO_SPACE][] = $space;
   2432 
   2433 				// prepare for next attribute
   2434 				$space = array(
   2435 					$this->copy_skip($this->token_blank),
   2436 					'',
   2437 					''
   2438 				);
   2439 			} else { // no more attributes
   2440 				break;
   2441 			}
   2442 		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
   2443 
   2444 		$this->link_nodes($node, true);
   2445 		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
   2446 
   2447 		// handle empty tags (i.e. "<div/>")
   2448 		if ($this->copy_until_char('>') === '/') {
   2449 			$node->_[HDOM_INFO_ENDSPACE] .= '/';
   2450 			$node->_[HDOM_INFO_END] = 0;
   2451 		} else {
   2452 			// reset parent
   2453 			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
   2454 				$this->parent = $node;
   2455 			}
   2456 		}
   2457 
   2458 		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2459 
   2460 		// If it's a BR tag, we need to set it's text to the default text.
   2461 		// This way when we see it in plaintext, we can generate formatting that the user wants.
   2462 		// since a br tag never has sub nodes, this works well.
   2463 		if ($node->tag === 'br') {
   2464 			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
   2465 		}
   2466 
   2467 		return true;
   2468 	}
   2469 
   2470 	/**
   2471 	 * Parse attribute from current document position
   2472 	 *
   2473 	 * @param object $node Node for the attributes
   2474 	 * @param string $name Name of the current attribute
   2475 	 * @param array $space Array for spacing information
   2476 	 * @return void
   2477 	 */
   2478 	protected function parse_attr($node, $name, &$space)
   2479 	{
   2480 		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
   2481 		// If the attribute is already defined inside a tag, only pay attention
   2482 		// to the first one as opposed to the last one.
   2483 		// https://stackoverflow.com/a/26341866
   2484 		if (isset($node->attr[$name])) {
   2485 			return;
   2486 		}
   2487 
   2488 		// [2] Whitespace between "=" and the value
   2489 		$space[2] = $this->copy_skip($this->token_blank);
   2490 
   2491 		switch ($this->char) {
   2492 			case '"': // value is anything between double quotes
   2493 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
   2494 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2495 				$node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
   2496 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2497 				break;
   2498 			case '\'': // value is anything between single quotes
   2499 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
   2500 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2501 				$node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
   2502 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2503 				break;
   2504 			default: // value is anything until the first space or end tag
   2505 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
   2506 				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
   2507 		}
   2508 		// PaperG: Attributes should not have \r or \n in them, that counts as
   2509 		// html whitespace.
   2510 		$node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
   2511 		$node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
   2512 		// PaperG: If this is a "class" selector, lets get rid of the preceeding
   2513 		// and trailing space since some people leave it in the multi class case.
   2514 		if ($name === 'class') {
   2515 			$node->attr[$name] = trim($node->attr[$name]);
   2516 		}
   2517 	}
   2518 
   2519 	/**
   2520 	 * Link node to parent node
   2521 	 *
   2522 	 * @param object $node Node to link to parent
   2523 	 * @param bool $is_child True if the node is a child of parent
   2524 	 * @return void
   2525 	 */
   2526 	// link node's parent
   2527 	protected function link_nodes(&$node, $is_child)
   2528 	{
   2529 		$node->parent = $this->parent;
   2530 		$this->parent->nodes[] = $node;
   2531 		if ($is_child) {
   2532 			$this->parent->children[] = $node;
   2533 		}
   2534 	}
   2535 
   2536 	/**
   2537 	 * Add tag as text node to current node
   2538 	 *
   2539 	 * @param string $tag Tag name
   2540 	 * @return bool True on success
   2541 	 */
   2542 	protected function as_text_node($tag)
   2543 	{
   2544 		$node = new d_simple_html_dom_node($this);
   2545 		++$this->cursor;
   2546 		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
   2547 		$this->link_nodes($node, false);
   2548 		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2549 		return true;
   2550 	}
   2551 
   2552 	/**
   2553 	 * Seek from the current document position to the first occurrence of a
   2554 	 * character not defined by the provided string. Update the current document
   2555 	 * position to the new position.
   2556 	 *
   2557 	 * @param string $chars A string containing every allowed character.
   2558 	 * @return void
   2559 	 */
   2560 	protected function skip($chars)
   2561 	{
   2562 		$this->pos += strspn($this->doc, $chars, $this->pos);
   2563 		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2564 	}
   2565 
   2566 	/**
   2567 	 * Copy substring from the current document position to the first occurrence
   2568 	 * of a character not defined by the provided string.
   2569 	 *
   2570 	 * @param string $chars A string containing every allowed character.
   2571 	 * @return string Substring from the current document position to the first
   2572 	 * occurrence of a character not defined by the provided string.
   2573 	 */
   2574 	protected function copy_skip($chars)
   2575 	{
   2576 		$pos = $this->pos;
   2577 		$len = strspn($this->doc, $chars, $pos);
   2578 		$this->pos += $len;
   2579 		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2580 		if ($len === 0) { return ''; }
   2581 		return substr($this->doc, $pos, $len);
   2582 	}
   2583 
   2584 	/**
   2585 	 * Copy substring from the current document position to the first occurrence
   2586 	 * of any of the provided characters.
   2587 	 *
   2588 	 * @param string $chars A string containing every character to stop at.
   2589 	 * @return string Substring from the current document position to the first
   2590 	 * occurrence of any of the provided characters.
   2591 	 */
   2592 	protected function copy_until($chars)
   2593 	{
   2594 		$pos = $this->pos;
   2595 		$len = strcspn($this->doc, $chars, $pos);
   2596 		$this->pos += $len;
   2597 		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
   2598 		return substr($this->doc, $pos, $len);
   2599 	}
   2600 
   2601 	/**
   2602 	 * Copy substring from the current document position to the first occurrence
   2603 	 * of the provided string.
   2604 	 *
   2605 	 * @param string $char The string to stop at.
   2606 	 * @return string Substring from the current document position to the first
   2607 	 * occurrence of the provided string.
   2608 	 */
   2609 	protected function copy_until_char($char)
   2610 	{
   2611 		if ($this->char === null) { return ''; }
   2612 
   2613 		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
   2614 			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
   2615 			$this->char = null;
   2616 			$this->pos = $this->size;
   2617 			return $ret;
   2618 		}
   2619 
   2620 		if ($pos === $this->pos) { return ''; }
   2621 
   2622 		$pos_old = $this->pos;
   2623 		$this->char = $this->doc[$pos];
   2624 		$this->pos = $pos;
   2625 		return substr($this->doc, $pos_old, $pos - $pos_old);
   2626 	}
   2627 
   2628 	/**
   2629 	 * Remove noise from HTML content
   2630 	 *
   2631 	 * Noise is stored to {@see d_simple_html_dom::$noise}
   2632 	 *
   2633 	 * @param string $pattern The regex pattern used for finding noise
   2634 	 * @param bool $remove_tag True to remove the entire match. Default is false
   2635 	 * to only remove the captured data.
   2636 	 */
   2637 	protected function remove_noise($pattern, $remove_tag = false)
   2638 	{
   2639 		global $debug_object;
   2640 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
   2641 
   2642 		$count = preg_match_all(
   2643 			$pattern,
   2644 			$this->doc,
   2645 			$matches,
   2646 			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
   2647 		);
   2648 
   2649 		for ($i = $count - 1; $i > -1; --$i) {
   2650 			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
   2651 
   2652 			if (is_object($debug_object)) {
   2653 				$debug_object->debug_log(2, 'key is: ' . $key);
   2654 			}
   2655 
   2656 			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
   2657 			$this->noise[$key] = $matches[$i][$idx][0];
   2658 			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
   2659 		}
   2660 
   2661 		// reset the length of content
   2662 		$this->size = strlen($this->doc);
   2663 
   2664 		if ($this->size > 0) {
   2665 			$this->char = $this->doc[0];
   2666 		}
   2667 	}
   2668 
   2669 	/**
   2670 	 * Restore noise to HTML content
   2671 	 *
   2672 	 * Noise is restored from {@see d_simple_html_dom::$noise}
   2673 	 *
   2674 	 * @param string $text A subset of HTML containing noise
   2675 	 * @return string The same content with noise restored
   2676 	 */
   2677 	function restore_noise($text)
   2678 	{
   2679 		global $debug_object;
   2680 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
   2681 
   2682 		while (($pos = strpos($text, '___noise___')) !== false) {
   2683 			// Sometimes there is a broken piece of markup, and we don't GET the
   2684 			// pos+11 etc... token which indicates a problem outside of us...
   2685 
   2686 			// todo: "___noise___1000" (or any number with four or more digits)
   2687 			// in the DOM causes an infinite loop which could be utilized by
   2688 			// malicious software
   2689 			if (strlen($text) > $pos + 15) {
   2690 				$key = '___noise___'
   2691 				. $text[$pos + 11]
   2692 				. $text[$pos + 12]
   2693 				. $text[$pos + 13]
   2694 				. $text[$pos + 14]
   2695 				. $text[$pos + 15];
   2696 
   2697 				if (is_object($debug_object)) {
   2698 					$debug_object->debug_log(2, 'located key of: ' . $key);
   2699 				}
   2700 
   2701 				if (isset($this->noise[$key])) {
   2702 					$text = substr($text, 0, $pos)
   2703 					. $this->noise[$key]
   2704 					. substr($text, $pos + 16);
   2705 				} else {
   2706 					// do this to prevent an infinite loop.
   2707 					$text = substr($text, 0, $pos)
   2708 					. 'UNDEFINED NOISE FOR KEY: '
   2709 					. $key
   2710 					. substr($text, $pos + 16);
   2711 				}
   2712 			} else {
   2713 				// There is no valid key being given back to us... We must get
   2714 				// rid of the ___noise___ or we will have a problem.
   2715 				$text = substr($text, 0, $pos)
   2716 				. 'NO NUMERIC NOISE KEY'
   2717 				. substr($text, $pos + 11);
   2718 			}
   2719 		}
   2720 		return $text;
   2721 	}
   2722 
   2723 	// Sometimes we NEED one of the noise elements.
   2724 	function search_noise($text)
   2725 	{
   2726 		global $debug_object;
   2727 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
   2728 
   2729 		foreach($this->noise as $noiseElement) {
   2730 			if (strpos($noiseElement, $text) !== false) {
   2731 				return $noiseElement;
   2732 			}
   2733 		}
   2734 	}
   2735 
   2736 	function __toString()
   2737 	{
   2738 		return $this->root->innertext();
   2739 	}
   2740 
   2741 	function __get($name)
   2742 	{
   2743 		switch ($name) {
   2744 			case 'outertext':
   2745 				return $this->root->innertext();
   2746 			case 'innertext':
   2747 				return $this->root->innertext();
   2748 			case 'plaintext':
   2749 				return $this->root->text();
   2750 			case 'charset':
   2751 				return $this->_charset;
   2752 			case 'target_charset':
   2753 				return $this->_target_charset;
   2754 		}
   2755 	}
   2756 
   2757 	// camel naming conventions
   2758 	function childNodes($idx = -1)
   2759 	{
   2760 		return $this->root->childNodes($idx);
   2761 	}
   2762 
   2763 	function firstChild()
   2764 	{
   2765 		return $this->root->first_child();
   2766 	}
   2767 
   2768 	function lastChild()
   2769 	{
   2770 		return $this->root->last_child();
   2771 	}
   2772 
   2773 	function createElement($name, $value = null)
   2774 	{
   2775 		return @str_get_html("<$name>$value</$name>")->first_child();
   2776 	}
   2777 
   2778 	function createTextNode($value)
   2779 	{
   2780 		return @end(str_get_html($value)->nodes);
   2781 	}
   2782 
   2783 	function getElementById($id)
   2784 	{
   2785 		return $this->find("#$id", 0);
   2786 	}
   2787 
   2788 	function getElementsById($id, $idx = null)
   2789 	{
   2790 		return $this->find("#$id", $idx);
   2791 	}
   2792 
   2793 	function getElementByTagName($name)
   2794 	{
   2795 		return $this->find($name, 0);
   2796 	}
   2797 
   2798 	function getElementsByTagName($name, $idx = -1)
   2799 	{
   2800 		return $this->find($name, $idx);
   2801 	}
   2802 
   2803 	function loadFile()
   2804 	{
   2805 		$args = func_get_args();
   2806 		$this->load_file($args);
   2807 	}
   2808 }