angelovcom.net

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

Sanitize.php (16310B)


      1 <?php
      2 /**
      3  * SimplePie
      4  *
      5  * A PHP-Based RSS and Atom Feed Framework.
      6  * Takes the hard work out of managing a complete RSS/Atom solution.
      7  *
      8  * Copyright (c) 2004-2016, Ryan Parman, Sam Sneddon, Ryan McCue, and contributors
      9  * All rights reserved.
     10  *
     11  * Redistribution and use in source and binary forms, with or without modification, are
     12  * permitted provided that the following conditions are met:
     13  *
     14  * 	* Redistributions of source code must retain the above copyright notice, this list of
     15  * 	  conditions and the following disclaimer.
     16  *
     17  * 	* Redistributions in binary form must reproduce the above copyright notice, this list
     18  * 	  of conditions and the following disclaimer in the documentation and/or other materials
     19  * 	  provided with the distribution.
     20  *
     21  * 	* Neither the name of the SimplePie Team nor the names of its contributors may be used
     22  * 	  to endorse or promote products derived from this software without specific prior
     23  * 	  written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
     26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
     27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
     28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
     32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     33  * POSSIBILITY OF SUCH DAMAGE.
     34  *
     35  * @package SimplePie
     36  * @copyright 2004-2016 Ryan Parman, Sam Sneddon, Ryan McCue
     37  * @author Ryan Parman
     38  * @author Sam Sneddon
     39  * @author Ryan McCue
     40  * @link http://simplepie.org/ SimplePie
     41  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
     42  */
     43 
     44 /**
     45  * Used for data cleanup and post-processing
     46  *
     47  *
     48  * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
     49  *
     50  * @package SimplePie
     51  * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
     52  */
     53 class SimplePie_Sanitize
     54 {
     55 	// Private vars
     56 	var $base;
     57 
     58 	// Options
     59 	var $remove_div = true;
     60 	var $image_handler = '';
     61 	var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
     62 	var $encode_instead_of_strip = false;
     63 	var $strip_attributes = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
     64 	var $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
     65 	var $strip_comments = false;
     66 	var $output_encoding = 'UTF-8';
     67 	var $enable_cache = true;
     68 	var $cache_location = './cache';
     69 	var $cache_name_function = 'md5';
     70 	var $timeout = 10;
     71 	var $useragent = '';
     72 	var $force_fsockopen = false;
     73 	var $replace_url_attributes = null;
     74 
     75 	public function __construct()
     76 	{
     77 		// Set defaults
     78 		$this->set_url_replacements(null);
     79 	}
     80 
     81 	public function remove_div($enable = true)
     82 	{
     83 		$this->remove_div = (bool) $enable;
     84 	}
     85 
     86 	public function set_image_handler($page = false)
     87 	{
     88 		if ($page)
     89 		{
     90 			$this->image_handler = (string) $page;
     91 		}
     92 		else
     93 		{
     94 			$this->image_handler = false;
     95 		}
     96 	}
     97 
     98 	public function set_registry(SimplePie_Registry $registry)
     99 	{
    100 		$this->registry = $registry;
    101 	}
    102 
    103 	public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
    104 	{
    105 		if (isset($enable_cache))
    106 		{
    107 			$this->enable_cache = (bool) $enable_cache;
    108 		}
    109 
    110 		if ($cache_location)
    111 		{
    112 			$this->cache_location = (string) $cache_location;
    113 		}
    114 
    115 		if ($cache_name_function)
    116 		{
    117 			$this->cache_name_function = (string) $cache_name_function;
    118 		}
    119 	}
    120 
    121 	public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
    122 	{
    123 		if ($timeout)
    124 		{
    125 			$this->timeout = (string) $timeout;
    126 		}
    127 
    128 		if ($useragent)
    129 		{
    130 			$this->useragent = (string) $useragent;
    131 		}
    132 
    133 		if ($force_fsockopen)
    134 		{
    135 			$this->force_fsockopen = (string) $force_fsockopen;
    136 		}
    137 	}
    138 
    139 	public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
    140 	{
    141 		if ($tags)
    142 		{
    143 			if (is_array($tags))
    144 			{
    145 				$this->strip_htmltags = $tags;
    146 			}
    147 			else
    148 			{
    149 				$this->strip_htmltags = explode(',', $tags);
    150 			}
    151 		}
    152 		else
    153 		{
    154 			$this->strip_htmltags = false;
    155 		}
    156 	}
    157 
    158 	public function encode_instead_of_strip($encode = false)
    159 	{
    160 		$this->encode_instead_of_strip = (bool) $encode;
    161 	}
    162 
    163 	public function strip_attributes($attribs = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
    164 	{
    165 		if ($attribs)
    166 		{
    167 			if (is_array($attribs))
    168 			{
    169 				$this->strip_attributes = $attribs;
    170 			}
    171 			else
    172 			{
    173 				$this->strip_attributes = explode(',', $attribs);
    174 			}
    175 		}
    176 		else
    177 		{
    178 			$this->strip_attributes = false;
    179 		}
    180 	}
    181 
    182 	public function add_attributes($attribs = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none')))
    183 	{
    184 		if ($attribs)
    185 		{
    186 			if (is_array($attribs))
    187 			{
    188 				$this->add_attributes = $attribs;
    189 			}
    190 			else
    191 			{
    192 				$this->add_attributes = explode(',', $attribs);
    193 			}
    194 		}
    195 		else
    196 		{
    197 			$this->add_attributes = false;
    198 		}
    199 	}
    200 
    201 	public function strip_comments($strip = false)
    202 	{
    203 		$this->strip_comments = (bool) $strip;
    204 	}
    205 
    206 	public function set_output_encoding($encoding = 'UTF-8')
    207 	{
    208 		$this->output_encoding = (string) $encoding;
    209 	}
    210 
    211 	/**
    212 	 * Set element/attribute key/value pairs of HTML attributes
    213 	 * containing URLs that need to be resolved relative to the feed
    214 	 *
    215 	 * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
    216 	 * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
    217 	 * |q|@cite
    218 	 *
    219 	 * @since 1.0
    220 	 * @param array|null $element_attribute Element/attribute key/value pairs, null for default
    221 	 */
    222 	public function set_url_replacements($element_attribute = null)
    223 	{
    224 		if ($element_attribute === null)
    225 		{
    226 			$element_attribute = array(
    227 				'a' => 'href',
    228 				'area' => 'href',
    229 				'blockquote' => 'cite',
    230 				'del' => 'cite',
    231 				'form' => 'action',
    232 				'img' => array(
    233 					'longdesc',
    234 					'src'
    235 				),
    236 				'input' => 'src',
    237 				'ins' => 'cite',
    238 				'q' => 'cite'
    239 			);
    240 		}
    241 		$this->replace_url_attributes = (array) $element_attribute;
    242 	}
    243 
    244 	public function sanitize($data, $type, $base = '')
    245 	{
    246 		$data = trim($data);
    247 		if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
    248 		{
    249 			if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
    250 			{
    251 				if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
    252 				{
    253 					$type |= SIMPLEPIE_CONSTRUCT_HTML;
    254 				}
    255 				else
    256 				{
    257 					$type |= SIMPLEPIE_CONSTRUCT_TEXT;
    258 				}
    259 			}
    260 
    261 			if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
    262 			{
    263 				$data = base64_decode($data);
    264 			}
    265 
    266 			if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
    267 			{
    268 
    269 				if (!class_exists('DOMDocument'))
    270 				{
    271 					throw new SimplePie_Exception('DOMDocument not found, unable to use sanitizer');
    272 				}
    273 				$document = new DOMDocument();
    274 				$document->encoding = 'UTF-8';
    275 
    276 				$data = $this->preprocess($data, $type);
    277 
    278 				set_error_handler(array('SimplePie_Misc', 'silence_errors'));
    279 				$document->loadHTML($data);
    280 				restore_error_handler();
    281 
    282 				$xpath = new DOMXPath($document);
    283 
    284 				// Strip comments
    285 				if ($this->strip_comments)
    286 				{
    287 					$comments = $xpath->query('//comment()');
    288 
    289 					foreach ($comments as $comment)
    290 					{
    291 						$comment->parentNode->removeChild($comment);
    292 					}
    293 				}
    294 
    295 				// Strip out HTML tags and attributes that might cause various security problems.
    296 				// Based on recommendations by Mark Pilgrim at:
    297 				// http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
    298 				if ($this->strip_htmltags)
    299 				{
    300 					foreach ($this->strip_htmltags as $tag)
    301 					{
    302 						$this->strip_tag($tag, $document, $xpath, $type);
    303 					}
    304 				}
    305 
    306 				if ($this->strip_attributes)
    307 				{
    308 					foreach ($this->strip_attributes as $attrib)
    309 					{
    310 						$this->strip_attr($attrib, $xpath);
    311 					}
    312 				}
    313 
    314 				if ($this->add_attributes)
    315 				{
    316 					foreach ($this->add_attributes as $tag => $valuePairs)
    317 					{
    318 						$this->add_attr($tag, $valuePairs, $document);
    319 					}
    320 				}
    321 
    322 				// Replace relative URLs
    323 				$this->base = $base;
    324 				foreach ($this->replace_url_attributes as $element => $attributes)
    325 				{
    326 					$this->replace_urls($document, $element, $attributes);
    327 				}
    328 
    329 				// If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
    330 				if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
    331 				{
    332 					$images = $document->getElementsByTagName('img');
    333 					foreach ($images as $img)
    334 					{
    335 						if ($img->hasAttribute('src'))
    336 						{
    337 							$image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
    338 							$cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
    339 
    340 							if ($cache->load())
    341 							{
    342 								$img->setAttribute('src', $this->image_handler . $image_url);
    343 							}
    344 							else
    345 							{
    346 								$file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
    347 								$headers = $file->headers;
    348 
    349 								if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
    350 								{
    351 									if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
    352 									{
    353 										$img->setAttribute('src', $this->image_handler . $image_url);
    354 									}
    355 									else
    356 									{
    357 										trigger_error("$this->cache_location is not writable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
    358 									}
    359 								}
    360 							}
    361 						}
    362 					}
    363 				}
    364 
    365 				// Get content node
    366 				$div = $document->getElementsByTagName('body')->item(0)->firstChild;
    367 				// Finally, convert to a HTML string
    368 				$data = trim($document->saveHTML($div));
    369 
    370 				if ($this->remove_div)
    371 				{
    372 					$data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
    373 					$data = preg_replace('/<\/div>$/', '', $data);
    374 				}
    375 				else
    376 				{
    377 					$data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
    378 				}
    379 			}
    380 
    381 			if ($type & SIMPLEPIE_CONSTRUCT_IRI)
    382 			{
    383 				$absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
    384 				if ($absolute !== false)
    385 				{
    386 					$data = $absolute;
    387 				}
    388 			}
    389 
    390 			if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
    391 			{
    392 				$data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
    393 			}
    394 
    395 			if ($this->output_encoding !== 'UTF-8')
    396 			{
    397 				$data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
    398 			}
    399 		}
    400 		return $data;
    401 	}
    402 
    403 	protected function preprocess($html, $type)
    404 	{
    405 		$ret = '';
    406 		$html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html);
    407 		if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
    408 		{
    409 			// Atom XHTML constructs are wrapped with a div by default
    410 			// Note: No protection if $html contains a stray </div>!
    411 			$html = '<div>' . $html . '</div>';
    412 			$ret .= '<!DOCTYPE html>';
    413 			$content_type = 'text/html';
    414 		}
    415 		else
    416 		{
    417 			$ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
    418 			$content_type = 'application/xhtml+xml';
    419 		}
    420 
    421 		$ret .= '<html><head>';
    422 		$ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
    423 		$ret .= '</head><body>' . $html . '</body></html>';
    424 		return $ret;
    425 	}
    426 
    427 	public function replace_urls($document, $tag, $attributes)
    428 	{
    429 		if (!is_array($attributes))
    430 		{
    431 			$attributes = array($attributes);
    432 		}
    433 
    434 		if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
    435 		{
    436 			$elements = $document->getElementsByTagName($tag);
    437 			foreach ($elements as $element)
    438 			{
    439 				foreach ($attributes as $attribute)
    440 				{
    441 					if ($element->hasAttribute($attribute))
    442 					{
    443 						$value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
    444 						if ($value !== false)
    445 						{
    446 							$element->setAttribute($attribute, $value);
    447 						}
    448 					}
    449 				}
    450 			}
    451 		}
    452 	}
    453 
    454 	public function do_strip_htmltags($match)
    455 	{
    456 		if ($this->encode_instead_of_strip)
    457 		{
    458 			if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
    459 			{
    460 				$match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
    461 				$match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
    462 				return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
    463 			}
    464 			else
    465 			{
    466 				return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
    467 			}
    468 		}
    469 		elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
    470 		{
    471 			return $match[4];
    472 		}
    473 		else
    474 		{
    475 			return '';
    476 		}
    477 	}
    478 
    479 	protected function strip_tag($tag, $document, $xpath, $type)
    480 	{
    481 		$elements = $xpath->query('body//' . $tag);
    482 		if ($this->encode_instead_of_strip)
    483 		{
    484 			foreach ($elements as $element)
    485 			{
    486 				$fragment = $document->createDocumentFragment();
    487 
    488 				// For elements which aren't script or style, include the tag itself
    489 				if (!in_array($tag, array('script', 'style')))
    490 				{
    491 					$text = '<' . $tag;
    492 					if ($element->hasAttributes())
    493 					{
    494 						$attrs = array();
    495 						foreach ($element->attributes as $name => $attr)
    496 						{
    497 							$value = $attr->value;
    498 
    499 							// In XHTML, empty values should never exist, so we repeat the value
    500 							if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
    501 							{
    502 								$value = $name;
    503 							}
    504 							// For HTML, empty is fine
    505 							elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
    506 							{
    507 								$attrs[] = $name;
    508 								continue;
    509 							}
    510 
    511 							// Standard attribute text
    512 							$attrs[] = $name . '="' . $attr->value . '"';
    513 						}
    514 						$text .= ' ' . implode(' ', $attrs);
    515 					}
    516 					$text .= '>';
    517 					$fragment->appendChild(new DOMText($text));
    518 				}
    519 
    520 				$number = $element->childNodes->length;
    521 				for ($i = $number; $i > 0; $i--)
    522 				{
    523 					$child = $element->childNodes->item(0);
    524 					$fragment->appendChild($child);
    525 				}
    526 
    527 				if (!in_array($tag, array('script', 'style')))
    528 				{
    529 					$fragment->appendChild(new DOMText('</' . $tag . '>'));
    530 				}
    531 
    532 				$element->parentNode->replaceChild($fragment, $element);
    533 			}
    534 
    535 			return;
    536 		}
    537 		elseif (in_array($tag, array('script', 'style')))
    538 		{
    539 			foreach ($elements as $element)
    540 			{
    541 				$element->parentNode->removeChild($element);
    542 			}
    543 
    544 			return;
    545 		}
    546 		else
    547 		{
    548 			foreach ($elements as $element)
    549 			{
    550 				$fragment = $document->createDocumentFragment();
    551 				$number = $element->childNodes->length;
    552 				for ($i = $number; $i > 0; $i--)
    553 				{
    554 					$child = $element->childNodes->item(0);
    555 					$fragment->appendChild($child);
    556 				}
    557 
    558 				$element->parentNode->replaceChild($fragment, $element);
    559 			}
    560 		}
    561 	}
    562 
    563 	protected function strip_attr($attrib, $xpath)
    564 	{
    565 		$elements = $xpath->query('//*[@' . $attrib . ']');
    566 
    567 		foreach ($elements as $element)
    568 		{
    569 			$element->removeAttribute($attrib);
    570 		}
    571 	}
    572 
    573 	protected function add_attr($tag, $valuePairs, $document)
    574 	{
    575 		$elements = $document->getElementsByTagName($tag);
    576 		foreach ($elements as $element)
    577 		{
    578 			foreach ($valuePairs as $attrib => $value)
    579 			{
    580 				$element->setAttribute($attrib, $value);
    581 			}
    582 		}
    583 	}
    584 }