2 /* Washtml, a HTML sanityzer.
4 * Copyright (c) 2007 Frederic Motte <fmotte@ubixis.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 /* Please send me your comments about this code if you have some, thanks, Fred. */
32 * Wahstml take an untrusted HTML and return a safe html string.
36 * $washer = new washtml($config);
37 * $washer->wash($html);
38 * It return a sanityzed string of the $html parameter without html and head tags.
39 * $html is a string containing the html code to wash.
40 * $config is an array containing options:
41 * $config['allow_remote'] is a boolean to allow link to remote images.
42 * $config['blocked_src'] string with image-src to be used for blocked remote images
43 * $config['show_washed'] is a boolean to include washed out attributes as x-washed
44 * $config['cid_map'] is an array where cid urls index urls to replace them.
45 * $config['charset'] is a string containing the charset of the HTML document if it is not defined in it.
46 * $washer->extlinks is a reference to a boolean that is set to true if remote images were removed. (FE: show remote images link)
50 * Only tags and attributes in the static lists $html_elements and $html_attributes
51 * are kept, inline styles are also filtered: all style identifiers matching
52 * /[a-z\-]/i are allowed. Values matching colors, sizes, /[a-z\-]/i and safe
53 * urls if allowed and cid urls if mapped are kept.
55 * BUGS: It MUST be safe !
57 * - urlencode URLs instead of htmlspecials
58 * - Check is a 3 bytes utf8 first char can eat '">'
59 * - Update PCRE: CVE-2007-1659 - CVE-2007-1660 - CVE-2007-1661 - CVE-2007-1662
60 * CVE-2007-4766 - CVE-2007-4767 - CVE-2007-4768
61 * http://lists.debian.org/debian-security-announce/debian-security-announce-2007/msg00177.html
65 * - relative links, can be implemented by prefixing an absolute path, ask me
70 * - Dont alter data on a GET: '<img src="http://yourhost/mail?action=delete&uid=3267" />'
74 * - added $block_elements
75 * - changed $ignore_elements behaviour
76 * - added RFC2397 support
78 * - invalid HTML comments removal before parsing
83 /* Allowed HTML elements (default) */
84 static $html_elements = array('a', 'abbr', 'acronym', 'address', 'area', 'b',
85 'basefont', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center',
86 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
87 'dt', 'em', 'fieldset', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
88 'ins', 'label', 'legend', 'li', 'map', 'menu', 'nobr', 'ol', 'p', 'pre', 'q',
89 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
90 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'wbr', 'img',
92 'button', 'input', 'textarea', 'select', 'option', 'optgroup'
95 /* Ignore these HTML tags and their content */
96 static $ignore_elements = array('script', 'applet', 'embed', 'object', 'style');
98 /* Allowed HTML attributes */
99 static $html_attribs = array('name', 'class', 'title', 'alt', 'width', 'height',
100 'align', 'nowrap', 'col', 'row', 'id', 'rowspan', 'colspan', 'cellspacing',
101 'cellpadding', 'valign', 'bgcolor', 'color', 'border', 'bordercolorlight',
102 'bordercolordark', 'face', 'marginwidth', 'marginheight', 'axis', 'border',
103 'abbr', 'char', 'charoff', 'clear', 'compact', 'coords', 'vspace', 'hspace',
104 'cellborder', 'size', 'lang', 'dir',
105 // attributes of form elements
106 'type', 'rows', 'cols', 'disabled', 'readonly', 'checked', 'multiple', 'value'
109 /* Block elements which could be empty but cannot be returned in short form (<tag />) */
110 static $block_elements = array('div', 'p', 'pre', 'blockquote', 'a', 'font', 'center',
111 'table', 'ul', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'dl', 'strong', 'i', 'b', 'u');
113 /* State for linked objects in HTML */
114 public $extlinks = false;
116 /* Current settings */
117 private $config = array();
119 /* Registered callback functions for tags */
120 private $handlers = array();
122 /* Allowed HTML elements */
123 private $_html_elements = array();
125 /* Ignore these HTML tags but process their content */
126 private $_ignore_elements = array();
128 /* Block elements which could be empty but cannot be returned in short form (<tag />) */
129 private $_block_elements = array();
131 /* Allowed HTML attributes */
132 private $_html_attribs = array();
136 public function __construct($p = array()) {
137 $this->_html_elements = array_flip((array)$p['html_elements']) + array_flip(self::$html_elements) ;
138 $this->_html_attribs = array_flip((array)$p['html_attribs']) + array_flip(self::$html_attribs);
139 $this->_ignore_elements = array_flip((array)$p['ignore_elements']) + array_flip(self::$ignore_elements);
140 $this->_block_elements = array_flip((array)$p['block_elements']) + array_flip(self::$block_elements);
141 unset($p['html_elements'], $p['html_attribs'], $p['ignore_elements'], $p['block_elements']);
142 $this->config = $p + array('show_washed'=>true, 'allow_remote'=>false, 'cid_map'=>array());
145 /* Register a callback function for a certain tag */
146 public function add_callback($tagName, $callback)
148 $this->handlers[$tagName] = $callback;
151 /* Check CSS style */
152 private function wash_style($style) {
155 foreach (explode(';', $style) as $declaration) {
156 if (preg_match('/^\s*([a-z\-]+)\s*:\s*(.*)\s*$/i', $declaration, $match)) {
160 while (sizeof($str) > 0 &&
161 preg_match('/^(url\(\s*[\'"]?([^\'"\)]*)[\'"]?\s*\)'./*1,2*/
162 '|rgb\(\s*[0-9]+\s*,\s*[0-9]+\s*,\s*[0-9]+\s*\)'.
163 '|-?[0-9.]+\s*(em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%)?'.
164 '|#[0-9a-f]{3,6}|[a-z0-9", -]+'.
165 ')\s*/i', $str, $match)) {
167 if (($src = $this->config['cid_map'][$match[2]])
168 || ($src = $this->config['cid_map'][$this->config['base_url'].$match[2]])) {
169 $value .= ' url('.htmlspecialchars($src, ENT_QUOTES) . ')';
171 else if (preg_match('!^(https?:)?//[a-z0-9/._+-]+$!i', $match[2], $url)) {
172 if ($this->config['allow_remote'])
173 $value .= ' url('.htmlspecialchars($url[0], ENT_QUOTES).')';
175 $this->extlinks = true;
177 else if (preg_match('/^data:.+/i', $match[2])) { // RFC2397
178 $value .= ' url('.htmlspecialchars($match[2], ENT_QUOTES).')';
181 else if ($match[0] != 'url' && $match[0] != 'rgb') //whitelist ?
182 $value .= ' ' . $match[0];
184 $str = substr($str, strlen($match[0]));
187 $s .= ($s?' ':'') . $cssid . ':' . $value . ';';
193 /* Take a node and return allowed attributes and check values */
194 private function wash_attribs($node) {
198 foreach ($node->attributes as $key => $plop) {
199 $key = strtolower($key);
200 $value = $node->getAttribute($key);
201 if (isset($this->_html_attribs[$key]) ||
202 ($key == 'href' && preg_match('/^(http:|https:|ftp:|mailto:|#).+/i', $value)))
203 $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
204 else if ($key == 'style' && ($style = $this->wash_style($value))) {
205 $quot = strpos($style, '"') !== false ? "'" : '"';
206 $t .= ' style=' . $quot . $style . $quot;
208 else if ($key == 'background' || ($key == 'src' && strtolower($node->tagName) == 'img')) { //check tagName anyway
209 if (($src = $this->config['cid_map'][$value])
210 || ($src = $this->config['cid_map'][$this->config['base_url'].$value])) {
211 $t .= ' ' . $key . '="' . htmlspecialchars($src, ENT_QUOTES) . '"';
213 else if (preg_match('/^(http|https|ftp):.+/i', $value)) {
214 if ($this->config['allow_remote'])
215 $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
217 $this->extlinks = true;
218 if ($this->config['blocked_src'])
219 $t .= ' ' . $key . '="' . htmlspecialchars($this->config['blocked_src'], ENT_QUOTES) . '"';
222 else if (preg_match('/^data:.+/i', $value)) { // RFC2397
223 $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
226 $washed .= ($washed?' ':'') . $key;
228 return $t . ($washed && $this->config['show_washed']?' x-washed="'.$washed.'"':'');
231 /* The main loop that recurse on a node tree.
232 * It output only allowed tags with allowed attributes
233 * and allowed inline styles */
234 private function dumpHtml($node) {
235 if(!$node->hasChildNodes())
238 $node = $node->firstChild;
242 switch($node->nodeType) {
243 case XML_ELEMENT_NODE: //Check element
244 $tagName = strtolower($node->tagName);
245 if ($callback = $this->handlers[$tagName]) {
246 $dump .= call_user_func($callback, $tagName, $this->wash_attribs($node), $this->dumpHtml($node), $this);
248 else if (isset($this->_html_elements[$tagName])) {
249 $content = $this->dumpHtml($node);
250 $dump .= '<' . $tagName . $this->wash_attribs($node) .
251 // create closing tag for block elements, but also for elements
252 // with content or with some attributes (eg. style, class) (#1486812)
253 ($content != '' || $node->hasAttributes() || isset($this->_block_elements[$tagName]) ? ">$content</$tagName>" : ' />');
255 else if (isset($this->_ignore_elements[$tagName])) {
256 $dump .= '<!-- ' . htmlspecialchars($tagName, ENT_QUOTES) . ' not allowed -->';
259 $dump .= '<!-- ' . htmlspecialchars($tagName, ENT_QUOTES) . ' ignored -->';
260 $dump .= $this->dumpHtml($node); // ignore tags not its content
263 case XML_CDATA_SECTION_NODE:
264 $dump .= $node->nodeValue;
267 $dump .= htmlspecialchars($node->nodeValue);
269 case XML_HTML_DOCUMENT_NODE:
270 $dump .= $this->dumpHtml($node);
272 case XML_DOCUMENT_TYPE_NODE:
275 $dump . '<!-- node type ' . $node->nodeType . ' -->';
277 } while($node = $node->nextSibling);
282 /* Main function, give it untrusted HTML, tell it if you allow loading
283 * remote images and give it a map to convert "cid:" urls. */
284 public function wash($html)
286 // Charset seems to be ignored (probably if defined in the HTML document)
287 $node = new DOMDocument('1.0', $this->config['charset']);
288 $this->extlinks = false;
290 // Find base URL for images
291 if (preg_match('/<base\s+href=[\'"]*([^\'"]+)/is', $html, $matches))
292 $this->config['base_url'] = $matches[1];
294 $this->config['base_url'] = '';
296 // Remove invalid HTML comments (#1487759)
297 // Don't remove valid conditional comments
298 $html = preg_replace('/<!--[^->[\n]*>/', '', $html);
300 @$node->loadHTML($html);
301 return $this->dumpHtml($node);
305 * Getter for config parameters
307 public function get_config($prop)
309 return $this->config[$prop];