From 706c61bfe8a1a9f1a201f05defb929e8553e9690 Mon Sep 17 00:00:00 2001 From: Frank Ronny Larsen Date: Fri, 12 Jul 2013 14:49:37 +0200 Subject: [PATCH] Replaced PHP strip_tags with D7 filter_xss. Good thing we cache this.. --- h5p.classes.php | 326 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 322 insertions(+), 4 deletions(-) diff --git a/h5p.classes.php b/h5p.classes.php index 525368c..0665038 100644 --- a/h5p.classes.php +++ b/h5p.classes.php @@ -1208,11 +1208,8 @@ class H5PContentValidator { if (in_array('ul', $tags) || in_array('ol', $tags) && ! in_array('li', $tags)) { $tags[] = 'li'; } - // Convert array of tagNames to string of bracketed tags - $allowedtags = implode('', array_map(array($this, 'bracketTags'), $tags)); - // Strip invalid HTML tags. - $text = strip_tags($text, $allowedtags); + $text = $this->filter_xss($text, $tags); } else { // Filter text to plain text. @@ -1440,5 +1437,326 @@ class H5PContentValidator { $this->h5pF->setErrorMessage($this->h5pF->t('Library used in content is not a valid library according to semantics')); } } + + + // XSS filters copied from drupal 7 common.inc. Some modifications done to + // replace Drupal one-liner functions with corresponding flat PHP. + + /** + * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities. + * + * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses. + * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html. + * + * This code does four things: + * - Removes characters and constructs that can trick browsers. + * - Makes sure all HTML entities are well-formed. + * - Makes sure all HTML tags and attributes are well-formed. + * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g. + * javascript:). + * + * @param $string + * The string with raw HTML in it. It will be stripped of everything that can + * cause an XSS attack. + * @param $allowed_tags + * An array of allowed tags. + * + * @return + * An XSS safe version of $string, or an empty string if $string is not + * valid UTF-8. + * + * @ingroup sanitization + */ + private function filter_xss($string, $allowed_tags = array('a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd')) { + if (strlen($string) == 0) { + return $string; + } + // Only operate on valid UTF-8 strings. This is necessary to prevent cross + // site scripting issues on Internet Explorer 6. (Line copied from + // drupal_validate_utf8) + if (preg_match('/^./us', $string) != 1) { + return ''; + } + + // Store the text format. + $this->_filter_xss_split($allowed_tags, TRUE); + // Remove NULL characters (ignored by some browsers). + $string = str_replace(chr(0), '', $string); + // Remove Netscape 4 JS entities. + $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string); + + // Defuse all HTML entities. + $string = str_replace('&', '&', $string); + // Change back only well-formed entities in our whitelist: + // Decimal numeric entities. + $string = preg_replace('/&#([0-9]+;)/', '&#\1', $string); + // Hexadecimal numeric entities. + $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string); + // Named entities. + $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string); + return preg_replace_callback('% + ( + <(?=[^a-zA-Z!/]) # a lone < + | # or + # a comment + | # or + <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string + | # or + > # just a > + )%x', array($this, '_filter_xss_split'), $string); + } + + /** + * Processes an HTML tag. + * + * @param $m + * An array with various meaning depending on the value of $store. + * If $store is TRUE then the array contains the allowed tags. + * If $store is FALSE then the array has one element, the HTML tag to process. + * @param $store + * Whether to store $m. + * + * @return + * If the element isn't allowed, an empty string. Otherwise, the cleaned up + * version of the HTML element. + */ + private function _filter_xss_split($m, $store = FALSE) { + static $allowed_html; + + if ($store) { + $allowed_html = array_flip($m); + return; + } + + $string = $m[1]; + + if (substr($string, 0, 1) != '<') { + // We matched a lone ">" character. + return '>'; + } + elseif (strlen($string) == 1) { + // We matched a lone "<" character. + return '<'; + } + + if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?|()$%', $string, $matches)) { + // Seriously malformed. + return ''; + } + + $slash = trim($matches[1]); + $elem = &$matches[2]; + $attrlist = &$matches[3]; + $comment = &$matches[4]; + + if ($comment) { + $elem = '!--'; + } + + if (!isset($allowed_html[strtolower($elem)])) { + // Disallowed HTML element. + return ''; + } + + if ($comment) { + return $comment; + } + + if ($slash != '') { + return ""; + } + + // Is there a closing XHTML slash at the end of the attributes? + $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count); + $xhtml_slash = $count ? ' /' : ''; + + // Clean up attributes. + $attr2 = implode(' ', $this->_filter_xss_attributes($attrlist)); + $attr2 = preg_replace('/[<>]/', '', $attr2); + $attr2 = strlen($attr2) ? ' ' . $attr2 : ''; + + return "<$elem$attr2$xhtml_slash>"; + } + + /** + * Processes a string of HTML attributes. + * + * @return + * Cleaned up version of the HTML attributes. + */ + private function _filter_xss_attributes($attr) { + $attrarr = array(); + $mode = 0; + $attrname = ''; + + while (strlen($attr) != 0) { + // Was the last operation successful? + $working = 0; + + switch ($mode) { + case 0: + // Attribute name, href for instance. + if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { + $attrname = strtolower($match[1]); + $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on'); + $working = $mode = 1; + $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); + } + break; + + case 1: + // Equals sign or valueless ("selected"). + if (preg_match('/^\s*=\s*/', $attr)) { + $working = 1; $mode = 2; + $attr = preg_replace('/^\s*=\s*/', '', $attr); + break; + } + + if (preg_match('/^\s+/', $attr)) { + $working = 1; $mode = 0; + if (!$skip) { + $attrarr[] = $attrname; + } + $attr = preg_replace('/^\s+/', '', $attr); + } + break; + + case 2: + // Attribute value, a URL after href= for instance. + if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) { + $thisval = $this->filter_xss_bad_protocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname=\"$thisval\""; + } + $working = 1; + $mode = 0; + $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); + break; + } + + if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) { + $thisval = $this->filter_xss_bad_protocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname='$thisval'"; + } + $working = 1; $mode = 0; + $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); + break; + } + + if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) { + $thisval = $this->filter_xss_bad_protocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname=\"$thisval\""; + } + $working = 1; $mode = 0; + $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr); + } + break; + } + + if ($working == 0) { + // Not well formed; remove and try again. + $attr = preg_replace('/ + ^ + ( + "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string + | # or + \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string + | # or + \S # - a non-whitespace character + )* # any number of the above three + \s* # any number of whitespaces + /x', '', $attr); + $mode = 0; + } + } + + // The attribute list ends with a valueless attribute like "selected". + if ($mode == 1 && !$skip) { + $attrarr[] = $attrname; + } + return $attrarr; + } + + /** + * Processes an HTML attribute value and strips dangerous protocols from URLs. + * + * @param $string + * The string with the attribute value. + * @param $decode + * (deprecated) Whether to decode entities in the $string. Set to FALSE if the + * $string is in plain text, TRUE otherwise. Defaults to TRUE. This parameter + * is deprecated and will be removed in Drupal 8. To process a plain-text URI, + * call _strip_dangerous_protocols() or check_url() instead. + * + * @return + * Cleaned up and HTML-escaped version of $string. + */ + private function filter_xss_bad_protocol($string, $decode = TRUE) { + // Get the plain text representation of the attribute value (i.e. its meaning). + // @todo Remove the $decode parameter in Drupal 8, and always assume an HTML + // string that needs decoding. + if ($decode) { + $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8'); + } + return check_plain($this->_strip_dangerous_protocols($string)); + } + + /** + * Strips dangerous protocols (e.g. 'javascript:') from a URI. + * + * This function must be called for all URIs within user-entered input prior + * to being output to an HTML attribute value. It is often called as part of + * check_url() or filter_xss(), but those functions return an HTML-encoded + * string, so this function can be called independently when the output needs to + * be a plain-text string for passing to t(), l(), drupal_attributes(), or + * another function that will call check_plain() separately. + * + * @param $uri + * A plain-text URI that might contain dangerous protocols. + * + * @return + * A plain-text URI stripped of dangerous protocols. As with all plain-text + * strings, this return value must not be output to an HTML page without + * check_plain() being called on it. However, it can be passed to functions + * expecting plain-text strings. + * + * @see check_url() + */ + private function _strip_dangerous_protocols($uri) { + static $allowed_protocols; + + if (!isset($allowed_protocols)) { + $allowed_protocols = array_flip(array('ftp', 'http', 'https', 'mailto')); + } + + // Iteratively remove any invalid protocol found. + do { + $before = $uri; + $colonpos = strpos($uri, ':'); + if ($colonpos > 0) { + // We found a colon, possibly a protocol. Verify. + $protocol = substr($uri, 0, $colonpos); + // If a colon is preceded by a slash, question mark or hash, it cannot + // possibly be part of the URL scheme. This must be a relative URL, which + // inherits the (safe) protocol of the base document. + if (preg_match('![/?#]!', $protocol)) { + break; + } + // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3 + // (URI Comparison) scheme comparison must be case-insensitive. + if (!isset($allowed_protocols[strtolower($protocol)])) { + $uri = substr($uri, $colonpos + 1); + } + } + } while ($before != $uri); + + return $uri; + } + } ?> \ No newline at end of file