Replaced PHP strip_tags with D7 filter_xss. Good thing we cache this..

2013-07-12 14:49:37 +02:00 · 2013-07-12 14:49:37 +02:00 · 706c61bfe8
parent 7af599ae0d
commit 706c61bfe8
1 changed files with 322 additions and 4 deletions
--- a/h5p.classes.php
+++ b/h5p.classes.php
@ -1208,11 +1208,8 @@ class H5PContentValidator {
      if (in_array('ul', $tags) || in_array('ol', $tags) && ! in_array('li', $tags)) {
        $tags[] = 'li';
      }
-      // Convert array of tagNames to string of bracketed tags
-      $allowedtags = implode('', array_map(array($this, 'bracketTags'), $tags));
-
      // Strip invalid HTML tags.
-      $text = strip_tags($text, $allowedtags);
+      $text = $this->filter_xss($text, $tags);
    }
    else {
      // Filter text to plain text.
@ -1440,5 +1437,326 @@ class H5PContentValidator {
      $this->h5pF->setErrorMessage($this->h5pF->t('Library used in content is not a valid library according to semantics'));
    }
  }
+
+
+  // XSS filters copied from drupal 7 common.inc. Some modifications done to
+  // replace Drupal one-liner functions with corresponding flat PHP.
+
+  /**
+   * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
+   *
+   * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
+   * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
+   *
+   * This code does four things:
+   * - Removes characters and constructs that can trick browsers.
+   * - Makes sure all HTML entities are well-formed.
+   * - Makes sure all HTML tags and attributes are well-formed.
+   * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
+   *   javascript:).
+   *
+   * @param $string
+   *   The string with raw HTML in it. It will be stripped of everything that can
+   *   cause an XSS attack.
+   * @param $allowed_tags
+   *   An array of allowed tags.
+   *
+   * @return
+   *   An XSS safe version of $string, or an empty string if $string is not
+   *   valid UTF-8.
+   *
+   * @ingroup sanitization
+   */
+  private function filter_xss($string, $allowed_tags = array('a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd')) {
+    if (strlen($string) == 0) {
+      return $string;
+    }
+    // Only operate on valid UTF-8 strings. This is necessary to prevent cross
+    // site scripting issues on Internet Explorer 6. (Line copied from
+    // drupal_validate_utf8)
+    if (preg_match('/^./us', $string) != 1) {
+      return '';
+    }
+
+    // Store the text format.
+    $this->_filter_xss_split($allowed_tags, TRUE);
+    // Remove NULL characters (ignored by some browsers).
+    $string = str_replace(chr(0), '', $string);
+    // Remove Netscape 4 JS entities.
+    $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
+
+    // Defuse all HTML entities.
+    $string = str_replace('&', '&amp;', $string);
+    // Change back only well-formed entities in our whitelist:
+    // Decimal numeric entities.
+    $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
+    // Hexadecimal numeric entities.
+    $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
+    // Named entities.
+    $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
+    return preg_replace_callback('%
+      (
+      <(?=[^a-zA-Z!/])  # a lone <
+      |                 # or
+      <!--.*?-->        # a comment
+      |                 # or
+      <[^>]*(>|$)       # a string that starts with a <, up until the > or the end of the string
+      |                 # or
+      >                 # just a >
+      )%x', array($this, '_filter_xss_split'), $string);
+  }
+
+  /**
+   * Processes an HTML tag.
+   *
+   * @param $m
+   *   An array with various meaning depending on the value of $store.
+   *   If $store is TRUE then the array contains the allowed tags.
+   *   If $store is FALSE then the array has one element, the HTML tag to process.
+   * @param $store
+   *   Whether to store $m.
+   *
+   * @return
+   *   If the element isn't allowed, an empty string. Otherwise, the cleaned up
+   *   version of the HTML element.
+   */
+  private function _filter_xss_split($m, $store = FALSE) {
+    static $allowed_html;
+
+    if ($store) {
+      $allowed_html = array_flip($m);
+      return;
+    }
+
+    $string = $m[1];
+
+    if (substr($string, 0, 1) != '<') {
+      // We matched a lone ">" character.
+      return '&gt;';
+    }
+    elseif (strlen($string) == 1) {
+      // We matched a lone "<" character.
+      return '&lt;';
+    }
+
+    if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
+      // Seriously malformed.
+      return '';
+    }
+
+    $slash = trim($matches[1]);
+    $elem = &$matches[2];
+    $attrlist = &$matches[3];
+    $comment = &$matches[4];
+
+    if ($comment) {
+      $elem = '!--';
+    }
+
+    if (!isset($allowed_html[strtolower($elem)])) {
+      // Disallowed HTML element.
+      return '';
+    }
+
+    if ($comment) {
+      return $comment;
+    }
+
+    if ($slash != '') {
+      return "</$elem>";
+    }
+
+    // Is there a closing XHTML slash at the end of the attributes?
+    $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count);
+    $xhtml_slash = $count ? ' /' : '';
+
+    // Clean up attributes.
+    $attr2 = implode(' ', $this->_filter_xss_attributes($attrlist));
+    $attr2 = preg_replace('/[<>]/', '', $attr2);
+    $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
+
+    return "<$elem$attr2$xhtml_slash>";
+  }
+
+  /**
+   * Processes a string of HTML attributes.
+   *
+   * @return
+   *   Cleaned up version of the HTML attributes.
+   */
+  private function _filter_xss_attributes($attr) {
+    $attrarr = array();
+    $mode = 0;
+    $attrname = '';
+
+    while (strlen($attr) != 0) {
+      // Was the last operation successful?
+      $working = 0;
+
+      switch ($mode) {
+        case 0:
+          // Attribute name, href for instance.
+          if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
+            $attrname = strtolower($match[1]);
+            $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on');
+            $working = $mode = 1;
+            $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
+          }
+          break;
+
+        case 1:
+          // Equals sign or valueless ("selected").
+          if (preg_match('/^\s*=\s*/', $attr)) {
+            $working = 1; $mode = 2;
+            $attr = preg_replace('/^\s*=\s*/', '', $attr);
+            break;
+          }
+
+          if (preg_match('/^\s+/', $attr)) {
+            $working = 1; $mode = 0;
+            if (!$skip) {
+              $attrarr[] = $attrname;
+            }
+            $attr = preg_replace('/^\s+/', '', $attr);
+          }
+          break;
+
+        case 2:
+          // Attribute value, a URL after href= for instance.
+          if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
+            $thisval = $this->filter_xss_bad_protocol($match[1]);
+
+            if (!$skip) {
+              $attrarr[] = "$attrname=\"$thisval\"";
+            }
+            $working = 1;
+            $mode = 0;
+            $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
+            break;
+          }
+
+          if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
+            $thisval = $this->filter_xss_bad_protocol($match[1]);
+
+            if (!$skip) {
+              $attrarr[] = "$attrname='$thisval'";
+            }
+            $working = 1; $mode = 0;
+            $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
+            break;
+          }
+
+          if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
+            $thisval = $this->filter_xss_bad_protocol($match[1]);
+
+            if (!$skip) {
+              $attrarr[] = "$attrname=\"$thisval\"";
+            }
+            $working = 1; $mode = 0;
+            $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
+          }
+          break;
+      }
+
+      if ($working == 0) {
+        // Not well formed; remove and try again.
+        $attr = preg_replace('/
+          ^
+          (
+          "[^"]*("|$)     # - a string that starts with a double quote, up until the next double quote or the end of the string
+          |               # or
+          \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
+          |               # or
+          \S              # - a non-whitespace character
+          )*              # any number of the above three
+          \s*             # any number of whitespaces
+          /x', '', $attr);
+        $mode = 0;
+      }
+    }
+
+    // The attribute list ends with a valueless attribute like "selected".
+    if ($mode == 1 && !$skip) {
+      $attrarr[] = $attrname;
+    }
+    return $attrarr;
+  }
+
+  /**
+   * Processes an HTML attribute value and strips dangerous protocols from URLs.
+   *
+   * @param $string
+   *   The string with the attribute value.
+   * @param $decode
+   *   (deprecated) Whether to decode entities in the $string. Set to FALSE if the
+   *   $string is in plain text, TRUE otherwise. Defaults to TRUE. This parameter
+   *   is deprecated and will be removed in Drupal 8. To process a plain-text URI,
+   *   call _strip_dangerous_protocols() or check_url() instead.
+   *
+   * @return
+   *   Cleaned up and HTML-escaped version of $string.
+   */
+  private function filter_xss_bad_protocol($string, $decode = TRUE) {
+    // Get the plain text representation of the attribute value (i.e. its meaning).
+    // @todo Remove the $decode parameter in Drupal 8, and always assume an HTML
+    //   string that needs decoding.
+    if ($decode) {
+      $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
+    }
+    return check_plain($this->_strip_dangerous_protocols($string));
+  }
+
+  /**
+   * Strips dangerous protocols (e.g. 'javascript:') from a URI.
+   *
+   * This function must be called for all URIs within user-entered input prior
+   * to being output to an HTML attribute value. It is often called as part of
+   * check_url() or filter_xss(), but those functions return an HTML-encoded
+   * string, so this function can be called independently when the output needs to
+   * be a plain-text string for passing to t(), l(), drupal_attributes(), or
+   * another function that will call check_plain() separately.
+   *
+   * @param $uri
+   *   A plain-text URI that might contain dangerous protocols.
+   *
+   * @return
+   *   A plain-text URI stripped of dangerous protocols. As with all plain-text
+   *   strings, this return value must not be output to an HTML page without
+   *   check_plain() being called on it. However, it can be passed to functions
+   *   expecting plain-text strings.
+   *
+   * @see check_url()
+   */
+  private function _strip_dangerous_protocols($uri) {
+    static $allowed_protocols;
+
+    if (!isset($allowed_protocols)) {
+      $allowed_protocols = array_flip(array('ftp', 'http', 'https', 'mailto'));
+    }
+
+    // Iteratively remove any invalid protocol found.
+    do {
+      $before = $uri;
+      $colonpos = strpos($uri, ':');
+      if ($colonpos > 0) {
+        // We found a colon, possibly a protocol. Verify.
+        $protocol = substr($uri, 0, $colonpos);
+        // If a colon is preceded by a slash, question mark or hash, it cannot
+        // possibly be part of the URL scheme. This must be a relative URL, which
+        // inherits the (safe) protocol of the base document.
+        if (preg_match('![/?#]!', $protocol)) {
+          break;
+        }
+        // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
+        // (URI Comparison) scheme comparison must be case-insensitive.
+        if (!isset($allowed_protocols[strtolower($protocol)])) {
+          $uri = substr($uri, $colonpos + 1);
+        }
+      }
+    } while ($before != $uri);
+
+    return $uri;
+  }
+
 }
 ?>