<?php

/**
 * @file
 * $Id$.
 */

// vim: expandtab sw=4 ts=4 sts=4:
// ***** BEGIN LICENSE BLOCK *****
// This file is part of HTML Sanitizer.
// Copyright (c) 2005-2011 Frederic Minne <zefredz@gmail.com>.
// All rights reserved.
//
// HTML Sanitizer is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation; either version 3 of the License, or
// (at your option) any later version.
//
// HTML Sanitizer is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with HTML Sanitizer; if not, see <http://www.gnu.org/licenses/>.
//
// ***** END LICENSE BLOCK *****.
/**
 * Sanitize HTML contents :
 * Remove dangerous tags and attributes that can lead to security issues like
 * XSS or HTTP response splitting.
 *
 * @author Frederic Minne <zefredz@gmail.com>
 * @copyright Copyright &copy; 2005-2011, Frederic Minne
 * @license http://www.gnu.org/licenses/lgpl.txt GNU Lesser General Public License version 3 or later
 * @version 1.1
 */
class HTML_Sanitizer {
  /**
   * Private fields.
   */
  private $_allowedTags;
  private $_allowJavascriptEvents;
  private $_allowJavascriptInUrls;
  private $_allowObjects;
  private $_allowScript;
  private $_allowStyle;
  private $_additionalTags;

  /**
   * Constructor.
   */
  public function __construct() {
    $this->resetAll();
  }

  /**
   * (re)set all options to default value.
   */
  public function resetAll() {
    $this->_allowDOMEvents = FALSE;
    $this->_allowJavascriptInUrls = FALSE;
    $this->_allowStyle = FALSE;
    $this->_allowScript = FALSE;
    $this->_allowObjects = FALSE;
    $this->_allowStyle = FALSE;

    $this->_allowedTags = '<a><br><b><h1><h2><h3><h4><h5><h6>'
            . '<img><li><ol><p><strong><table><tr><td><th><u><ul><thead>'
            . '<tbody><tfoot><em><dd><dt><dl><span><div><del><add><i><hr>'
            . '<pre><br><blockquote><address><code><caption><abbr><acronym>'
            . '<cite><dfn><q><ins><sup><sub><kbd><samp><var><tt><small><big>';

    $this->_additionalTags = '';
  }

  /**
   * Add additional tags to allowed tags.
   *
   * @param string
   *
   * @access public
   */
  public function addAdditionalTags($tags) {
    $this->_additionalTags .= $tags;
  }

  /**
   * Allow iframes.
   *
   * @access public
   */
  public function allowIframes() {
    $this->addAdditionalTags('<iframe>');
  }

  /**
   * Allow HTML5 media tags.
   *
   * @access public
   */
  public function allowHtml5Media() {
    $this->addAdditionalTags('<canvas><video><audio>');
  }

  /**
   * Allow object, embed, applet and param tags in html.
   *
   * @access public
   */
  public function allowObjects() {
    $this->_allowObjects = TRUE;
  }

  /**
   * Allow DOM event on DOM elements.
   *
   * @access public
   */
  public function allowDOMEvents() {
    $this->_allowDOMEvents = TRUE;
  }

  /**
   * Allow script tags.
   *
   * @access public
   */
  public function allowScript() {
    $this->_allowScript = TRUE;
  }

  /**
   * Allow the use of javascript: in urls.
   *
   * @access public
   */
  public function allowJavascriptInUrls() {
    $this->_allowJavascriptInUrls = TRUE;
  }

  /**
   * Allow style tags and attributes.
   *
   * @access public
   */
  public function allowStyle() {
    $this->_allowStyle = TRUE;
  }

  /**
   * Helper to allow all javascript related tags and attributes.
   *
   * @access public
   */
  public function allowAllJavascript() {
    $this->allowDOMEvents();
    $this->allowScript();
    $this->allowJavascriptInUrls();
  }

  /**
   * Allow all tags and attributes.
   *
   * @access public
   */
  public function allowAll() {
    $this->allowAllJavascript();
    $this->allowObjects();
    $this->allowStyle();
    $this->allowIframes();
    $this->allowHtml5Media();
  }

  /**
   * Filter URLs to avoid HTTP response splitting attacks.
   *
   * @access public
   * @param string url
   *
   * @return string filtered url
   */
  public function filterHTTPResponseSplitting($url) {
    $dangerousCharactersPattern = '~(\r\n|\r|\n|%0a|%0d|%0D|%0A)~';
    return preg_replace($dangerousCharactersPattern, '', $url);
  }

  /**
   * Remove potential javascript in urls.
   *
   * @access public
   * @param string url
   *
   * @return string filtered url
   */
  public function removeJavascriptURL($str) {
    $HTML_Sanitizer_stripJavascriptURL = 'javascript:[^"]+';

    $str = preg_replace("/$HTML_Sanitizer_stripJavascriptURL/i", '__forbidden__', $str);

    return $str;
  }

  /**
   * Remove potential flaws in urls.
   *
   * @access private
   * @param string url
   *
   * @return string filtered url
   */
  private function sanitizeURL($url) {
    if (!$this->_allowJavascriptInUrls) {
      $url = $this->removeJavascriptURL($url);
    }

    $url = $this->filterHTTPResponseSplitting($url);

    return $url;
  }

  /**
   * Callback for PCRE.
   *
   * @access private
   * @param matches array
   *
   * @return string
   *
   * @see sanitizeURL
   */
  private function _sanitizeURLCallback($matches) {
    return 'href="' . $this->sanitizeURL($matches[1]) . '"';
  }

  /**
   * Remove potential flaws in href attributes.
   *
   * @access private
   * @param string html tag
   *
   * @return string filtered html tag
   */
  private function sanitizeHref($str) {
    $HTML_Sanitizer_URL = 'href="([^"]+)"';

    return preg_replace_callback("/$HTML_Sanitizer_URL/i", array(&$this, '_sanitizeURLCallback'), $str);
  }

  /**
   * Callback for PCRE.
   *
   * @access private
   * @param matches array
   *
   * @return string
   *
   * @see sanitizeURL
   */
  private function _sanitizeSrcCallback($matches) {
    return 'src="' . $this->sanitizeURL($matches[1]) . '"';
  }

  /**
   * Remove potential flaws in href attributes.
   *
   * @access private
   * @param string html tag
   *
   * @return string filtered html tag
   */
  private function sanitizeSrc($str) {
    $HTML_Sanitizer_URL = 'src="([^"]+)"';

    return preg_replace_callback("/$HTML_Sanitizer_URL/i", array(&$this, '_sanitizeSrcCallback'), $str);
  }

  /**
   * Remove dangerous attributes from html tags.
   *
   * @access private
   * @param string html tag
   *
   * @return string filtered html tag
   */
  private function removeEvilAttributes($str) {
    if (!$this->_allowDOMEvents) {
      $str = preg_replace_callback('/<(.*?)>/i', array(&$this, '_removeDOMEventsCallback'), $str);
    }

    if (!$this->_allowStyle) {
      $str = preg_replace_callback('/<(.*?)>/i', array(&$this, '_removeStyleCallback'), $str);
    }

    return $str;
  }

  /**
   * Remove DOM events attributes from html tags.
   *
   * @access private
   * @param string html tag
   *
   * @return string filtered html tag
   */
  private function removeDOMEvents($str) {
    $str = preg_replace('/\s*=\s*/', '=', $str);

    $HTML_Sanitizer_stripAttrib = '(onclick|ondblclick|onmousedown|'
            . 'onmouseup|onmouseover|onmousemove|onmouseout|onkeypress|onkeydown|'
            . 'onkeyup|onfocus|onblur|onabort|onerror|onload)';

    $str = stripslashes(preg_replace("/$HTML_Sanitizer_stripAttrib/i", 'forbidden', $str));

    return $str;
  }

  /**
   * Callback for PCRE.
   *
   * @access private
   * @param matches array
   *
   * @return string
   *
   * @see removeDOMEvents
   */
  private function _removeDOMEventsCallback($matches) {
    return '<' . $this->removeDOMEvents($matches[1]) . '>';
  }

  /**
   * Remove style attributes from html tags.
   *
   * @access private
   * @param string html tag
   *
   * @return string filtered html tag
   */
  private function removeStyle($str) {
    $str = preg_replace('/\s*=\s*/', '=', $str);

    $HTML_Sanitizer_stripAttrib = '(style)';

    $str = stripslashes(preg_replace("/$HTML_Sanitizer_stripAttrib/i", 'forbidden', $str));

    return $str;
  }

  /**
   * Callback for PCRE.
   *
   * @access private
   * @param matches array
   *
   * @return string
   *
   * @see removeStyle
   */
  private function _removeStyleCallback($matches) {
    return '<' . $this->removeStyle($matches[1]) . '>';
  }

  /**
   * Remove dangerous HTML tags.
   *
   * @access private
   * @param string html code
   *
   * @return string filtered url
   */
  private function removeEvilTags($str) {
    $allowedTags = $this->_allowedTags;

    if ($this->_allowScript) {
      $allowedTags .= '<script>';
    }

    if ($this->_allowStyle) {
      $allowedTags .= '<style>';
    }

    if ($this->_allowObjects) {
      $allowedTags .= '<object><embed><applet><param>';
    }

    $allowedTags .= $this->_additionalTags;

    $str = strip_tags($str, $allowedTags);

    return $str;
  }

  /**
   * Sanitize HTML
   *  remove dangerous tags and attributes
   *  clean urls.
   *
   * @access public
   * @param string html code
   *
   * @return string sanitized html code
   */
  public function sanitize($html) {
    $html = $this->removeEvilTags($html);

    $html = $this->removeEvilAttributes($html);

    $html = $this->sanitizeHref($html);

    $html = $this->sanitizeSrc($html);

    return $html;
  }

}

/**
 *
 */
function html_sanitize($str) {
  static $san = NULL;

  if (empty($san)) {
    $san = new HTML_Sanitizer();
  }

  return $san->sanitize($str);
}

/**
 *
 */
function html_loose_sanitize($str) {
  static $san = NULL;

  if (empty($san)) {
    $san = new HTML_Sanitizer();
    $san->allowAll();
  }

  return $san->sanitize($str);

}