SickGear/lib/feedparser/sanitizer.py

# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

from __future__ import annotations

import re

from .html import BaseHTMLProcessor
from .urls import make_safe_absolute_uri


class HTMLSanitizer(BaseHTMLProcessor):
    acceptable_elements = {
        "a",
        "abbr",
        "acronym",
        "address",
        "area",
        "article",
        "aside",
        "audio",
        "b",
        "big",
        "blockquote",
        "br",
        "button",
        "canvas",
        "caption",
        "center",
        "cite",
        "code",
        "col",
        "colgroup",
        "command",
        "datagrid",
        "datalist",
        "dd",
        "del",
        "details",
        "dfn",
        "dialog",
        "dir",
        "div",
        "dl",
        "dt",
        "em",
        "event-source",
        "fieldset",
        "figcaption",
        "figure",
        "font",
        "footer",
        "form",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "header",
        "hr",
        "i",
        "img",
        "input",
        "ins",
        "kbd",
        "keygen",
        "label",
        "legend",
        "li",
        "m",
        "map",
        "menu",
        "meter",
        "multicol",
        "nav",
        "nextid",
        "noscript",
        "ol",
        "optgroup",
        "option",
        "output",
        "p",
        "pre",
        "progress",
        "q",
        "s",
        "samp",
        "section",
        "select",
        "small",
        "sound",
        "source",
        "spacer",
        "span",
        "strike",
        "strong",
        "sub",
        "sup",
        "table",
        "tbody",
        "td",
        "textarea",
        "tfoot",
        "th",
        "thead",
        "time",
        "tr",
        "tt",
        "u",
        "ul",
        "var",
        "video",
    }

    acceptable_attributes = {
        "abbr",
        "accept",
        "accept-charset",
        "accesskey",
        "action",
        "align",
        "alt",
        "autocomplete",
        "autofocus",
        "axis",
        "background",
        "balance",
        "bgcolor",
        "bgproperties",
        "border",
        "bordercolor",
        "bordercolordark",
        "bordercolorlight",
        "bottompadding",
        "cellpadding",
        "cellspacing",
        "ch",
        "challenge",
        "char",
        "charoff",
        "charset",
        "checked",
        "choff",
        "cite",
        "class",
        "clear",
        "color",
        "cols",
        "colspan",
        "compact",
        "contenteditable",
        "controls",
        "coords",
        "data",
        "datafld",
        "datapagesize",
        "datasrc",
        "datetime",
        "default",
        "delay",
        "dir",
        "disabled",
        "draggable",
        "dynsrc",
        "enctype",
        "end",
        "face",
        "for",
        "form",
        "frame",
        "galleryimg",
        "gutter",
        "headers",
        "height",
        "hidden",
        "hidefocus",
        "high",
        "href",
        "hreflang",
        "hspace",
        "icon",
        "id",
        "inputmode",
        "ismap",
        "keytype",
        "label",
        "lang",
        "leftspacing",
        "list",
        "longdesc",
        "loop",
        "loopcount",
        "loopend",
        "loopstart",
        "low",
        "lowsrc",
        "max",
        "maxlength",
        "media",
        "method",
        "min",
        "multiple",
        "name",
        "nohref",
        "noshade",
        "nowrap",
        "open",
        "optimum",
        "pattern",
        "ping",
        "point-size",
        "poster",
        "pqg",
        "preload",
        "prompt",
        "radiogroup",
        "readonly",
        "rel",
        "repeat-max",
        "repeat-min",
        "replace",
        "required",
        "rev",
        "rightspacing",
        "rows",
        "rowspan",
        "rules",
        "scope",
        "selected",
        "shape",
        "size",
        "span",
        "src",
        "start",
        "step",
        "style",
        "summary",
        "suppress",
        "tabindex",
        "target",
        "template",
        "title",
        "toppadding",
        "type",
        "unselectable",
        "urn",
        "usemap",
        "valign",
        "value",
        "variable",
        "volume",
        "vrml",
        "vspace",
        "width",
        "wrap",
        "xml:lang",
    }

    unacceptable_elements_with_end_tag = {
        "applet",
        "script",
        "style",
    }

    acceptable_css_properties = {
        "azimuth",
        "background-color",
        "border-bottom-color",
        "border-collapse",
        "border-color",
        "border-left-color",
        "border-right-color",
        "border-top-color",
        "clear",
        "color",
        "cursor",
        "direction",
        "display",
        "elevation",
        "float",
        "font",
        "font-family",
        "font-size",
        "font-style",
        "font-variant",
        "font-weight",
        "height",
        "letter-spacing",
        "line-height",
        "overflow",
        "pause",
        "pause-after",
        "pause-before",
        "pitch",
        "pitch-range",
        "richness",
        "speak",
        "speak-header",
        "speak-numeral",
        "speak-punctuation",
        "speech-rate",
        "stress",
        "text-align",
        "text-decoration",
        "text-indent",
        "unicode-bidi",
        "vertical-align",
        "voice-family",
        "volume",
        "white-space",
        "width",
    }

    # survey of common keywords found in feeds
    acceptable_css_keywords = {
        "!important",
        "aqua",
        "auto",
        "black",
        "block",
        "blue",
        "bold",
        "both",
        "bottom",
        "brown",
        "center",
        "collapse",
        "dashed",
        "dotted",
        "fuchsia",
        "gray",
        "green",
        "italic",
        "left",
        "lime",
        "maroon",
        "medium",
        "navy",
        "none",
        "normal",
        "nowrap",
        "olive",
        "pointer",
        "purple",
        "red",
        "right",
        "silver",
        "solid",
        "teal",
        "top",
        "transparent",
        "underline",
        "white",
        "yellow",
    }

    valid_css_values = re.compile(
        r"^("
        r"#[0-9a-f]+"  # Hex values
        r"|rgb\(\d+%?,\d*%?,?\d*%?\)?"  # RGB values
        r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?"  # Sizes/widths
        r")$"
    )

    mathml_elements = {
        "annotation",
        "annotation-xml",
        "maction",
        "maligngroup",
        "malignmark",
        "math",
        "menclose",
        "merror",
        "mfenced",
        "mfrac",
        "mglyph",
        "mi",
        "mlabeledtr",
        "mlongdiv",
        "mmultiscripts",
        "mn",
        "mo",
        "mover",
        "mpadded",
        "mphantom",
        "mprescripts",
        "mroot",
        "mrow",
        "ms",
        "mscarries",
        "mscarry",
        "msgroup",
        "msline",
        "mspace",
        "msqrt",
        "msrow",
        "mstack",
        "mstyle",
        "msub",
        "msubsup",
        "msup",
        "mtable",
        "mtd",
        "mtext",
        "mtr",
        "munder",
        "munderover",
        "none",
        "semantics",
    }

    mathml_attributes = {
        "accent",
        "accentunder",
        "actiontype",
        "align",
        "alignmentscope",
        "altimg",
        "altimg-height",
        "altimg-valign",
        "altimg-width",
        "alttext",
        "bevelled",
        "charalign",
        "close",
        "columnalign",
        "columnlines",
        "columnspacing",
        "columnspan",
        "columnwidth",
        "crossout",
        "decimalpoint",
        "denomalign",
        "depth",
        "dir",
        "display",
        "displaystyle",
        "edge",
        "encoding",
        "equalcolumns",
        "equalrows",
        "fence",
        "fontstyle",
        "fontweight",
        "form",
        "frame",
        "framespacing",
        "groupalign",
        "height",
        "href",
        "id",
        "indentalign",
        "indentalignfirst",
        "indentalignlast",
        "indentshift",
        "indentshiftfirst",
        "indentshiftlast",
        "indenttarget",
        "infixlinebreakstyle",
        "largeop",
        "length",
        "linebreak",
        "linebreakmultchar",
        "linebreakstyle",
        "lineleading",
        "linethickness",
        "location",
        "longdivstyle",
        "lquote",
        "lspace",
        "mathbackground",
        "mathcolor",
        "mathsize",
        "mathvariant",
        "maxsize",
        "minlabelspacing",
        "minsize",
        "movablelimits",
        "notation",
        "numalign",
        "open",
        "other",
        "overflow",
        "position",
        "rowalign",
        "rowlines",
        "rowspacing",
        "rowspan",
        "rquote",
        "rspace",
        "scriptlevel",
        "scriptminsize",
        "scriptsizemultiplier",
        "selection",
        "separator",
        "separators",
        "shift",
        "side",
        "src",
        "stackalign",
        "stretchy",
        "subscriptshift",
        "superscriptshift",
        "symmetric",
        "voffset",
        "width",
        "xlink:href",
        "xlink:show",
        "xlink:type",
        "xmlns",
        "xmlns:xlink",
    }

    # svgtiny - foreignObject + linearGradient + radialGradient + stop
    svg_elements = {
        "a",
        "animate",
        "animateColor",
        "animateMotion",
        "animateTransform",
        "circle",
        "defs",
        "desc",
        "ellipse",
        "font-face",
        "font-face-name",
        "font-face-src",
        "foreignObject",
        "g",
        "glyph",
        "hkern",
        "line",
        "linearGradient",
        "marker",
        "metadata",
        "missing-glyph",
        "mpath",
        "path",
        "polygon",
        "polyline",
        "radialGradient",
        "rect",
        "set",
        "stop",
        "svg",
        "switch",
        "text",
        "title",
        "tspan",
        "use",
    }

    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
    svg_attributes = {
        "accent-height",
        "accumulate",
        "additive",
        "alphabetic",
        "arabic-form",
        "ascent",
        "attributeName",
        "attributeType",
        "baseProfile",
        "bbox",
        "begin",
        "by",
        "calcMode",
        "cap-height",
        "class",
        "color",
        "color-rendering",
        "content",
        "cx",
        "cy",
        "d",
        "descent",
        "display",
        "dur",
        "dx",
        "dy",
        "end",
        "fill",
        "fill-opacity",
        "fill-rule",
        "font-family",
        "font-size",
        "font-stretch",
        "font-style",
        "font-variant",
        "font-weight",
        "from",
        "fx",
        "fy",
        "g1",
        "g2",
        "glyph-name",
        "gradientUnits",
        "hanging",
        "height",
        "horiz-adv-x",
        "horiz-origin-x",
        "id",
        "ideographic",
        "k",
        "keyPoints",
        "keySplines",
        "keyTimes",
        "lang",
        "marker-end",
        "marker-mid",
        "marker-start",
        "markerHeight",
        "markerUnits",
        "markerWidth",
        "mathematical",
        "max",
        "min",
        "name",
        "offset",
        "opacity",
        "orient",
        "origin",
        "overline-position",
        "overline-thickness",
        "panose-1",
        "path",
        "pathLength",
        "points",
        "preserveAspectRatio",
        "r",
        "refX",
        "refY",
        "repeatCount",
        "repeatDur",
        "requiredExtensions",
        "requiredFeatures",
        "restart",
        "rotate",
        "rx",
        "ry",
        "slope",
        "stemh",
        "stemv",
        "stop-color",
        "stop-opacity",
        "strikethrough-position",
        "strikethrough-thickness",
        "stroke",
        "stroke-dasharray",
        "stroke-dashoffset",
        "stroke-linecap",
        "stroke-linejoin",
        "stroke-miterlimit",
        "stroke-opacity",
        "stroke-width",
        "systemLanguage",
        "target",
        "text-anchor",
        "to",
        "transform",
        "type",
        "u1",
        "u2",
        "underline-position",
        "underline-thickness",
        "unicode",
        "unicode-range",
        "units-per-em",
        "values",
        "version",
        "viewBox",
        "visibility",
        "width",
        "widths",
        "x",
        "x-height",
        "x1",
        "x2",
        "xlink:actuate",
        "xlink:arcrole",
        "xlink:href",
        "xlink:role",
        "xlink:show",
        "xlink:title",
        "xlink:type",
        "xml:base",
        "xml:lang",
        "xml:space",
        "xmlns",
        "xmlns:xlink",
        "y",
        "y1",
        "y2",
        "zoomAndPan",
    }

    svg_attr_map = None
    svg_elem_map = None

    acceptable_svg_properties = {
        "fill",
        "fill-opacity",
        "fill-rule",
        "stroke",
        "stroke-linecap",
        "stroke-linejoin",
        "stroke-opacity",
        "stroke-width",
    }

    def __init__(self, encoding=None, _type="application/xhtml+xml"):
        super().__init__(encoding, _type)

        self.unacceptablestack = 0
        self.mathmlOK = 0
        self.svgOK = 0

    def reset(self):
        super().reset()
        self.unacceptablestack = 0
        self.mathmlOK = 0
        self.svgOK = 0

    def unknown_starttag(self, tag, attrs):
        acceptable_attributes = self.acceptable_attributes
        keymap = {}
        if tag not in self.acceptable_elements or self.svgOK:
            if tag in self.unacceptable_elements_with_end_tag:
                self.unacceptablestack += 1

            # add implicit namespaces to html5 inline svg/mathml
            if self._type.endswith("html"):
                if not dict(attrs).get("xmlns"):
                    if tag == "svg":
                        attrs.append(("xmlns", "http://www.w3.org/2000/svg"))
                    if tag == "math":
                        attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML"))

            # not otherwise acceptable, perhaps it is MathML or SVG?
            if (
                tag == "math"
                and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs
            ):
                self.mathmlOK += 1
            if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs:
                self.svgOK += 1

            # chose acceptable attributes based on tag class, else bail
            if self.mathmlOK and tag in self.mathml_elements:
                acceptable_attributes = self.mathml_attributes
            elif self.svgOK and tag in self.svg_elements:
                # For most vocabularies, lowercasing is a good idea. Many
                # svg elements, however, are camel case.
                if not self.svg_attr_map:
                    lower = [attr.lower() for attr in self.svg_attributes]
                    mix = [a for a in self.svg_attributes if a not in lower]
                    self.svg_attributes = lower
                    self.svg_attr_map = {a.lower(): a for a in mix}

                    lower = [attr.lower() for attr in self.svg_elements]
                    mix = [a for a in self.svg_elements if a not in lower]
                    self.svg_elements = lower
                    self.svg_elem_map = {a.lower(): a for a in mix}
                acceptable_attributes = self.svg_attributes
                tag = self.svg_elem_map.get(tag, tag)
                keymap = self.svg_attr_map
            elif tag not in self.acceptable_elements:
                return

        # declare xlink namespace, if needed
        if self.mathmlOK or self.svgOK:
            if any(a for a in attrs if a[0].startswith("xlink:")):
                if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs:
                    attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink"))

        clean_attrs = []
        for key, value in self.normalize_attrs(attrs):
            if key == "style" and "style" in acceptable_attributes:
                clean_value = self.sanitize_style(value)
                if clean_value:
                    clean_attrs.append((key, clean_value))
            elif key in acceptable_attributes:
                key = keymap.get(key, key)
                # make sure the uri uses an acceptable uri scheme
                if key == "href":
                    value = make_safe_absolute_uri(value)
                clean_attrs.append((key, value))
        super().unknown_starttag(tag, clean_attrs)

    def unknown_endtag(self, tag):
        if tag not in self.acceptable_elements:
            if tag in self.unacceptable_elements_with_end_tag:
                self.unacceptablestack -= 1
            if self.mathmlOK and tag in self.mathml_elements:
                if tag == "math" and self.mathmlOK:
                    self.mathmlOK -= 1
            elif self.svgOK and tag in self.svg_elements:
                tag = self.svg_elem_map.get(tag, tag)
                if tag == "svg" and self.svgOK:
                    self.svgOK -= 1
            else:
                return
        super().unknown_endtag(tag)

    def handle_pi(self, text):
        pass

    def handle_decl(self, text):
        pass

    def handle_data(self, text):
        if not self.unacceptablestack:
            super().handle_data(text)

    def sanitize_style(self, style):
        # disallow urls
        style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)

        # gauntlet
        if not re.match(
            r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""",
            style,
        ):
            return ""
        # This replaced a regexp that used re.match and was prone to
        # pathological back-tracking.
        if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip():
            return ""

        clean = []
        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
            if prop.lower() in self.acceptable_css_properties:
                clean.append(prop + ": " + value + ";")
            elif prop.split("-")[0].lower() in [
                "background",
                "border",
                "margin",
                "padding",
            ]:
                for keyword in value.split():
                    if (
                        keyword not in self.acceptable_css_keywords
                        and not self.valid_css_values.match(keyword)
                    ):
                        break
                else:
                    clean.append(prop + ": " + value + ";")
            elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
                clean.append(prop + ": " + value + ";")

        return " ".join(clean)

    def parse_comment(self, i, report=1):
        ret = super().parse_comment(i, report)
        if ret >= 0:
            return ret
        # if ret == -1, this may be a malicious attempt to circumvent
        # sanitization, or a page-destroying unclosed comment
        match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4)
        if match:
            return match.end()
        # unclosed comment; deliberately fail to handle_data()
        return len(self.rawdata)


def sanitize_html(html_source, encoding, _type):
    p = HTMLSanitizer(encoding, _type)
    html_source = html_source.replace("<![CDATA[", "&lt;![CDATA[")
    p.feed(html_source)
    data = p.output()
    data = data.strip().replace("\r\n", "\n")
    return data


# Match XML entity declarations.
# Example: <!ENTITY copyright "(C)">
RE_ENTITY_PATTERN = re.compile(rb"^\s*<!ENTITY([^>]*?)>", re.MULTILINE)

# Match XML DOCTYPE declarations.
# Example: <!DOCTYPE feed [ ]>
RE_DOCTYPE_PATTERN = re.compile(rb"^\s*<!DOCTYPE([^>]*?)>", re.MULTILINE)

# Match safe entity declarations.
# This will allow hexadecimal character references through,
# as well as text, but not arbitrary nested entities.
# Example: cubed "&#179;"
# Example: copyright "(C)"
# Forbidden: explode1 "&explode2;&explode2;"
RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')


def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
    """Strip and replaces the DOCTYPE.

    One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
    Therefore, this function must identify that version while replacing the DOCTYPE.

    As a convenience to the loose XML parser, entities are pre-computed and returned.

    The tuple that is returned has the following values, in order:

    1.  The version extracted from the XML DOCTYPE.
        The value will either be "rss091n" or None.
    2.  Binary XML content with a replaced DOCTYPE.
    3.  A dictionary of entities and replacements.
    """

    # Verify this looks like an XML feed.
    if not re.match(rb"^\s*<", data):
        return None, data, {}

    # Divide the document into two groups by finding the location
    # of the first element that doesn't begin with '<?' or '<!'.
    match = re.search(rb"<\w", data)
    first_element = match.start() + 1 if match is not None else 0
    head, data = data[:first_element], data[first_element:]

    # Save, and then remove, any ENTITY declarations.
    entity_results = RE_ENTITY_PATTERN.findall(head)
    head = RE_ENTITY_PATTERN.sub(b"", head)

    # Find the DOCTYPE declaration and check the feed type.
    doctype_results = RE_DOCTYPE_PATTERN.findall(head)
    doctype = doctype_results and doctype_results[0] or b""
    if b"netscape" in doctype.lower():
        version = "rss091n"
    else:
        version = None

    # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
    replacement = b""
    if len(doctype_results) == 1 and entity_results:
        safe_entities = [e for e in entity_results if RE_SAFE_ENTITY_PATTERN.match(e)]
        if safe_entities:
            replacement = (
                b"<!DOCTYPE feed [\n<!ENTITY"
                + b">\n<!ENTITY ".join(safe_entities)
                + b">\n]>"
            )
    data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data

    # Precompute the safe entities for the loose parser.
    entities = {
        k.decode("utf-8"): v.decode("utf-8")
        for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
    }
    return version, data, entities