# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # # This file is a part of feedparser. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. from __future__ import annotations import re from .html import BaseHTMLProcessor from .urls import make_safe_absolute_uri class HTMLSanitizer(BaseHTMLProcessor): acceptable_elements = { "a", "abbr", "acronym", "address", "area", "article", "aside", "audio", "b", "big", "blockquote", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "command", "datagrid", "datalist", "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "event-source", "fieldset", "figcaption", "figure", "font", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "i", "img", "input", "ins", "kbd", "keygen", "label", "legend", "li", "m", "map", "menu", "meter", "multicol", "nav", "nextid", "noscript", "ol", "optgroup", "option", "output", "p", "pre", "progress", "q", "s", "samp", "section", "select", "small", "sound", "source", "spacer", "span", "strike", "strong", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", "tr", "tt", "u", "ul", "var", "video", } acceptable_attributes = { "abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "autocomplete", "autofocus", "axis", "background", "balance", "bgcolor", "bgproperties", "border", "bordercolor", "bordercolordark", "bordercolorlight", "bottompadding", "cellpadding", "cellspacing", "ch", "challenge", "char", "charoff", "charset", "checked", "choff", "cite", "class", "clear", "color", "cols", "colspan", "compact", "contenteditable", "controls", "coords", "data", "datafld", "datapagesize", "datasrc", "datetime", "default", "delay", "dir", "disabled", "draggable", "dynsrc", "enctype", "end", "face", "for", "form", "frame", "galleryimg", "gutter", "headers", "height", "hidden", "hidefocus", "high", "href", "hreflang", "hspace", "icon", "id", "inputmode", "ismap", "keytype", "label", "lang", "leftspacing", "list", "longdesc", "loop", "loopcount", "loopend", "loopstart", "low", "lowsrc", "max", "maxlength", "media", "method", "min", "multiple", "name", "nohref", "noshade", "nowrap", "open", "optimum", "pattern", "ping", "point-size", "poster", "pqg", "preload", "prompt", "radiogroup", "readonly", "rel", "repeat-max", "repeat-min", "replace", "required", "rev", "rightspacing", "rows", "rowspan", "rules", "scope", "selected", "shape", "size", "span", "src", "start", "step", "style", "summary", "suppress", "tabindex", "target", "template", "title", "toppadding", "type", "unselectable", "urn", "usemap", "valign", "value", "variable", "volume", "vrml", "vspace", "width", "wrap", "xml:lang", } unacceptable_elements_with_end_tag = { "applet", "script", "style", } acceptable_css_properties = { "azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width", } # survey of common keywords found in feeds acceptable_css_keywords = { "!important", "aqua", "auto", "black", "block", "blue", "bold", "both", "bottom", "brown", "center", "collapse", "dashed", "dotted", "fuchsia", "gray", "green", "italic", "left", "lime", "maroon", "medium", "navy", "none", "normal", "nowrap", "olive", "pointer", "purple", "red", "right", "silver", "solid", "teal", "top", "transparent", "underline", "white", "yellow", } valid_css_values = re.compile( r"^(" r"#[0-9a-f]+" # Hex values r"|rgb\(\d+%?,\d*%?,?\d*%?\)?" # RGB values r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?" # Sizes/widths r")$" ) mathml_elements = { "annotation", "annotation-xml", "maction", "maligngroup", "malignmark", "math", "menclose", "merror", "mfenced", "mfrac", "mglyph", "mi", "mlabeledtr", "mlongdiv", "mmultiscripts", "mn", "mo", "mover", "mpadded", "mphantom", "mprescripts", "mroot", "mrow", "ms", "mscarries", "mscarry", "msgroup", "msline", "mspace", "msqrt", "msrow", "mstack", "mstyle", "msub", "msubsup", "msup", "mtable", "mtd", "mtext", "mtr", "munder", "munderover", "none", "semantics", } mathml_attributes = { "accent", "accentunder", "actiontype", "align", "alignmentscope", "altimg", "altimg-height", "altimg-valign", "altimg-width", "alttext", "bevelled", "charalign", "close", "columnalign", "columnlines", "columnspacing", "columnspan", "columnwidth", "crossout", "decimalpoint", "denomalign", "depth", "dir", "display", "displaystyle", "edge", "encoding", "equalcolumns", "equalrows", "fence", "fontstyle", "fontweight", "form", "frame", "framespacing", "groupalign", "height", "href", "id", "indentalign", "indentalignfirst", "indentalignlast", "indentshift", "indentshiftfirst", "indentshiftlast", "indenttarget", "infixlinebreakstyle", "largeop", "length", "linebreak", "linebreakmultchar", "linebreakstyle", "lineleading", "linethickness", "location", "longdivstyle", "lquote", "lspace", "mathbackground", "mathcolor", "mathsize", "mathvariant", "maxsize", "minlabelspacing", "minsize", "movablelimits", "notation", "numalign", "open", "other", "overflow", "position", "rowalign", "rowlines", "rowspacing", "rowspan", "rquote", "rspace", "scriptlevel", "scriptminsize", "scriptsizemultiplier", "selection", "separator", "separators", "shift", "side", "src", "stackalign", "stretchy", "subscriptshift", "superscriptshift", "symmetric", "voffset", "width", "xlink:href", "xlink:show", "xlink:type", "xmlns", "xmlns:xlink", } # svgtiny - foreignObject + linearGradient + radialGradient + stop svg_elements = { "a", "animate", "animateColor", "animateMotion", "animateTransform", "circle", "defs", "desc", "ellipse", "font-face", "font-face-name", "font-face-src", "foreignObject", "g", "glyph", "hkern", "line", "linearGradient", "marker", "metadata", "missing-glyph", "mpath", "path", "polygon", "polyline", "radialGradient", "rect", "set", "stop", "svg", "switch", "text", "title", "tspan", "use", } # svgtiny + class + opacity + offset + xmlns + xmlns:xlink svg_attributes = { "accent-height", "accumulate", "additive", "alphabetic", "arabic-form", "ascent", "attributeName", "attributeType", "baseProfile", "bbox", "begin", "by", "calcMode", "cap-height", "class", "color", "color-rendering", "content", "cx", "cy", "d", "descent", "display", "dur", "dx", "dy", "end", "fill", "fill-opacity", "fill-rule", "font-family", "font-size", "font-stretch", "font-style", "font-variant", "font-weight", "from", "fx", "fy", "g1", "g2", "glyph-name", "gradientUnits", "hanging", "height", "horiz-adv-x", "horiz-origin-x", "id", "ideographic", "k", "keyPoints", "keySplines", "keyTimes", "lang", "marker-end", "marker-mid", "marker-start", "markerHeight", "markerUnits", "markerWidth", "mathematical", "max", "min", "name", "offset", "opacity", "orient", "origin", "overline-position", "overline-thickness", "panose-1", "path", "pathLength", "points", "preserveAspectRatio", "r", "refX", "refY", "repeatCount", "repeatDur", "requiredExtensions", "requiredFeatures", "restart", "rotate", "rx", "ry", "slope", "stemh", "stemv", "stop-color", "stop-opacity", "strikethrough-position", "strikethrough-thickness", "stroke", "stroke-dasharray", "stroke-dashoffset", "stroke-linecap", "stroke-linejoin", "stroke-miterlimit", "stroke-opacity", "stroke-width", "systemLanguage", "target", "text-anchor", "to", "transform", "type", "u1", "u2", "underline-position", "underline-thickness", "unicode", "unicode-range", "units-per-em", "values", "version", "viewBox", "visibility", "width", "widths", "x", "x-height", "x1", "x2", "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role", "xlink:show", "xlink:title", "xlink:type", "xml:base", "xml:lang", "xml:space", "xmlns", "xmlns:xlink", "y", "y1", "y2", "zoomAndPan", } svg_attr_map = None svg_elem_map = None acceptable_svg_properties = { "fill", "fill-opacity", "fill-rule", "stroke", "stroke-linecap", "stroke-linejoin", "stroke-opacity", "stroke-width", } def __init__(self, encoding=None, _type="application/xhtml+xml"): super().__init__(encoding, _type) self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 def reset(self): super().reset() self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 def unknown_starttag(self, tag, attrs): acceptable_attributes = self.acceptable_attributes keymap = {} if tag not in self.acceptable_elements or self.svgOK: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 # add implicit namespaces to html5 inline svg/mathml if self._type.endswith("html"): if not dict(attrs).get("xmlns"): if tag == "svg": attrs.append(("xmlns", "http://www.w3.org/2000/svg")) if tag == "math": attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML")) # not otherwise acceptable, perhaps it is MathML or SVG? if ( tag == "math" and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs ): self.mathmlOK += 1 if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs: self.svgOK += 1 # chose acceptable attributes based on tag class, else bail if self.mathmlOK and tag in self.mathml_elements: acceptable_attributes = self.mathml_attributes elif self.svgOK and tag in self.svg_elements: # For most vocabularies, lowercasing is a good idea. Many # svg elements, however, are camel case. if not self.svg_attr_map: lower = [attr.lower() for attr in self.svg_attributes] mix = [a for a in self.svg_attributes if a not in lower] self.svg_attributes = lower self.svg_attr_map = {a.lower(): a for a in mix} lower = [attr.lower() for attr in self.svg_elements] mix = [a for a in self.svg_elements if a not in lower] self.svg_elements = lower self.svg_elem_map = {a.lower(): a for a in mix} acceptable_attributes = self.svg_attributes tag = self.svg_elem_map.get(tag, tag) keymap = self.svg_attr_map elif tag not in self.acceptable_elements: return # declare xlink namespace, if needed if self.mathmlOK or self.svgOK: if any(a for a in attrs if a[0].startswith("xlink:")): if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs: attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink")) clean_attrs = [] for key, value in self.normalize_attrs(attrs): if key == "style" and "style" in acceptable_attributes: clean_value = self.sanitize_style(value) if clean_value: clean_attrs.append((key, clean_value)) elif key in acceptable_attributes: key = keymap.get(key, key) # make sure the uri uses an acceptable uri scheme if key == "href": value = make_safe_absolute_uri(value) clean_attrs.append((key, value)) super().unknown_starttag(tag, clean_attrs) def unknown_endtag(self, tag): if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 if self.mathmlOK and tag in self.mathml_elements: if tag == "math" and self.mathmlOK: self.mathmlOK -= 1 elif self.svgOK and tag in self.svg_elements: tag = self.svg_elem_map.get(tag, tag) if tag == "svg" and self.svgOK: self.svgOK -= 1 else: return super().unknown_endtag(tag) def handle_pi(self, text): pass def handle_decl(self, text): pass def handle_data(self, text): if not self.unacceptablestack: super().handle_data(text) def sanitize_style(self, style): # disallow urls style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style) # gauntlet if not re.match( r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style, ): return "" # This replaced a regexp that used re.match and was prone to # pathological back-tracking. if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip(): return "" clean = [] for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): if not value: continue if prop.lower() in self.acceptable_css_properties: clean.append(prop + ": " + value + ";") elif prop.split("-")[0].lower() in [ "background", "border", "margin", "padding", ]: for keyword in value.split(): if ( keyword not in self.acceptable_css_keywords and not self.valid_css_values.match(keyword) ): break else: clean.append(prop + ": " + value + ";") elif self.svgOK and prop.lower() in self.acceptable_svg_properties: clean.append(prop + ": " + value + ";") return " ".join(clean) def parse_comment(self, i, report=1): ret = super().parse_comment(i, report) if ret >= 0: return ret # if ret == -1, this may be a malicious attempt to circumvent # sanitization, or a page-destroying unclosed comment match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4) if match: return match.end() # unclosed comment; deliberately fail to handle_data() return len(self.rawdata) def sanitize_html(html_source, encoding, _type): p = HTMLSanitizer(encoding, _type) html_source = html_source.replace(" RE_ENTITY_PATTERN = re.compile(rb"^\s*]*?)>", re.MULTILINE) # Match XML DOCTYPE declarations. # Example: RE_DOCTYPE_PATTERN = re.compile(rb"^\s*]*?)>", re.MULTILINE) # Match safe entity declarations. # This will allow hexadecimal character references through, # as well as text, but not arbitrary nested entities. # Example: cubed "³" # Example: copyright "(C)" # Forbidden: explode1 "&explode2;&explode2;" RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"') def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]: """Strip and replaces the DOCTYPE. One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration. Therefore, this function must identify that version while replacing the DOCTYPE. As a convenience to the loose XML parser, entities are pre-computed and returned. The tuple that is returned has the following values, in order: 1. The version extracted from the XML DOCTYPE. The value will either be "rss091n" or None. 2. Binary XML content with a replaced DOCTYPE. 3. A dictionary of entities and replacements. """ # Verify this looks like an XML feed. if not re.match(rb"^\s*<", data): return None, data, {} # Divide the document into two groups by finding the location # of the first element that doesn't begin with '\n\n]>" ) data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data # Precompute the safe entities for the loose parser. entities = { k.decode("utf-8"): v.decode("utf-8") for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement) } return version, data, entities