SickGear/lib/feedparser/sanitizer.py

974 lines
24 KiB
Python

# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from __future__ import annotations
import re
from .html import BaseHTMLProcessor
from .urls import make_safe_absolute_uri
class HTMLSanitizer(BaseHTMLProcessor):
acceptable_elements = {
"a",
"abbr",
"acronym",
"address",
"area",
"article",
"aside",
"audio",
"b",
"big",
"blockquote",
"br",
"button",
"canvas",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"command",
"datagrid",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"dir",
"div",
"dl",
"dt",
"em",
"event-source",
"fieldset",
"figcaption",
"figure",
"font",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hr",
"i",
"img",
"input",
"ins",
"kbd",
"keygen",
"label",
"legend",
"li",
"m",
"map",
"menu",
"meter",
"multicol",
"nav",
"nextid",
"noscript",
"ol",
"optgroup",
"option",
"output",
"p",
"pre",
"progress",
"q",
"s",
"samp",
"section",
"select",
"small",
"sound",
"source",
"spacer",
"span",
"strike",
"strong",
"sub",
"sup",
"table",
"tbody",
"td",
"textarea",
"tfoot",
"th",
"thead",
"time",
"tr",
"tt",
"u",
"ul",
"var",
"video",
}
acceptable_attributes = {
"abbr",
"accept",
"accept-charset",
"accesskey",
"action",
"align",
"alt",
"autocomplete",
"autofocus",
"axis",
"background",
"balance",
"bgcolor",
"bgproperties",
"border",
"bordercolor",
"bordercolordark",
"bordercolorlight",
"bottompadding",
"cellpadding",
"cellspacing",
"ch",
"challenge",
"char",
"charoff",
"charset",
"checked",
"choff",
"cite",
"class",
"clear",
"color",
"cols",
"colspan",
"compact",
"contenteditable",
"controls",
"coords",
"data",
"datafld",
"datapagesize",
"datasrc",
"datetime",
"default",
"delay",
"dir",
"disabled",
"draggable",
"dynsrc",
"enctype",
"end",
"face",
"for",
"form",
"frame",
"galleryimg",
"gutter",
"headers",
"height",
"hidden",
"hidefocus",
"high",
"href",
"hreflang",
"hspace",
"icon",
"id",
"inputmode",
"ismap",
"keytype",
"label",
"lang",
"leftspacing",
"list",
"longdesc",
"loop",
"loopcount",
"loopend",
"loopstart",
"low",
"lowsrc",
"max",
"maxlength",
"media",
"method",
"min",
"multiple",
"name",
"nohref",
"noshade",
"nowrap",
"open",
"optimum",
"pattern",
"ping",
"point-size",
"poster",
"pqg",
"preload",
"prompt",
"radiogroup",
"readonly",
"rel",
"repeat-max",
"repeat-min",
"replace",
"required",
"rev",
"rightspacing",
"rows",
"rowspan",
"rules",
"scope",
"selected",
"shape",
"size",
"span",
"src",
"start",
"step",
"style",
"summary",
"suppress",
"tabindex",
"target",
"template",
"title",
"toppadding",
"type",
"unselectable",
"urn",
"usemap",
"valign",
"value",
"variable",
"volume",
"vrml",
"vspace",
"width",
"wrap",
"xml:lang",
}
unacceptable_elements_with_end_tag = {
"applet",
"script",
"style",
}
acceptable_css_properties = {
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
}
# survey of common keywords found in feeds
acceptable_css_keywords = {
"!important",
"aqua",
"auto",
"black",
"block",
"blue",
"bold",
"both",
"bottom",
"brown",
"center",
"collapse",
"dashed",
"dotted",
"fuchsia",
"gray",
"green",
"italic",
"left",
"lime",
"maroon",
"medium",
"navy",
"none",
"normal",
"nowrap",
"olive",
"pointer",
"purple",
"red",
"right",
"silver",
"solid",
"teal",
"top",
"transparent",
"underline",
"white",
"yellow",
}
valid_css_values = re.compile(
r"^("
r"#[0-9a-f]+" # Hex values
r"|rgb\(\d+%?,\d*%?,?\d*%?\)?" # RGB values
r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?" # Sizes/widths
r")$"
)
mathml_elements = {
"annotation",
"annotation-xml",
"maction",
"maligngroup",
"malignmark",
"math",
"menclose",
"merror",
"mfenced",
"mfrac",
"mglyph",
"mi",
"mlabeledtr",
"mlongdiv",
"mmultiscripts",
"mn",
"mo",
"mover",
"mpadded",
"mphantom",
"mprescripts",
"mroot",
"mrow",
"ms",
"mscarries",
"mscarry",
"msgroup",
"msline",
"mspace",
"msqrt",
"msrow",
"mstack",
"mstyle",
"msub",
"msubsup",
"msup",
"mtable",
"mtd",
"mtext",
"mtr",
"munder",
"munderover",
"none",
"semantics",
}
mathml_attributes = {
"accent",
"accentunder",
"actiontype",
"align",
"alignmentscope",
"altimg",
"altimg-height",
"altimg-valign",
"altimg-width",
"alttext",
"bevelled",
"charalign",
"close",
"columnalign",
"columnlines",
"columnspacing",
"columnspan",
"columnwidth",
"crossout",
"decimalpoint",
"denomalign",
"depth",
"dir",
"display",
"displaystyle",
"edge",
"encoding",
"equalcolumns",
"equalrows",
"fence",
"fontstyle",
"fontweight",
"form",
"frame",
"framespacing",
"groupalign",
"height",
"href",
"id",
"indentalign",
"indentalignfirst",
"indentalignlast",
"indentshift",
"indentshiftfirst",
"indentshiftlast",
"indenttarget",
"infixlinebreakstyle",
"largeop",
"length",
"linebreak",
"linebreakmultchar",
"linebreakstyle",
"lineleading",
"linethickness",
"location",
"longdivstyle",
"lquote",
"lspace",
"mathbackground",
"mathcolor",
"mathsize",
"mathvariant",
"maxsize",
"minlabelspacing",
"minsize",
"movablelimits",
"notation",
"numalign",
"open",
"other",
"overflow",
"position",
"rowalign",
"rowlines",
"rowspacing",
"rowspan",
"rquote",
"rspace",
"scriptlevel",
"scriptminsize",
"scriptsizemultiplier",
"selection",
"separator",
"separators",
"shift",
"side",
"src",
"stackalign",
"stretchy",
"subscriptshift",
"superscriptshift",
"symmetric",
"voffset",
"width",
"xlink:href",
"xlink:show",
"xlink:type",
"xmlns",
"xmlns:xlink",
}
# svgtiny - foreignObject + linearGradient + radialGradient + stop
svg_elements = {
"a",
"animate",
"animateColor",
"animateMotion",
"animateTransform",
"circle",
"defs",
"desc",
"ellipse",
"font-face",
"font-face-name",
"font-face-src",
"foreignObject",
"g",
"glyph",
"hkern",
"line",
"linearGradient",
"marker",
"metadata",
"missing-glyph",
"mpath",
"path",
"polygon",
"polyline",
"radialGradient",
"rect",
"set",
"stop",
"svg",
"switch",
"text",
"title",
"tspan",
"use",
}
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
svg_attributes = {
"accent-height",
"accumulate",
"additive",
"alphabetic",
"arabic-form",
"ascent",
"attributeName",
"attributeType",
"baseProfile",
"bbox",
"begin",
"by",
"calcMode",
"cap-height",
"class",
"color",
"color-rendering",
"content",
"cx",
"cy",
"d",
"descent",
"display",
"dur",
"dx",
"dy",
"end",
"fill",
"fill-opacity",
"fill-rule",
"font-family",
"font-size",
"font-stretch",
"font-style",
"font-variant",
"font-weight",
"from",
"fx",
"fy",
"g1",
"g2",
"glyph-name",
"gradientUnits",
"hanging",
"height",
"horiz-adv-x",
"horiz-origin-x",
"id",
"ideographic",
"k",
"keyPoints",
"keySplines",
"keyTimes",
"lang",
"marker-end",
"marker-mid",
"marker-start",
"markerHeight",
"markerUnits",
"markerWidth",
"mathematical",
"max",
"min",
"name",
"offset",
"opacity",
"orient",
"origin",
"overline-position",
"overline-thickness",
"panose-1",
"path",
"pathLength",
"points",
"preserveAspectRatio",
"r",
"refX",
"refY",
"repeatCount",
"repeatDur",
"requiredExtensions",
"requiredFeatures",
"restart",
"rotate",
"rx",
"ry",
"slope",
"stemh",
"stemv",
"stop-color",
"stop-opacity",
"strikethrough-position",
"strikethrough-thickness",
"stroke",
"stroke-dasharray",
"stroke-dashoffset",
"stroke-linecap",
"stroke-linejoin",
"stroke-miterlimit",
"stroke-opacity",
"stroke-width",
"systemLanguage",
"target",
"text-anchor",
"to",
"transform",
"type",
"u1",
"u2",
"underline-position",
"underline-thickness",
"unicode",
"unicode-range",
"units-per-em",
"values",
"version",
"viewBox",
"visibility",
"width",
"widths",
"x",
"x-height",
"x1",
"x2",
"xlink:actuate",
"xlink:arcrole",
"xlink:href",
"xlink:role",
"xlink:show",
"xlink:title",
"xlink:type",
"xml:base",
"xml:lang",
"xml:space",
"xmlns",
"xmlns:xlink",
"y",
"y1",
"y2",
"zoomAndPan",
}
svg_attr_map = None
svg_elem_map = None
acceptable_svg_properties = {
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
"stroke-width",
}
def __init__(self, encoding=None, _type="application/xhtml+xml"):
super().__init__(encoding, _type)
self.unacceptablestack = 0
self.mathmlOK = 0
self.svgOK = 0
def reset(self):
super().reset()
self.unacceptablestack = 0
self.mathmlOK = 0
self.svgOK = 0
def unknown_starttag(self, tag, attrs):
acceptable_attributes = self.acceptable_attributes
keymap = {}
if tag not in self.acceptable_elements or self.svgOK:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
# add implicit namespaces to html5 inline svg/mathml
if self._type.endswith("html"):
if not dict(attrs).get("xmlns"):
if tag == "svg":
attrs.append(("xmlns", "http://www.w3.org/2000/svg"))
if tag == "math":
attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML"))
# not otherwise acceptable, perhaps it is MathML or SVG?
if (
tag == "math"
and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs
):
self.mathmlOK += 1
if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs:
self.svgOK += 1
# chose acceptable attributes based on tag class, else bail
if self.mathmlOK and tag in self.mathml_elements:
acceptable_attributes = self.mathml_attributes
elif self.svgOK and tag in self.svg_elements:
# For most vocabularies, lowercasing is a good idea. Many
# svg elements, however, are camel case.
if not self.svg_attr_map:
lower = [attr.lower() for attr in self.svg_attributes]
mix = [a for a in self.svg_attributes if a not in lower]
self.svg_attributes = lower
self.svg_attr_map = {a.lower(): a for a in mix}
lower = [attr.lower() for attr in self.svg_elements]
mix = [a for a in self.svg_elements if a not in lower]
self.svg_elements = lower
self.svg_elem_map = {a.lower(): a for a in mix}
acceptable_attributes = self.svg_attributes
tag = self.svg_elem_map.get(tag, tag)
keymap = self.svg_attr_map
elif tag not in self.acceptable_elements:
return
# declare xlink namespace, if needed
if self.mathmlOK or self.svgOK:
if any(a for a in attrs if a[0].startswith("xlink:")):
if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs:
attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink"))
clean_attrs = []
for key, value in self.normalize_attrs(attrs):
if key == "style" and "style" in acceptable_attributes:
clean_value = self.sanitize_style(value)
if clean_value:
clean_attrs.append((key, clean_value))
elif key in acceptable_attributes:
key = keymap.get(key, key)
# make sure the uri uses an acceptable uri scheme
if key == "href":
value = make_safe_absolute_uri(value)
clean_attrs.append((key, value))
super().unknown_starttag(tag, clean_attrs)
def unknown_endtag(self, tag):
if tag not in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack -= 1
if self.mathmlOK and tag in self.mathml_elements:
if tag == "math" and self.mathmlOK:
self.mathmlOK -= 1
elif self.svgOK and tag in self.svg_elements:
tag = self.svg_elem_map.get(tag, tag)
if tag == "svg" and self.svgOK:
self.svgOK -= 1
else:
return
super().unknown_endtag(tag)
def handle_pi(self, text):
pass
def handle_decl(self, text):
pass
def handle_data(self, text):
if not self.unacceptablestack:
super().handle_data(text)
def sanitize_style(self, style):
# disallow urls
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
# gauntlet
if not re.match(
r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""",
style,
):
return ""
# This replaced a regexp that used re.match and was prone to
# pathological back-tracking.
if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip():
return ""
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.acceptable_css_properties:
clean.append(prop + ": " + value + ";")
elif prop.split("-")[0].lower() in [
"background",
"border",
"margin",
"padding",
]:
for keyword in value.split():
if (
keyword not in self.acceptable_css_keywords
and not self.valid_css_values.match(keyword)
):
break
else:
clean.append(prop + ": " + value + ";")
elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
clean.append(prop + ": " + value + ";")
return " ".join(clean)
def parse_comment(self, i, report=1):
ret = super().parse_comment(i, report)
if ret >= 0:
return ret
# if ret == -1, this may be a malicious attempt to circumvent
# sanitization, or a page-destroying unclosed comment
match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4)
if match:
return match.end()
# unclosed comment; deliberately fail to handle_data()
return len(self.rawdata)
def sanitize_html(html_source, encoding, _type):
p = HTMLSanitizer(encoding, _type)
html_source = html_source.replace("<![CDATA[", "&lt;![CDATA[")
p.feed(html_source)
data = p.output()
data = data.strip().replace("\r\n", "\n")
return data
# Match XML entity declarations.
# Example: <!ENTITY copyright "(C)">
RE_ENTITY_PATTERN = re.compile(rb"^\s*<!ENTITY([^>]*?)>", re.MULTILINE)
# Match XML DOCTYPE declarations.
# Example: <!DOCTYPE feed [ ]>
RE_DOCTYPE_PATTERN = re.compile(rb"^\s*<!DOCTYPE([^>]*?)>", re.MULTILINE)
# Match safe entity declarations.
# This will allow hexadecimal character references through,
# as well as text, but not arbitrary nested entities.
# Example: cubed "&#179;"
# Example: copyright "(C)"
# Forbidden: explode1 "&explode2;&explode2;"
RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
"""Strip and replaces the DOCTYPE.
One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
Therefore, this function must identify that version while replacing the DOCTYPE.
As a convenience to the loose XML parser, entities are pre-computed and returned.
The tuple that is returned has the following values, in order:
1. The version extracted from the XML DOCTYPE.
The value will either be "rss091n" or None.
2. Binary XML content with a replaced DOCTYPE.
3. A dictionary of entities and replacements.
"""
# Verify this looks like an XML feed.
if not re.match(rb"^\s*<", data):
return None, data, {}
# Divide the document into two groups by finding the location
# of the first element that doesn't begin with '<?' or '<!'.
match = re.search(rb"<\w", data)
first_element = match.start() + 1 if match is not None else 0
head, data = data[:first_element], data[first_element:]
# Save, and then remove, any ENTITY declarations.
entity_results = RE_ENTITY_PATTERN.findall(head)
head = RE_ENTITY_PATTERN.sub(b"", head)
# Find the DOCTYPE declaration and check the feed type.
doctype_results = RE_DOCTYPE_PATTERN.findall(head)
doctype = doctype_results and doctype_results[0] or b""
if b"netscape" in doctype.lower():
version = "rss091n"
else:
version = None
# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
replacement = b""
if len(doctype_results) == 1 and entity_results:
safe_entities = [e for e in entity_results if RE_SAFE_ENTITY_PATTERN.match(e)]
if safe_entities:
replacement = (
b"<!DOCTYPE feed [\n<!ENTITY"
+ b">\n<!ENTITY ".join(safe_entities)
+ b">\n]>"
)
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
# Precompute the safe entities for the loose parser.
entities = {
k.decode("utf-8"): v.decode("utf-8")
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
}
return version, data, entities