mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-24 05:45:05 +00:00
974 lines
24 KiB
Python
974 lines
24 KiB
Python
# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
|
|
# Copyright 2002-2008 Mark Pilgrim
|
|
# All rights reserved.
|
|
#
|
|
# This file is a part of feedparser.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from .html import BaseHTMLProcessor
|
|
from .urls import make_safe_absolute_uri
|
|
|
|
|
|
class HTMLSanitizer(BaseHTMLProcessor):
|
|
acceptable_elements = {
|
|
"a",
|
|
"abbr",
|
|
"acronym",
|
|
"address",
|
|
"area",
|
|
"article",
|
|
"aside",
|
|
"audio",
|
|
"b",
|
|
"big",
|
|
"blockquote",
|
|
"br",
|
|
"button",
|
|
"canvas",
|
|
"caption",
|
|
"center",
|
|
"cite",
|
|
"code",
|
|
"col",
|
|
"colgroup",
|
|
"command",
|
|
"datagrid",
|
|
"datalist",
|
|
"dd",
|
|
"del",
|
|
"details",
|
|
"dfn",
|
|
"dialog",
|
|
"dir",
|
|
"div",
|
|
"dl",
|
|
"dt",
|
|
"em",
|
|
"event-source",
|
|
"fieldset",
|
|
"figcaption",
|
|
"figure",
|
|
"font",
|
|
"footer",
|
|
"form",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"header",
|
|
"hr",
|
|
"i",
|
|
"img",
|
|
"input",
|
|
"ins",
|
|
"kbd",
|
|
"keygen",
|
|
"label",
|
|
"legend",
|
|
"li",
|
|
"m",
|
|
"map",
|
|
"menu",
|
|
"meter",
|
|
"multicol",
|
|
"nav",
|
|
"nextid",
|
|
"noscript",
|
|
"ol",
|
|
"optgroup",
|
|
"option",
|
|
"output",
|
|
"p",
|
|
"pre",
|
|
"progress",
|
|
"q",
|
|
"s",
|
|
"samp",
|
|
"section",
|
|
"select",
|
|
"small",
|
|
"sound",
|
|
"source",
|
|
"spacer",
|
|
"span",
|
|
"strike",
|
|
"strong",
|
|
"sub",
|
|
"sup",
|
|
"table",
|
|
"tbody",
|
|
"td",
|
|
"textarea",
|
|
"tfoot",
|
|
"th",
|
|
"thead",
|
|
"time",
|
|
"tr",
|
|
"tt",
|
|
"u",
|
|
"ul",
|
|
"var",
|
|
"video",
|
|
}
|
|
|
|
acceptable_attributes = {
|
|
"abbr",
|
|
"accept",
|
|
"accept-charset",
|
|
"accesskey",
|
|
"action",
|
|
"align",
|
|
"alt",
|
|
"autocomplete",
|
|
"autofocus",
|
|
"axis",
|
|
"background",
|
|
"balance",
|
|
"bgcolor",
|
|
"bgproperties",
|
|
"border",
|
|
"bordercolor",
|
|
"bordercolordark",
|
|
"bordercolorlight",
|
|
"bottompadding",
|
|
"cellpadding",
|
|
"cellspacing",
|
|
"ch",
|
|
"challenge",
|
|
"char",
|
|
"charoff",
|
|
"charset",
|
|
"checked",
|
|
"choff",
|
|
"cite",
|
|
"class",
|
|
"clear",
|
|
"color",
|
|
"cols",
|
|
"colspan",
|
|
"compact",
|
|
"contenteditable",
|
|
"controls",
|
|
"coords",
|
|
"data",
|
|
"datafld",
|
|
"datapagesize",
|
|
"datasrc",
|
|
"datetime",
|
|
"default",
|
|
"delay",
|
|
"dir",
|
|
"disabled",
|
|
"draggable",
|
|
"dynsrc",
|
|
"enctype",
|
|
"end",
|
|
"face",
|
|
"for",
|
|
"form",
|
|
"frame",
|
|
"galleryimg",
|
|
"gutter",
|
|
"headers",
|
|
"height",
|
|
"hidden",
|
|
"hidefocus",
|
|
"high",
|
|
"href",
|
|
"hreflang",
|
|
"hspace",
|
|
"icon",
|
|
"id",
|
|
"inputmode",
|
|
"ismap",
|
|
"keytype",
|
|
"label",
|
|
"lang",
|
|
"leftspacing",
|
|
"list",
|
|
"longdesc",
|
|
"loop",
|
|
"loopcount",
|
|
"loopend",
|
|
"loopstart",
|
|
"low",
|
|
"lowsrc",
|
|
"max",
|
|
"maxlength",
|
|
"media",
|
|
"method",
|
|
"min",
|
|
"multiple",
|
|
"name",
|
|
"nohref",
|
|
"noshade",
|
|
"nowrap",
|
|
"open",
|
|
"optimum",
|
|
"pattern",
|
|
"ping",
|
|
"point-size",
|
|
"poster",
|
|
"pqg",
|
|
"preload",
|
|
"prompt",
|
|
"radiogroup",
|
|
"readonly",
|
|
"rel",
|
|
"repeat-max",
|
|
"repeat-min",
|
|
"replace",
|
|
"required",
|
|
"rev",
|
|
"rightspacing",
|
|
"rows",
|
|
"rowspan",
|
|
"rules",
|
|
"scope",
|
|
"selected",
|
|
"shape",
|
|
"size",
|
|
"span",
|
|
"src",
|
|
"start",
|
|
"step",
|
|
"style",
|
|
"summary",
|
|
"suppress",
|
|
"tabindex",
|
|
"target",
|
|
"template",
|
|
"title",
|
|
"toppadding",
|
|
"type",
|
|
"unselectable",
|
|
"urn",
|
|
"usemap",
|
|
"valign",
|
|
"value",
|
|
"variable",
|
|
"volume",
|
|
"vrml",
|
|
"vspace",
|
|
"width",
|
|
"wrap",
|
|
"xml:lang",
|
|
}
|
|
|
|
unacceptable_elements_with_end_tag = {
|
|
"applet",
|
|
"script",
|
|
"style",
|
|
}
|
|
|
|
acceptable_css_properties = {
|
|
"azimuth",
|
|
"background-color",
|
|
"border-bottom-color",
|
|
"border-collapse",
|
|
"border-color",
|
|
"border-left-color",
|
|
"border-right-color",
|
|
"border-top-color",
|
|
"clear",
|
|
"color",
|
|
"cursor",
|
|
"direction",
|
|
"display",
|
|
"elevation",
|
|
"float",
|
|
"font",
|
|
"font-family",
|
|
"font-size",
|
|
"font-style",
|
|
"font-variant",
|
|
"font-weight",
|
|
"height",
|
|
"letter-spacing",
|
|
"line-height",
|
|
"overflow",
|
|
"pause",
|
|
"pause-after",
|
|
"pause-before",
|
|
"pitch",
|
|
"pitch-range",
|
|
"richness",
|
|
"speak",
|
|
"speak-header",
|
|
"speak-numeral",
|
|
"speak-punctuation",
|
|
"speech-rate",
|
|
"stress",
|
|
"text-align",
|
|
"text-decoration",
|
|
"text-indent",
|
|
"unicode-bidi",
|
|
"vertical-align",
|
|
"voice-family",
|
|
"volume",
|
|
"white-space",
|
|
"width",
|
|
}
|
|
|
|
# survey of common keywords found in feeds
|
|
acceptable_css_keywords = {
|
|
"!important",
|
|
"aqua",
|
|
"auto",
|
|
"black",
|
|
"block",
|
|
"blue",
|
|
"bold",
|
|
"both",
|
|
"bottom",
|
|
"brown",
|
|
"center",
|
|
"collapse",
|
|
"dashed",
|
|
"dotted",
|
|
"fuchsia",
|
|
"gray",
|
|
"green",
|
|
"italic",
|
|
"left",
|
|
"lime",
|
|
"maroon",
|
|
"medium",
|
|
"navy",
|
|
"none",
|
|
"normal",
|
|
"nowrap",
|
|
"olive",
|
|
"pointer",
|
|
"purple",
|
|
"red",
|
|
"right",
|
|
"silver",
|
|
"solid",
|
|
"teal",
|
|
"top",
|
|
"transparent",
|
|
"underline",
|
|
"white",
|
|
"yellow",
|
|
}
|
|
|
|
valid_css_values = re.compile(
|
|
r"^("
|
|
r"#[0-9a-f]+" # Hex values
|
|
r"|rgb\(\d+%?,\d*%?,?\d*%?\)?" # RGB values
|
|
r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?" # Sizes/widths
|
|
r")$"
|
|
)
|
|
|
|
mathml_elements = {
|
|
"annotation",
|
|
"annotation-xml",
|
|
"maction",
|
|
"maligngroup",
|
|
"malignmark",
|
|
"math",
|
|
"menclose",
|
|
"merror",
|
|
"mfenced",
|
|
"mfrac",
|
|
"mglyph",
|
|
"mi",
|
|
"mlabeledtr",
|
|
"mlongdiv",
|
|
"mmultiscripts",
|
|
"mn",
|
|
"mo",
|
|
"mover",
|
|
"mpadded",
|
|
"mphantom",
|
|
"mprescripts",
|
|
"mroot",
|
|
"mrow",
|
|
"ms",
|
|
"mscarries",
|
|
"mscarry",
|
|
"msgroup",
|
|
"msline",
|
|
"mspace",
|
|
"msqrt",
|
|
"msrow",
|
|
"mstack",
|
|
"mstyle",
|
|
"msub",
|
|
"msubsup",
|
|
"msup",
|
|
"mtable",
|
|
"mtd",
|
|
"mtext",
|
|
"mtr",
|
|
"munder",
|
|
"munderover",
|
|
"none",
|
|
"semantics",
|
|
}
|
|
|
|
mathml_attributes = {
|
|
"accent",
|
|
"accentunder",
|
|
"actiontype",
|
|
"align",
|
|
"alignmentscope",
|
|
"altimg",
|
|
"altimg-height",
|
|
"altimg-valign",
|
|
"altimg-width",
|
|
"alttext",
|
|
"bevelled",
|
|
"charalign",
|
|
"close",
|
|
"columnalign",
|
|
"columnlines",
|
|
"columnspacing",
|
|
"columnspan",
|
|
"columnwidth",
|
|
"crossout",
|
|
"decimalpoint",
|
|
"denomalign",
|
|
"depth",
|
|
"dir",
|
|
"display",
|
|
"displaystyle",
|
|
"edge",
|
|
"encoding",
|
|
"equalcolumns",
|
|
"equalrows",
|
|
"fence",
|
|
"fontstyle",
|
|
"fontweight",
|
|
"form",
|
|
"frame",
|
|
"framespacing",
|
|
"groupalign",
|
|
"height",
|
|
"href",
|
|
"id",
|
|
"indentalign",
|
|
"indentalignfirst",
|
|
"indentalignlast",
|
|
"indentshift",
|
|
"indentshiftfirst",
|
|
"indentshiftlast",
|
|
"indenttarget",
|
|
"infixlinebreakstyle",
|
|
"largeop",
|
|
"length",
|
|
"linebreak",
|
|
"linebreakmultchar",
|
|
"linebreakstyle",
|
|
"lineleading",
|
|
"linethickness",
|
|
"location",
|
|
"longdivstyle",
|
|
"lquote",
|
|
"lspace",
|
|
"mathbackground",
|
|
"mathcolor",
|
|
"mathsize",
|
|
"mathvariant",
|
|
"maxsize",
|
|
"minlabelspacing",
|
|
"minsize",
|
|
"movablelimits",
|
|
"notation",
|
|
"numalign",
|
|
"open",
|
|
"other",
|
|
"overflow",
|
|
"position",
|
|
"rowalign",
|
|
"rowlines",
|
|
"rowspacing",
|
|
"rowspan",
|
|
"rquote",
|
|
"rspace",
|
|
"scriptlevel",
|
|
"scriptminsize",
|
|
"scriptsizemultiplier",
|
|
"selection",
|
|
"separator",
|
|
"separators",
|
|
"shift",
|
|
"side",
|
|
"src",
|
|
"stackalign",
|
|
"stretchy",
|
|
"subscriptshift",
|
|
"superscriptshift",
|
|
"symmetric",
|
|
"voffset",
|
|
"width",
|
|
"xlink:href",
|
|
"xlink:show",
|
|
"xlink:type",
|
|
"xmlns",
|
|
"xmlns:xlink",
|
|
}
|
|
|
|
# svgtiny - foreignObject + linearGradient + radialGradient + stop
|
|
svg_elements = {
|
|
"a",
|
|
"animate",
|
|
"animateColor",
|
|
"animateMotion",
|
|
"animateTransform",
|
|
"circle",
|
|
"defs",
|
|
"desc",
|
|
"ellipse",
|
|
"font-face",
|
|
"font-face-name",
|
|
"font-face-src",
|
|
"foreignObject",
|
|
"g",
|
|
"glyph",
|
|
"hkern",
|
|
"line",
|
|
"linearGradient",
|
|
"marker",
|
|
"metadata",
|
|
"missing-glyph",
|
|
"mpath",
|
|
"path",
|
|
"polygon",
|
|
"polyline",
|
|
"radialGradient",
|
|
"rect",
|
|
"set",
|
|
"stop",
|
|
"svg",
|
|
"switch",
|
|
"text",
|
|
"title",
|
|
"tspan",
|
|
"use",
|
|
}
|
|
|
|
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
|
svg_attributes = {
|
|
"accent-height",
|
|
"accumulate",
|
|
"additive",
|
|
"alphabetic",
|
|
"arabic-form",
|
|
"ascent",
|
|
"attributeName",
|
|
"attributeType",
|
|
"baseProfile",
|
|
"bbox",
|
|
"begin",
|
|
"by",
|
|
"calcMode",
|
|
"cap-height",
|
|
"class",
|
|
"color",
|
|
"color-rendering",
|
|
"content",
|
|
"cx",
|
|
"cy",
|
|
"d",
|
|
"descent",
|
|
"display",
|
|
"dur",
|
|
"dx",
|
|
"dy",
|
|
"end",
|
|
"fill",
|
|
"fill-opacity",
|
|
"fill-rule",
|
|
"font-family",
|
|
"font-size",
|
|
"font-stretch",
|
|
"font-style",
|
|
"font-variant",
|
|
"font-weight",
|
|
"from",
|
|
"fx",
|
|
"fy",
|
|
"g1",
|
|
"g2",
|
|
"glyph-name",
|
|
"gradientUnits",
|
|
"hanging",
|
|
"height",
|
|
"horiz-adv-x",
|
|
"horiz-origin-x",
|
|
"id",
|
|
"ideographic",
|
|
"k",
|
|
"keyPoints",
|
|
"keySplines",
|
|
"keyTimes",
|
|
"lang",
|
|
"marker-end",
|
|
"marker-mid",
|
|
"marker-start",
|
|
"markerHeight",
|
|
"markerUnits",
|
|
"markerWidth",
|
|
"mathematical",
|
|
"max",
|
|
"min",
|
|
"name",
|
|
"offset",
|
|
"opacity",
|
|
"orient",
|
|
"origin",
|
|
"overline-position",
|
|
"overline-thickness",
|
|
"panose-1",
|
|
"path",
|
|
"pathLength",
|
|
"points",
|
|
"preserveAspectRatio",
|
|
"r",
|
|
"refX",
|
|
"refY",
|
|
"repeatCount",
|
|
"repeatDur",
|
|
"requiredExtensions",
|
|
"requiredFeatures",
|
|
"restart",
|
|
"rotate",
|
|
"rx",
|
|
"ry",
|
|
"slope",
|
|
"stemh",
|
|
"stemv",
|
|
"stop-color",
|
|
"stop-opacity",
|
|
"strikethrough-position",
|
|
"strikethrough-thickness",
|
|
"stroke",
|
|
"stroke-dasharray",
|
|
"stroke-dashoffset",
|
|
"stroke-linecap",
|
|
"stroke-linejoin",
|
|
"stroke-miterlimit",
|
|
"stroke-opacity",
|
|
"stroke-width",
|
|
"systemLanguage",
|
|
"target",
|
|
"text-anchor",
|
|
"to",
|
|
"transform",
|
|
"type",
|
|
"u1",
|
|
"u2",
|
|
"underline-position",
|
|
"underline-thickness",
|
|
"unicode",
|
|
"unicode-range",
|
|
"units-per-em",
|
|
"values",
|
|
"version",
|
|
"viewBox",
|
|
"visibility",
|
|
"width",
|
|
"widths",
|
|
"x",
|
|
"x-height",
|
|
"x1",
|
|
"x2",
|
|
"xlink:actuate",
|
|
"xlink:arcrole",
|
|
"xlink:href",
|
|
"xlink:role",
|
|
"xlink:show",
|
|
"xlink:title",
|
|
"xlink:type",
|
|
"xml:base",
|
|
"xml:lang",
|
|
"xml:space",
|
|
"xmlns",
|
|
"xmlns:xlink",
|
|
"y",
|
|
"y1",
|
|
"y2",
|
|
"zoomAndPan",
|
|
}
|
|
|
|
svg_attr_map = None
|
|
svg_elem_map = None
|
|
|
|
acceptable_svg_properties = {
|
|
"fill",
|
|
"fill-opacity",
|
|
"fill-rule",
|
|
"stroke",
|
|
"stroke-linecap",
|
|
"stroke-linejoin",
|
|
"stroke-opacity",
|
|
"stroke-width",
|
|
}
|
|
|
|
def __init__(self, encoding=None, _type="application/xhtml+xml"):
|
|
super().__init__(encoding, _type)
|
|
|
|
self.unacceptablestack = 0
|
|
self.mathmlOK = 0
|
|
self.svgOK = 0
|
|
|
|
def reset(self):
|
|
super().reset()
|
|
self.unacceptablestack = 0
|
|
self.mathmlOK = 0
|
|
self.svgOK = 0
|
|
|
|
def unknown_starttag(self, tag, attrs):
|
|
acceptable_attributes = self.acceptable_attributes
|
|
keymap = {}
|
|
if tag not in self.acceptable_elements or self.svgOK:
|
|
if tag in self.unacceptable_elements_with_end_tag:
|
|
self.unacceptablestack += 1
|
|
|
|
# add implicit namespaces to html5 inline svg/mathml
|
|
if self._type.endswith("html"):
|
|
if not dict(attrs).get("xmlns"):
|
|
if tag == "svg":
|
|
attrs.append(("xmlns", "http://www.w3.org/2000/svg"))
|
|
if tag == "math":
|
|
attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML"))
|
|
|
|
# not otherwise acceptable, perhaps it is MathML or SVG?
|
|
if (
|
|
tag == "math"
|
|
and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs
|
|
):
|
|
self.mathmlOK += 1
|
|
if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs:
|
|
self.svgOK += 1
|
|
|
|
# chose acceptable attributes based on tag class, else bail
|
|
if self.mathmlOK and tag in self.mathml_elements:
|
|
acceptable_attributes = self.mathml_attributes
|
|
elif self.svgOK and tag in self.svg_elements:
|
|
# For most vocabularies, lowercasing is a good idea. Many
|
|
# svg elements, however, are camel case.
|
|
if not self.svg_attr_map:
|
|
lower = [attr.lower() for attr in self.svg_attributes]
|
|
mix = [a for a in self.svg_attributes if a not in lower]
|
|
self.svg_attributes = lower
|
|
self.svg_attr_map = {a.lower(): a for a in mix}
|
|
|
|
lower = [attr.lower() for attr in self.svg_elements]
|
|
mix = [a for a in self.svg_elements if a not in lower]
|
|
self.svg_elements = lower
|
|
self.svg_elem_map = {a.lower(): a for a in mix}
|
|
acceptable_attributes = self.svg_attributes
|
|
tag = self.svg_elem_map.get(tag, tag)
|
|
keymap = self.svg_attr_map
|
|
elif tag not in self.acceptable_elements:
|
|
return
|
|
|
|
# declare xlink namespace, if needed
|
|
if self.mathmlOK or self.svgOK:
|
|
if any(a for a in attrs if a[0].startswith("xlink:")):
|
|
if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs:
|
|
attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink"))
|
|
|
|
clean_attrs = []
|
|
for key, value in self.normalize_attrs(attrs):
|
|
if key == "style" and "style" in acceptable_attributes:
|
|
clean_value = self.sanitize_style(value)
|
|
if clean_value:
|
|
clean_attrs.append((key, clean_value))
|
|
elif key in acceptable_attributes:
|
|
key = keymap.get(key, key)
|
|
# make sure the uri uses an acceptable uri scheme
|
|
if key == "href":
|
|
value = make_safe_absolute_uri(value)
|
|
clean_attrs.append((key, value))
|
|
super().unknown_starttag(tag, clean_attrs)
|
|
|
|
def unknown_endtag(self, tag):
|
|
if tag not in self.acceptable_elements:
|
|
if tag in self.unacceptable_elements_with_end_tag:
|
|
self.unacceptablestack -= 1
|
|
if self.mathmlOK and tag in self.mathml_elements:
|
|
if tag == "math" and self.mathmlOK:
|
|
self.mathmlOK -= 1
|
|
elif self.svgOK and tag in self.svg_elements:
|
|
tag = self.svg_elem_map.get(tag, tag)
|
|
if tag == "svg" and self.svgOK:
|
|
self.svgOK -= 1
|
|
else:
|
|
return
|
|
super().unknown_endtag(tag)
|
|
|
|
def handle_pi(self, text):
|
|
pass
|
|
|
|
def handle_decl(self, text):
|
|
pass
|
|
|
|
def handle_data(self, text):
|
|
if not self.unacceptablestack:
|
|
super().handle_data(text)
|
|
|
|
def sanitize_style(self, style):
|
|
# disallow urls
|
|
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
|
|
|
|
# gauntlet
|
|
if not re.match(
|
|
r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""",
|
|
style,
|
|
):
|
|
return ""
|
|
# This replaced a regexp that used re.match and was prone to
|
|
# pathological back-tracking.
|
|
if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip():
|
|
return ""
|
|
|
|
clean = []
|
|
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
|
if not value:
|
|
continue
|
|
if prop.lower() in self.acceptable_css_properties:
|
|
clean.append(prop + ": " + value + ";")
|
|
elif prop.split("-")[0].lower() in [
|
|
"background",
|
|
"border",
|
|
"margin",
|
|
"padding",
|
|
]:
|
|
for keyword in value.split():
|
|
if (
|
|
keyword not in self.acceptable_css_keywords
|
|
and not self.valid_css_values.match(keyword)
|
|
):
|
|
break
|
|
else:
|
|
clean.append(prop + ": " + value + ";")
|
|
elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
|
|
clean.append(prop + ": " + value + ";")
|
|
|
|
return " ".join(clean)
|
|
|
|
def parse_comment(self, i, report=1):
|
|
ret = super().parse_comment(i, report)
|
|
if ret >= 0:
|
|
return ret
|
|
# if ret == -1, this may be a malicious attempt to circumvent
|
|
# sanitization, or a page-destroying unclosed comment
|
|
match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4)
|
|
if match:
|
|
return match.end()
|
|
# unclosed comment; deliberately fail to handle_data()
|
|
return len(self.rawdata)
|
|
|
|
|
|
def sanitize_html(html_source, encoding, _type):
|
|
p = HTMLSanitizer(encoding, _type)
|
|
html_source = html_source.replace("<![CDATA[", "<![CDATA[")
|
|
p.feed(html_source)
|
|
data = p.output()
|
|
data = data.strip().replace("\r\n", "\n")
|
|
return data
|
|
|
|
|
|
# Match XML entity declarations.
|
|
# Example: <!ENTITY copyright "(C)">
|
|
RE_ENTITY_PATTERN = re.compile(rb"^\s*<!ENTITY([^>]*?)>", re.MULTILINE)
|
|
|
|
# Match XML DOCTYPE declarations.
|
|
# Example: <!DOCTYPE feed [ ]>
|
|
RE_DOCTYPE_PATTERN = re.compile(rb"^\s*<!DOCTYPE([^>]*?)>", re.MULTILINE)
|
|
|
|
# Match safe entity declarations.
|
|
# This will allow hexadecimal character references through,
|
|
# as well as text, but not arbitrary nested entities.
|
|
# Example: cubed "³"
|
|
# Example: copyright "(C)"
|
|
# Forbidden: explode1 "&explode2;&explode2;"
|
|
RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
|
|
|
|
|
|
def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
|
|
"""Strip and replaces the DOCTYPE.
|
|
|
|
One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
|
|
Therefore, this function must identify that version while replacing the DOCTYPE.
|
|
|
|
As a convenience to the loose XML parser, entities are pre-computed and returned.
|
|
|
|
The tuple that is returned has the following values, in order:
|
|
|
|
1. The version extracted from the XML DOCTYPE.
|
|
The value will either be "rss091n" or None.
|
|
2. Binary XML content with a replaced DOCTYPE.
|
|
3. A dictionary of entities and replacements.
|
|
"""
|
|
|
|
# Verify this looks like an XML feed.
|
|
if not re.match(rb"^\s*<", data):
|
|
return None, data, {}
|
|
|
|
# Divide the document into two groups by finding the location
|
|
# of the first element that doesn't begin with '<?' or '<!'.
|
|
match = re.search(rb"<\w", data)
|
|
first_element = match.start() + 1 if match is not None else 0
|
|
head, data = data[:first_element], data[first_element:]
|
|
|
|
# Save, and then remove, any ENTITY declarations.
|
|
entity_results = RE_ENTITY_PATTERN.findall(head)
|
|
head = RE_ENTITY_PATTERN.sub(b"", head)
|
|
|
|
# Find the DOCTYPE declaration and check the feed type.
|
|
doctype_results = RE_DOCTYPE_PATTERN.findall(head)
|
|
doctype = doctype_results and doctype_results[0] or b""
|
|
if b"netscape" in doctype.lower():
|
|
version = "rss091n"
|
|
else:
|
|
version = None
|
|
|
|
# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
|
|
replacement = b""
|
|
if len(doctype_results) == 1 and entity_results:
|
|
safe_entities = [e for e in entity_results if RE_SAFE_ENTITY_PATTERN.match(e)]
|
|
if safe_entities:
|
|
replacement = (
|
|
b"<!DOCTYPE feed [\n<!ENTITY"
|
|
+ b">\n<!ENTITY ".join(safe_entities)
|
|
+ b">\n]>"
|
|
)
|
|
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
|
|
|
|
# Precompute the safe entities for the loose parser.
|
|
entities = {
|
|
k.decode("utf-8"): v.decode("utf-8")
|
|
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
|
|
}
|
|
return version, data, entities
|