2023-04-13 07:04:58 +00:00
|
|
|
# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
|
2023-01-12 01:04:47 +00:00
|
|
|
# Copyright 2002-2008 Mark Pilgrim
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# This file is a part of feedparser.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
|
|
# and/or other materials provided with the distribution.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
import html.entities
|
|
|
|
import re
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
# These items must all be imported into this module due to .__code__ replacements.
|
|
|
|
from .sgml import ( # noqa: F401
|
|
|
|
attrfind,
|
|
|
|
charref,
|
|
|
|
endbracket,
|
|
|
|
entityref,
|
|
|
|
incomplete,
|
|
|
|
interesting,
|
|
|
|
sgmllib,
|
|
|
|
shorttag,
|
|
|
|
shorttagopen,
|
|
|
|
starttagopen,
|
|
|
|
tagfind,
|
|
|
|
)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
_cp1252 = {
|
2023-04-13 07:04:58 +00:00
|
|
|
128: "\u20ac", # euro sign
|
|
|
|
130: "\u201a", # single low-9 quotation mark
|
|
|
|
131: "\u0192", # latin small letter f with hook
|
|
|
|
132: "\u201e", # double low-9 quotation mark
|
|
|
|
133: "\u2026", # horizontal ellipsis
|
|
|
|
134: "\u2020", # dagger
|
|
|
|
135: "\u2021", # double dagger
|
|
|
|
136: "\u02c6", # modifier letter circumflex accent
|
|
|
|
137: "\u2030", # per mille sign
|
|
|
|
138: "\u0160", # latin capital letter s with caron
|
|
|
|
139: "\u2039", # single left-pointing angle quotation mark
|
|
|
|
140: "\u0152", # latin capital ligature oe
|
|
|
|
142: "\u017d", # latin capital letter z with caron
|
|
|
|
145: "\u2018", # left single quotation mark
|
|
|
|
146: "\u2019", # right single quotation mark
|
|
|
|
147: "\u201c", # left double quotation mark
|
|
|
|
148: "\u201d", # right double quotation mark
|
|
|
|
149: "\u2022", # bullet
|
|
|
|
150: "\u2013", # en dash
|
|
|
|
151: "\u2014", # em dash
|
|
|
|
152: "\u02dc", # small tilde
|
|
|
|
153: "\u2122", # trade mark sign
|
|
|
|
154: "\u0161", # latin small letter s with caron
|
|
|
|
155: "\u203a", # single right-pointing angle quotation mark
|
|
|
|
156: "\u0153", # latin small ligature oe
|
|
|
|
158: "\u017e", # latin small letter z with caron
|
|
|
|
159: "\u0178", # latin capital letter y with diaeresis
|
2023-01-12 01:04:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-01-13 20:16:45 +00:00
|
|
|
class BaseHTMLProcessor(sgmllib.SGMLParser):
|
2023-01-12 01:04:47 +00:00
|
|
|
special = re.compile("""[<>'"]""")
|
|
|
|
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
|
|
|
|
elements_no_end_tag = {
|
2023-04-13 07:04:58 +00:00
|
|
|
"area",
|
|
|
|
"base",
|
|
|
|
"basefont",
|
|
|
|
"br",
|
|
|
|
"col",
|
|
|
|
"command",
|
|
|
|
"embed",
|
|
|
|
"frame",
|
|
|
|
"hr",
|
|
|
|
"img",
|
|
|
|
"input",
|
|
|
|
"isindex",
|
|
|
|
"keygen",
|
|
|
|
"link",
|
|
|
|
"meta",
|
|
|
|
"param",
|
|
|
|
"source",
|
|
|
|
"track",
|
|
|
|
"wbr",
|
2023-01-12 01:04:47 +00:00
|
|
|
}
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
def __init__(self, encoding=None, _type="application/xhtml+xml"):
|
2023-01-12 01:04:47 +00:00
|
|
|
if encoding:
|
|
|
|
self.encoding = encoding
|
|
|
|
self._type = _type
|
|
|
|
self.pieces = []
|
2023-01-13 20:16:45 +00:00
|
|
|
super().__init__()
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.pieces = []
|
2023-01-13 20:16:45 +00:00
|
|
|
super().reset()
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def _shorttag_replace(self, match):
|
|
|
|
"""
|
|
|
|
:type match: Match[str]
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
|
|
|
tag = match.group(1)
|
|
|
|
if tag in self.elements_no_end_tag:
|
2023-04-13 07:04:58 +00:00
|
|
|
return "<" + tag + " />"
|
2023-01-12 01:04:47 +00:00
|
|
|
else:
|
2023-04-13 07:04:58 +00:00
|
|
|
return "<" + tag + "></" + tag + ">"
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
# By declaring these methods and overriding their compiled code
|
|
|
|
# with the code from sgmllib, the original code will execute in
|
|
|
|
# feedparser's scope instead of sgmllib's. This means that the
|
|
|
|
# `tagfind` and `charref` regular expressions will be found as
|
|
|
|
# they're declared above, not as they're declared in sgmllib.
|
|
|
|
def goahead(self, i):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
# Replace goahead with SGMLParser's goahead() code object.
|
2023-01-13 20:16:45 +00:00
|
|
|
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def __parse_starttag(self, i):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
# Replace __parse_starttag with SGMLParser's parse_starttag() code object.
|
2023-01-13 20:16:45 +00:00
|
|
|
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def parse_starttag(self, i):
|
|
|
|
j = self.__parse_starttag(i)
|
2023-04-13 07:04:58 +00:00
|
|
|
if self._type == "application/xhtml+xml":
|
|
|
|
if j > 2 and self.rawdata[j - 2 : j] == "/>":
|
2023-01-12 01:04:47 +00:00
|
|
|
self.unknown_endtag(self.lasttag)
|
|
|
|
return j
|
|
|
|
|
|
|
|
def feed(self, data):
|
|
|
|
"""
|
|
|
|
:type data: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
2023-09-06 08:18:26 +00:00
|
|
|
data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"<!\1", data, flags=re.IGNORECASE)
|
2023-04-13 07:04:58 +00:00
|
|
|
data = re.sub(r"<([^<>\s]+?)\s*/>", self._shorttag_replace, data)
|
|
|
|
data = data.replace("'", "'")
|
|
|
|
data = data.replace(""", '"')
|
2023-01-13 20:16:45 +00:00
|
|
|
super().feed(data)
|
|
|
|
super().close()
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def normalize_attrs(attrs):
|
|
|
|
"""
|
|
|
|
:type attrs: List[Tuple[str, str]]
|
|
|
|
:rtype: List[Tuple[str, str]]
|
|
|
|
"""
|
|
|
|
|
|
|
|
if not attrs:
|
|
|
|
return attrs
|
|
|
|
# utility method to be called by descendants
|
|
|
|
# Collapse any duplicate attribute names and values by converting
|
|
|
|
# *attrs* into a dictionary, then convert it back to a list.
|
|
|
|
attrs_d = {k.lower(): v for k, v in attrs}
|
|
|
|
attrs = [
|
2023-04-13 07:04:58 +00:00
|
|
|
(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs_d.items()
|
2023-01-12 01:04:47 +00:00
|
|
|
]
|
|
|
|
attrs.sort()
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
def unknown_starttag(self, tag, attrs):
|
|
|
|
"""
|
|
|
|
:type tag: str
|
|
|
|
:type attrs: List[Tuple[str, str]]
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Called for each start tag
|
|
|
|
# attrs is a list of (attr, value) tuples
|
|
|
|
# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
|
|
|
|
uattrs = []
|
2023-04-13 07:04:58 +00:00
|
|
|
strattrs = ""
|
2023-01-12 01:04:47 +00:00
|
|
|
if attrs:
|
|
|
|
for key, value in attrs:
|
2023-04-13 07:04:58 +00:00
|
|
|
value = value.replace(">", ">")
|
|
|
|
value = value.replace("<", "<")
|
|
|
|
value = value.replace('"', """)
|
2023-01-12 01:04:47 +00:00
|
|
|
value = self.bare_ampersand.sub("&", value)
|
|
|
|
uattrs.append((key, value))
|
2023-04-13 07:04:58 +00:00
|
|
|
strattrs = "".join(f' {key}="{value}"' for key, value in uattrs)
|
2023-01-12 01:04:47 +00:00
|
|
|
if tag in self.elements_no_end_tag:
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append(f"<{tag}{strattrs} />")
|
2023-01-12 01:04:47 +00:00
|
|
|
else:
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append(f"<{tag}{strattrs}>")
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def unknown_endtag(self, tag):
|
|
|
|
"""
|
|
|
|
:type tag: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Called for each end tag, e.g. for </pre>, tag will be 'pre'
|
|
|
|
# Reconstruct the original end tag.
|
|
|
|
if tag not in self.elements_no_end_tag:
|
|
|
|
self.pieces.append("</%s>" % tag)
|
|
|
|
|
|
|
|
def handle_charref(self, ref):
|
|
|
|
"""
|
|
|
|
:type ref: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Called for each character reference, e.g. ' ' will extract '160'
|
|
|
|
# Reconstruct the original character reference.
|
|
|
|
ref = ref.lower()
|
2023-04-13 07:04:58 +00:00
|
|
|
if ref.startswith("x"):
|
2023-01-12 01:04:47 +00:00
|
|
|
value = int(ref[1:], 16)
|
|
|
|
else:
|
|
|
|
value = int(ref)
|
|
|
|
|
|
|
|
if value in _cp1252:
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append("&#%s;" % hex(ord(_cp1252[value]))[1:])
|
2023-01-12 01:04:47 +00:00
|
|
|
else:
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append("&#%s;" % ref)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def handle_entityref(self, ref):
|
|
|
|
"""
|
|
|
|
:type ref: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Called for each entity reference, e.g. '©' will extract 'copy'
|
|
|
|
# Reconstruct the original entity reference.
|
2023-04-13 07:04:58 +00:00
|
|
|
if ref in html.entities.name2codepoint or ref == "apos":
|
|
|
|
self.pieces.append("&%s;" % ref)
|
2023-01-12 01:04:47 +00:00
|
|
|
else:
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append("&%s" % ref)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def handle_data(self, text):
|
|
|
|
"""
|
|
|
|
:type text: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# called for each block of plain text, i.e. outside of any tag and
|
|
|
|
# not containing any character or entity references
|
|
|
|
# Store the original text verbatim.
|
|
|
|
self.pieces.append(text)
|
|
|
|
|
|
|
|
def handle_comment(self, text):
|
|
|
|
"""
|
|
|
|
:type text: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Called for HTML comments, e.g. <!-- insert Javascript code here -->
|
|
|
|
# Reconstruct the original comment.
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append("<!--%s-->" % text)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def handle_pi(self, text):
|
|
|
|
"""
|
|
|
|
:type text: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Called for each processing instruction, e.g. <?instruction>
|
|
|
|
# Reconstruct original processing instruction.
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append("<?%s>" % text)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def handle_decl(self, text):
|
|
|
|
"""
|
|
|
|
:type text: str
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# called for the DOCTYPE, if present, e.g.
|
|
|
|
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
|
|
|
# "http://www.w3.org/TR/html4/loose.dtd">
|
|
|
|
# Reconstruct original DOCTYPE
|
2023-04-13 07:04:58 +00:00
|
|
|
self.pieces.append("<!%s>" % text)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
_new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def _scan_name(self, i, declstartpos):
|
|
|
|
"""
|
|
|
|
:type i: int
|
|
|
|
:type declstartpos: int
|
|
|
|
:rtype: Tuple[Optional[str], int]
|
|
|
|
"""
|
|
|
|
|
|
|
|
rawdata = self.rawdata
|
|
|
|
n = len(rawdata)
|
|
|
|
if i == n:
|
|
|
|
return None, -1
|
|
|
|
m = self._new_declname_match(rawdata, i)
|
|
|
|
if m:
|
|
|
|
s = m.group()
|
|
|
|
name = s.strip()
|
|
|
|
if (i + len(s)) == n:
|
|
|
|
return None, -1 # end of buffer
|
|
|
|
return name.lower(), m.end()
|
|
|
|
else:
|
|
|
|
self.handle_data(rawdata)
|
|
|
|
# self.updatepos(declstartpos, i)
|
|
|
|
return None, -1
|
|
|
|
|
2023-01-13 20:16:45 +00:00
|
|
|
def convert_charref(self, name):
|
2023-01-12 01:04:47 +00:00
|
|
|
"""
|
|
|
|
:type name: str
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
return "&#%s;" % name
|
2023-01-12 01:04:47 +00:00
|
|
|
|
2023-01-13 20:16:45 +00:00
|
|
|
def convert_entityref(self, name):
|
2023-01-12 01:04:47 +00:00
|
|
|
"""
|
|
|
|
:type name: str
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
return "&%s;" % name
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def output(self):
|
|
|
|
"""Return processed HTML as a single string.
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
return "".join(self.pieces)
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
def parse_declaration(self, i):
|
|
|
|
"""
|
|
|
|
:type i: int
|
|
|
|
:rtype: int
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
return sgmllib.SGMLParser.parse_declaration(self, i)
|
2023-01-13 20:16:45 +00:00
|
|
|
except (AssertionError, sgmllib.SGMLParseError):
|
2023-01-12 01:04:47 +00:00
|
|
|
# Escape the doctype declaration and continue parsing.
|
2023-04-13 07:04:58 +00:00
|
|
|
self.handle_data("<")
|
|
|
|
return i + 1
|