Merge branch 'feature/UpdateHtml5lib' into dev

This commit is contained in:
JackDandy 2023-03-06 23:56:21 +00:00
commit 8239168a5a
11 changed files with 49 additions and 25 deletions

View file

@ -1,5 +1,6 @@
### 3.28.0 (2023-xx-xx xx:xx:00 UTC) ### 3.28.0 (2023-xx-xx xx:xx:00 UTC)
* Update html5lib 1.1 (f87487a) to 1.2-dev (3e500bb)
* Update package resource API 63.2.0 (3ae44cd) to 67.3.2 (b9bf2ec) * Update package resource API 63.2.0 (3ae44cd) to 67.3.2 (b9bf2ec)
* Change remove calls to legacy py2 fix encoding function * Change remove calls to legacy py2 fix encoding function
* Change requirements for pure py3 * Change requirements for pure py3

View file

@ -32,4 +32,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
# this has to be at the top level, see how setup.py parses this # this has to be at the top level, see how setup.py parses this
#: Distribution version number. #: Distribution version number.
__version__ = "1.1" __version__ = "1.2-dev"

View file

@ -104,18 +104,15 @@ def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")] charRanges = [item.strip() for item in chars.split(" | ")]
rv = [] rv = []
for item in charRanges: for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange): for regexp in (reChar, reCharRange):
match = regexp.match(item) match = regexp.match(item)
if match is not None: if match is not None:
rv.append([hexToInt(item) for item in match.groups()]) rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1: if len(rv[-1]) == 1:
rv[-1] = rv[-1] * 2 rv[-1] = rv[-1] * 2
foundMatch = True
break break
if not foundMatch: else:
assert len(item) == 1 assert len(item) == 1
rv.append([ord(item)] * 2) rv.append([ord(item)] * 2)
rv = normaliseCharList(rv) rv = normaliseCharList(rv)
return rv return rv

View file

@ -324,7 +324,7 @@ class HTMLUnicodeInputStream(object):
except KeyError: except KeyError:
if __debug__: if __debug__:
for c in characters: for c in characters:
assert(ord(c) < 128) assert ord(c) < 128
regex = "".join(["\\x%02x" % ord(c) for c in characters]) regex = "".join(["\\x%02x" % ord(c) for c in characters])
if not opposite: if not opposite:
regex = "^%s" % regex regex = "^%s" % regex

View file

@ -557,23 +557,36 @@ headingElements = (
) )
voidElements = frozenset([ voidElements = frozenset([
"area",
"base", "base",
"command", "br",
"event-source", "col",
"command", # removed ^1
"embed",
"event-source", # renamed and later removed ^2
"hr",
"img",
"input",
"link", "link",
"meta", "meta",
"hr", "param", # deprecated ^3
"br",
"img",
"embed",
"param",
"area",
"col",
"input",
"source", "source",
"track" "track",
"wbr",
]) ])
# Removals and deprecations in the HTML 5 spec:
# ^1: command
# http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2012-December/038472.html
# https://github.com/whatwg/html/commit/9e2e25f4ae90969a7c64e0763c98548a35b50af8
# ^2: event-source
# renamed to eventsource in 7/2008:
# https://github.com/whatwg/html/commit/d157945d0285b4463a04b57318da0c4b300a99e7
# removed entirely in 2/2009:
# https://github.com/whatwg/html/commit/43cbdbfbb7eb74b0d65e0f4caab2020c0b2a16ff
# ^3: param
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/param
cdataElements = frozenset(['title', 'textarea']) cdataElements = frozenset(['title', 'textarea'])
rcdataElements = frozenset([ rcdataElements = frozenset([
@ -604,6 +617,7 @@ booleanAttributes = {
"button": frozenset(["disabled", "autofocus"]), "button": frozenset(["disabled", "autofocus"]),
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"ol": frozenset(["reversed"]),
"output": frozenset(["disabled", "readonly"]), "output": frozenset(["disabled", "readonly"]),
"iframe": frozenset(["seamless"]), "iframe": frozenset(["seamless"]),
} }

View file

@ -113,6 +113,7 @@ allowed_elements = frozenset((
(namespaces['html'], 'strike'), (namespaces['html'], 'strike'),
(namespaces['html'], 'strong'), (namespaces['html'], 'strong'),
(namespaces['html'], 'sub'), (namespaces['html'], 'sub'),
(namespaces['html'], 'summary'),
(namespaces['html'], 'sup'), (namespaces['html'], 'sup'),
(namespaces['html'], 'table'), (namespaces['html'], 'table'),
(namespaces['html'], 'tbody'), (namespaces['html'], 'tbody'),
@ -128,6 +129,7 @@ allowed_elements = frozenset((
(namespaces['html'], 'ul'), (namespaces['html'], 'ul'),
(namespaces['html'], 'var'), (namespaces['html'], 'var'),
(namespaces['html'], 'video'), (namespaces['html'], 'video'),
(namespaces['html'], 'wbr'),
(namespaces['mathml'], 'maction'), (namespaces['mathml'], 'maction'),
(namespaces['mathml'], 'math'), (namespaces['mathml'], 'math'),
(namespaces['mathml'], 'merror'), (namespaces['mathml'], 'merror'),
@ -363,6 +365,7 @@ allowed_attributes = frozenset((
(None, 'maxsize'), (None, 'maxsize'),
(None, 'minsize'), (None, 'minsize'),
(None, 'other'), (None, 'other'),
(None, 'reversed'),
(None, 'rowalign'), (None, 'rowalign'),
(None, 'rowalign'), (None, 'rowalign'),
(None, 'rowalign'), (None, 'rowalign'),
@ -373,6 +376,7 @@ allowed_attributes = frozenset((
(None, 'scriptlevel'), (None, 'scriptlevel'),
(None, 'selection'), (None, 'selection'),
(None, 'separator'), (None, 'separator'),
(None, 'start'),
(None, 'stretchy'), (None, 'stretchy'),
(None, 'width'), (None, 'width'),
(None, 'width'), (None, 'width'),
@ -594,6 +598,10 @@ allowed_css_properties = frozenset((
'height', 'height',
'letter-spacing', 'letter-spacing',
'line-height', 'line-height',
'max-height',
'min-height',
'max-width',
'min-width',
'overflow', 'overflow',
'pause', 'pause',
'pause-after', 'pause-after',

View file

@ -115,6 +115,9 @@ class HTMLParser(object):
if tree is None: if tree is None:
tree = treebuilders.getTreeBuilder("etree") tree = treebuilders.getTreeBuilder("etree")
elif isinstance(tree, str):
tree = treebuilders.getTreeBuilder(tree)
self.tree = tree(namespaceHTMLElements) self.tree = tree(namespaceHTMLElements)
self.errors = [] self.errors = []
@ -1002,8 +1005,8 @@ def getPhases(debug):
self.tree.insertText(token["data"]) self.tree.insertText(token["data"])
# This must be bad for performance # This must be bad for performance
if (self.parser.framesetOK and if (self.parser.framesetOK and
any([char not in spaceCharacters any(char not in spaceCharacters
for char in token["data"]])): for char in token["data"])):
self.parser.framesetOK = False self.parser.framesetOK = False
def processSpaceCharactersNonPre(self, token): def processSpaceCharactersNonPre(self, token):
@ -1850,7 +1853,7 @@ def getPhases(debug):
def flushCharacters(self): def flushCharacters(self):
data = "".join([item["data"] for item in self.characterTokens]) data = "".join([item["data"] for item in self.characterTokens])
if any([item not in spaceCharacters for item in data]): if any(item not in spaceCharacters for item in data):
token = {"type": tokenTypes["Characters"], "data": data} token = {"type": tokenTypes["Characters"], "data": data}
self.parser.phases["inTable"].insertText(token) self.parser.phases["inTable"].insertText(token)
elif data: elif data:

View file

@ -222,14 +222,14 @@ class HTMLSerializer(object):
self.strict = False self.strict = False
def encode(self, string): def encode(self, string):
assert(isinstance(string, text_type)) assert isinstance(string, text_type)
if self.encoding: if self.encoding:
return string.encode(self.encoding, "htmlentityreplace") return string.encode(self.encoding, "htmlentityreplace")
else: else:
return string return string
def encodeStrict(self, string): def encodeStrict(self, string):
assert(isinstance(string, text_type)) assert isinstance(string, text_type)
if self.encoding: if self.encoding:
return string.encode(self.encoding, "strict") return string.encode(self.encoding, "strict")
else: else:

View file

@ -121,6 +121,7 @@ class Node(object):
class ActiveFormattingElements(list): class ActiveFormattingElements(list):
def append(self, node): def append(self, node):
"""Append node to the end of the list."""
equalCount = 0 equalCount = 0
if node != Marker: if node != Marker:
for element in self[::-1]: for element in self[::-1]:

View file

@ -108,7 +108,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
node.parent = None node.parent = None
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
if not(len(self._element)): if not len(self._element):
if not self._element.text: if not self._element.text:
self._element.text = "" self._element.text = ""
self._element.text += data self._element.text += data
@ -201,7 +201,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
rv = [] rv = []
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if not(hasattr(element, "tag")): if not hasattr(element, "tag"):
element = element.getroot() element = element.getroot()
if element.tag == "<!DOCTYPE>": if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"): if element.get("publicId") or element.get("systemId"):

View file

@ -37,7 +37,7 @@ def getETreeBuilder(ElementTreeImplementation):
else: else:
node = elt node = elt
if not(hasattr(node, "tag")): if not hasattr(node, "tag"):
node = node.getroot() node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"): if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):