diff --git a/CHANGES.md b/CHANGES.md index e5107f87..75888e5b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ ### 3.28.0 (2023-xx-xx xx:xx:00 UTC) +* Update html5lib 1.1 (f87487a) to 1.2-dev (3e500bb) * Update package resource API 63.2.0 (3ae44cd) to 67.3.2 (b9bf2ec) * Change remove calls to legacy py2 fix encoding function * Change requirements for pure py3 diff --git a/lib/html5lib/__init__.py b/lib/html5lib/__init__.py index 320e0c3b..7b854f99 100644 --- a/lib/html5lib/__init__.py +++ b/lib/html5lib/__init__.py @@ -32,4 +32,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", # this has to be at the top level, see how setup.py parses this #: Distribution version number. -__version__ = "1.1" +__version__ = "1.2-dev" diff --git a/lib/html5lib/_ihatexml.py b/lib/html5lib/_ihatexml.py index 3ff803c1..d725eabd 100644 --- a/lib/html5lib/_ihatexml.py +++ b/lib/html5lib/_ihatexml.py @@ -104,18 +104,15 @@ def charStringToList(chars): charRanges = [item.strip() for item in chars.split(" | ")] rv = [] for item in charRanges: - foundMatch = False for regexp in (reChar, reCharRange): match = regexp.match(item) if match is not None: rv.append([hexToInt(item) for item in match.groups()]) if len(rv[-1]) == 1: rv[-1] = rv[-1] * 2 - foundMatch = True break - if not foundMatch: + else: assert len(item) == 1 - rv.append([ord(item)] * 2) rv = normaliseCharList(rv) return rv diff --git a/lib/html5lib/_inputstream.py b/lib/html5lib/_inputstream.py index 0207dd21..a93b5a4e 100644 --- a/lib/html5lib/_inputstream.py +++ b/lib/html5lib/_inputstream.py @@ -324,7 +324,7 @@ class HTMLUnicodeInputStream(object): except KeyError: if __debug__: for c in characters: - assert(ord(c) < 128) + assert ord(c) < 128 regex = "".join(["\\x%02x" % ord(c) for c in characters]) if not opposite: regex = "^%s" % regex diff --git a/lib/html5lib/constants.py b/lib/html5lib/constants.py index fe3e237c..2fa4146d 100644 --- a/lib/html5lib/constants.py +++ b/lib/html5lib/constants.py @@ -557,23 +557,36 @@ headingElements = ( ) voidElements = frozenset([ + "area", "base", - "command", - "event-source", + "br", + "col", + "command", # removed ^1 + "embed", + "event-source", # renamed and later removed ^2 + "hr", + "img", + "input", "link", "meta", - "hr", - "br", - "img", - "embed", - "param", - "area", - "col", - "input", + "param", # deprecated ^3 "source", - "track" + "track", + "wbr", ]) +# Removals and deprecations in the HTML 5 spec: +# ^1: command +# http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2012-December/038472.html +# https://github.com/whatwg/html/commit/9e2e25f4ae90969a7c64e0763c98548a35b50af8 +# ^2: event-source +# renamed to eventsource in 7/2008: +# https://github.com/whatwg/html/commit/d157945d0285b4463a04b57318da0c4b300a99e7 +# removed entirely in 2/2009: +# https://github.com/whatwg/html/commit/43cbdbfbb7eb74b0d65e0f4caab2020c0b2a16ff +# ^3: param +# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/param + cdataElements = frozenset(['title', 'textarea']) rcdataElements = frozenset([ @@ -604,6 +617,7 @@ booleanAttributes = { "button": frozenset(["disabled", "autofocus"]), "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), + "ol": frozenset(["reversed"]), "output": frozenset(["disabled", "readonly"]), "iframe": frozenset(["seamless"]), } diff --git a/lib/html5lib/filters/sanitizer.py b/lib/html5lib/filters/sanitizer.py index 70ef9066..ea2c5dd3 100644 --- a/lib/html5lib/filters/sanitizer.py +++ b/lib/html5lib/filters/sanitizer.py @@ -113,6 +113,7 @@ allowed_elements = frozenset(( (namespaces['html'], 'strike'), (namespaces['html'], 'strong'), (namespaces['html'], 'sub'), + (namespaces['html'], 'summary'), (namespaces['html'], 'sup'), (namespaces['html'], 'table'), (namespaces['html'], 'tbody'), @@ -128,6 +129,7 @@ allowed_elements = frozenset(( (namespaces['html'], 'ul'), (namespaces['html'], 'var'), (namespaces['html'], 'video'), + (namespaces['html'], 'wbr'), (namespaces['mathml'], 'maction'), (namespaces['mathml'], 'math'), (namespaces['mathml'], 'merror'), @@ -363,6 +365,7 @@ allowed_attributes = frozenset(( (None, 'maxsize'), (None, 'minsize'), (None, 'other'), + (None, 'reversed'), (None, 'rowalign'), (None, 'rowalign'), (None, 'rowalign'), @@ -373,6 +376,7 @@ allowed_attributes = frozenset(( (None, 'scriptlevel'), (None, 'selection'), (None, 'separator'), + (None, 'start'), (None, 'stretchy'), (None, 'width'), (None, 'width'), @@ -594,6 +598,10 @@ allowed_css_properties = frozenset(( 'height', 'letter-spacing', 'line-height', + 'max-height', + 'min-height', + 'max-width', + 'min-width', 'overflow', 'pause', 'pause-after', diff --git a/lib/html5lib/html5parser.py b/lib/html5lib/html5parser.py index 74d829d9..4c2d4c75 100644 --- a/lib/html5lib/html5parser.py +++ b/lib/html5lib/html5parser.py @@ -115,6 +115,9 @@ class HTMLParser(object): if tree is None: tree = treebuilders.getTreeBuilder("etree") + elif isinstance(tree, str): + tree = treebuilders.getTreeBuilder(tree) + self.tree = tree(namespaceHTMLElements) self.errors = [] @@ -1002,8 +1005,8 @@ def getPhases(debug): self.tree.insertText(token["data"]) # This must be bad for performance if (self.parser.framesetOK and - any([char not in spaceCharacters - for char in token["data"]])): + any(char not in spaceCharacters + for char in token["data"])): self.parser.framesetOK = False def processSpaceCharactersNonPre(self, token): @@ -1850,7 +1853,7 @@ def getPhases(debug): def flushCharacters(self): data = "".join([item["data"] for item in self.characterTokens]) - if any([item not in spaceCharacters for item in data]): + if any(item not in spaceCharacters for item in data): token = {"type": tokenTypes["Characters"], "data": data} self.parser.phases["inTable"].insertText(token) elif data: diff --git a/lib/html5lib/serializer.py b/lib/html5lib/serializer.py index c66df683..a171ac1c 100644 --- a/lib/html5lib/serializer.py +++ b/lib/html5lib/serializer.py @@ -222,14 +222,14 @@ class HTMLSerializer(object): self.strict = False def encode(self, string): - assert(isinstance(string, text_type)) + assert isinstance(string, text_type) if self.encoding: return string.encode(self.encoding, "htmlentityreplace") else: return string def encodeStrict(self, string): - assert(isinstance(string, text_type)) + assert isinstance(string, text_type) if self.encoding: return string.encode(self.encoding, "strict") else: diff --git a/lib/html5lib/treebuilders/base.py b/lib/html5lib/treebuilders/base.py index e4a3d710..020d7e15 100644 --- a/lib/html5lib/treebuilders/base.py +++ b/lib/html5lib/treebuilders/base.py @@ -121,6 +121,7 @@ class Node(object): class ActiveFormattingElements(list): def append(self, node): + """Append node to the end of the list.""" equalCount = 0 if node != Marker: for element in self[::-1]: diff --git a/lib/html5lib/treebuilders/etree.py b/lib/html5lib/treebuilders/etree.py index 086bed4e..0b745081 100644 --- a/lib/html5lib/treebuilders/etree.py +++ b/lib/html5lib/treebuilders/etree.py @@ -108,7 +108,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): node.parent = None def insertText(self, data, insertBefore=None): - if not(len(self._element)): + if not len(self._element): if not self._element.text: self._element.text = "" self._element.text += data @@ -201,7 +201,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): rv = [] def serializeElement(element, indent=0): - if not(hasattr(element, "tag")): + if not hasattr(element, "tag"): element = element.getroot() if element.tag == "": if element.get("publicId") or element.get("systemId"): diff --git a/lib/html5lib/treewalkers/etree.py b/lib/html5lib/treewalkers/etree.py index 44653372..411a1d45 100644 --- a/lib/html5lib/treewalkers/etree.py +++ b/lib/html5lib/treewalkers/etree.py @@ -37,7 +37,7 @@ def getETreeBuilder(ElementTreeImplementation): else: node = elt - if not(hasattr(node, "tag")): + if not hasattr(node, "tag"): node = node.getroot() if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):