Merge branch 'feature/UpdateHtml5lib' into dev

2024-12-18 08:43:37 +00:00 · 2023-03-06 23:56:21 +00:00 · 2023-03-06 23:56:21 +00:00 · 8239168a5a
commit 8239168a5a
parent a0d379595c e08baa4f0b
11 changed files with 49 additions and 25 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,5 +1,6 @@
 ### 3.28.0 (2023-xx-xx xx:xx:00 UTC)
 * Update html5lib 1.1 (f87487a) to 1.2-dev (3e500bb)
 * Update package resource API 63.2.0 (3ae44cd) to 67.3.2 (b9bf2ec)
 * Change remove calls to legacy py2 fix encoding function
 * Change requirements for pure py3
--- a/lib/html5lib/init.py
+++ b/lib/html5lib/init.py
@ -32,4 +32,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
 # this has to be at the top level, see how setup.py parses this
 #: Distribution version number.
-__version__ = "1.1"
+__version__ = "1.2-dev"
--- a/lib/html5lib/_ihatexml.py
+++ b/lib/html5lib/_ihatexml.py
@ -104,18 +104,15 @@ def charStringToList(chars):
    charRanges = [item.strip() for item in chars.split(" | ")]
    rv = []
    for item in charRanges:
        foundMatch = False
        for regexp in (reChar, reCharRange):
            match = regexp.match(item)
            if match is not None:
                rv.append([hexToInt(item) for item in match.groups()])
                if len(rv[-1]) == 1:
                    rv[-1] = rv[-1] * 2
                foundMatch = True
                break
-        if not foundMatch:
+        else:
            assert len(item) == 1
            rv.append([ord(item)] * 2)
    rv = normaliseCharList(rv)
    return rv
--- a/lib/html5lib/_inputstream.py
+++ b/lib/html5lib/_inputstream.py
@ -324,7 +324,7 @@ class HTMLUnicodeInputStream(object):
        except KeyError:
            if __debug__:
                for c in characters:
-                    assert(ord(c) < 128)
+                    assert ord(c) < 128
            regex = "".join(["\\x%02x" % ord(c) for c in characters])
            if not opposite:
                regex = "^%s" % regex
--- a/lib/html5lib/constants.py
+++ b/lib/html5lib/constants.py
@ -557,23 +557,36 @@ headingElements = (
 )
 voidElements = frozenset([
    "area",
    "base",
-    "command",
+    "br",
-    "event-source",
+    "col",
    "command",  # removed ^1
    "embed",
    "event-source",  # renamed and later removed ^2
    "hr",
    "img",
    "input",
    "link",
    "meta",
-    "hr",
+    "param",  # deprecated ^3
    "br",
    "img",
    "embed",
    "param",
    "area",
    "col",
    "input",
    "source",
-    "track"
+    "track",
    "wbr",
 ])
 # Removals and deprecations in the HTML 5 spec:
 # ^1: command
 #     http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2012-December/038472.html
 #     https://github.com/whatwg/html/commit/9e2e25f4ae90969a7c64e0763c98548a35b50af8
 # ^2: event-source
 #     renamed to eventsource in 7/2008:
 #     https://github.com/whatwg/html/commit/d157945d0285b4463a04b57318da0c4b300a99e7
 #     removed entirely in 2/2009:
 #     https://github.com/whatwg/html/commit/43cbdbfbb7eb74b0d65e0f4caab2020c0b2a16ff
 # ^3: param
 #     https://developer.mozilla.org/en-US/docs/Web/HTML/Element/param
 cdataElements = frozenset(['title', 'textarea'])
 rcdataElements = frozenset([
@ -604,6 +617,7 @@ booleanAttributes = {
    "button": frozenset(["disabled", "autofocus"]),
    "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
    "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
    "ol": frozenset(["reversed"]),
    "output": frozenset(["disabled", "readonly"]),
    "iframe": frozenset(["seamless"]),
 }
--- a/lib/html5lib/filters/sanitizer.py
+++ b/lib/html5lib/filters/sanitizer.py
@ -113,6 +113,7 @@ allowed_elements = frozenset((
    (namespaces['html'], 'strike'),
    (namespaces['html'], 'strong'),
    (namespaces['html'], 'sub'),
    (namespaces['html'], 'summary'),
    (namespaces['html'], 'sup'),
    (namespaces['html'], 'table'),
    (namespaces['html'], 'tbody'),
@ -128,6 +129,7 @@ allowed_elements = frozenset((
    (namespaces['html'], 'ul'),
    (namespaces['html'], 'var'),
    (namespaces['html'], 'video'),
    (namespaces['html'], 'wbr'),
    (namespaces['mathml'], 'maction'),
    (namespaces['mathml'], 'math'),
    (namespaces['mathml'], 'merror'),
@ -363,6 +365,7 @@ allowed_attributes = frozenset((
    (None, 'maxsize'),
    (None, 'minsize'),
    (None, 'other'),
    (None, 'reversed'),
    (None, 'rowalign'),
    (None, 'rowalign'),
    (None, 'rowalign'),
@ -373,6 +376,7 @@ allowed_attributes = frozenset((
    (None, 'scriptlevel'),
    (None, 'selection'),
    (None, 'separator'),
    (None, 'start'),
    (None, 'stretchy'),
    (None, 'width'),
    (None, 'width'),
@ -594,6 +598,10 @@ allowed_css_properties = frozenset((
    'height',
    'letter-spacing',
    'line-height',
    'max-height',
    'min-height',
    'max-width',
    'min-width',
    'overflow',
    'pause',
    'pause-after',
--- a/lib/html5lib/html5parser.py
+++ b/lib/html5lib/html5parser.py
@ -115,6 +115,9 @@ class HTMLParser(object):
        if tree is None:
            tree = treebuilders.getTreeBuilder("etree")
        elif isinstance(tree, str):
            tree = treebuilders.getTreeBuilder(tree)
        self.tree = tree(namespaceHTMLElements)
        self.errors = []
@ -1002,8 +1005,8 @@ def getPhases(debug):
            self.tree.insertText(token["data"])
            # This must be bad for performance
            if (self.parser.framesetOK and
-                any([char not in spaceCharacters
+                any(char not in spaceCharacters
-                     for char in token["data"]])):
+                    for char in token["data"])):
                self.parser.framesetOK = False
        def processSpaceCharactersNonPre(self, token):
@ -1850,7 +1853,7 @@ def getPhases(debug):
        def flushCharacters(self):
            data = "".join([item["data"] for item in self.characterTokens])
-            if any([item not in spaceCharacters for item in data]):
+            if any(item not in spaceCharacters for item in data):
                token = {"type": tokenTypes["Characters"], "data": data}
                self.parser.phases["inTable"].insertText(token)
            elif data:
--- a/lib/html5lib/serializer.py
+++ b/lib/html5lib/serializer.py
@ -222,14 +222,14 @@ class HTMLSerializer(object):
        self.strict = False
    def encode(self, string):
-        assert(isinstance(string, text_type))
+        assert isinstance(string, text_type)
        if self.encoding:
            return string.encode(self.encoding, "htmlentityreplace")
        else:
            return string
    def encodeStrict(self, string):
-        assert(isinstance(string, text_type))
+        assert isinstance(string, text_type)
        if self.encoding:
            return string.encode(self.encoding, "strict")
        else:
--- a/lib/html5lib/treebuilders/base.py
+++ b/lib/html5lib/treebuilders/base.py
@ -121,6 +121,7 @@ class Node(object):
 class ActiveFormattingElements(list):
    def append(self, node):
        """Append node to the end of the list."""
        equalCount = 0
        if node != Marker:
            for element in self[::-1]:
--- a/lib/html5lib/treebuilders/etree.py
+++ b/lib/html5lib/treebuilders/etree.py
@ -108,7 +108,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            node.parent = None
        def insertText(self, data, insertBefore=None):
-            if not(len(self._element)):
+            if not len(self._element):
                if not self._element.text:
                    self._element.text = ""
                self._element.text += data
@ -201,7 +201,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        rv = []
        def serializeElement(element, indent=0):
-            if not(hasattr(element, "tag")):
+            if not hasattr(element, "tag"):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
                if element.get("publicId") or element.get("systemId"):
--- a/lib/html5lib/treewalkers/etree.py
+++ b/lib/html5lib/treewalkers/etree.py
@ -37,7 +37,7 @@ def getETreeBuilder(ElementTreeImplementation):
                else:
                    node = elt
-            if not(hasattr(node, "tag")):
+            if not hasattr(node, "tag"):
                node = node.getroot()
            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):