Update html5lib 1.1 (f87487a) → 1.2-dev (3e500bb).

This commit is contained in:
JackDandy 2023-03-06 19:54:59 +00:00
parent a0d379595c
commit e08baa4f0b
11 changed files with 49 additions and 25 deletions

View file

@ -1,5 +1,6 @@
### 3.28.0 (2023-xx-xx xx:xx:00 UTC)
* Update html5lib 1.1 (f87487a) to 1.2-dev (3e500bb)
* Update package resource API 63.2.0 (3ae44cd) to 67.3.2 (b9bf2ec)
* Change remove calls to legacy py2 fix encoding function
* Change requirements for pure py3

View file

@ -32,4 +32,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
# this has to be at the top level, see how setup.py parses this
#: Distribution version number.
__version__ = "1.1"
__version__ = "1.2-dev"

View file

@ -104,18 +104,15 @@ def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange):
match = regexp.match(item)
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1] * 2
foundMatch = True
break
if not foundMatch:
else:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv

View file

@ -324,7 +324,7 @@ class HTMLUnicodeInputStream(object):
except KeyError:
if __debug__:
for c in characters:
assert(ord(c) < 128)
assert ord(c) < 128
regex = "".join(["\\x%02x" % ord(c) for c in characters])
if not opposite:
regex = "^%s" % regex

View file

@ -557,23 +557,36 @@ headingElements = (
)
voidElements = frozenset([
"area",
"base",
"command",
"event-source",
"br",
"col",
"command", # removed ^1
"embed",
"event-source", # renamed and later removed ^2
"hr",
"img",
"input",
"link",
"meta",
"hr",
"br",
"img",
"embed",
"param",
"area",
"col",
"input",
"param", # deprecated ^3
"source",
"track"
"track",
"wbr",
])
# Removals and deprecations in the HTML 5 spec:
# ^1: command
# http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2012-December/038472.html
# https://github.com/whatwg/html/commit/9e2e25f4ae90969a7c64e0763c98548a35b50af8
# ^2: event-source
# renamed to eventsource in 7/2008:
# https://github.com/whatwg/html/commit/d157945d0285b4463a04b57318da0c4b300a99e7
# removed entirely in 2/2009:
# https://github.com/whatwg/html/commit/43cbdbfbb7eb74b0d65e0f4caab2020c0b2a16ff
# ^3: param
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/param
cdataElements = frozenset(['title', 'textarea'])
rcdataElements = frozenset([
@ -604,6 +617,7 @@ booleanAttributes = {
"button": frozenset(["disabled", "autofocus"]),
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"ol": frozenset(["reversed"]),
"output": frozenset(["disabled", "readonly"]),
"iframe": frozenset(["seamless"]),
}

View file

@ -113,6 +113,7 @@ allowed_elements = frozenset((
(namespaces['html'], 'strike'),
(namespaces['html'], 'strong'),
(namespaces['html'], 'sub'),
(namespaces['html'], 'summary'),
(namespaces['html'], 'sup'),
(namespaces['html'], 'table'),
(namespaces['html'], 'tbody'),
@ -128,6 +129,7 @@ allowed_elements = frozenset((
(namespaces['html'], 'ul'),
(namespaces['html'], 'var'),
(namespaces['html'], 'video'),
(namespaces['html'], 'wbr'),
(namespaces['mathml'], 'maction'),
(namespaces['mathml'], 'math'),
(namespaces['mathml'], 'merror'),
@ -363,6 +365,7 @@ allowed_attributes = frozenset((
(None, 'maxsize'),
(None, 'minsize'),
(None, 'other'),
(None, 'reversed'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowalign'),
@ -373,6 +376,7 @@ allowed_attributes = frozenset((
(None, 'scriptlevel'),
(None, 'selection'),
(None, 'separator'),
(None, 'start'),
(None, 'stretchy'),
(None, 'width'),
(None, 'width'),
@ -594,6 +598,10 @@ allowed_css_properties = frozenset((
'height',
'letter-spacing',
'line-height',
'max-height',
'min-height',
'max-width',
'min-width',
'overflow',
'pause',
'pause-after',

View file

@ -115,6 +115,9 @@ class HTMLParser(object):
if tree is None:
tree = treebuilders.getTreeBuilder("etree")
elif isinstance(tree, str):
tree = treebuilders.getTreeBuilder(tree)
self.tree = tree(namespaceHTMLElements)
self.errors = []
@ -1002,8 +1005,8 @@ def getPhases(debug):
self.tree.insertText(token["data"])
# This must be bad for performance
if (self.parser.framesetOK and
any([char not in spaceCharacters
for char in token["data"]])):
any(char not in spaceCharacters
for char in token["data"])):
self.parser.framesetOK = False
def processSpaceCharactersNonPre(self, token):
@ -1850,7 +1853,7 @@ def getPhases(debug):
def flushCharacters(self):
data = "".join([item["data"] for item in self.characterTokens])
if any([item not in spaceCharacters for item in data]):
if any(item not in spaceCharacters for item in data):
token = {"type": tokenTypes["Characters"], "data": data}
self.parser.phases["inTable"].insertText(token)
elif data:

View file

@ -222,14 +222,14 @@ class HTMLSerializer(object):
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
assert isinstance(string, text_type)
if self.encoding:
return string.encode(self.encoding, "htmlentityreplace")
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
assert isinstance(string, text_type)
if self.encoding:
return string.encode(self.encoding, "strict")
else:

View file

@ -121,6 +121,7 @@ class Node(object):
class ActiveFormattingElements(list):
def append(self, node):
"""Append node to the end of the list."""
equalCount = 0
if node != Marker:
for element in self[::-1]:

View file

@ -108,7 +108,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
node.parent = None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not len(self._element):
if not self._element.text:
self._element.text = ""
self._element.text += data
@ -201,7 +201,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
rv = []
def serializeElement(element, indent=0):
if not(hasattr(element, "tag")):
if not hasattr(element, "tag"):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):

View file

@ -37,7 +37,7 @@ def getETreeBuilder(ElementTreeImplementation):
else:
node = elt
if not(hasattr(node, "tag")):
if not hasattr(node, "tag"):
node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):