From 8c026d797738bbb718152cf5b7858a89df7d901b Mon Sep 17 00:00:00 2001 From: JackDandy Date: Tue, 12 Jan 2016 01:17:02 +0000 Subject: [PATCH] Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). --- CHANGES.md | 1 + lib/html5lib/__init__.py | 4 +- lib/html5lib/constants.py | 388 +++++++++++----------- lib/html5lib/filters/lint.py | 41 ++- lib/html5lib/filters/optionaltags.py | 4 +- lib/html5lib/html5parser.py | 23 +- lib/html5lib/ihatexml.py | 3 + lib/html5lib/inputstream.py | 49 ++- lib/html5lib/sanitizer.py | 43 ++- lib/html5lib/serializer/htmlserializer.py | 19 +- lib/html5lib/treeadapters/__init__.py | 12 + lib/html5lib/treeadapters/genshi.py | 47 +++ lib/html5lib/treebuilders/dom.py | 2 +- lib/html5lib/treebuilders/etree_lxml.py | 13 +- lib/html5lib/treewalkers/__init__.py | 122 ++++++- lib/html5lib/treewalkers/_base.py | 8 +- lib/html5lib/treewalkers/dom.py | 3 - lib/html5lib/treewalkers/etree.py | 7 +- lib/html5lib/treewalkers/lxmletree.py | 11 +- lib/html5lib/treewalkers/pulldom.py | 63 ---- lib/html5lib/utils.py | 41 ++- 21 files changed, 534 insertions(+), 370 deletions(-) create mode 100644 lib/html5lib/treeadapters/genshi.py delete mode 100644 lib/html5lib/treewalkers/pulldom.py diff --git a/CHANGES.md b/CHANGES.md index 0fa591d1..37eb4921 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,7 @@ * Update chardet packages 2.3.0 (26982c5) to 2.3.0 (d7fae98) * Update dateutil library 2.4.2 (083f666) to 2.4.2 (d4baf97) * Update Hachoir library 1.3.4 (r1383) to 1.3.4 (r1435) +* Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d) ### 0.11.0 (2016-01-10 22:30:00 UTC) diff --git a/lib/html5lib/__init__.py b/lib/html5lib/__init__.py index 19a4b7d6..3f17d83c 100644 --- a/lib/html5lib/__init__.py +++ b/lib/html5lib/__init__.py @@ -20,4 +20,6 @@ from .serializer import serialize __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", "getTreeWalker", "serialize"] -__version__ = "0.999" + +# this has to be at the top level, see how setup.py parses this +__version__ = "0.99999999-dev" diff --git a/lib/html5lib/constants.py b/lib/html5lib/constants.py index e7089846..d938e0ae 100644 --- a/lib/html5lib/constants.py +++ b/lib/html5lib/constants.py @@ -1,292 +1,290 @@ from __future__ import absolute_import, division, unicode_literals import string -import gettext -_ = gettext.gettext EOF = None E = { "null-character": - _("Null character in input stream, replaced with U+FFFD."), + "Null character in input stream, replaced with U+FFFD.", "invalid-codepoint": - _("Invalid codepoint in stream."), + "Invalid codepoint in stream.", "incorrectly-placed-solidus": - _("Solidus (/) incorrectly placed in tag."), + "Solidus (/) incorrectly placed in tag.", "incorrect-cr-newline-entity": - _("Incorrect CR newline entity, replaced with LF."), + "Incorrect CR newline entity, replaced with LF.", "illegal-windows-1252-entity": - _("Entity used with illegal number (windows-1252 reference)."), + "Entity used with illegal number (windows-1252 reference).", "cant-convert-numeric-entity": - _("Numeric entity couldn't be converted to character " - "(codepoint U+%(charAsInt)08x)."), + "Numeric entity couldn't be converted to character " + "(codepoint U+%(charAsInt)08x).", "illegal-codepoint-for-numeric-entity": - _("Numeric entity represents an illegal codepoint: " - "U+%(charAsInt)08x."), + "Numeric entity represents an illegal codepoint: " + "U+%(charAsInt)08x.", "numeric-entity-without-semicolon": - _("Numeric entity didn't end with ';'."), + "Numeric entity didn't end with ';'.", "expected-numeric-entity-but-got-eof": - _("Numeric entity expected. Got end of file instead."), + "Numeric entity expected. Got end of file instead.", "expected-numeric-entity": - _("Numeric entity expected but none found."), + "Numeric entity expected but none found.", "named-entity-without-semicolon": - _("Named entity didn't end with ';'."), + "Named entity didn't end with ';'.", "expected-named-entity": - _("Named entity expected. Got none."), + "Named entity expected. Got none.", "attributes-in-end-tag": - _("End tag contains unexpected attributes."), + "End tag contains unexpected attributes.", 'self-closing-flag-on-end-tag': - _("End tag contains unexpected self-closing flag."), + "End tag contains unexpected self-closing flag.", "expected-tag-name-but-got-right-bracket": - _("Expected tag name. Got '>' instead."), + "Expected tag name. Got '>' instead.", "expected-tag-name-but-got-question-mark": - _("Expected tag name. Got '?' instead. (HTML doesn't " - "support processing instructions.)"), + "Expected tag name. Got '?' instead. (HTML doesn't " + "support processing instructions.)", "expected-tag-name": - _("Expected tag name. Got something else instead"), + "Expected tag name. Got something else instead", "expected-closing-tag-but-got-right-bracket": - _("Expected closing tag. Got '>' instead. Ignoring ''."), + "Expected closing tag. Got '>' instead. Ignoring ''.", "expected-closing-tag-but-got-eof": - _("Expected closing tag. Unexpected end of file."), + "Expected closing tag. Unexpected end of file.", "expected-closing-tag-but-got-char": - _("Expected closing tag. Unexpected character '%(data)s' found."), + "Expected closing tag. Unexpected character '%(data)s' found.", "eof-in-tag-name": - _("Unexpected end of file in the tag name."), + "Unexpected end of file in the tag name.", "expected-attribute-name-but-got-eof": - _("Unexpected end of file. Expected attribute name instead."), + "Unexpected end of file. Expected attribute name instead.", "eof-in-attribute-name": - _("Unexpected end of file in attribute name."), + "Unexpected end of file in attribute name.", "invalid-character-in-attribute-name": - _("Invalid character in attribute name"), + "Invalid character in attribute name", "duplicate-attribute": - _("Dropped duplicate attribute on tag."), + "Dropped duplicate attribute on tag.", "expected-end-of-tag-name-but-got-eof": - _("Unexpected end of file. Expected = or end of tag."), + "Unexpected end of file. Expected = or end of tag.", "expected-attribute-value-but-got-eof": - _("Unexpected end of file. Expected attribute value."), + "Unexpected end of file. Expected attribute value.", "expected-attribute-value-but-got-right-bracket": - _("Expected attribute value. Got '>' instead."), + "Expected attribute value. Got '>' instead.", 'equals-in-unquoted-attribute-value': - _("Unexpected = in unquoted attribute"), + "Unexpected = in unquoted attribute", 'unexpected-character-in-unquoted-attribute-value': - _("Unexpected character in unquoted attribute"), + "Unexpected character in unquoted attribute", "invalid-character-after-attribute-name": - _("Unexpected character after attribute name."), + "Unexpected character after attribute name.", "unexpected-character-after-attribute-value": - _("Unexpected character after attribute value."), + "Unexpected character after attribute value.", "eof-in-attribute-value-double-quote": - _("Unexpected end of file in attribute value (\")."), + "Unexpected end of file in attribute value (\").", "eof-in-attribute-value-single-quote": - _("Unexpected end of file in attribute value (')."), + "Unexpected end of file in attribute value (').", "eof-in-attribute-value-no-quotes": - _("Unexpected end of file in attribute value."), + "Unexpected end of file in attribute value.", "unexpected-EOF-after-solidus-in-tag": - _("Unexpected end of file in tag. Expected >"), + "Unexpected end of file in tag. Expected >", "unexpected-character-after-solidus-in-tag": - _("Unexpected character after / in tag. Expected >"), + "Unexpected character after / in tag. Expected >", "expected-dashes-or-doctype": - _("Expected '--' or 'DOCTYPE'. Not found."), + "Expected '--' or 'DOCTYPE'. Not found.", "unexpected-bang-after-double-dash-in-comment": - _("Unexpected ! after -- in comment"), + "Unexpected ! after -- in comment", "unexpected-space-after-double-dash-in-comment": - _("Unexpected space after -- in comment"), + "Unexpected space after -- in comment", "incorrect-comment": - _("Incorrect comment."), + "Incorrect comment.", "eof-in-comment": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "eof-in-comment-end-dash": - _("Unexpected end of file in comment (-)"), + "Unexpected end of file in comment (-)", "unexpected-dash-after-double-dash-in-comment": - _("Unexpected '-' after '--' found in comment."), + "Unexpected '-' after '--' found in comment.", "eof-in-comment-double-dash": - _("Unexpected end of file in comment (--)."), + "Unexpected end of file in comment (--).", "eof-in-comment-end-space-state": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "eof-in-comment-end-bang-state": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "unexpected-char-in-comment": - _("Unexpected character in comment found."), + "Unexpected character in comment found.", "need-space-after-doctype": - _("No space after literal string 'DOCTYPE'."), + "No space after literal string 'DOCTYPE'.", "expected-doctype-name-but-got-right-bracket": - _("Unexpected > character. Expected DOCTYPE name."), + "Unexpected > character. Expected DOCTYPE name.", "expected-doctype-name-but-got-eof": - _("Unexpected end of file. Expected DOCTYPE name."), + "Unexpected end of file. Expected DOCTYPE name.", "eof-in-doctype-name": - _("Unexpected end of file in DOCTYPE name."), + "Unexpected end of file in DOCTYPE name.", "eof-in-doctype": - _("Unexpected end of file in DOCTYPE."), + "Unexpected end of file in DOCTYPE.", "expected-space-or-right-bracket-in-doctype": - _("Expected space or '>'. Got '%(data)s'"), + "Expected space or '>'. Got '%(data)s'", "unexpected-end-of-doctype": - _("Unexpected end of DOCTYPE."), + "Unexpected end of DOCTYPE.", "unexpected-char-in-doctype": - _("Unexpected character in DOCTYPE."), + "Unexpected character in DOCTYPE.", "eof-in-innerhtml": - _("XXX innerHTML EOF"), + "XXX innerHTML EOF", "unexpected-doctype": - _("Unexpected DOCTYPE. Ignored."), + "Unexpected DOCTYPE. Ignored.", "non-html-root": - _("html needs to be the first start tag."), + "html needs to be the first start tag.", "expected-doctype-but-got-eof": - _("Unexpected End of file. Expected DOCTYPE."), + "Unexpected End of file. Expected DOCTYPE.", "unknown-doctype": - _("Erroneous DOCTYPE."), + "Erroneous DOCTYPE.", "expected-doctype-but-got-chars": - _("Unexpected non-space characters. Expected DOCTYPE."), + "Unexpected non-space characters. Expected DOCTYPE.", "expected-doctype-but-got-start-tag": - _("Unexpected start tag (%(name)s). Expected DOCTYPE."), + "Unexpected start tag (%(name)s). Expected DOCTYPE.", "expected-doctype-but-got-end-tag": - _("Unexpected end tag (%(name)s). Expected DOCTYPE."), + "Unexpected end tag (%(name)s). Expected DOCTYPE.", "end-tag-after-implied-root": - _("Unexpected end tag (%(name)s) after the (implied) root element."), + "Unexpected end tag (%(name)s) after the (implied) root element.", "expected-named-closing-tag-but-got-eof": - _("Unexpected end of file. Expected end tag (%(name)s)."), + "Unexpected end of file. Expected end tag (%(name)s).", "two-heads-are-not-better-than-one": - _("Unexpected start tag head in existing head. Ignored."), + "Unexpected start tag head in existing head. Ignored.", "unexpected-end-tag": - _("Unexpected end tag (%(name)s). Ignored."), + "Unexpected end tag (%(name)s). Ignored.", "unexpected-start-tag-out-of-my-head": - _("Unexpected start tag (%(name)s) that can be in head. Moved."), + "Unexpected start tag (%(name)s) that can be in head. Moved.", "unexpected-start-tag": - _("Unexpected start tag (%(name)s)."), + "Unexpected start tag (%(name)s).", "missing-end-tag": - _("Missing end tag (%(name)s)."), + "Missing end tag (%(name)s).", "missing-end-tags": - _("Missing end tags (%(name)s)."), + "Missing end tags (%(name)s).", "unexpected-start-tag-implies-end-tag": - _("Unexpected start tag (%(startName)s) " - "implies end tag (%(endName)s)."), + "Unexpected start tag (%(startName)s) " + "implies end tag (%(endName)s).", "unexpected-start-tag-treated-as": - _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "Unexpected start tag (%(originalName)s). Treated as %(newName)s.", "deprecated-tag": - _("Unexpected start tag %(name)s. Don't use it!"), + "Unexpected start tag %(name)s. Don't use it!", "unexpected-start-tag-ignored": - _("Unexpected start tag %(name)s. Ignored."), + "Unexpected start tag %(name)s. Ignored.", "expected-one-end-tag-but-got-another": - _("Unexpected end tag (%(gotName)s). " - "Missing end tag (%(expectedName)s)."), + "Unexpected end tag (%(gotName)s). " + "Missing end tag (%(expectedName)s).", "end-tag-too-early": - _("End tag (%(name)s) seen too early. Expected other end tag."), + "End tag (%(name)s) seen too early. Expected other end tag.", "end-tag-too-early-named": - _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).", "end-tag-too-early-ignored": - _("End tag (%(name)s) seen too early. Ignored."), + "End tag (%(name)s) seen too early. Ignored.", "adoption-agency-1.1": - _("End tag (%(name)s) violates step 1, " - "paragraph 1 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 1, " + "paragraph 1 of the adoption agency algorithm.", "adoption-agency-1.2": - _("End tag (%(name)s) violates step 1, " - "paragraph 2 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 1, " + "paragraph 2 of the adoption agency algorithm.", "adoption-agency-1.3": - _("End tag (%(name)s) violates step 1, " - "paragraph 3 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 1, " + "paragraph 3 of the adoption agency algorithm.", "adoption-agency-4.4": - _("End tag (%(name)s) violates step 4, " - "paragraph 4 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 4, " + "paragraph 4 of the adoption agency algorithm.", "unexpected-end-tag-treated-as": - _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "Unexpected end tag (%(originalName)s). Treated as %(newName)s.", "no-end-tag": - _("This element (%(name)s) has no end tag."), + "This element (%(name)s) has no end tag.", "unexpected-implied-end-tag-in-table": - _("Unexpected implied end tag (%(name)s) in the table phase."), + "Unexpected implied end tag (%(name)s) in the table phase.", "unexpected-implied-end-tag-in-table-body": - _("Unexpected implied end tag (%(name)s) in the table body phase."), + "Unexpected implied end tag (%(name)s) in the table body phase.", "unexpected-char-implies-table-voodoo": - _("Unexpected non-space characters in " - "table context caused voodoo mode."), + "Unexpected non-space characters in " + "table context caused voodoo mode.", "unexpected-hidden-input-in-table": - _("Unexpected input with type hidden in table context."), + "Unexpected input with type hidden in table context.", "unexpected-form-in-table": - _("Unexpected form in table context."), + "Unexpected form in table context.", "unexpected-start-tag-implies-table-voodoo": - _("Unexpected start tag (%(name)s) in " - "table context caused voodoo mode."), + "Unexpected start tag (%(name)s) in " + "table context caused voodoo mode.", "unexpected-end-tag-implies-table-voodoo": - _("Unexpected end tag (%(name)s) in " - "table context caused voodoo mode."), + "Unexpected end tag (%(name)s) in " + "table context caused voodoo mode.", "unexpected-cell-in-table-body": - _("Unexpected table cell start tag (%(name)s) " - "in the table body phase."), + "Unexpected table cell start tag (%(name)s) " + "in the table body phase.", "unexpected-cell-end-tag": - _("Got table cell end tag (%(name)s) " - "while required end tags are missing."), + "Got table cell end tag (%(name)s) " + "while required end tags are missing.", "unexpected-end-tag-in-table-body": - _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "Unexpected end tag (%(name)s) in the table body phase. Ignored.", "unexpected-implied-end-tag-in-table-row": - _("Unexpected implied end tag (%(name)s) in the table row phase."), + "Unexpected implied end tag (%(name)s) in the table row phase.", "unexpected-end-tag-in-table-row": - _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "Unexpected end tag (%(name)s) in the table row phase. Ignored.", "unexpected-select-in-select": - _("Unexpected select start tag in the select phase " - "treated as select end tag."), + "Unexpected select start tag in the select phase " + "treated as select end tag.", "unexpected-input-in-select": - _("Unexpected input start tag in the select phase."), + "Unexpected input start tag in the select phase.", "unexpected-start-tag-in-select": - _("Unexpected start tag token (%(name)s in the select phase. " - "Ignored."), + "Unexpected start tag token (%(name)s in the select phase. " + "Ignored.", "unexpected-end-tag-in-select": - _("Unexpected end tag (%(name)s) in the select phase. Ignored."), + "Unexpected end tag (%(name)s) in the select phase. Ignored.", "unexpected-table-element-start-tag-in-select-in-table": - _("Unexpected table element start tag (%(name)s) in the select in table phase."), + "Unexpected table element start tag (%(name)s) in the select in table phase.", "unexpected-table-element-end-tag-in-select-in-table": - _("Unexpected table element end tag (%(name)s) in the select in table phase."), + "Unexpected table element end tag (%(name)s) in the select in table phase.", "unexpected-char-after-body": - _("Unexpected non-space characters in the after body phase."), + "Unexpected non-space characters in the after body phase.", "unexpected-start-tag-after-body": - _("Unexpected start tag token (%(name)s)" - " in the after body phase."), + "Unexpected start tag token (%(name)s)" + " in the after body phase.", "unexpected-end-tag-after-body": - _("Unexpected end tag token (%(name)s)" - " in the after body phase."), + "Unexpected end tag token (%(name)s)" + " in the after body phase.", "unexpected-char-in-frameset": - _("Unexpected characters in the frameset phase. Characters ignored."), + "Unexpected characters in the frameset phase. Characters ignored.", "unexpected-start-tag-in-frameset": - _("Unexpected start tag token (%(name)s)" - " in the frameset phase. Ignored."), + "Unexpected start tag token (%(name)s)" + " in the frameset phase. Ignored.", "unexpected-frameset-in-frameset-innerhtml": - _("Unexpected end tag token (frameset) " - "in the frameset phase (innerHTML)."), + "Unexpected end tag token (frameset) " + "in the frameset phase (innerHTML).", "unexpected-end-tag-in-frameset": - _("Unexpected end tag token (%(name)s)" - " in the frameset phase. Ignored."), + "Unexpected end tag token (%(name)s)" + " in the frameset phase. Ignored.", "unexpected-char-after-frameset": - _("Unexpected non-space characters in the " - "after frameset phase. Ignored."), + "Unexpected non-space characters in the " + "after frameset phase. Ignored.", "unexpected-start-tag-after-frameset": - _("Unexpected start tag (%(name)s)" - " in the after frameset phase. Ignored."), + "Unexpected start tag (%(name)s)" + " in the after frameset phase. Ignored.", "unexpected-end-tag-after-frameset": - _("Unexpected end tag (%(name)s)" - " in the after frameset phase. Ignored."), + "Unexpected end tag (%(name)s)" + " in the after frameset phase. Ignored.", "unexpected-end-tag-after-body-innerhtml": - _("Unexpected end tag after body(innerHtml)"), + "Unexpected end tag after body(innerHtml)", "expected-eof-but-got-char": - _("Unexpected non-space characters. Expected end of file."), + "Unexpected non-space characters. Expected end of file.", "expected-eof-but-got-start-tag": - _("Unexpected start tag (%(name)s)" - ". Expected end of file."), + "Unexpected start tag (%(name)s)" + ". Expected end of file.", "expected-eof-but-got-end-tag": - _("Unexpected end tag (%(name)s)" - ". Expected end of file."), + "Unexpected end tag (%(name)s)" + ". Expected end of file.", "eof-in-table": - _("Unexpected end of file. Expected table content."), + "Unexpected end of file. Expected table content.", "eof-in-select": - _("Unexpected end of file. Expected select content."), + "Unexpected end of file. Expected select content.", "eof-in-frameset": - _("Unexpected end of file. Expected frameset content."), + "Unexpected end of file. Expected frameset content.", "eof-in-script-in-script": - _("Unexpected end of file. Expected script content."), + "Unexpected end of file. Expected script content.", "eof-in-foreign-lands": - _("Unexpected end of file. Expected foreign content"), + "Unexpected end of file. Expected foreign content", "non-void-element-with-trailing-solidus": - _("Trailing solidus not allowed on element %(name)s"), + "Trailing solidus not allowed on element %(name)s", "unexpected-html-element-in-foreign-content": - _("Element %(name)s not allowed in a non-html context"), + "Element %(name)s not allowed in a non-html context", "unexpected-end-tag-before-html": - _("Unexpected end tag (%(name)s) before html."), + "Unexpected end tag (%(name)s) before html.", "XXX-undefined-error": - _("Undefined error (this sucks and should be fixed)"), + "Undefined error (this sucks and should be fixed)", } namespaces = { @@ -298,7 +296,7 @@ namespaces = { "xmlns": "http://www.w3.org/2000/xmlns/" } -scopingElements = frozenset(( +scopingElements = frozenset([ (namespaces["html"], "applet"), (namespaces["html"], "caption"), (namespaces["html"], "html"), @@ -316,9 +314,9 @@ scopingElements = frozenset(( (namespaces["svg"], "foreignObject"), (namespaces["svg"], "desc"), (namespaces["svg"], "title"), -)) +]) -formattingElements = frozenset(( +formattingElements = frozenset([ (namespaces["html"], "a"), (namespaces["html"], "b"), (namespaces["html"], "big"), @@ -333,9 +331,9 @@ formattingElements = frozenset(( (namespaces["html"], "strong"), (namespaces["html"], "tt"), (namespaces["html"], "u") -)) +]) -specialElements = frozenset(( +specialElements = frozenset([ (namespaces["html"], "address"), (namespaces["html"], "applet"), (namespaces["html"], "area"), @@ -416,22 +414,22 @@ specialElements = frozenset(( (namespaces["html"], "wbr"), (namespaces["html"], "xmp"), (namespaces["svg"], "foreignObject") -)) +]) -htmlIntegrationPointElements = frozenset(( +htmlIntegrationPointElements = frozenset([ (namespaces["mathml"], "annotaion-xml"), (namespaces["svg"], "foreignObject"), (namespaces["svg"], "desc"), (namespaces["svg"], "title") -)) +]) -mathmlTextIntegrationPointElements = frozenset(( +mathmlTextIntegrationPointElements = frozenset([ (namespaces["mathml"], "mi"), (namespaces["mathml"], "mo"), (namespaces["mathml"], "mn"), (namespaces["mathml"], "ms"), (namespaces["mathml"], "mtext") -)) +]) adjustForeignAttributes = { "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), @@ -451,21 +449,21 @@ adjustForeignAttributes = { unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in adjustForeignAttributes.items()]) -spaceCharacters = frozenset(( +spaceCharacters = frozenset([ "\t", "\n", "\u000C", " ", "\r" -)) +]) -tableInsertModeElements = frozenset(( +tableInsertModeElements = frozenset([ "table", "tbody", "tfoot", "thead", "tr" -)) +]) asciiLowercase = frozenset(string.ascii_lowercase) asciiUppercase = frozenset(string.ascii_uppercase) @@ -486,7 +484,7 @@ headingElements = ( "h6" ) -voidElements = frozenset(( +voidElements = frozenset([ "base", "command", "event-source", @@ -502,11 +500,11 @@ voidElements = frozenset(( "input", "source", "track" -)) +]) -cdataElements = frozenset(('title', 'textarea')) +cdataElements = frozenset(['title', 'textarea']) -rcdataElements = frozenset(( +rcdataElements = frozenset([ 'style', 'script', 'xmp', @@ -514,27 +512,27 @@ rcdataElements = frozenset(( 'noembed', 'noframes', 'noscript' -)) +]) booleanAttributes = { - "": frozenset(("irrelevant",)), - "style": frozenset(("scoped",)), - "img": frozenset(("ismap",)), - "audio": frozenset(("autoplay", "controls")), - "video": frozenset(("autoplay", "controls")), - "script": frozenset(("defer", "async")), - "details": frozenset(("open",)), - "datagrid": frozenset(("multiple", "disabled")), - "command": frozenset(("hidden", "disabled", "checked", "default")), - "hr": frozenset(("noshade")), - "menu": frozenset(("autosubmit",)), - "fieldset": frozenset(("disabled", "readonly")), - "option": frozenset(("disabled", "readonly", "selected")), - "optgroup": frozenset(("disabled", "readonly")), - "button": frozenset(("disabled", "autofocus")), - "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), - "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), - "output": frozenset(("disabled", "readonly")), + "": frozenset(["irrelevant"]), + "style": frozenset(["scoped"]), + "img": frozenset(["ismap"]), + "audio": frozenset(["autoplay", "controls"]), + "video": frozenset(["autoplay", "controls"]), + "script": frozenset(["defer", "async"]), + "details": frozenset(["open"]), + "datagrid": frozenset(["multiple", "disabled"]), + "command": frozenset(["hidden", "disabled", "checked", "default"]), + "hr": frozenset(["noshade"]), + "menu": frozenset(["autosubmit"]), + "fieldset": frozenset(["disabled", "readonly"]), + "option": frozenset(["disabled", "readonly", "selected"]), + "optgroup": frozenset(["disabled", "readonly"]), + "button": frozenset(["disabled", "autofocus"]), + "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), + "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), + "output": frozenset(["disabled", "readonly"]), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It @@ -574,7 +572,7 @@ entitiesWindows1252 = ( 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) -xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) +xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;']) entities = { "AElig": "\xc6", @@ -3088,8 +3086,8 @@ tokenTypes = { "ParseError": 7 } -tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], - tokenTypes["EmptyTag"])) +tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"]]) prefixes = dict([(v, k) for k, v in namespaces.items()]) diff --git a/lib/html5lib/filters/lint.py b/lib/html5lib/filters/lint.py index 7cc99a4b..8884696d 100644 --- a/lib/html5lib/filters/lint.py +++ b/lib/html5lib/filters/lint.py @@ -1,8 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from gettext import gettext -_ = gettext - from . import _base from ..constants import cdataElements, rcdataElements, voidElements @@ -23,24 +20,24 @@ class Filter(_base.Filter): if type in ("StartTag", "EmptyTag"): name = token["name"] if contentModelFlag != "PCDATA": - raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name}) + raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: - raise LintError(_("Empty tag name")) + raise LintError("Empty tag name") if type == "StartTag" and name in voidElements: - raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name}) + raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) elif type == "EmptyTag" and name not in voidElements: - raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]}) + raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) if type == "StartTag": open_elements.append(name) for name, value in token["data"]: if not isinstance(name, str): - raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name}) + raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) if not name: - raise LintError(_("Empty attribute name")) + raise LintError("Empty attribute name") if not isinstance(value, str): - raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value}) + raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) if name in cdataElements: contentModelFlag = "CDATA" elif name in rcdataElements: @@ -51,43 +48,43 @@ class Filter(_base.Filter): elif type == "EndTag": name = token["name"] if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: - raise LintError(_("Empty tag name")) + raise LintError("Empty tag name") if name in voidElements: - raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name}) + raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) start_name = open_elements.pop() if start_name != name: - raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name}) + raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) contentModelFlag = "PCDATA" elif type == "Comment": if contentModelFlag != "PCDATA": - raise LintError(_("Comment not in PCDATA content model flag")) + raise LintError("Comment not in PCDATA content model flag") elif type in ("Characters", "SpaceCharacters"): data = token["data"] if not isinstance(data, str): - raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data}) + raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) if not data: - raise LintError(_("%(type)s token with empty data") % {"type": type}) + raise LintError("%(type)s token with empty data" % {"type": type}) if type == "SpaceCharacters": data = data.strip(spaceCharacters) if data: - raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data}) + raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data}) elif type == "Doctype": name = token["name"] if contentModelFlag != "PCDATA": - raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name}) + raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) # XXX: what to do with token["data"] ? elif type in ("ParseError", "SerializeError"): pass else: - raise LintError(_("Unknown token type: %(type)s") % {"type": type}) + raise LintError("Unknown token type: %(type)s" % {"type": type}) yield token diff --git a/lib/html5lib/filters/optionaltags.py b/lib/html5lib/filters/optionaltags.py index fefe0b30..dab0574a 100644 --- a/lib/html5lib/filters/optionaltags.py +++ b/lib/html5lib/filters/optionaltags.py @@ -58,7 +58,7 @@ class Filter(_base.Filter): elif tagname == 'colgroup': # A colgroup element's start tag may be omitted if the first thing # inside the colgroup element is a col element, and if the element - # is not immediately preceeded by another colgroup element whose + # is not immediately preceded by another colgroup element whose # end tag has been omitted. if type in ("StartTag", "EmptyTag"): # XXX: we do not look at the preceding event, so instead we never @@ -70,7 +70,7 @@ class Filter(_base.Filter): elif tagname == 'tbody': # A tbody element's start tag may be omitted if the first thing # inside the tbody element is a tr element, and if the element is - # not immediately preceeded by a tbody, thead, or tfoot element + # not immediately preceded by a tbody, thead, or tfoot element # whose end tag has been omitted. if type == "StartTag": # omit the thead and tfoot elements' end tag when they are diff --git a/lib/html5lib/html5parser.py b/lib/html5lib/html5parser.py index b0f14f39..c2c30783 100644 --- a/lib/html5lib/html5parser.py +++ b/lib/html5lib/html5parser.py @@ -18,6 +18,7 @@ from .constants import cdataElements, rcdataElements from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements from .constants import adjustForeignAttributes as adjustForeignAttributesMap +from .constants import E def parse(doc, treebuilder="etree", encoding=None, @@ -129,6 +130,17 @@ class HTMLParser(object): self.framesetOK = True + @property + def documentEncoding(self): + """The name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.charEncoding[0] + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): @@ -245,7 +257,7 @@ class HTMLParser(object): # XXX The idea is to make errorcode mandatory. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: - raise ParseError + raise ParseError(E[errorcode] % datavars) def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ @@ -868,7 +880,7 @@ def getPhases(debug): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("base", "basefont", "bgsound", "command", "link", "meta", - "noframes", "script", "style", "title"), + "script", "style", "title"), self.startTagProcessInHead), ("body", self.startTagBody), ("frameset", self.startTagFrameset), @@ -1205,8 +1217,7 @@ def getPhases(debug): attributes["name"] = "isindex" self.processStartTag(impliedTagToken("input", "StartTag", attributes=attributes, - selfClosing= - token["selfClosing"])) + selfClosing=token["selfClosing"])) self.processEndTag(impliedTagToken("label")) self.processStartTag(impliedTagToken("hr", "StartTag")) self.processEndTag(impliedTagToken("form")) @@ -1316,7 +1327,7 @@ def getPhases(debug): # Not sure this is the correct name for the parse error self.parser.parseError( "expected-one-end-tag-but-got-another", - {"expectedName": "body", "gotName": node.name}) + {"gotName": "body", "expectedName": node.name}) break self.parser.phase = self.parser.phases["afterBody"] @@ -2553,7 +2564,7 @@ def getPhases(debug): self.tree.openElements.pop() if (not self.parser.innerHTML and self.tree.openElements[-1].name != "frameset"): - # If we're not in innerHTML mode and the the current node is not a + # If we're not in innerHTML mode and the current node is not a # "frameset" element (anymore) then switch. self.parser.phase = self.parser.phases["afterFrameset"] diff --git a/lib/html5lib/ihatexml.py b/lib/html5lib/ihatexml.py index 0fc79308..5da5d938 100644 --- a/lib/html5lib/ihatexml.py +++ b/lib/html5lib/ihatexml.py @@ -225,6 +225,9 @@ class InfosetFilter(object): while "--" in data: warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning) data = data.replace("--", "- -") + if data.endswith("-"): + warnings.warn("Comments cannot end in a dash", DataLossWarning) + data += " " return data def coerceCharacters(self, data): diff --git a/lib/html5lib/inputstream.py b/lib/html5lib/inputstream.py index 9e03b931..63373db9 100644 --- a/lib/html5lib/inputstream.py +++ b/lib/html5lib/inputstream.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, unicode_literals + from six import text_type -from six.moves import http_client +from six.moves import http_client, urllib import codecs import re @@ -28,7 +29,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" + +if utils.supports_lone_surrogates: + # Use one extra step of indirection and create surrogates with + # unichr. Not using this indirection would introduce an illegal + # unicode literal on platforms not supporting such lone + # surrogates. + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + + eval('"\\uD800-\\uDFFF"')) +else: + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -119,9 +131,12 @@ class BufferedStream(object): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): - if isinstance(source, http_client.HTTPResponse): - # Work around Python bug #20007: read(0) closes the connection. - # http://bugs.python.org/issue20007 + # Work around Python bug #20007: read(0) closes the connection. + # http://bugs.python.org/issue20007 + if (isinstance(source, http_client.HTTPResponse) or + # Also check for addinfourl wrapping HTTPResponse + (isinstance(source, urllib.response.addbase) and + isinstance(source.fp, http_client.HTTPResponse))): isUnicode = False elif hasattr(source, "read"): isUnicode = isinstance(source.read(0), text_type) @@ -164,13 +179,18 @@ class HTMLUnicodeInputStream(object): """ - # Craziness - if len("\U0010FFFF") == 1: + if not utils.supports_lone_surrogates: + # Such platforms will have already checked for such + # surrogate errors, so no need to do this checking. + self.reportCharacterErrors = None + self.replaceCharactersRegexp = None + elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") + self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"')) else: self.reportCharacterErrors = self.characterErrorsUCS2 - self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?/ + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) + |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) + # Assume the rest is data + ,.* + $ + ''', + re.VERBOSE) + + class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object): 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] - attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', - 'xlink:href', 'xml:base'] + attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc', + 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base'] svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', @@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object): acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs'] + 'ssh', 'sftp', 'rtsp', 'afs', 'data'] + + acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements @@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object): allowed_css_keywords = acceptable_css_keywords allowed_svg_properties = acceptable_svg_properties allowed_protocols = acceptable_protocols + allowed_content_types = acceptable_content_types # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style @@ -189,10 +207,21 @@ class HTMLSanitizerMixin(object): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): + try: + uri = urlparse.urlparse(val_unescaped) + except ValueError: + uri = None del attrs[attr] + if uri and uri.scheme: + if uri.scheme not in self.allowed_protocols: + del attrs[attr] + if uri.scheme == 'data': + m = content_type_rgx.match(uri.path) + if not m: + del attrs[attr] + elif m.group('content_type') not in self.allowed_content_types: + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', @@ -245,7 +274,7 @@ class HTMLSanitizerMixin(object): elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']: for keyword in value.split(): - if not keyword in self.acceptable_css_keywords and \ + if keyword not in self.acceptable_css_keywords and \ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): break else: diff --git a/lib/html5lib/serializer/htmlserializer.py b/lib/html5lib/serializer/htmlserializer.py index 412a5a22..be4d6344 100644 --- a/lib/html5lib/serializer/htmlserializer.py +++ b/lib/html5lib/serializer/htmlserializer.py @@ -1,9 +1,6 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type -import gettext -_ = gettext.gettext - try: from functools import reduce except ImportError: @@ -35,7 +32,7 @@ else: v = utils.surrogatePairToCodepoint(v) else: v = ord(v) - if not v in encode_entity_map or k.islower(): + if v not in encode_entity_map or k.islower(): # prefer < over < and similarly for &, >, etc. encode_entity_map[v] = k @@ -208,7 +205,7 @@ class HTMLSerializer(object): if token["systemId"]: if token["systemId"].find('"') >= 0: if token["systemId"].find("'") >= 0: - self.serializeError(_("System identifer contains both single and double quote characters")) + self.serializeError("System identifer contains both single and double quote characters") quote_char = "'" else: quote_char = '"' @@ -220,7 +217,7 @@ class HTMLSerializer(object): elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("= 0: - self.serializeError(_("Unexpected " % name) elif type == "Comment": data = token["data"] if data.find("--") >= 0: - self.serializeError(_("Comment contains --")) + self.serializeError("Comment contains --") yield self.encodeStrict("" % token["data"]) elif type == "Entity": name = token["name"] key = name + ";" - if not key in entities: - self.serializeError(_("Entity %s not recognized" % name)) + if key not in entities: + self.serializeError("Entity %s not recognized" % name) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: diff --git a/lib/html5lib/treeadapters/__init__.py b/lib/html5lib/treeadapters/__init__.py index e69de29b..57d71304 100644 --- a/lib/html5lib/treeadapters/__init__.py +++ b/lib/html5lib/treeadapters/__init__.py @@ -0,0 +1,12 @@ +from __future__ import absolute_import, division, unicode_literals + +from . import sax + +__all__ = ["sax"] + +try: + from . import genshi # flake8: noqa +except ImportError: + pass +else: + __all__.append("genshi") diff --git a/lib/html5lib/treeadapters/genshi.py b/lib/html5lib/treeadapters/genshi.py new file mode 100644 index 00000000..04e316df --- /dev/null +++ b/lib/html5lib/treeadapters/genshi.py @@ -0,0 +1,47 @@ +from __future__ import absolute_import, division, unicode_literals + +from genshi.core import QName, Attrs +from genshi.core import START, END, TEXT, COMMENT, DOCTYPE + + +def to_genshi(walker): + text = [] + for token in walker: + type = token["type"] + if type in ("Characters", "SpaceCharacters"): + text.append(token["data"]) + elif text: + yield TEXT, "".join(text), (None, -1, -1) + text = [] + + if type in ("StartTag", "EmptyTag"): + if token["namespace"]: + name = "{%s}%s" % (token["namespace"], token["name"]) + else: + name = token["name"] + attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value) + for attr, value in token["data"].items()]) + yield (START, (QName(name), attrs), (None, -1, -1)) + if type == "EmptyTag": + type = "EndTag" + + if type == "EndTag": + if token["namespace"]: + name = "{%s}%s" % (token["namespace"], token["name"]) + else: + name = token["name"] + + yield END, QName(name), (None, -1, -1) + + elif type == "Comment": + yield COMMENT, token["data"], (None, -1, -1) + + elif type == "Doctype": + yield DOCTYPE, (token["name"], token["publicId"], + token["systemId"]), (None, -1, -1) + + else: + pass # FIXME: What to do? + + if text: + yield TEXT, "".join(text), (None, -1, -1) diff --git a/lib/html5lib/treebuilders/dom.py b/lib/html5lib/treebuilders/dom.py index 61e5ed79..234233b7 100644 --- a/lib/html5lib/treebuilders/dom.py +++ b/lib/html5lib/treebuilders/dom.py @@ -158,7 +158,7 @@ def getDomBuilder(DomImplementation): else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): - if not Node.TEXT_NODE in self.dom._child_node_types: + if Node.TEXT_NODE not in self.dom._child_node_types: self.dom._child_node_types = list(self.dom._child_node_types) self.dom._child_node_types.append(Node.TEXT_NODE) self.dom.appendChild(self.dom.createTextNode(data)) diff --git a/lib/html5lib/treebuilders/etree_lxml.py b/lib/html5lib/treebuilders/etree_lxml.py index 35d08efa..138b30bd 100644 --- a/lib/html5lib/treebuilders/etree_lxml.py +++ b/lib/html5lib/treebuilders/etree_lxml.py @@ -54,7 +54,7 @@ class Document(object): def testSerializer(element): rv = [] finalText = None - infosetFilter = ihatexml.InfosetFilter() + infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) def serializeElement(element, indent=0): if not hasattr(element, "tag"): @@ -79,7 +79,7 @@ def testSerializer(element): next_element = next_element.getnext() elif isinstance(element, str) or isinstance(element, bytes): # Text in a fragment - assert isinstance(element, str) or sys.version_info.major == 2 + assert isinstance(element, str) or sys.version_info[0] == 2 rv.append("|%s\"%s\"" % (' ' * indent, element)) else: # Fragment case @@ -189,7 +189,7 @@ class TreeBuilder(_base.TreeBuilder): def __init__(self, namespaceHTMLElements, fullTree=False): builder = etree_builders.getETreeModule(etree, fullTree=fullTree) - infosetFilter = self.infosetFilter = ihatexml.InfosetFilter() + infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): @@ -257,7 +257,7 @@ class TreeBuilder(_base.TreeBuilder): data = property(_getData, _setData) self.elementClass = Element - self.commentClass = builder.Comment + self.commentClass = Comment # self.fragmentClass = builder.DocumentFragment _base.TreeBuilder.__init__(self, namespaceHTMLElements) @@ -315,7 +315,7 @@ class TreeBuilder(_base.TreeBuilder): """Create the document root""" # Because of the way libxml2 works, it doesn't seem to be possible to # alter information like the doctype after the tree has been parsed. - # Therefore we need to use the built-in parser to create our iniial + # Therefore we need to use the built-in parser to create our initial # tree, after which we can add elements like normal docStr = "" if self.doctype: @@ -344,7 +344,8 @@ class TreeBuilder(_base.TreeBuilder): # Append the initial comments: for comment_token in self.initial_comments: - root.addprevious(etree.Comment(comment_token["data"])) + comment = self.commentClass(comment_token["data"]) + root.addprevious(comment._element) # Create the root document and add the ElementTree to it self.document = self.documentClass() diff --git a/lib/html5lib/treewalkers/__init__.py b/lib/html5lib/treewalkers/__init__.py index 18124e75..21f46b01 100644 --- a/lib/html5lib/treewalkers/__init__.py +++ b/lib/html5lib/treewalkers/__init__.py @@ -10,8 +10,9 @@ returning an iterator generating tokens. from __future__ import absolute_import, division, unicode_literals -import sys +__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"] +from .. import constants from ..utils import default_etree treeWalkerCache = {} @@ -20,28 +21,27 @@ treeWalkerCache = {} def getTreeWalker(treeType, implementation=None, **kwargs): """Get a TreeWalker class for various types of tree with built-in support - treeType - the name of the tree type required (case-insensitive). Supported - values are: + Args: + treeType (str): the name of the tree type required (case-insensitive). + Supported values are: - "dom" - The xml.dom.minidom DOM implementation - "pulldom" - The xml.dom.pulldom event stream - "etree" - A generic walker for tree implementations exposing an - elementtree-like interface (known to work with - ElementTree, cElementTree and lxml.etree). - "lxml" - Optimized walker for lxml.etree - "genshi" - a Genshi stream + - "dom": The xml.dom.minidom DOM implementation + - "etree": A generic walker for tree implementations exposing an + elementtree-like interface (known to work with + ElementTree, cElementTree and lxml.etree). + - "lxml": Optimized walker for lxml.etree + - "genshi": a Genshi stream - implementation - (Currently applies to the "etree" tree type only). A module - implementing the tree type e.g. xml.etree.ElementTree or - cElementTree.""" + Implementation: A module implementing the tree type e.g. + xml.etree.ElementTree or cElementTree (Currently applies to the + "etree" tree type only). + """ treeType = treeType.lower() if treeType not in treeWalkerCache: - if treeType in ("dom", "pulldom"): - name = "%s.%s" % (__name__, treeType) - __import__(name) - mod = sys.modules[name] - treeWalkerCache[treeType] = mod.TreeWalker + if treeType == "dom": + from . import dom + treeWalkerCache[treeType] = dom.TreeWalker elif treeType == "genshi": from . import genshistream treeWalkerCache[treeType] = genshistream.TreeWalker @@ -55,3 +55,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs): # XXX: NEVER cache here, caching is done in the etree submodule return etree.getETreeModule(implementation, **kwargs).TreeWalker return treeWalkerCache.get(treeType) + + +def concatenateCharacterTokens(tokens): + pendingCharacters = [] + for token in tokens: + type = token["type"] + if type in ("Characters", "SpaceCharacters"): + pendingCharacters.append(token["data"]) + else: + if pendingCharacters: + yield {"type": "Characters", "data": "".join(pendingCharacters)} + pendingCharacters = [] + yield token + if pendingCharacters: + yield {"type": "Characters", "data": "".join(pendingCharacters)} + + +def pprint(walker): + """Pretty printer for tree walkers""" + output = [] + indent = 0 + for token in concatenateCharacterTokens(walker): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + # tag name + if token["namespace"] and token["namespace"] != constants.namespaces["html"]: + if token["namespace"] in constants.prefixes: + ns = constants.prefixes[token["namespace"]] + else: + ns = token["namespace"] + name = "%s %s" % (ns, token["name"]) + else: + name = token["name"] + output.append("%s<%s>" % (" " * indent, name)) + indent += 2 + # attributes (sorted for consistent ordering) + attrs = token["data"] + for (namespace, localname), value in sorted(attrs.items()): + if namespace: + if namespace in constants.prefixes: + ns = constants.prefixes[namespace] + else: + ns = namespace + name = "%s %s" % (ns, localname) + else: + name = localname + output.append("%s%s=\"%s\"" % (" " * indent, name, value)) + # self-closing + if type == "EmptyTag": + indent -= 2 + + elif type == "EndTag": + indent -= 2 + + elif type == "Comment": + output.append("%s" % (" " * indent, token["data"])) + + elif type == "Doctype": + if token["name"]: + if token["publicId"]: + output.append("""%s""" % + (" " * indent, + token["name"], + token["publicId"], + token["systemId"] if token["systemId"] else "")) + elif token["systemId"]: + output.append("""%s""" % + (" " * indent, + token["name"], + token["systemId"])) + else: + output.append("%s" % (" " * indent, + token["name"])) + else: + output.append("%s" % (" " * indent,)) + + elif type == "Characters": + output.append("%s\"%s\"" % (" " * indent, token["data"])) + + elif type == "SpaceCharacters": + assert False, "concatenateCharacterTokens should have got rid of all Space tokens" + + else: + raise ValueError("Unknown token type, %s" % type) + + return "\n".join(output) diff --git a/lib/html5lib/treewalkers/_base.py b/lib/html5lib/treewalkers/_base.py index 34252e50..4e11cd02 100644 --- a/lib/html5lib/treewalkers/_base.py +++ b/lib/html5lib/treewalkers/_base.py @@ -1,8 +1,8 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type, string_types -import gettext -_ = gettext.gettext +__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", + "TreeWalker", "NonRecursiveTreeWalker"] from xml.dom import Node @@ -58,7 +58,7 @@ class TreeWalker(object): "namespace": to_text(namespace), "data": attrs} if hasChildren: - yield self.error(_("Void element has children")) + yield self.error("Void element has children") def startTag(self, namespace, name, attrs): assert namespace is None or isinstance(namespace, string_types), type(namespace) @@ -122,7 +122,7 @@ class TreeWalker(object): return {"type": "Entity", "name": text_type(name)} def unknown(self, nodeType): - return self.error(_("Unknown node type: ") + nodeType) + return self.error("Unknown node type: " + nodeType) class NonRecursiveTreeWalker(TreeWalker): diff --git a/lib/html5lib/treewalkers/dom.py b/lib/html5lib/treewalkers/dom.py index a01287a9..ac4dcf31 100644 --- a/lib/html5lib/treewalkers/dom.py +++ b/lib/html5lib/treewalkers/dom.py @@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals from xml.dom import Node -import gettext -_ = gettext.gettext - from . import _base diff --git a/lib/html5lib/treewalkers/etree.py b/lib/html5lib/treewalkers/etree.py index fd8a9cc9..73c8e26a 100644 --- a/lib/html5lib/treewalkers/etree.py +++ b/lib/html5lib/treewalkers/etree.py @@ -7,12 +7,10 @@ except ImportError: from ordereddict import OrderedDict except ImportError: OrderedDict = dict -import gettext -_ = gettext.gettext import re -from six import text_type +from six import string_types from . import _base from ..utils import moduleFactoryFactory @@ -60,7 +58,7 @@ def getETreeBuilder(ElementTreeImplementation): return _base.COMMENT, node.text else: - assert type(node.tag) == text_type, type(node.tag) + assert isinstance(node.tag, string_types), type(node.tag) # This is assumed to be an ordinary element match = tag_regexp.match(node.tag) if match: @@ -131,6 +129,7 @@ def getETreeBuilder(ElementTreeImplementation): if not parents: return parent else: + assert list(parents[-1]).count(parent) == 1 return parent, list(parents[-1]).index(parent), parents, None return locals() diff --git a/lib/html5lib/treewalkers/lxmletree.py b/lib/html5lib/treewalkers/lxmletree.py index bc934ac0..90e116d3 100644 --- a/lib/html5lib/treewalkers/lxmletree.py +++ b/lib/html5lib/treewalkers/lxmletree.py @@ -4,9 +4,6 @@ from six import text_type from lxml import etree from ..treebuilders.etree import tag_regexp -from gettext import gettext -_ = gettext - from . import _base from .. import ihatexml @@ -130,7 +127,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): def getNodeDetails(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key return _base.TEXT, ensure_str(getattr(node, key)) elif isinstance(node, Root): @@ -169,7 +166,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): attrs, len(node) > 0 or node.text) def getFirstChild(self, node): - assert not isinstance(node, tuple), _("Text nodes have no children") + assert not isinstance(node, tuple), "Text nodes have no children" assert len(node) or node.text, "Node has no children" if node.text: @@ -180,7 +177,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): def getNextSibling(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": # XXX: we cannot use a "bool(node) and node[0] or None" construct here # because node[0] might evaluate to False if it has no child element @@ -196,7 +193,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): def getParentNode(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": return node # else: fallback to "normal" processing diff --git a/lib/html5lib/treewalkers/pulldom.py b/lib/html5lib/treewalkers/pulldom.py deleted file mode 100644 index 0b0f515f..00000000 --- a/lib/html5lib/treewalkers/pulldom.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals - -from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \ - COMMENT, IGNORABLE_WHITESPACE, CHARACTERS - -from . import _base - -from ..constants import voidElements - - -class TreeWalker(_base.TreeWalker): - def __iter__(self): - ignore_until = None - previous = None - for event in self.tree: - if previous is not None and \ - (ignore_until is None or previous[1] is ignore_until): - if previous[1] is ignore_until: - ignore_until = None - for token in self.tokens(previous, event): - yield token - if token["type"] == "EmptyTag": - ignore_until = previous[1] - previous = event - if ignore_until is None or previous[1] is ignore_until: - for token in self.tokens(previous, None): - yield token - elif ignore_until is not None: - raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") - - def tokens(self, event, next): - type, node = event - if type == START_ELEMENT: - name = node.nodeName - namespace = node.namespaceURI - attrs = {} - for attr in list(node.attributes.keys()): - attr = node.getAttributeNode(attr) - attrs[(attr.namespaceURI, attr.localName)] = attr.value - if name in voidElements: - for token in self.emptyTag(namespace, - name, - attrs, - not next or next[1] is not node): - yield token - else: - yield self.startTag(namespace, name, attrs) - - elif type == END_ELEMENT: - name = node.nodeName - namespace = node.namespaceURI - if name not in voidElements: - yield self.endTag(namespace, name) - - elif type == COMMENT: - yield self.comment(node.nodeValue) - - elif type in (IGNORABLE_WHITESPACE, CHARACTERS): - for token in self.text(node.nodeValue): - yield token - - else: - yield self.unknown(type) diff --git a/lib/html5lib/utils.py b/lib/html5lib/utils.py index 2f41f4df..c196821f 100644 --- a/lib/html5lib/utils.py +++ b/lib/html5lib/utils.py @@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals from types import ModuleType +from six import text_type + try: import xml.etree.cElementTree as default_etree except ImportError: @@ -9,7 +11,26 @@ except ImportError: __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", - "surrogatePairToCodepoint", "moduleFactoryFactory"] + "surrogatePairToCodepoint", "moduleFactoryFactory", + "supports_lone_surrogates"] + + +# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be +# caught by the below test. In general this would be any platform +# using UTF-16 as its encoding of unicode strings, such as +# Jython. This is because UTF-16 itself is based on the use of such +# surrogates, and there is no mechanism to further escape such +# escapes. +try: + _x = eval('"\\uD800"') + if not isinstance(_x, text_type): + # We need this with u"" because of http://bugs.jython.org/issue2039 + _x = eval('u"\\uD800"') + assert isinstance(_x, text_type) +except: + supports_lone_surrogates = False +else: + supports_lone_surrogates = True class MethodDispatcher(dict): @@ -43,7 +64,7 @@ class MethodDispatcher(dict): return dict.get(self, key, self.default) -# Some utility functions to dal with weirdness around UCS2 vs UCS4 +# Some utility functions to deal with weirdness around UCS2 vs UCS4 # python builds def isSurrogatePair(data): @@ -70,13 +91,21 @@ def moduleFactoryFactory(factory): else: name = b"_%s_factory" % baseModule.__name__ - if name in moduleCache: - return moduleCache[name] - else: + kwargs_tuple = tuple(kwargs.items()) + + try: + return moduleCache[name][args][kwargs_tuple] + except KeyError: mod = ModuleType(name) objs = factory(baseModule, *args, **kwargs) mod.__dict__.update(objs) - moduleCache[name] = mod + if "name" not in moduleCache: + moduleCache[name] = {} + if "args" not in moduleCache[name]: + moduleCache[name][args] = {} + if "kwargs" not in moduleCache[name][args]: + moduleCache[name][args][kwargs_tuple] = {} + moduleCache[name][args][kwargs_tuple] = mod return mod return moduleFactory