Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d).

This commit is contained in:
JackDandy 2016-01-12 01:17:02 +00:00
parent 589d2544dd
commit 8c026d7977
21 changed files with 534 additions and 370 deletions

View file

@ -13,6 +13,7 @@
* Update chardet packages 2.3.0 (26982c5) to 2.3.0 (d7fae98) * Update chardet packages 2.3.0 (26982c5) to 2.3.0 (d7fae98)
* Update dateutil library 2.4.2 (083f666) to 2.4.2 (d4baf97) * Update dateutil library 2.4.2 (083f666) to 2.4.2 (d4baf97)
* Update Hachoir library 1.3.4 (r1383) to 1.3.4 (r1435) * Update Hachoir library 1.3.4 (r1383) to 1.3.4 (r1435)
* Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d)
### 0.11.0 (2016-01-10 22:30:00 UTC) ### 0.11.0 (2016-01-10 22:30:00 UTC)

View file

@ -20,4 +20,6 @@ from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"] "getTreeWalker", "serialize"]
__version__ = "0.999"
# this has to be at the top level, see how setup.py parses this
__version__ = "0.99999999-dev"

View file

@ -1,292 +1,290 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
import string import string
import gettext
_ = gettext.gettext
EOF = None EOF = None
E = { E = {
"null-character": "null-character":
_("Null character in input stream, replaced with U+FFFD."), "Null character in input stream, replaced with U+FFFD.",
"invalid-codepoint": "invalid-codepoint":
_("Invalid codepoint in stream."), "Invalid codepoint in stream.",
"incorrectly-placed-solidus": "incorrectly-placed-solidus":
_("Solidus (/) incorrectly placed in tag."), "Solidus (/) incorrectly placed in tag.",
"incorrect-cr-newline-entity": "incorrect-cr-newline-entity":
_("Incorrect CR newline entity, replaced with LF."), "Incorrect CR newline entity, replaced with LF.",
"illegal-windows-1252-entity": "illegal-windows-1252-entity":
_("Entity used with illegal number (windows-1252 reference)."), "Entity used with illegal number (windows-1252 reference).",
"cant-convert-numeric-entity": "cant-convert-numeric-entity":
_("Numeric entity couldn't be converted to character " "Numeric entity couldn't be converted to character "
"(codepoint U+%(charAsInt)08x)."), "(codepoint U+%(charAsInt)08x).",
"illegal-codepoint-for-numeric-entity": "illegal-codepoint-for-numeric-entity":
_("Numeric entity represents an illegal codepoint: " "Numeric entity represents an illegal codepoint: "
"U+%(charAsInt)08x."), "U+%(charAsInt)08x.",
"numeric-entity-without-semicolon": "numeric-entity-without-semicolon":
_("Numeric entity didn't end with ';'."), "Numeric entity didn't end with ';'.",
"expected-numeric-entity-but-got-eof": "expected-numeric-entity-but-got-eof":
_("Numeric entity expected. Got end of file instead."), "Numeric entity expected. Got end of file instead.",
"expected-numeric-entity": "expected-numeric-entity":
_("Numeric entity expected but none found."), "Numeric entity expected but none found.",
"named-entity-without-semicolon": "named-entity-without-semicolon":
_("Named entity didn't end with ';'."), "Named entity didn't end with ';'.",
"expected-named-entity": "expected-named-entity":
_("Named entity expected. Got none."), "Named entity expected. Got none.",
"attributes-in-end-tag": "attributes-in-end-tag":
_("End tag contains unexpected attributes."), "End tag contains unexpected attributes.",
'self-closing-flag-on-end-tag': 'self-closing-flag-on-end-tag':
_("End tag contains unexpected self-closing flag."), "End tag contains unexpected self-closing flag.",
"expected-tag-name-but-got-right-bracket": "expected-tag-name-but-got-right-bracket":
_("Expected tag name. Got '>' instead."), "Expected tag name. Got '>' instead.",
"expected-tag-name-but-got-question-mark": "expected-tag-name-but-got-question-mark":
_("Expected tag name. Got '?' instead. (HTML doesn't " "Expected tag name. Got '?' instead. (HTML doesn't "
"support processing instructions.)"), "support processing instructions.)",
"expected-tag-name": "expected-tag-name":
_("Expected tag name. Got something else instead"), "Expected tag name. Got something else instead",
"expected-closing-tag-but-got-right-bracket": "expected-closing-tag-but-got-right-bracket":
_("Expected closing tag. Got '>' instead. Ignoring '</>'."), "Expected closing tag. Got '>' instead. Ignoring '</>'.",
"expected-closing-tag-but-got-eof": "expected-closing-tag-but-got-eof":
_("Expected closing tag. Unexpected end of file."), "Expected closing tag. Unexpected end of file.",
"expected-closing-tag-but-got-char": "expected-closing-tag-but-got-char":
_("Expected closing tag. Unexpected character '%(data)s' found."), "Expected closing tag. Unexpected character '%(data)s' found.",
"eof-in-tag-name": "eof-in-tag-name":
_("Unexpected end of file in the tag name."), "Unexpected end of file in the tag name.",
"expected-attribute-name-but-got-eof": "expected-attribute-name-but-got-eof":
_("Unexpected end of file. Expected attribute name instead."), "Unexpected end of file. Expected attribute name instead.",
"eof-in-attribute-name": "eof-in-attribute-name":
_("Unexpected end of file in attribute name."), "Unexpected end of file in attribute name.",
"invalid-character-in-attribute-name": "invalid-character-in-attribute-name":
_("Invalid character in attribute name"), "Invalid character in attribute name",
"duplicate-attribute": "duplicate-attribute":
_("Dropped duplicate attribute on tag."), "Dropped duplicate attribute on tag.",
"expected-end-of-tag-name-but-got-eof": "expected-end-of-tag-name-but-got-eof":
_("Unexpected end of file. Expected = or end of tag."), "Unexpected end of file. Expected = or end of tag.",
"expected-attribute-value-but-got-eof": "expected-attribute-value-but-got-eof":
_("Unexpected end of file. Expected attribute value."), "Unexpected end of file. Expected attribute value.",
"expected-attribute-value-but-got-right-bracket": "expected-attribute-value-but-got-right-bracket":
_("Expected attribute value. Got '>' instead."), "Expected attribute value. Got '>' instead.",
'equals-in-unquoted-attribute-value': 'equals-in-unquoted-attribute-value':
_("Unexpected = in unquoted attribute"), "Unexpected = in unquoted attribute",
'unexpected-character-in-unquoted-attribute-value': 'unexpected-character-in-unquoted-attribute-value':
_("Unexpected character in unquoted attribute"), "Unexpected character in unquoted attribute",
"invalid-character-after-attribute-name": "invalid-character-after-attribute-name":
_("Unexpected character after attribute name."), "Unexpected character after attribute name.",
"unexpected-character-after-attribute-value": "unexpected-character-after-attribute-value":
_("Unexpected character after attribute value."), "Unexpected character after attribute value.",
"eof-in-attribute-value-double-quote": "eof-in-attribute-value-double-quote":
_("Unexpected end of file in attribute value (\")."), "Unexpected end of file in attribute value (\").",
"eof-in-attribute-value-single-quote": "eof-in-attribute-value-single-quote":
_("Unexpected end of file in attribute value (')."), "Unexpected end of file in attribute value (').",
"eof-in-attribute-value-no-quotes": "eof-in-attribute-value-no-quotes":
_("Unexpected end of file in attribute value."), "Unexpected end of file in attribute value.",
"unexpected-EOF-after-solidus-in-tag": "unexpected-EOF-after-solidus-in-tag":
_("Unexpected end of file in tag. Expected >"), "Unexpected end of file in tag. Expected >",
"unexpected-character-after-solidus-in-tag": "unexpected-character-after-solidus-in-tag":
_("Unexpected character after / in tag. Expected >"), "Unexpected character after / in tag. Expected >",
"expected-dashes-or-doctype": "expected-dashes-or-doctype":
_("Expected '--' or 'DOCTYPE'. Not found."), "Expected '--' or 'DOCTYPE'. Not found.",
"unexpected-bang-after-double-dash-in-comment": "unexpected-bang-after-double-dash-in-comment":
_("Unexpected ! after -- in comment"), "Unexpected ! after -- in comment",
"unexpected-space-after-double-dash-in-comment": "unexpected-space-after-double-dash-in-comment":
_("Unexpected space after -- in comment"), "Unexpected space after -- in comment",
"incorrect-comment": "incorrect-comment":
_("Incorrect comment."), "Incorrect comment.",
"eof-in-comment": "eof-in-comment":
_("Unexpected end of file in comment."), "Unexpected end of file in comment.",
"eof-in-comment-end-dash": "eof-in-comment-end-dash":
_("Unexpected end of file in comment (-)"), "Unexpected end of file in comment (-)",
"unexpected-dash-after-double-dash-in-comment": "unexpected-dash-after-double-dash-in-comment":
_("Unexpected '-' after '--' found in comment."), "Unexpected '-' after '--' found in comment.",
"eof-in-comment-double-dash": "eof-in-comment-double-dash":
_("Unexpected end of file in comment (--)."), "Unexpected end of file in comment (--).",
"eof-in-comment-end-space-state": "eof-in-comment-end-space-state":
_("Unexpected end of file in comment."), "Unexpected end of file in comment.",
"eof-in-comment-end-bang-state": "eof-in-comment-end-bang-state":
_("Unexpected end of file in comment."), "Unexpected end of file in comment.",
"unexpected-char-in-comment": "unexpected-char-in-comment":
_("Unexpected character in comment found."), "Unexpected character in comment found.",
"need-space-after-doctype": "need-space-after-doctype":
_("No space after literal string 'DOCTYPE'."), "No space after literal string 'DOCTYPE'.",
"expected-doctype-name-but-got-right-bracket": "expected-doctype-name-but-got-right-bracket":
_("Unexpected > character. Expected DOCTYPE name."), "Unexpected > character. Expected DOCTYPE name.",
"expected-doctype-name-but-got-eof": "expected-doctype-name-but-got-eof":
_("Unexpected end of file. Expected DOCTYPE name."), "Unexpected end of file. Expected DOCTYPE name.",
"eof-in-doctype-name": "eof-in-doctype-name":
_("Unexpected end of file in DOCTYPE name."), "Unexpected end of file in DOCTYPE name.",
"eof-in-doctype": "eof-in-doctype":
_("Unexpected end of file in DOCTYPE."), "Unexpected end of file in DOCTYPE.",
"expected-space-or-right-bracket-in-doctype": "expected-space-or-right-bracket-in-doctype":
_("Expected space or '>'. Got '%(data)s'"), "Expected space or '>'. Got '%(data)s'",
"unexpected-end-of-doctype": "unexpected-end-of-doctype":
_("Unexpected end of DOCTYPE."), "Unexpected end of DOCTYPE.",
"unexpected-char-in-doctype": "unexpected-char-in-doctype":
_("Unexpected character in DOCTYPE."), "Unexpected character in DOCTYPE.",
"eof-in-innerhtml": "eof-in-innerhtml":
_("XXX innerHTML EOF"), "XXX innerHTML EOF",
"unexpected-doctype": "unexpected-doctype":
_("Unexpected DOCTYPE. Ignored."), "Unexpected DOCTYPE. Ignored.",
"non-html-root": "non-html-root":
_("html needs to be the first start tag."), "html needs to be the first start tag.",
"expected-doctype-but-got-eof": "expected-doctype-but-got-eof":
_("Unexpected End of file. Expected DOCTYPE."), "Unexpected End of file. Expected DOCTYPE.",
"unknown-doctype": "unknown-doctype":
_("Erroneous DOCTYPE."), "Erroneous DOCTYPE.",
"expected-doctype-but-got-chars": "expected-doctype-but-got-chars":
_("Unexpected non-space characters. Expected DOCTYPE."), "Unexpected non-space characters. Expected DOCTYPE.",
"expected-doctype-but-got-start-tag": "expected-doctype-but-got-start-tag":
_("Unexpected start tag (%(name)s). Expected DOCTYPE."), "Unexpected start tag (%(name)s). Expected DOCTYPE.",
"expected-doctype-but-got-end-tag": "expected-doctype-but-got-end-tag":
_("Unexpected end tag (%(name)s). Expected DOCTYPE."), "Unexpected end tag (%(name)s). Expected DOCTYPE.",
"end-tag-after-implied-root": "end-tag-after-implied-root":
_("Unexpected end tag (%(name)s) after the (implied) root element."), "Unexpected end tag (%(name)s) after the (implied) root element.",
"expected-named-closing-tag-but-got-eof": "expected-named-closing-tag-but-got-eof":
_("Unexpected end of file. Expected end tag (%(name)s)."), "Unexpected end of file. Expected end tag (%(name)s).",
"two-heads-are-not-better-than-one": "two-heads-are-not-better-than-one":
_("Unexpected start tag head in existing head. Ignored."), "Unexpected start tag head in existing head. Ignored.",
"unexpected-end-tag": "unexpected-end-tag":
_("Unexpected end tag (%(name)s). Ignored."), "Unexpected end tag (%(name)s). Ignored.",
"unexpected-start-tag-out-of-my-head": "unexpected-start-tag-out-of-my-head":
_("Unexpected start tag (%(name)s) that can be in head. Moved."), "Unexpected start tag (%(name)s) that can be in head. Moved.",
"unexpected-start-tag": "unexpected-start-tag":
_("Unexpected start tag (%(name)s)."), "Unexpected start tag (%(name)s).",
"missing-end-tag": "missing-end-tag":
_("Missing end tag (%(name)s)."), "Missing end tag (%(name)s).",
"missing-end-tags": "missing-end-tags":
_("Missing end tags (%(name)s)."), "Missing end tags (%(name)s).",
"unexpected-start-tag-implies-end-tag": "unexpected-start-tag-implies-end-tag":
_("Unexpected start tag (%(startName)s) " "Unexpected start tag (%(startName)s) "
"implies end tag (%(endName)s)."), "implies end tag (%(endName)s).",
"unexpected-start-tag-treated-as": "unexpected-start-tag-treated-as":
_("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
"deprecated-tag": "deprecated-tag":
_("Unexpected start tag %(name)s. Don't use it!"), "Unexpected start tag %(name)s. Don't use it!",
"unexpected-start-tag-ignored": "unexpected-start-tag-ignored":
_("Unexpected start tag %(name)s. Ignored."), "Unexpected start tag %(name)s. Ignored.",
"expected-one-end-tag-but-got-another": "expected-one-end-tag-but-got-another":
_("Unexpected end tag (%(gotName)s). " "Unexpected end tag (%(gotName)s). "
"Missing end tag (%(expectedName)s)."), "Missing end tag (%(expectedName)s).",
"end-tag-too-early": "end-tag-too-early":
_("End tag (%(name)s) seen too early. Expected other end tag."), "End tag (%(name)s) seen too early. Expected other end tag.",
"end-tag-too-early-named": "end-tag-too-early-named":
_("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
"end-tag-too-early-ignored": "end-tag-too-early-ignored":
_("End tag (%(name)s) seen too early. Ignored."), "End tag (%(name)s) seen too early. Ignored.",
"adoption-agency-1.1": "adoption-agency-1.1":
_("End tag (%(name)s) violates step 1, " "End tag (%(name)s) violates step 1, "
"paragraph 1 of the adoption agency algorithm."), "paragraph 1 of the adoption agency algorithm.",
"adoption-agency-1.2": "adoption-agency-1.2":
_("End tag (%(name)s) violates step 1, " "End tag (%(name)s) violates step 1, "
"paragraph 2 of the adoption agency algorithm."), "paragraph 2 of the adoption agency algorithm.",
"adoption-agency-1.3": "adoption-agency-1.3":
_("End tag (%(name)s) violates step 1, " "End tag (%(name)s) violates step 1, "
"paragraph 3 of the adoption agency algorithm."), "paragraph 3 of the adoption agency algorithm.",
"adoption-agency-4.4": "adoption-agency-4.4":
_("End tag (%(name)s) violates step 4, " "End tag (%(name)s) violates step 4, "
"paragraph 4 of the adoption agency algorithm."), "paragraph 4 of the adoption agency algorithm.",
"unexpected-end-tag-treated-as": "unexpected-end-tag-treated-as":
_("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
"no-end-tag": "no-end-tag":
_("This element (%(name)s) has no end tag."), "This element (%(name)s) has no end tag.",
"unexpected-implied-end-tag-in-table": "unexpected-implied-end-tag-in-table":
_("Unexpected implied end tag (%(name)s) in the table phase."), "Unexpected implied end tag (%(name)s) in the table phase.",
"unexpected-implied-end-tag-in-table-body": "unexpected-implied-end-tag-in-table-body":
_("Unexpected implied end tag (%(name)s) in the table body phase."), "Unexpected implied end tag (%(name)s) in the table body phase.",
"unexpected-char-implies-table-voodoo": "unexpected-char-implies-table-voodoo":
_("Unexpected non-space characters in " "Unexpected non-space characters in "
"table context caused voodoo mode."), "table context caused voodoo mode.",
"unexpected-hidden-input-in-table": "unexpected-hidden-input-in-table":
_("Unexpected input with type hidden in table context."), "Unexpected input with type hidden in table context.",
"unexpected-form-in-table": "unexpected-form-in-table":
_("Unexpected form in table context."), "Unexpected form in table context.",
"unexpected-start-tag-implies-table-voodoo": "unexpected-start-tag-implies-table-voodoo":
_("Unexpected start tag (%(name)s) in " "Unexpected start tag (%(name)s) in "
"table context caused voodoo mode."), "table context caused voodoo mode.",
"unexpected-end-tag-implies-table-voodoo": "unexpected-end-tag-implies-table-voodoo":
_("Unexpected end tag (%(name)s) in " "Unexpected end tag (%(name)s) in "
"table context caused voodoo mode."), "table context caused voodoo mode.",
"unexpected-cell-in-table-body": "unexpected-cell-in-table-body":
_("Unexpected table cell start tag (%(name)s) " "Unexpected table cell start tag (%(name)s) "
"in the table body phase."), "in the table body phase.",
"unexpected-cell-end-tag": "unexpected-cell-end-tag":
_("Got table cell end tag (%(name)s) " "Got table cell end tag (%(name)s) "
"while required end tags are missing."), "while required end tags are missing.",
"unexpected-end-tag-in-table-body": "unexpected-end-tag-in-table-body":
_("Unexpected end tag (%(name)s) in the table body phase. Ignored."), "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
"unexpected-implied-end-tag-in-table-row": "unexpected-implied-end-tag-in-table-row":
_("Unexpected implied end tag (%(name)s) in the table row phase."), "Unexpected implied end tag (%(name)s) in the table row phase.",
"unexpected-end-tag-in-table-row": "unexpected-end-tag-in-table-row":
_("Unexpected end tag (%(name)s) in the table row phase. Ignored."), "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
"unexpected-select-in-select": "unexpected-select-in-select":
_("Unexpected select start tag in the select phase " "Unexpected select start tag in the select phase "
"treated as select end tag."), "treated as select end tag.",
"unexpected-input-in-select": "unexpected-input-in-select":
_("Unexpected input start tag in the select phase."), "Unexpected input start tag in the select phase.",
"unexpected-start-tag-in-select": "unexpected-start-tag-in-select":
_("Unexpected start tag token (%(name)s in the select phase. " "Unexpected start tag token (%(name)s in the select phase. "
"Ignored."), "Ignored.",
"unexpected-end-tag-in-select": "unexpected-end-tag-in-select":
_("Unexpected end tag (%(name)s) in the select phase. Ignored."), "Unexpected end tag (%(name)s) in the select phase. Ignored.",
"unexpected-table-element-start-tag-in-select-in-table": "unexpected-table-element-start-tag-in-select-in-table":
_("Unexpected table element start tag (%(name)s) in the select in table phase."), "Unexpected table element start tag (%(name)s) in the select in table phase.",
"unexpected-table-element-end-tag-in-select-in-table": "unexpected-table-element-end-tag-in-select-in-table":
_("Unexpected table element end tag (%(name)s) in the select in table phase."), "Unexpected table element end tag (%(name)s) in the select in table phase.",
"unexpected-char-after-body": "unexpected-char-after-body":
_("Unexpected non-space characters in the after body phase."), "Unexpected non-space characters in the after body phase.",
"unexpected-start-tag-after-body": "unexpected-start-tag-after-body":
_("Unexpected start tag token (%(name)s)" "Unexpected start tag token (%(name)s)"
" in the after body phase."), " in the after body phase.",
"unexpected-end-tag-after-body": "unexpected-end-tag-after-body":
_("Unexpected end tag token (%(name)s)" "Unexpected end tag token (%(name)s)"
" in the after body phase."), " in the after body phase.",
"unexpected-char-in-frameset": "unexpected-char-in-frameset":
_("Unexpected characters in the frameset phase. Characters ignored."), "Unexpected characters in the frameset phase. Characters ignored.",
"unexpected-start-tag-in-frameset": "unexpected-start-tag-in-frameset":
_("Unexpected start tag token (%(name)s)" "Unexpected start tag token (%(name)s)"
" in the frameset phase. Ignored."), " in the frameset phase. Ignored.",
"unexpected-frameset-in-frameset-innerhtml": "unexpected-frameset-in-frameset-innerhtml":
_("Unexpected end tag token (frameset) " "Unexpected end tag token (frameset) "
"in the frameset phase (innerHTML)."), "in the frameset phase (innerHTML).",
"unexpected-end-tag-in-frameset": "unexpected-end-tag-in-frameset":
_("Unexpected end tag token (%(name)s)" "Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored."), " in the frameset phase. Ignored.",
"unexpected-char-after-frameset": "unexpected-char-after-frameset":
_("Unexpected non-space characters in the " "Unexpected non-space characters in the "
"after frameset phase. Ignored."), "after frameset phase. Ignored.",
"unexpected-start-tag-after-frameset": "unexpected-start-tag-after-frameset":
_("Unexpected start tag (%(name)s)" "Unexpected start tag (%(name)s)"
" in the after frameset phase. Ignored."), " in the after frameset phase. Ignored.",
"unexpected-end-tag-after-frameset": "unexpected-end-tag-after-frameset":
_("Unexpected end tag (%(name)s)" "Unexpected end tag (%(name)s)"
" in the after frameset phase. Ignored."), " in the after frameset phase. Ignored.",
"unexpected-end-tag-after-body-innerhtml": "unexpected-end-tag-after-body-innerhtml":
_("Unexpected end tag after body(innerHtml)"), "Unexpected end tag after body(innerHtml)",
"expected-eof-but-got-char": "expected-eof-but-got-char":
_("Unexpected non-space characters. Expected end of file."), "Unexpected non-space characters. Expected end of file.",
"expected-eof-but-got-start-tag": "expected-eof-but-got-start-tag":
_("Unexpected start tag (%(name)s)" "Unexpected start tag (%(name)s)"
". Expected end of file."), ". Expected end of file.",
"expected-eof-but-got-end-tag": "expected-eof-but-got-end-tag":
_("Unexpected end tag (%(name)s)" "Unexpected end tag (%(name)s)"
". Expected end of file."), ". Expected end of file.",
"eof-in-table": "eof-in-table":
_("Unexpected end of file. Expected table content."), "Unexpected end of file. Expected table content.",
"eof-in-select": "eof-in-select":
_("Unexpected end of file. Expected select content."), "Unexpected end of file. Expected select content.",
"eof-in-frameset": "eof-in-frameset":
_("Unexpected end of file. Expected frameset content."), "Unexpected end of file. Expected frameset content.",
"eof-in-script-in-script": "eof-in-script-in-script":
_("Unexpected end of file. Expected script content."), "Unexpected end of file. Expected script content.",
"eof-in-foreign-lands": "eof-in-foreign-lands":
_("Unexpected end of file. Expected foreign content"), "Unexpected end of file. Expected foreign content",
"non-void-element-with-trailing-solidus": "non-void-element-with-trailing-solidus":
_("Trailing solidus not allowed on element %(name)s"), "Trailing solidus not allowed on element %(name)s",
"unexpected-html-element-in-foreign-content": "unexpected-html-element-in-foreign-content":
_("Element %(name)s not allowed in a non-html context"), "Element %(name)s not allowed in a non-html context",
"unexpected-end-tag-before-html": "unexpected-end-tag-before-html":
_("Unexpected end tag (%(name)s) before html."), "Unexpected end tag (%(name)s) before html.",
"XXX-undefined-error": "XXX-undefined-error":
_("Undefined error (this sucks and should be fixed)"), "Undefined error (this sucks and should be fixed)",
} }
namespaces = { namespaces = {
@ -298,7 +296,7 @@ namespaces = {
"xmlns": "http://www.w3.org/2000/xmlns/" "xmlns": "http://www.w3.org/2000/xmlns/"
} }
scopingElements = frozenset(( scopingElements = frozenset([
(namespaces["html"], "applet"), (namespaces["html"], "applet"),
(namespaces["html"], "caption"), (namespaces["html"], "caption"),
(namespaces["html"], "html"), (namespaces["html"], "html"),
@ -316,9 +314,9 @@ scopingElements = frozenset((
(namespaces["svg"], "foreignObject"), (namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"), (namespaces["svg"], "desc"),
(namespaces["svg"], "title"), (namespaces["svg"], "title"),
)) ])
formattingElements = frozenset(( formattingElements = frozenset([
(namespaces["html"], "a"), (namespaces["html"], "a"),
(namespaces["html"], "b"), (namespaces["html"], "b"),
(namespaces["html"], "big"), (namespaces["html"], "big"),
@ -333,9 +331,9 @@ formattingElements = frozenset((
(namespaces["html"], "strong"), (namespaces["html"], "strong"),
(namespaces["html"], "tt"), (namespaces["html"], "tt"),
(namespaces["html"], "u") (namespaces["html"], "u")
)) ])
specialElements = frozenset(( specialElements = frozenset([
(namespaces["html"], "address"), (namespaces["html"], "address"),
(namespaces["html"], "applet"), (namespaces["html"], "applet"),
(namespaces["html"], "area"), (namespaces["html"], "area"),
@ -416,22 +414,22 @@ specialElements = frozenset((
(namespaces["html"], "wbr"), (namespaces["html"], "wbr"),
(namespaces["html"], "xmp"), (namespaces["html"], "xmp"),
(namespaces["svg"], "foreignObject") (namespaces["svg"], "foreignObject")
)) ])
htmlIntegrationPointElements = frozenset(( htmlIntegrationPointElements = frozenset([
(namespaces["mathml"], "annotaion-xml"), (namespaces["mathml"], "annotaion-xml"),
(namespaces["svg"], "foreignObject"), (namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"), (namespaces["svg"], "desc"),
(namespaces["svg"], "title") (namespaces["svg"], "title")
)) ])
mathmlTextIntegrationPointElements = frozenset(( mathmlTextIntegrationPointElements = frozenset([
(namespaces["mathml"], "mi"), (namespaces["mathml"], "mi"),
(namespaces["mathml"], "mo"), (namespaces["mathml"], "mo"),
(namespaces["mathml"], "mn"), (namespaces["mathml"], "mn"),
(namespaces["mathml"], "ms"), (namespaces["mathml"], "ms"),
(namespaces["mathml"], "mtext") (namespaces["mathml"], "mtext")
)) ])
adjustForeignAttributes = { adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
@ -451,21 +449,21 @@ adjustForeignAttributes = {
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()]) adjustForeignAttributes.items()])
spaceCharacters = frozenset(( spaceCharacters = frozenset([
"\t", "\t",
"\n", "\n",
"\u000C", "\u000C",
" ", " ",
"\r" "\r"
)) ])
tableInsertModeElements = frozenset(( tableInsertModeElements = frozenset([
"table", "table",
"tbody", "tbody",
"tfoot", "tfoot",
"thead", "thead",
"tr" "tr"
)) ])
asciiLowercase = frozenset(string.ascii_lowercase) asciiLowercase = frozenset(string.ascii_lowercase)
asciiUppercase = frozenset(string.ascii_uppercase) asciiUppercase = frozenset(string.ascii_uppercase)
@ -486,7 +484,7 @@ headingElements = (
"h6" "h6"
) )
voidElements = frozenset(( voidElements = frozenset([
"base", "base",
"command", "command",
"event-source", "event-source",
@ -502,11 +500,11 @@ voidElements = frozenset((
"input", "input",
"source", "source",
"track" "track"
)) ])
cdataElements = frozenset(('title', 'textarea')) cdataElements = frozenset(['title', 'textarea'])
rcdataElements = frozenset(( rcdataElements = frozenset([
'style', 'style',
'script', 'script',
'xmp', 'xmp',
@ -514,27 +512,27 @@ rcdataElements = frozenset((
'noembed', 'noembed',
'noframes', 'noframes',
'noscript' 'noscript'
)) ])
booleanAttributes = { booleanAttributes = {
"": frozenset(("irrelevant",)), "": frozenset(["irrelevant"]),
"style": frozenset(("scoped",)), "style": frozenset(["scoped"]),
"img": frozenset(("ismap",)), "img": frozenset(["ismap"]),
"audio": frozenset(("autoplay", "controls")), "audio": frozenset(["autoplay", "controls"]),
"video": frozenset(("autoplay", "controls")), "video": frozenset(["autoplay", "controls"]),
"script": frozenset(("defer", "async")), "script": frozenset(["defer", "async"]),
"details": frozenset(("open",)), "details": frozenset(["open"]),
"datagrid": frozenset(("multiple", "disabled")), "datagrid": frozenset(["multiple", "disabled"]),
"command": frozenset(("hidden", "disabled", "checked", "default")), "command": frozenset(["hidden", "disabled", "checked", "default"]),
"hr": frozenset(("noshade")), "hr": frozenset(["noshade"]),
"menu": frozenset(("autosubmit",)), "menu": frozenset(["autosubmit"]),
"fieldset": frozenset(("disabled", "readonly")), "fieldset": frozenset(["disabled", "readonly"]),
"option": frozenset(("disabled", "readonly", "selected")), "option": frozenset(["disabled", "readonly", "selected"]),
"optgroup": frozenset(("disabled", "readonly")), "optgroup": frozenset(["disabled", "readonly"]),
"button": frozenset(("disabled", "autofocus")), "button": frozenset(["disabled", "autofocus"]),
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")), "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"output": frozenset(("disabled", "readonly")), "output": frozenset(["disabled", "readonly"]),
} }
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@ -574,7 +572,7 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
) )
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
entities = { entities = {
"AElig": "\xc6", "AElig": "\xc6",
@ -3088,8 +3086,8 @@ tokenTypes = {
"ParseError": 7 "ParseError": 7
} }
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"])) tokenTypes["EmptyTag"]])
prefixes = dict([(v, k) for k, v in namespaces.items()]) prefixes = dict([(v, k) for k, v in namespaces.items()])

View file

@ -1,8 +1,5 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from gettext import gettext
_ = gettext
from . import _base from . import _base
from ..constants import cdataElements, rcdataElements, voidElements from ..constants import cdataElements, rcdataElements, voidElements
@ -23,24 +20,24 @@ class Filter(_base.Filter):
if type in ("StartTag", "EmptyTag"): if type in ("StartTag", "EmptyTag"):
name = token["name"] name = token["name"]
if contentModelFlag != "PCDATA": if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name}) raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
if not isinstance(name, str): if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name: if not name:
raise LintError(_("Empty tag name")) raise LintError("Empty tag name")
if type == "StartTag" and name in voidElements: if type == "StartTag" and name in voidElements:
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name}) raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
elif type == "EmptyTag" and name not in voidElements: elif type == "EmptyTag" and name not in voidElements:
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]}) raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
if type == "StartTag": if type == "StartTag":
open_elements.append(name) open_elements.append(name)
for name, value in token["data"]: for name, value in token["data"]:
if not isinstance(name, str): if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name}) raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
if not name: if not name:
raise LintError(_("Empty attribute name")) raise LintError("Empty attribute name")
if not isinstance(value, str): if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value}) raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
if name in cdataElements: if name in cdataElements:
contentModelFlag = "CDATA" contentModelFlag = "CDATA"
elif name in rcdataElements: elif name in rcdataElements:
@ -51,43 +48,43 @@ class Filter(_base.Filter):
elif type == "EndTag": elif type == "EndTag":
name = token["name"] name = token["name"]
if not isinstance(name, str): if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name: if not name:
raise LintError(_("Empty tag name")) raise LintError("Empty tag name")
if name in voidElements: if name in voidElements:
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name}) raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
start_name = open_elements.pop() start_name = open_elements.pop()
if start_name != name: if start_name != name:
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name}) raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
contentModelFlag = "PCDATA" contentModelFlag = "PCDATA"
elif type == "Comment": elif type == "Comment":
if contentModelFlag != "PCDATA": if contentModelFlag != "PCDATA":
raise LintError(_("Comment not in PCDATA content model flag")) raise LintError("Comment not in PCDATA content model flag")
elif type in ("Characters", "SpaceCharacters"): elif type in ("Characters", "SpaceCharacters"):
data = token["data"] data = token["data"]
if not isinstance(data, str): if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data}) raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
if not data: if not data:
raise LintError(_("%(type)s token with empty data") % {"type": type}) raise LintError("%(type)s token with empty data" % {"type": type})
if type == "SpaceCharacters": if type == "SpaceCharacters":
data = data.strip(spaceCharacters) data = data.strip(spaceCharacters)
if data: if data:
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data}) raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
elif type == "Doctype": elif type == "Doctype":
name = token["name"] name = token["name"]
if contentModelFlag != "PCDATA": if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name}) raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
if not isinstance(name, str): if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
# XXX: what to do with token["data"] ? # XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"): elif type in ("ParseError", "SerializeError"):
pass pass
else: else:
raise LintError(_("Unknown token type: %(type)s") % {"type": type}) raise LintError("Unknown token type: %(type)s" % {"type": type})
yield token yield token

View file

@ -58,7 +58,7 @@ class Filter(_base.Filter):
elif tagname == 'colgroup': elif tagname == 'colgroup':
# A colgroup element's start tag may be omitted if the first thing # A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element # inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose # is not immediately preceded by another colgroup element whose
# end tag has been omitted. # end tag has been omitted.
if type in ("StartTag", "EmptyTag"): if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never # XXX: we do not look at the preceding event, so instead we never
@ -70,7 +70,7 @@ class Filter(_base.Filter):
elif tagname == 'tbody': elif tagname == 'tbody':
# A tbody element's start tag may be omitted if the first thing # A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is # inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element # not immediately preceded by a tbody, thead, or tfoot element
# whose end tag has been omitted. # whose end tag has been omitted.
if type == "StartTag": if type == "StartTag":
# omit the thead and tfoot elements' end tag when they are # omit the thead and tfoot elements' end tag when they are

View file

@ -18,6 +18,7 @@ from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap from .constants import adjustForeignAttributes as adjustForeignAttributesMap
from .constants import E
def parse(doc, treebuilder="etree", encoding=None, def parse(doc, treebuilder="etree", encoding=None,
@ -129,6 +130,17 @@ class HTMLParser(object):
self.framesetOK = True self.framesetOK = True
@property
def documentEncoding(self):
"""The name of the character encoding
that was used to decode the input stream,
or :obj:`None` if that is not determined yet.
"""
if not hasattr(self, 'tokenizer'):
return None
return self.tokenizer.stream.charEncoding[0]
def isHTMLIntegrationPoint(self, element): def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]): element.namespace == namespaces["mathml"]):
@ -245,7 +257,7 @@ class HTMLParser(object):
# XXX The idea is to make errorcode mandatory. # XXX The idea is to make errorcode mandatory.
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict: if self.strict:
raise ParseError raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token): def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """ """ HTML5 specific normalizations to the token stream """
@ -868,7 +880,7 @@ def getPhases(debug):
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta", (("base", "basefont", "bgsound", "command", "link", "meta",
"noframes", "script", "style", "title"), "script", "style", "title"),
self.startTagProcessInHead), self.startTagProcessInHead),
("body", self.startTagBody), ("body", self.startTagBody),
("frameset", self.startTagFrameset), ("frameset", self.startTagFrameset),
@ -1205,8 +1217,7 @@ def getPhases(debug):
attributes["name"] = "isindex" attributes["name"] = "isindex"
self.processStartTag(impliedTagToken("input", "StartTag", self.processStartTag(impliedTagToken("input", "StartTag",
attributes=attributes, attributes=attributes,
selfClosing= selfClosing=token["selfClosing"]))
token["selfClosing"]))
self.processEndTag(impliedTagToken("label")) self.processEndTag(impliedTagToken("label"))
self.processStartTag(impliedTagToken("hr", "StartTag")) self.processStartTag(impliedTagToken("hr", "StartTag"))
self.processEndTag(impliedTagToken("form")) self.processEndTag(impliedTagToken("form"))
@ -1316,7 +1327,7 @@ def getPhases(debug):
# Not sure this is the correct name for the parse error # Not sure this is the correct name for the parse error
self.parser.parseError( self.parser.parseError(
"expected-one-end-tag-but-got-another", "expected-one-end-tag-but-got-another",
{"expectedName": "body", "gotName": node.name}) {"gotName": "body", "expectedName": node.name})
break break
self.parser.phase = self.parser.phases["afterBody"] self.parser.phase = self.parser.phases["afterBody"]
@ -2553,7 +2564,7 @@ def getPhases(debug):
self.tree.openElements.pop() self.tree.openElements.pop()
if (not self.parser.innerHTML and if (not self.parser.innerHTML and
self.tree.openElements[-1].name != "frameset"): self.tree.openElements[-1].name != "frameset"):
# If we're not in innerHTML mode and the the current node is not a # If we're not in innerHTML mode and the current node is not a
# "frameset" element (anymore) then switch. # "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["afterFrameset"] self.parser.phase = self.parser.phases["afterFrameset"]

View file

@ -225,6 +225,9 @@ class InfosetFilter(object):
while "--" in data: while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning) warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -") data = data.replace("--", "- -")
if data.endswith("-"):
warnings.warn("Comments cannot end in a dash", DataLossWarning)
data += " "
return data return data
def coerceCharacters(self, data): def coerceCharacters(self, data):

View file

@ -1,6 +1,7 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type from six import text_type
from six.moves import http_client from six.moves import http_client, urllib
import codecs import codecs
import re import re
@ -28,7 +29,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
eval('"\\uD800-\\uDFFF"'))
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@ -119,9 +131,12 @@ class BufferedStream(object):
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
if isinstance(source, http_client.HTTPResponse): # Work around Python bug #20007: read(0) closes the connection.
# Work around Python bug #20007: read(0) closes the connection. # http://bugs.python.org/issue20007
# http://bugs.python.org/issue20007 if (isinstance(source, http_client.HTTPResponse) or
# Also check for addinfourl wrapping HTTPResponse
(isinstance(source, urllib.response.addbase) and
isinstance(source.fp, http_client.HTTPResponse))):
isUnicode = False isUnicode = False
elif hasattr(source, "read"): elif hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type) isUnicode = isinstance(source.read(0), text_type)
@ -164,13 +179,18 @@ class HTMLUnicodeInputStream(object):
""" """
# Craziness if not utils.supports_lone_surrogates:
if len("\U0010FFFF") == 1: # Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4 self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else: else:
self.reportCharacterErrors = self.characterErrorsUCS2 self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])") self.replaceCharactersRegexp = re.compile(
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
# List of where new lines occur # List of where new lines occur
self.newLines = [0] self.newLines = [0]
@ -265,11 +285,12 @@ class HTMLUnicodeInputStream(object):
self._bufferedCharacter = data[-1] self._bufferedCharacter = data[-1]
data = data[:-1] data = data[:-1]
self.reportCharacterErrors(data) if self.reportCharacterErrors:
self.reportCharacterErrors(data)
# Replace invalid characters # Replace invalid characters
# Note U+0000 is dealt with in the tokenizer # Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data) data = self.replaceCharactersRegexp.sub("\ufffd", data)
data = data.replace("\r\n", "\n") data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n") data = data.replace("\r", "\n")
@ -452,7 +473,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
if encoding is None and parseMeta: if encoding is None and parseMeta:
encoding = self.detectEncodingMeta() encoding = self.detectEncodingMeta()
confidence = "tentative" confidence = "tentative"
# Guess with chardet, if avaliable # Guess with chardet, if available
if encoding is None and chardet: if encoding is None and chardet:
confidence = "tentative" confidence = "tentative"
try: try:

View file

@ -2,11 +2,26 @@ from __future__ import absolute_import, division, unicode_literals
import re import re
from xml.sax.saxutils import escape, unescape from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from .tokenizer import HTMLTokenizer from .tokenizer import HTMLTokenizer
from .constants import tokenTypes from .constants import tokenTypes
content_type_rgx = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class HTMLSanitizerMixin(object): class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes.""" """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object):
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan'] 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
'xlink:href', 'xml:base'] 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs'] 'ssh', 'sftp', 'rtsp', 'afs', 'data']
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
# subclasses may define their own versions of these constants # subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements allowed_elements = acceptable_elements + mathml_elements + svg_elements
@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
allowed_css_keywords = acceptable_css_keywords allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols allowed_protocols = acceptable_protocols
allowed_content_types = acceptable_content_types
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@ -189,10 +207,21 @@ class HTMLSanitizerMixin(object):
unescape(attrs[attr])).lower() unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters # remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "") val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and try:
(val_unescaped.split(':')[0] not in uri = urlparse.urlparse(val_unescaped)
self.allowed_protocols)): except ValueError:
uri = None
del attrs[attr] del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = content_type_rgx.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref: for attr in self.svg_attr_val_allows_ref:
if attr in attrs: if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
@ -245,7 +274,7 @@ class HTMLSanitizerMixin(object):
elif prop.split('-')[0].lower() in ['background', 'border', 'margin', elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']: 'padding']:
for keyword in value.split(): for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \ if keyword not in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break break
else: else:

View file

@ -1,9 +1,6 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type from six import text_type
import gettext
_ = gettext.gettext
try: try:
from functools import reduce from functools import reduce
except ImportError: except ImportError:
@ -35,7 +32,7 @@ else:
v = utils.surrogatePairToCodepoint(v) v = utils.surrogatePairToCodepoint(v)
else: else:
v = ord(v) v = ord(v)
if not v in encode_entity_map or k.islower(): if v not in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc. # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k encode_entity_map[v] = k
@ -208,7 +205,7 @@ class HTMLSerializer(object):
if token["systemId"]: if token["systemId"]:
if token["systemId"].find('"') >= 0: if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0: if token["systemId"].find("'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters")) self.serializeError("System identifer contains both single and double quote characters")
quote_char = "'" quote_char = "'"
else: else:
quote_char = '"' quote_char = '"'
@ -220,7 +217,7 @@ class HTMLSerializer(object):
elif type in ("Characters", "SpaceCharacters"): elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata: if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0: if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA")) self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"]) yield self.encode(token["data"])
else: else:
yield self.encode(escape(token["data"])) yield self.encode(escape(token["data"]))
@ -231,7 +228,7 @@ class HTMLSerializer(object):
if name in rcdataElements and not self.escape_rcdata: if name in rcdataElements and not self.escape_rcdata:
in_cdata = True in_cdata = True
elif in_cdata: elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element")) self.serializeError("Unexpected child element of a CDATA element")
for (attr_namespace, attr_name), attr_value in token["data"].items(): for (attr_namespace, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here # TODO: Add namespace support here
k = attr_name k = attr_name
@ -279,20 +276,20 @@ class HTMLSerializer(object):
if name in rcdataElements: if name in rcdataElements:
in_cdata = False in_cdata = False
elif in_cdata: elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element")) self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name) yield self.encodeStrict("</%s>" % name)
elif type == "Comment": elif type == "Comment":
data = token["data"] data = token["data"]
if data.find("--") >= 0: if data.find("--") >= 0:
self.serializeError(_("Comment contains --")) self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"]) yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity": elif type == "Entity":
name = token["name"] name = token["name"]
key = name + ";" key = name + ";"
if not key in entities: if key not in entities:
self.serializeError(_("Entity %s not recognized" % name)) self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities: if self.resolve_entities and key not in xmlEntities:
data = entities[key] data = entities[key]
else: else:

View file

@ -0,0 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from . import sax
__all__ = ["sax"]
try:
from . import genshi # flake8: noqa
except ImportError:
pass
else:
__all__.append("genshi")

View file

@ -0,0 +1,47 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName, Attrs
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
def to_genshi(walker):
text = []
for token in walker:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
text.append(token["data"])
elif text:
yield TEXT, "".join(text), (None, -1, -1)
text = []
if type in ("StartTag", "EmptyTag"):
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
for attr, value in token["data"].items()])
yield (START, (QName(name), attrs), (None, -1, -1))
if type == "EmptyTag":
type = "EndTag"
if type == "EndTag":
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
yield END, QName(name), (None, -1, -1)
elif type == "Comment":
yield COMMENT, token["data"], (None, -1, -1)
elif type == "Doctype":
yield DOCTYPE, (token["name"], token["publicId"],
token["systemId"]), (None, -1, -1)
else:
pass # FIXME: What to do?
if text:
yield TEXT, "".join(text), (None, -1, -1)

View file

@ -158,7 +158,7 @@ def getDomBuilder(DomImplementation):
else: else:
# HACK: allow text nodes as children of the document node # HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'): if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types: if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types) self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE) self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data)) self.dom.appendChild(self.dom.createTextNode(data))

View file

@ -54,7 +54,7 @@ class Document(object):
def testSerializer(element): def testSerializer(element):
rv = [] rv = []
finalText = None finalText = None
infosetFilter = ihatexml.InfosetFilter() infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if not hasattr(element, "tag"): if not hasattr(element, "tag"):
@ -79,7 +79,7 @@ def testSerializer(element):
next_element = next_element.getnext() next_element = next_element.getnext()
elif isinstance(element, str) or isinstance(element, bytes): elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment # Text in a fragment
assert isinstance(element, str) or sys.version_info.major == 2 assert isinstance(element, str) or sys.version_info[0] == 2
rv.append("|%s\"%s\"" % (' ' * indent, element)) rv.append("|%s\"%s\"" % (' ' * indent, element))
else: else:
# Fragment case # Fragment case
@ -189,7 +189,7 @@ class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, fullTree=False): def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree) builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter() infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict): class Attributes(dict):
@ -257,7 +257,7 @@ class TreeBuilder(_base.TreeBuilder):
data = property(_getData, _setData) data = property(_getData, _setData)
self.elementClass = Element self.elementClass = Element
self.commentClass = builder.Comment self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment # self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements) _base.TreeBuilder.__init__(self, namespaceHTMLElements)
@ -315,7 +315,7 @@ class TreeBuilder(_base.TreeBuilder):
"""Create the document root""" """Create the document root"""
# Because of the way libxml2 works, it doesn't seem to be possible to # Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed. # alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our iniial # Therefore we need to use the built-in parser to create our initial
# tree, after which we can add elements like normal # tree, after which we can add elements like normal
docStr = "" docStr = ""
if self.doctype: if self.doctype:
@ -344,7 +344,8 @@ class TreeBuilder(_base.TreeBuilder):
# Append the initial comments: # Append the initial comments:
for comment_token in self.initial_comments: for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"])) comment = self.commentClass(comment_token["data"])
root.addprevious(comment._element)
# Create the root document and add the ElementTree to it # Create the root document and add the ElementTree to it
self.document = self.documentClass() self.document = self.documentClass()

View file

@ -10,8 +10,9 @@ returning an iterator generating tokens.
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
import sys __all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
from .. import constants
from ..utils import default_etree from ..utils import default_etree
treeWalkerCache = {} treeWalkerCache = {}
@ -20,28 +21,27 @@ treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs): def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support """Get a TreeWalker class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported Args:
values are: treeType (str): the name of the tree type required (case-insensitive).
Supported values are:
"dom" - The xml.dom.minidom DOM implementation - "dom": The xml.dom.minidom DOM implementation
"pulldom" - The xml.dom.pulldom event stream - "etree": A generic walker for tree implementations exposing an
"etree" - A generic walker for tree implementations exposing an elementtree-like interface (known to work with
elementtree-like interface (known to work with ElementTree, cElementTree and lxml.etree).
ElementTree, cElementTree and lxml.etree). - "lxml": Optimized walker for lxml.etree
"lxml" - Optimized walker for lxml.etree - "genshi": a Genshi stream
"genshi" - a Genshi stream
implementation - (Currently applies to the "etree" tree type only). A module Implementation: A module implementing the tree type e.g.
implementing the tree type e.g. xml.etree.ElementTree or xml.etree.ElementTree or cElementTree (Currently applies to the
cElementTree.""" "etree" tree type only).
"""
treeType = treeType.lower() treeType = treeType.lower()
if treeType not in treeWalkerCache: if treeType not in treeWalkerCache:
if treeType in ("dom", "pulldom"): if treeType == "dom":
name = "%s.%s" % (__name__, treeType) from . import dom
__import__(name) treeWalkerCache[treeType] = dom.TreeWalker
mod = sys.modules[name]
treeWalkerCache[treeType] = mod.TreeWalker
elif treeType == "genshi": elif treeType == "genshi":
from . import genshistream from . import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker treeWalkerCache[treeType] = genshistream.TreeWalker
@ -55,3 +55,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
# XXX: NEVER cache here, caching is done in the etree submodule # XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType) return treeWalkerCache.get(treeType)
def concatenateCharacterTokens(tokens):
pendingCharacters = []
for token in tokens:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
pendingCharacters.append(token["data"])
else:
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
pendingCharacters = []
yield token
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
def pprint(walker):
"""Pretty printer for tree walkers"""
output = []
indent = 0
for token in concatenateCharacterTokens(walker):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
# tag name
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
if token["namespace"] in constants.prefixes:
ns = constants.prefixes[token["namespace"]]
else:
ns = token["namespace"]
name = "%s %s" % (ns, token["name"])
else:
name = token["name"]
output.append("%s<%s>" % (" " * indent, name))
indent += 2
# attributes (sorted for consistent ordering)
attrs = token["data"]
for (namespace, localname), value in sorted(attrs.items()):
if namespace:
if namespace in constants.prefixes:
ns = constants.prefixes[namespace]
else:
ns = namespace
name = "%s %s" % (ns, localname)
else:
name = localname
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
# self-closing
if type == "EmptyTag":
indent -= 2
elif type == "EndTag":
indent -= 2
elif type == "Comment":
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
elif type == "Doctype":
if token["name"]:
if token["publicId"]:
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
(" " * indent,
token["name"],
token["publicId"],
token["systemId"] if token["systemId"] else ""))
elif token["systemId"]:
output.append("""%s<!DOCTYPE %s "" "%s">""" %
(" " * indent,
token["name"],
token["systemId"]))
else:
output.append("%s<!DOCTYPE %s>" % (" " * indent,
token["name"]))
else:
output.append("%s<!DOCTYPE >" % (" " * indent,))
elif type == "Characters":
output.append("%s\"%s\"" % (" " * indent, token["data"]))
elif type == "SpaceCharacters":
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
else:
raise ValueError("Unknown token type, %s" % type)
return "\n".join(output)

View file

@ -1,8 +1,8 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types from six import text_type, string_types
import gettext __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
_ = gettext.gettext "TreeWalker", "NonRecursiveTreeWalker"]
from xml.dom import Node from xml.dom import Node
@ -58,7 +58,7 @@ class TreeWalker(object):
"namespace": to_text(namespace), "namespace": to_text(namespace),
"data": attrs} "data": attrs}
if hasChildren: if hasChildren:
yield self.error(_("Void element has children")) yield self.error("Void element has children")
def startTag(self, namespace, name, attrs): def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, string_types), type(namespace) assert namespace is None or isinstance(namespace, string_types), type(namespace)
@ -122,7 +122,7 @@ class TreeWalker(object):
return {"type": "Entity", "name": text_type(name)} return {"type": "Entity", "name": text_type(name)}
def unknown(self, nodeType): def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType) return self.error("Unknown node type: " + nodeType)
class NonRecursiveTreeWalker(TreeWalker): class NonRecursiveTreeWalker(TreeWalker):

View file

@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node from xml.dom import Node
import gettext
_ = gettext.gettext
from . import _base from . import _base

View file

@ -7,12 +7,10 @@ except ImportError:
from ordereddict import OrderedDict from ordereddict import OrderedDict
except ImportError: except ImportError:
OrderedDict = dict OrderedDict = dict
import gettext
_ = gettext.gettext
import re import re
from six import text_type from six import string_types
from . import _base from . import _base
from ..utils import moduleFactoryFactory from ..utils import moduleFactoryFactory
@ -60,7 +58,7 @@ def getETreeBuilder(ElementTreeImplementation):
return _base.COMMENT, node.text return _base.COMMENT, node.text
else: else:
assert type(node.tag) == text_type, type(node.tag) assert isinstance(node.tag, string_types), type(node.tag)
# This is assumed to be an ordinary element # This is assumed to be an ordinary element
match = tag_regexp.match(node.tag) match = tag_regexp.match(node.tag)
if match: if match:
@ -131,6 +129,7 @@ def getETreeBuilder(ElementTreeImplementation):
if not parents: if not parents:
return parent return parent
else: else:
assert list(parents[-1]).count(parent) == 1
return parent, list(parents[-1]).index(parent), parents, None return parent, list(parents[-1]).index(parent), parents, None
return locals() return locals()

View file

@ -4,9 +4,6 @@ from six import text_type
from lxml import etree from lxml import etree
from ..treebuilders.etree import tag_regexp from ..treebuilders.etree import tag_regexp
from gettext import gettext
_ = gettext
from . import _base from . import _base
from .. import ihatexml from .. import ihatexml
@ -130,7 +127,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return _base.TEXT, ensure_str(getattr(node, key)) return _base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root): elif isinstance(node, Root):
@ -169,7 +166,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs, len(node) > 0 or node.text) attrs, len(node) > 0 or node.text)
def getFirstChild(self, node): def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children") assert not isinstance(node, tuple), "Text nodes have no children"
assert len(node) or node.text, "Node has no children" assert len(node) or node.text, "Node has no children"
if node.text: if node.text:
@ -180,7 +177,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getNextSibling(self, node): def getNextSibling(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text": if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here # XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element # because node[0] might evaluate to False if it has no child element
@ -196,7 +193,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getParentNode(self, node): def getParentNode(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text": if key == "text":
return node return node
# else: fallback to "normal" processing # else: fallback to "normal" processing

View file

@ -1,63 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
from . import _base
from ..constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
ignore_until = None
previous = None
for event in self.tree:
if previous is not None and \
(ignore_until is None or previous[1] is ignore_until):
if previous[1] is ignore_until:
ignore_until = None
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = previous[1]
previous = event
if ignore_until is None or previous[1] is ignore_until:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
def tokens(self, event, next):
type, node = event
if type == START_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
attrs[(attr.namespaceURI, attr.localName)] = attr.value
if name in voidElements:
for token in self.emptyTag(namespace,
name,
attrs,
not next or next[1] is not node):
yield token
else:
yield self.startTag(namespace, name, attrs)
elif type == END_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
if name not in voidElements:
yield self.endTag(namespace, name)
elif type == COMMENT:
yield self.comment(node.nodeValue)
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
for token in self.text(node.nodeValue):
yield token
else:
yield self.unknown(type)

View file

@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
from types import ModuleType from types import ModuleType
from six import text_type
try: try:
import xml.etree.cElementTree as default_etree import xml.etree.cElementTree as default_etree
except ImportError: except ImportError:
@ -9,7 +11,26 @@ except ImportError:
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"] "surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as
# Jython. This is because UTF-16 itself is based on the use of such
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"')
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"')
assert isinstance(_x, text_type)
except:
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
class MethodDispatcher(dict): class MethodDispatcher(dict):
@ -43,7 +64,7 @@ class MethodDispatcher(dict):
return dict.get(self, key, self.default) return dict.get(self, key, self.default)
# Some utility functions to dal with weirdness around UCS2 vs UCS4 # Some utility functions to deal with weirdness around UCS2 vs UCS4
# python builds # python builds
def isSurrogatePair(data): def isSurrogatePair(data):
@ -70,13 +91,21 @@ def moduleFactoryFactory(factory):
else: else:
name = b"_%s_factory" % baseModule.__name__ name = b"_%s_factory" % baseModule.__name__
if name in moduleCache: kwargs_tuple = tuple(kwargs.items())
return moduleCache[name]
else: try:
return moduleCache[name][args][kwargs_tuple]
except KeyError:
mod = ModuleType(name) mod = ModuleType(name)
objs = factory(baseModule, *args, **kwargs) objs = factory(baseModule, *args, **kwargs)
mod.__dict__.update(objs) mod.__dict__.update(objs)
moduleCache[name] = mod if "name" not in moduleCache:
moduleCache[name] = {}
if "args" not in moduleCache[name]:
moduleCache[name][args] = {}
if "kwargs" not in moduleCache[name][args]:
moduleCache[name][args][kwargs_tuple] = {}
moduleCache[name][args][kwargs_tuple] = mod
return mod return mod
return moduleFactory return moduleFactory