mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-07 10:33:38 +00:00
Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d).
This commit is contained in:
parent
589d2544dd
commit
8c026d7977
21 changed files with 534 additions and 370 deletions
|
@ -13,6 +13,7 @@
|
||||||
* Update chardet packages 2.3.0 (26982c5) to 2.3.0 (d7fae98)
|
* Update chardet packages 2.3.0 (26982c5) to 2.3.0 (d7fae98)
|
||||||
* Update dateutil library 2.4.2 (083f666) to 2.4.2 (d4baf97)
|
* Update dateutil library 2.4.2 (083f666) to 2.4.2 (d4baf97)
|
||||||
* Update Hachoir library 1.3.4 (r1383) to 1.3.4 (r1435)
|
* Update Hachoir library 1.3.4 (r1383) to 1.3.4 (r1435)
|
||||||
|
* Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d)
|
||||||
|
|
||||||
|
|
||||||
### 0.11.0 (2016-01-10 22:30:00 UTC)
|
### 0.11.0 (2016-01-10 22:30:00 UTC)
|
||||||
|
|
|
@ -20,4 +20,6 @@ from .serializer import serialize
|
||||||
|
|
||||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||||
"getTreeWalker", "serialize"]
|
"getTreeWalker", "serialize"]
|
||||||
__version__ = "0.999"
|
|
||||||
|
# this has to be at the top level, see how setup.py parses this
|
||||||
|
__version__ = "0.99999999-dev"
|
||||||
|
|
|
@ -1,292 +1,290 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import string
|
import string
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
EOF = None
|
EOF = None
|
||||||
|
|
||||||
E = {
|
E = {
|
||||||
"null-character":
|
"null-character":
|
||||||
_("Null character in input stream, replaced with U+FFFD."),
|
"Null character in input stream, replaced with U+FFFD.",
|
||||||
"invalid-codepoint":
|
"invalid-codepoint":
|
||||||
_("Invalid codepoint in stream."),
|
"Invalid codepoint in stream.",
|
||||||
"incorrectly-placed-solidus":
|
"incorrectly-placed-solidus":
|
||||||
_("Solidus (/) incorrectly placed in tag."),
|
"Solidus (/) incorrectly placed in tag.",
|
||||||
"incorrect-cr-newline-entity":
|
"incorrect-cr-newline-entity":
|
||||||
_("Incorrect CR newline entity, replaced with LF."),
|
"Incorrect CR newline entity, replaced with LF.",
|
||||||
"illegal-windows-1252-entity":
|
"illegal-windows-1252-entity":
|
||||||
_("Entity used with illegal number (windows-1252 reference)."),
|
"Entity used with illegal number (windows-1252 reference).",
|
||||||
"cant-convert-numeric-entity":
|
"cant-convert-numeric-entity":
|
||||||
_("Numeric entity couldn't be converted to character "
|
"Numeric entity couldn't be converted to character "
|
||||||
"(codepoint U+%(charAsInt)08x)."),
|
"(codepoint U+%(charAsInt)08x).",
|
||||||
"illegal-codepoint-for-numeric-entity":
|
"illegal-codepoint-for-numeric-entity":
|
||||||
_("Numeric entity represents an illegal codepoint: "
|
"Numeric entity represents an illegal codepoint: "
|
||||||
"U+%(charAsInt)08x."),
|
"U+%(charAsInt)08x.",
|
||||||
"numeric-entity-without-semicolon":
|
"numeric-entity-without-semicolon":
|
||||||
_("Numeric entity didn't end with ';'."),
|
"Numeric entity didn't end with ';'.",
|
||||||
"expected-numeric-entity-but-got-eof":
|
"expected-numeric-entity-but-got-eof":
|
||||||
_("Numeric entity expected. Got end of file instead."),
|
"Numeric entity expected. Got end of file instead.",
|
||||||
"expected-numeric-entity":
|
"expected-numeric-entity":
|
||||||
_("Numeric entity expected but none found."),
|
"Numeric entity expected but none found.",
|
||||||
"named-entity-without-semicolon":
|
"named-entity-without-semicolon":
|
||||||
_("Named entity didn't end with ';'."),
|
"Named entity didn't end with ';'.",
|
||||||
"expected-named-entity":
|
"expected-named-entity":
|
||||||
_("Named entity expected. Got none."),
|
"Named entity expected. Got none.",
|
||||||
"attributes-in-end-tag":
|
"attributes-in-end-tag":
|
||||||
_("End tag contains unexpected attributes."),
|
"End tag contains unexpected attributes.",
|
||||||
'self-closing-flag-on-end-tag':
|
'self-closing-flag-on-end-tag':
|
||||||
_("End tag contains unexpected self-closing flag."),
|
"End tag contains unexpected self-closing flag.",
|
||||||
"expected-tag-name-but-got-right-bracket":
|
"expected-tag-name-but-got-right-bracket":
|
||||||
_("Expected tag name. Got '>' instead."),
|
"Expected tag name. Got '>' instead.",
|
||||||
"expected-tag-name-but-got-question-mark":
|
"expected-tag-name-but-got-question-mark":
|
||||||
_("Expected tag name. Got '?' instead. (HTML doesn't "
|
"Expected tag name. Got '?' instead. (HTML doesn't "
|
||||||
"support processing instructions.)"),
|
"support processing instructions.)",
|
||||||
"expected-tag-name":
|
"expected-tag-name":
|
||||||
_("Expected tag name. Got something else instead"),
|
"Expected tag name. Got something else instead",
|
||||||
"expected-closing-tag-but-got-right-bracket":
|
"expected-closing-tag-but-got-right-bracket":
|
||||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
"Expected closing tag. Got '>' instead. Ignoring '</>'.",
|
||||||
"expected-closing-tag-but-got-eof":
|
"expected-closing-tag-but-got-eof":
|
||||||
_("Expected closing tag. Unexpected end of file."),
|
"Expected closing tag. Unexpected end of file.",
|
||||||
"expected-closing-tag-but-got-char":
|
"expected-closing-tag-but-got-char":
|
||||||
_("Expected closing tag. Unexpected character '%(data)s' found."),
|
"Expected closing tag. Unexpected character '%(data)s' found.",
|
||||||
"eof-in-tag-name":
|
"eof-in-tag-name":
|
||||||
_("Unexpected end of file in the tag name."),
|
"Unexpected end of file in the tag name.",
|
||||||
"expected-attribute-name-but-got-eof":
|
"expected-attribute-name-but-got-eof":
|
||||||
_("Unexpected end of file. Expected attribute name instead."),
|
"Unexpected end of file. Expected attribute name instead.",
|
||||||
"eof-in-attribute-name":
|
"eof-in-attribute-name":
|
||||||
_("Unexpected end of file in attribute name."),
|
"Unexpected end of file in attribute name.",
|
||||||
"invalid-character-in-attribute-name":
|
"invalid-character-in-attribute-name":
|
||||||
_("Invalid character in attribute name"),
|
"Invalid character in attribute name",
|
||||||
"duplicate-attribute":
|
"duplicate-attribute":
|
||||||
_("Dropped duplicate attribute on tag."),
|
"Dropped duplicate attribute on tag.",
|
||||||
"expected-end-of-tag-name-but-got-eof":
|
"expected-end-of-tag-name-but-got-eof":
|
||||||
_("Unexpected end of file. Expected = or end of tag."),
|
"Unexpected end of file. Expected = or end of tag.",
|
||||||
"expected-attribute-value-but-got-eof":
|
"expected-attribute-value-but-got-eof":
|
||||||
_("Unexpected end of file. Expected attribute value."),
|
"Unexpected end of file. Expected attribute value.",
|
||||||
"expected-attribute-value-but-got-right-bracket":
|
"expected-attribute-value-but-got-right-bracket":
|
||||||
_("Expected attribute value. Got '>' instead."),
|
"Expected attribute value. Got '>' instead.",
|
||||||
'equals-in-unquoted-attribute-value':
|
'equals-in-unquoted-attribute-value':
|
||||||
_("Unexpected = in unquoted attribute"),
|
"Unexpected = in unquoted attribute",
|
||||||
'unexpected-character-in-unquoted-attribute-value':
|
'unexpected-character-in-unquoted-attribute-value':
|
||||||
_("Unexpected character in unquoted attribute"),
|
"Unexpected character in unquoted attribute",
|
||||||
"invalid-character-after-attribute-name":
|
"invalid-character-after-attribute-name":
|
||||||
_("Unexpected character after attribute name."),
|
"Unexpected character after attribute name.",
|
||||||
"unexpected-character-after-attribute-value":
|
"unexpected-character-after-attribute-value":
|
||||||
_("Unexpected character after attribute value."),
|
"Unexpected character after attribute value.",
|
||||||
"eof-in-attribute-value-double-quote":
|
"eof-in-attribute-value-double-quote":
|
||||||
_("Unexpected end of file in attribute value (\")."),
|
"Unexpected end of file in attribute value (\").",
|
||||||
"eof-in-attribute-value-single-quote":
|
"eof-in-attribute-value-single-quote":
|
||||||
_("Unexpected end of file in attribute value (')."),
|
"Unexpected end of file in attribute value (').",
|
||||||
"eof-in-attribute-value-no-quotes":
|
"eof-in-attribute-value-no-quotes":
|
||||||
_("Unexpected end of file in attribute value."),
|
"Unexpected end of file in attribute value.",
|
||||||
"unexpected-EOF-after-solidus-in-tag":
|
"unexpected-EOF-after-solidus-in-tag":
|
||||||
_("Unexpected end of file in tag. Expected >"),
|
"Unexpected end of file in tag. Expected >",
|
||||||
"unexpected-character-after-solidus-in-tag":
|
"unexpected-character-after-solidus-in-tag":
|
||||||
_("Unexpected character after / in tag. Expected >"),
|
"Unexpected character after / in tag. Expected >",
|
||||||
"expected-dashes-or-doctype":
|
"expected-dashes-or-doctype":
|
||||||
_("Expected '--' or 'DOCTYPE'. Not found."),
|
"Expected '--' or 'DOCTYPE'. Not found.",
|
||||||
"unexpected-bang-after-double-dash-in-comment":
|
"unexpected-bang-after-double-dash-in-comment":
|
||||||
_("Unexpected ! after -- in comment"),
|
"Unexpected ! after -- in comment",
|
||||||
"unexpected-space-after-double-dash-in-comment":
|
"unexpected-space-after-double-dash-in-comment":
|
||||||
_("Unexpected space after -- in comment"),
|
"Unexpected space after -- in comment",
|
||||||
"incorrect-comment":
|
"incorrect-comment":
|
||||||
_("Incorrect comment."),
|
"Incorrect comment.",
|
||||||
"eof-in-comment":
|
"eof-in-comment":
|
||||||
_("Unexpected end of file in comment."),
|
"Unexpected end of file in comment.",
|
||||||
"eof-in-comment-end-dash":
|
"eof-in-comment-end-dash":
|
||||||
_("Unexpected end of file in comment (-)"),
|
"Unexpected end of file in comment (-)",
|
||||||
"unexpected-dash-after-double-dash-in-comment":
|
"unexpected-dash-after-double-dash-in-comment":
|
||||||
_("Unexpected '-' after '--' found in comment."),
|
"Unexpected '-' after '--' found in comment.",
|
||||||
"eof-in-comment-double-dash":
|
"eof-in-comment-double-dash":
|
||||||
_("Unexpected end of file in comment (--)."),
|
"Unexpected end of file in comment (--).",
|
||||||
"eof-in-comment-end-space-state":
|
"eof-in-comment-end-space-state":
|
||||||
_("Unexpected end of file in comment."),
|
"Unexpected end of file in comment.",
|
||||||
"eof-in-comment-end-bang-state":
|
"eof-in-comment-end-bang-state":
|
||||||
_("Unexpected end of file in comment."),
|
"Unexpected end of file in comment.",
|
||||||
"unexpected-char-in-comment":
|
"unexpected-char-in-comment":
|
||||||
_("Unexpected character in comment found."),
|
"Unexpected character in comment found.",
|
||||||
"need-space-after-doctype":
|
"need-space-after-doctype":
|
||||||
_("No space after literal string 'DOCTYPE'."),
|
"No space after literal string 'DOCTYPE'.",
|
||||||
"expected-doctype-name-but-got-right-bracket":
|
"expected-doctype-name-but-got-right-bracket":
|
||||||
_("Unexpected > character. Expected DOCTYPE name."),
|
"Unexpected > character. Expected DOCTYPE name.",
|
||||||
"expected-doctype-name-but-got-eof":
|
"expected-doctype-name-but-got-eof":
|
||||||
_("Unexpected end of file. Expected DOCTYPE name."),
|
"Unexpected end of file. Expected DOCTYPE name.",
|
||||||
"eof-in-doctype-name":
|
"eof-in-doctype-name":
|
||||||
_("Unexpected end of file in DOCTYPE name."),
|
"Unexpected end of file in DOCTYPE name.",
|
||||||
"eof-in-doctype":
|
"eof-in-doctype":
|
||||||
_("Unexpected end of file in DOCTYPE."),
|
"Unexpected end of file in DOCTYPE.",
|
||||||
"expected-space-or-right-bracket-in-doctype":
|
"expected-space-or-right-bracket-in-doctype":
|
||||||
_("Expected space or '>'. Got '%(data)s'"),
|
"Expected space or '>'. Got '%(data)s'",
|
||||||
"unexpected-end-of-doctype":
|
"unexpected-end-of-doctype":
|
||||||
_("Unexpected end of DOCTYPE."),
|
"Unexpected end of DOCTYPE.",
|
||||||
"unexpected-char-in-doctype":
|
"unexpected-char-in-doctype":
|
||||||
_("Unexpected character in DOCTYPE."),
|
"Unexpected character in DOCTYPE.",
|
||||||
"eof-in-innerhtml":
|
"eof-in-innerhtml":
|
||||||
_("XXX innerHTML EOF"),
|
"XXX innerHTML EOF",
|
||||||
"unexpected-doctype":
|
"unexpected-doctype":
|
||||||
_("Unexpected DOCTYPE. Ignored."),
|
"Unexpected DOCTYPE. Ignored.",
|
||||||
"non-html-root":
|
"non-html-root":
|
||||||
_("html needs to be the first start tag."),
|
"html needs to be the first start tag.",
|
||||||
"expected-doctype-but-got-eof":
|
"expected-doctype-but-got-eof":
|
||||||
_("Unexpected End of file. Expected DOCTYPE."),
|
"Unexpected End of file. Expected DOCTYPE.",
|
||||||
"unknown-doctype":
|
"unknown-doctype":
|
||||||
_("Erroneous DOCTYPE."),
|
"Erroneous DOCTYPE.",
|
||||||
"expected-doctype-but-got-chars":
|
"expected-doctype-but-got-chars":
|
||||||
_("Unexpected non-space characters. Expected DOCTYPE."),
|
"Unexpected non-space characters. Expected DOCTYPE.",
|
||||||
"expected-doctype-but-got-start-tag":
|
"expected-doctype-but-got-start-tag":
|
||||||
_("Unexpected start tag (%(name)s). Expected DOCTYPE."),
|
"Unexpected start tag (%(name)s). Expected DOCTYPE.",
|
||||||
"expected-doctype-but-got-end-tag":
|
"expected-doctype-but-got-end-tag":
|
||||||
_("Unexpected end tag (%(name)s). Expected DOCTYPE."),
|
"Unexpected end tag (%(name)s). Expected DOCTYPE.",
|
||||||
"end-tag-after-implied-root":
|
"end-tag-after-implied-root":
|
||||||
_("Unexpected end tag (%(name)s) after the (implied) root element."),
|
"Unexpected end tag (%(name)s) after the (implied) root element.",
|
||||||
"expected-named-closing-tag-but-got-eof":
|
"expected-named-closing-tag-but-got-eof":
|
||||||
_("Unexpected end of file. Expected end tag (%(name)s)."),
|
"Unexpected end of file. Expected end tag (%(name)s).",
|
||||||
"two-heads-are-not-better-than-one":
|
"two-heads-are-not-better-than-one":
|
||||||
_("Unexpected start tag head in existing head. Ignored."),
|
"Unexpected start tag head in existing head. Ignored.",
|
||||||
"unexpected-end-tag":
|
"unexpected-end-tag":
|
||||||
_("Unexpected end tag (%(name)s). Ignored."),
|
"Unexpected end tag (%(name)s). Ignored.",
|
||||||
"unexpected-start-tag-out-of-my-head":
|
"unexpected-start-tag-out-of-my-head":
|
||||||
_("Unexpected start tag (%(name)s) that can be in head. Moved."),
|
"Unexpected start tag (%(name)s) that can be in head. Moved.",
|
||||||
"unexpected-start-tag":
|
"unexpected-start-tag":
|
||||||
_("Unexpected start tag (%(name)s)."),
|
"Unexpected start tag (%(name)s).",
|
||||||
"missing-end-tag":
|
"missing-end-tag":
|
||||||
_("Missing end tag (%(name)s)."),
|
"Missing end tag (%(name)s).",
|
||||||
"missing-end-tags":
|
"missing-end-tags":
|
||||||
_("Missing end tags (%(name)s)."),
|
"Missing end tags (%(name)s).",
|
||||||
"unexpected-start-tag-implies-end-tag":
|
"unexpected-start-tag-implies-end-tag":
|
||||||
_("Unexpected start tag (%(startName)s) "
|
"Unexpected start tag (%(startName)s) "
|
||||||
"implies end tag (%(endName)s)."),
|
"implies end tag (%(endName)s).",
|
||||||
"unexpected-start-tag-treated-as":
|
"unexpected-start-tag-treated-as":
|
||||||
_("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
|
"Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
|
||||||
"deprecated-tag":
|
"deprecated-tag":
|
||||||
_("Unexpected start tag %(name)s. Don't use it!"),
|
"Unexpected start tag %(name)s. Don't use it!",
|
||||||
"unexpected-start-tag-ignored":
|
"unexpected-start-tag-ignored":
|
||||||
_("Unexpected start tag %(name)s. Ignored."),
|
"Unexpected start tag %(name)s. Ignored.",
|
||||||
"expected-one-end-tag-but-got-another":
|
"expected-one-end-tag-but-got-another":
|
||||||
_("Unexpected end tag (%(gotName)s). "
|
"Unexpected end tag (%(gotName)s). "
|
||||||
"Missing end tag (%(expectedName)s)."),
|
"Missing end tag (%(expectedName)s).",
|
||||||
"end-tag-too-early":
|
"end-tag-too-early":
|
||||||
_("End tag (%(name)s) seen too early. Expected other end tag."),
|
"End tag (%(name)s) seen too early. Expected other end tag.",
|
||||||
"end-tag-too-early-named":
|
"end-tag-too-early-named":
|
||||||
_("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
|
"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
|
||||||
"end-tag-too-early-ignored":
|
"end-tag-too-early-ignored":
|
||||||
_("End tag (%(name)s) seen too early. Ignored."),
|
"End tag (%(name)s) seen too early. Ignored.",
|
||||||
"adoption-agency-1.1":
|
"adoption-agency-1.1":
|
||||||
_("End tag (%(name)s) violates step 1, "
|
"End tag (%(name)s) violates step 1, "
|
||||||
"paragraph 1 of the adoption agency algorithm."),
|
"paragraph 1 of the adoption agency algorithm.",
|
||||||
"adoption-agency-1.2":
|
"adoption-agency-1.2":
|
||||||
_("End tag (%(name)s) violates step 1, "
|
"End tag (%(name)s) violates step 1, "
|
||||||
"paragraph 2 of the adoption agency algorithm."),
|
"paragraph 2 of the adoption agency algorithm.",
|
||||||
"adoption-agency-1.3":
|
"adoption-agency-1.3":
|
||||||
_("End tag (%(name)s) violates step 1, "
|
"End tag (%(name)s) violates step 1, "
|
||||||
"paragraph 3 of the adoption agency algorithm."),
|
"paragraph 3 of the adoption agency algorithm.",
|
||||||
"adoption-agency-4.4":
|
"adoption-agency-4.4":
|
||||||
_("End tag (%(name)s) violates step 4, "
|
"End tag (%(name)s) violates step 4, "
|
||||||
"paragraph 4 of the adoption agency algorithm."),
|
"paragraph 4 of the adoption agency algorithm.",
|
||||||
"unexpected-end-tag-treated-as":
|
"unexpected-end-tag-treated-as":
|
||||||
_("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
|
"Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
|
||||||
"no-end-tag":
|
"no-end-tag":
|
||||||
_("This element (%(name)s) has no end tag."),
|
"This element (%(name)s) has no end tag.",
|
||||||
"unexpected-implied-end-tag-in-table":
|
"unexpected-implied-end-tag-in-table":
|
||||||
_("Unexpected implied end tag (%(name)s) in the table phase."),
|
"Unexpected implied end tag (%(name)s) in the table phase.",
|
||||||
"unexpected-implied-end-tag-in-table-body":
|
"unexpected-implied-end-tag-in-table-body":
|
||||||
_("Unexpected implied end tag (%(name)s) in the table body phase."),
|
"Unexpected implied end tag (%(name)s) in the table body phase.",
|
||||||
"unexpected-char-implies-table-voodoo":
|
"unexpected-char-implies-table-voodoo":
|
||||||
_("Unexpected non-space characters in "
|
"Unexpected non-space characters in "
|
||||||
"table context caused voodoo mode."),
|
"table context caused voodoo mode.",
|
||||||
"unexpected-hidden-input-in-table":
|
"unexpected-hidden-input-in-table":
|
||||||
_("Unexpected input with type hidden in table context."),
|
"Unexpected input with type hidden in table context.",
|
||||||
"unexpected-form-in-table":
|
"unexpected-form-in-table":
|
||||||
_("Unexpected form in table context."),
|
"Unexpected form in table context.",
|
||||||
"unexpected-start-tag-implies-table-voodoo":
|
"unexpected-start-tag-implies-table-voodoo":
|
||||||
_("Unexpected start tag (%(name)s) in "
|
"Unexpected start tag (%(name)s) in "
|
||||||
"table context caused voodoo mode."),
|
"table context caused voodoo mode.",
|
||||||
"unexpected-end-tag-implies-table-voodoo":
|
"unexpected-end-tag-implies-table-voodoo":
|
||||||
_("Unexpected end tag (%(name)s) in "
|
"Unexpected end tag (%(name)s) in "
|
||||||
"table context caused voodoo mode."),
|
"table context caused voodoo mode.",
|
||||||
"unexpected-cell-in-table-body":
|
"unexpected-cell-in-table-body":
|
||||||
_("Unexpected table cell start tag (%(name)s) "
|
"Unexpected table cell start tag (%(name)s) "
|
||||||
"in the table body phase."),
|
"in the table body phase.",
|
||||||
"unexpected-cell-end-tag":
|
"unexpected-cell-end-tag":
|
||||||
_("Got table cell end tag (%(name)s) "
|
"Got table cell end tag (%(name)s) "
|
||||||
"while required end tags are missing."),
|
"while required end tags are missing.",
|
||||||
"unexpected-end-tag-in-table-body":
|
"unexpected-end-tag-in-table-body":
|
||||||
_("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
|
"Unexpected end tag (%(name)s) in the table body phase. Ignored.",
|
||||||
"unexpected-implied-end-tag-in-table-row":
|
"unexpected-implied-end-tag-in-table-row":
|
||||||
_("Unexpected implied end tag (%(name)s) in the table row phase."),
|
"Unexpected implied end tag (%(name)s) in the table row phase.",
|
||||||
"unexpected-end-tag-in-table-row":
|
"unexpected-end-tag-in-table-row":
|
||||||
_("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
|
"Unexpected end tag (%(name)s) in the table row phase. Ignored.",
|
||||||
"unexpected-select-in-select":
|
"unexpected-select-in-select":
|
||||||
_("Unexpected select start tag in the select phase "
|
"Unexpected select start tag in the select phase "
|
||||||
"treated as select end tag."),
|
"treated as select end tag.",
|
||||||
"unexpected-input-in-select":
|
"unexpected-input-in-select":
|
||||||
_("Unexpected input start tag in the select phase."),
|
"Unexpected input start tag in the select phase.",
|
||||||
"unexpected-start-tag-in-select":
|
"unexpected-start-tag-in-select":
|
||||||
_("Unexpected start tag token (%(name)s in the select phase. "
|
"Unexpected start tag token (%(name)s in the select phase. "
|
||||||
"Ignored."),
|
"Ignored.",
|
||||||
"unexpected-end-tag-in-select":
|
"unexpected-end-tag-in-select":
|
||||||
_("Unexpected end tag (%(name)s) in the select phase. Ignored."),
|
"Unexpected end tag (%(name)s) in the select phase. Ignored.",
|
||||||
"unexpected-table-element-start-tag-in-select-in-table":
|
"unexpected-table-element-start-tag-in-select-in-table":
|
||||||
_("Unexpected table element start tag (%(name)s) in the select in table phase."),
|
"Unexpected table element start tag (%(name)s) in the select in table phase.",
|
||||||
"unexpected-table-element-end-tag-in-select-in-table":
|
"unexpected-table-element-end-tag-in-select-in-table":
|
||||||
_("Unexpected table element end tag (%(name)s) in the select in table phase."),
|
"Unexpected table element end tag (%(name)s) in the select in table phase.",
|
||||||
"unexpected-char-after-body":
|
"unexpected-char-after-body":
|
||||||
_("Unexpected non-space characters in the after body phase."),
|
"Unexpected non-space characters in the after body phase.",
|
||||||
"unexpected-start-tag-after-body":
|
"unexpected-start-tag-after-body":
|
||||||
_("Unexpected start tag token (%(name)s)"
|
"Unexpected start tag token (%(name)s)"
|
||||||
" in the after body phase."),
|
" in the after body phase.",
|
||||||
"unexpected-end-tag-after-body":
|
"unexpected-end-tag-after-body":
|
||||||
_("Unexpected end tag token (%(name)s)"
|
"Unexpected end tag token (%(name)s)"
|
||||||
" in the after body phase."),
|
" in the after body phase.",
|
||||||
"unexpected-char-in-frameset":
|
"unexpected-char-in-frameset":
|
||||||
_("Unexpected characters in the frameset phase. Characters ignored."),
|
"Unexpected characters in the frameset phase. Characters ignored.",
|
||||||
"unexpected-start-tag-in-frameset":
|
"unexpected-start-tag-in-frameset":
|
||||||
_("Unexpected start tag token (%(name)s)"
|
"Unexpected start tag token (%(name)s)"
|
||||||
" in the frameset phase. Ignored."),
|
" in the frameset phase. Ignored.",
|
||||||
"unexpected-frameset-in-frameset-innerhtml":
|
"unexpected-frameset-in-frameset-innerhtml":
|
||||||
_("Unexpected end tag token (frameset) "
|
"Unexpected end tag token (frameset) "
|
||||||
"in the frameset phase (innerHTML)."),
|
"in the frameset phase (innerHTML).",
|
||||||
"unexpected-end-tag-in-frameset":
|
"unexpected-end-tag-in-frameset":
|
||||||
_("Unexpected end tag token (%(name)s)"
|
"Unexpected end tag token (%(name)s)"
|
||||||
" in the frameset phase. Ignored."),
|
" in the frameset phase. Ignored.",
|
||||||
"unexpected-char-after-frameset":
|
"unexpected-char-after-frameset":
|
||||||
_("Unexpected non-space characters in the "
|
"Unexpected non-space characters in the "
|
||||||
"after frameset phase. Ignored."),
|
"after frameset phase. Ignored.",
|
||||||
"unexpected-start-tag-after-frameset":
|
"unexpected-start-tag-after-frameset":
|
||||||
_("Unexpected start tag (%(name)s)"
|
"Unexpected start tag (%(name)s)"
|
||||||
" in the after frameset phase. Ignored."),
|
" in the after frameset phase. Ignored.",
|
||||||
"unexpected-end-tag-after-frameset":
|
"unexpected-end-tag-after-frameset":
|
||||||
_("Unexpected end tag (%(name)s)"
|
"Unexpected end tag (%(name)s)"
|
||||||
" in the after frameset phase. Ignored."),
|
" in the after frameset phase. Ignored.",
|
||||||
"unexpected-end-tag-after-body-innerhtml":
|
"unexpected-end-tag-after-body-innerhtml":
|
||||||
_("Unexpected end tag after body(innerHtml)"),
|
"Unexpected end tag after body(innerHtml)",
|
||||||
"expected-eof-but-got-char":
|
"expected-eof-but-got-char":
|
||||||
_("Unexpected non-space characters. Expected end of file."),
|
"Unexpected non-space characters. Expected end of file.",
|
||||||
"expected-eof-but-got-start-tag":
|
"expected-eof-but-got-start-tag":
|
||||||
_("Unexpected start tag (%(name)s)"
|
"Unexpected start tag (%(name)s)"
|
||||||
". Expected end of file."),
|
". Expected end of file.",
|
||||||
"expected-eof-but-got-end-tag":
|
"expected-eof-but-got-end-tag":
|
||||||
_("Unexpected end tag (%(name)s)"
|
"Unexpected end tag (%(name)s)"
|
||||||
". Expected end of file."),
|
". Expected end of file.",
|
||||||
"eof-in-table":
|
"eof-in-table":
|
||||||
_("Unexpected end of file. Expected table content."),
|
"Unexpected end of file. Expected table content.",
|
||||||
"eof-in-select":
|
"eof-in-select":
|
||||||
_("Unexpected end of file. Expected select content."),
|
"Unexpected end of file. Expected select content.",
|
||||||
"eof-in-frameset":
|
"eof-in-frameset":
|
||||||
_("Unexpected end of file. Expected frameset content."),
|
"Unexpected end of file. Expected frameset content.",
|
||||||
"eof-in-script-in-script":
|
"eof-in-script-in-script":
|
||||||
_("Unexpected end of file. Expected script content."),
|
"Unexpected end of file. Expected script content.",
|
||||||
"eof-in-foreign-lands":
|
"eof-in-foreign-lands":
|
||||||
_("Unexpected end of file. Expected foreign content"),
|
"Unexpected end of file. Expected foreign content",
|
||||||
"non-void-element-with-trailing-solidus":
|
"non-void-element-with-trailing-solidus":
|
||||||
_("Trailing solidus not allowed on element %(name)s"),
|
"Trailing solidus not allowed on element %(name)s",
|
||||||
"unexpected-html-element-in-foreign-content":
|
"unexpected-html-element-in-foreign-content":
|
||||||
_("Element %(name)s not allowed in a non-html context"),
|
"Element %(name)s not allowed in a non-html context",
|
||||||
"unexpected-end-tag-before-html":
|
"unexpected-end-tag-before-html":
|
||||||
_("Unexpected end tag (%(name)s) before html."),
|
"Unexpected end tag (%(name)s) before html.",
|
||||||
"XXX-undefined-error":
|
"XXX-undefined-error":
|
||||||
_("Undefined error (this sucks and should be fixed)"),
|
"Undefined error (this sucks and should be fixed)",
|
||||||
}
|
}
|
||||||
|
|
||||||
namespaces = {
|
namespaces = {
|
||||||
|
@ -298,7 +296,7 @@ namespaces = {
|
||||||
"xmlns": "http://www.w3.org/2000/xmlns/"
|
"xmlns": "http://www.w3.org/2000/xmlns/"
|
||||||
}
|
}
|
||||||
|
|
||||||
scopingElements = frozenset((
|
scopingElements = frozenset([
|
||||||
(namespaces["html"], "applet"),
|
(namespaces["html"], "applet"),
|
||||||
(namespaces["html"], "caption"),
|
(namespaces["html"], "caption"),
|
||||||
(namespaces["html"], "html"),
|
(namespaces["html"], "html"),
|
||||||
|
@ -316,9 +314,9 @@ scopingElements = frozenset((
|
||||||
(namespaces["svg"], "foreignObject"),
|
(namespaces["svg"], "foreignObject"),
|
||||||
(namespaces["svg"], "desc"),
|
(namespaces["svg"], "desc"),
|
||||||
(namespaces["svg"], "title"),
|
(namespaces["svg"], "title"),
|
||||||
))
|
])
|
||||||
|
|
||||||
formattingElements = frozenset((
|
formattingElements = frozenset([
|
||||||
(namespaces["html"], "a"),
|
(namespaces["html"], "a"),
|
||||||
(namespaces["html"], "b"),
|
(namespaces["html"], "b"),
|
||||||
(namespaces["html"], "big"),
|
(namespaces["html"], "big"),
|
||||||
|
@ -333,9 +331,9 @@ formattingElements = frozenset((
|
||||||
(namespaces["html"], "strong"),
|
(namespaces["html"], "strong"),
|
||||||
(namespaces["html"], "tt"),
|
(namespaces["html"], "tt"),
|
||||||
(namespaces["html"], "u")
|
(namespaces["html"], "u")
|
||||||
))
|
])
|
||||||
|
|
||||||
specialElements = frozenset((
|
specialElements = frozenset([
|
||||||
(namespaces["html"], "address"),
|
(namespaces["html"], "address"),
|
||||||
(namespaces["html"], "applet"),
|
(namespaces["html"], "applet"),
|
||||||
(namespaces["html"], "area"),
|
(namespaces["html"], "area"),
|
||||||
|
@ -416,22 +414,22 @@ specialElements = frozenset((
|
||||||
(namespaces["html"], "wbr"),
|
(namespaces["html"], "wbr"),
|
||||||
(namespaces["html"], "xmp"),
|
(namespaces["html"], "xmp"),
|
||||||
(namespaces["svg"], "foreignObject")
|
(namespaces["svg"], "foreignObject")
|
||||||
))
|
])
|
||||||
|
|
||||||
htmlIntegrationPointElements = frozenset((
|
htmlIntegrationPointElements = frozenset([
|
||||||
(namespaces["mathml"], "annotaion-xml"),
|
(namespaces["mathml"], "annotaion-xml"),
|
||||||
(namespaces["svg"], "foreignObject"),
|
(namespaces["svg"], "foreignObject"),
|
||||||
(namespaces["svg"], "desc"),
|
(namespaces["svg"], "desc"),
|
||||||
(namespaces["svg"], "title")
|
(namespaces["svg"], "title")
|
||||||
))
|
])
|
||||||
|
|
||||||
mathmlTextIntegrationPointElements = frozenset((
|
mathmlTextIntegrationPointElements = frozenset([
|
||||||
(namespaces["mathml"], "mi"),
|
(namespaces["mathml"], "mi"),
|
||||||
(namespaces["mathml"], "mo"),
|
(namespaces["mathml"], "mo"),
|
||||||
(namespaces["mathml"], "mn"),
|
(namespaces["mathml"], "mn"),
|
||||||
(namespaces["mathml"], "ms"),
|
(namespaces["mathml"], "ms"),
|
||||||
(namespaces["mathml"], "mtext")
|
(namespaces["mathml"], "mtext")
|
||||||
))
|
])
|
||||||
|
|
||||||
adjustForeignAttributes = {
|
adjustForeignAttributes = {
|
||||||
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
|
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
|
||||||
|
@ -451,21 +449,21 @@ adjustForeignAttributes = {
|
||||||
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
|
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
|
||||||
adjustForeignAttributes.items()])
|
adjustForeignAttributes.items()])
|
||||||
|
|
||||||
spaceCharacters = frozenset((
|
spaceCharacters = frozenset([
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
"\u000C",
|
"\u000C",
|
||||||
" ",
|
" ",
|
||||||
"\r"
|
"\r"
|
||||||
))
|
])
|
||||||
|
|
||||||
tableInsertModeElements = frozenset((
|
tableInsertModeElements = frozenset([
|
||||||
"table",
|
"table",
|
||||||
"tbody",
|
"tbody",
|
||||||
"tfoot",
|
"tfoot",
|
||||||
"thead",
|
"thead",
|
||||||
"tr"
|
"tr"
|
||||||
))
|
])
|
||||||
|
|
||||||
asciiLowercase = frozenset(string.ascii_lowercase)
|
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||||
asciiUppercase = frozenset(string.ascii_uppercase)
|
asciiUppercase = frozenset(string.ascii_uppercase)
|
||||||
|
@ -486,7 +484,7 @@ headingElements = (
|
||||||
"h6"
|
"h6"
|
||||||
)
|
)
|
||||||
|
|
||||||
voidElements = frozenset((
|
voidElements = frozenset([
|
||||||
"base",
|
"base",
|
||||||
"command",
|
"command",
|
||||||
"event-source",
|
"event-source",
|
||||||
|
@ -502,11 +500,11 @@ voidElements = frozenset((
|
||||||
"input",
|
"input",
|
||||||
"source",
|
"source",
|
||||||
"track"
|
"track"
|
||||||
))
|
])
|
||||||
|
|
||||||
cdataElements = frozenset(('title', 'textarea'))
|
cdataElements = frozenset(['title', 'textarea'])
|
||||||
|
|
||||||
rcdataElements = frozenset((
|
rcdataElements = frozenset([
|
||||||
'style',
|
'style',
|
||||||
'script',
|
'script',
|
||||||
'xmp',
|
'xmp',
|
||||||
|
@ -514,27 +512,27 @@ rcdataElements = frozenset((
|
||||||
'noembed',
|
'noembed',
|
||||||
'noframes',
|
'noframes',
|
||||||
'noscript'
|
'noscript'
|
||||||
))
|
])
|
||||||
|
|
||||||
booleanAttributes = {
|
booleanAttributes = {
|
||||||
"": frozenset(("irrelevant",)),
|
"": frozenset(["irrelevant"]),
|
||||||
"style": frozenset(("scoped",)),
|
"style": frozenset(["scoped"]),
|
||||||
"img": frozenset(("ismap",)),
|
"img": frozenset(["ismap"]),
|
||||||
"audio": frozenset(("autoplay", "controls")),
|
"audio": frozenset(["autoplay", "controls"]),
|
||||||
"video": frozenset(("autoplay", "controls")),
|
"video": frozenset(["autoplay", "controls"]),
|
||||||
"script": frozenset(("defer", "async")),
|
"script": frozenset(["defer", "async"]),
|
||||||
"details": frozenset(("open",)),
|
"details": frozenset(["open"]),
|
||||||
"datagrid": frozenset(("multiple", "disabled")),
|
"datagrid": frozenset(["multiple", "disabled"]),
|
||||||
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
"command": frozenset(["hidden", "disabled", "checked", "default"]),
|
||||||
"hr": frozenset(("noshade")),
|
"hr": frozenset(["noshade"]),
|
||||||
"menu": frozenset(("autosubmit",)),
|
"menu": frozenset(["autosubmit"]),
|
||||||
"fieldset": frozenset(("disabled", "readonly")),
|
"fieldset": frozenset(["disabled", "readonly"]),
|
||||||
"option": frozenset(("disabled", "readonly", "selected")),
|
"option": frozenset(["disabled", "readonly", "selected"]),
|
||||||
"optgroup": frozenset(("disabled", "readonly")),
|
"optgroup": frozenset(["disabled", "readonly"]),
|
||||||
"button": frozenset(("disabled", "autofocus")),
|
"button": frozenset(["disabled", "autofocus"]),
|
||||||
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
|
||||||
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
|
||||||
"output": frozenset(("disabled", "readonly")),
|
"output": frozenset(["disabled", "readonly"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||||
|
@ -574,7 +572,7 @@ entitiesWindows1252 = (
|
||||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
)
|
)
|
||||||
|
|
||||||
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
|
xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
|
||||||
|
|
||||||
entities = {
|
entities = {
|
||||||
"AElig": "\xc6",
|
"AElig": "\xc6",
|
||||||
|
@ -3088,8 +3086,8 @@ tokenTypes = {
|
||||||
"ParseError": 7
|
"ParseError": 7
|
||||||
}
|
}
|
||||||
|
|
||||||
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
|
tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||||
tokenTypes["EmptyTag"]))
|
tokenTypes["EmptyTag"]])
|
||||||
|
|
||||||
|
|
||||||
prefixes = dict([(v, k) for k, v in namespaces.items()])
|
prefixes = dict([(v, k) for k, v in namespaces.items()])
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from gettext import gettext
|
|
||||||
_ = gettext
|
|
||||||
|
|
||||||
from . import _base
|
from . import _base
|
||||||
from ..constants import cdataElements, rcdataElements, voidElements
|
from ..constants import cdataElements, rcdataElements, voidElements
|
||||||
|
|
||||||
|
@ -23,24 +20,24 @@ class Filter(_base.Filter):
|
||||||
if type in ("StartTag", "EmptyTag"):
|
if type in ("StartTag", "EmptyTag"):
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
if contentModelFlag != "PCDATA":
|
if contentModelFlag != "PCDATA":
|
||||||
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
|
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||||
if not name:
|
if not name:
|
||||||
raise LintError(_("Empty tag name"))
|
raise LintError("Empty tag name")
|
||||||
if type == "StartTag" and name in voidElements:
|
if type == "StartTag" and name in voidElements:
|
||||||
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
|
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
|
||||||
elif type == "EmptyTag" and name not in voidElements:
|
elif type == "EmptyTag" and name not in voidElements:
|
||||||
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
|
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
open_elements.append(name)
|
open_elements.append(name)
|
||||||
for name, value in token["data"]:
|
for name, value in token["data"]:
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
|
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
|
||||||
if not name:
|
if not name:
|
||||||
raise LintError(_("Empty attribute name"))
|
raise LintError("Empty attribute name")
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
|
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
|
||||||
if name in cdataElements:
|
if name in cdataElements:
|
||||||
contentModelFlag = "CDATA"
|
contentModelFlag = "CDATA"
|
||||||
elif name in rcdataElements:
|
elif name in rcdataElements:
|
||||||
|
@ -51,43 +48,43 @@ class Filter(_base.Filter):
|
||||||
elif type == "EndTag":
|
elif type == "EndTag":
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||||
if not name:
|
if not name:
|
||||||
raise LintError(_("Empty tag name"))
|
raise LintError("Empty tag name")
|
||||||
if name in voidElements:
|
if name in voidElements:
|
||||||
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
|
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
|
||||||
start_name = open_elements.pop()
|
start_name = open_elements.pop()
|
||||||
if start_name != name:
|
if start_name != name:
|
||||||
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
|
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
|
||||||
contentModelFlag = "PCDATA"
|
contentModelFlag = "PCDATA"
|
||||||
|
|
||||||
elif type == "Comment":
|
elif type == "Comment":
|
||||||
if contentModelFlag != "PCDATA":
|
if contentModelFlag != "PCDATA":
|
||||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
raise LintError("Comment not in PCDATA content model flag")
|
||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
data = token["data"]
|
data = token["data"]
|
||||||
if not isinstance(data, str):
|
if not isinstance(data, str):
|
||||||
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
|
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
|
||||||
if not data:
|
if not data:
|
||||||
raise LintError(_("%(type)s token with empty data") % {"type": type})
|
raise LintError("%(type)s token with empty data" % {"type": type})
|
||||||
if type == "SpaceCharacters":
|
if type == "SpaceCharacters":
|
||||||
data = data.strip(spaceCharacters)
|
data = data.strip(spaceCharacters)
|
||||||
if data:
|
if data:
|
||||||
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
|
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
|
||||||
|
|
||||||
elif type == "Doctype":
|
elif type == "Doctype":
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
if contentModelFlag != "PCDATA":
|
if contentModelFlag != "PCDATA":
|
||||||
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
|
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||||
# XXX: what to do with token["data"] ?
|
# XXX: what to do with token["data"] ?
|
||||||
|
|
||||||
elif type in ("ParseError", "SerializeError"):
|
elif type in ("ParseError", "SerializeError"):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
|
raise LintError("Unknown token type: %(type)s" % {"type": type})
|
||||||
|
|
||||||
yield token
|
yield token
|
||||||
|
|
|
@ -58,7 +58,7 @@ class Filter(_base.Filter):
|
||||||
elif tagname == 'colgroup':
|
elif tagname == 'colgroup':
|
||||||
# A colgroup element's start tag may be omitted if the first thing
|
# A colgroup element's start tag may be omitted if the first thing
|
||||||
# inside the colgroup element is a col element, and if the element
|
# inside the colgroup element is a col element, and if the element
|
||||||
# is not immediately preceeded by another colgroup element whose
|
# is not immediately preceded by another colgroup element whose
|
||||||
# end tag has been omitted.
|
# end tag has been omitted.
|
||||||
if type in ("StartTag", "EmptyTag"):
|
if type in ("StartTag", "EmptyTag"):
|
||||||
# XXX: we do not look at the preceding event, so instead we never
|
# XXX: we do not look at the preceding event, so instead we never
|
||||||
|
@ -70,7 +70,7 @@ class Filter(_base.Filter):
|
||||||
elif tagname == 'tbody':
|
elif tagname == 'tbody':
|
||||||
# A tbody element's start tag may be omitted if the first thing
|
# A tbody element's start tag may be omitted if the first thing
|
||||||
# inside the tbody element is a tr element, and if the element is
|
# inside the tbody element is a tr element, and if the element is
|
||||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
# not immediately preceded by a tbody, thead, or tfoot element
|
||||||
# whose end tag has been omitted.
|
# whose end tag has been omitted.
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
# omit the thead and tfoot elements' end tag when they are
|
# omit the thead and tfoot elements' end tag when they are
|
||||||
|
|
|
@ -18,6 +18,7 @@ from .constants import cdataElements, rcdataElements
|
||||||
from .constants import tokenTypes, ReparseException, namespaces
|
from .constants import tokenTypes, ReparseException, namespaces
|
||||||
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
|
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
|
||||||
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
|
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
|
||||||
|
from .constants import E
|
||||||
|
|
||||||
|
|
||||||
def parse(doc, treebuilder="etree", encoding=None,
|
def parse(doc, treebuilder="etree", encoding=None,
|
||||||
|
@ -129,6 +130,17 @@ class HTMLParser(object):
|
||||||
|
|
||||||
self.framesetOK = True
|
self.framesetOK = True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def documentEncoding(self):
|
||||||
|
"""The name of the character encoding
|
||||||
|
that was used to decode the input stream,
|
||||||
|
or :obj:`None` if that is not determined yet.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not hasattr(self, 'tokenizer'):
|
||||||
|
return None
|
||||||
|
return self.tokenizer.stream.charEncoding[0]
|
||||||
|
|
||||||
def isHTMLIntegrationPoint(self, element):
|
def isHTMLIntegrationPoint(self, element):
|
||||||
if (element.name == "annotation-xml" and
|
if (element.name == "annotation-xml" and
|
||||||
element.namespace == namespaces["mathml"]):
|
element.namespace == namespaces["mathml"]):
|
||||||
|
@ -245,7 +257,7 @@ class HTMLParser(object):
|
||||||
# XXX The idea is to make errorcode mandatory.
|
# XXX The idea is to make errorcode mandatory.
|
||||||
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
|
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
|
||||||
if self.strict:
|
if self.strict:
|
||||||
raise ParseError
|
raise ParseError(E[errorcode] % datavars)
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
def normalizeToken(self, token):
|
||||||
""" HTML5 specific normalizations to the token stream """
|
""" HTML5 specific normalizations to the token stream """
|
||||||
|
@ -868,7 +880,7 @@ def getPhases(debug):
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
("html", self.startTagHtml),
|
("html", self.startTagHtml),
|
||||||
(("base", "basefont", "bgsound", "command", "link", "meta",
|
(("base", "basefont", "bgsound", "command", "link", "meta",
|
||||||
"noframes", "script", "style", "title"),
|
"script", "style", "title"),
|
||||||
self.startTagProcessInHead),
|
self.startTagProcessInHead),
|
||||||
("body", self.startTagBody),
|
("body", self.startTagBody),
|
||||||
("frameset", self.startTagFrameset),
|
("frameset", self.startTagFrameset),
|
||||||
|
@ -1205,8 +1217,7 @@ def getPhases(debug):
|
||||||
attributes["name"] = "isindex"
|
attributes["name"] = "isindex"
|
||||||
self.processStartTag(impliedTagToken("input", "StartTag",
|
self.processStartTag(impliedTagToken("input", "StartTag",
|
||||||
attributes=attributes,
|
attributes=attributes,
|
||||||
selfClosing=
|
selfClosing=token["selfClosing"]))
|
||||||
token["selfClosing"]))
|
|
||||||
self.processEndTag(impliedTagToken("label"))
|
self.processEndTag(impliedTagToken("label"))
|
||||||
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
||||||
self.processEndTag(impliedTagToken("form"))
|
self.processEndTag(impliedTagToken("form"))
|
||||||
|
@ -1316,7 +1327,7 @@ def getPhases(debug):
|
||||||
# Not sure this is the correct name for the parse error
|
# Not sure this is the correct name for the parse error
|
||||||
self.parser.parseError(
|
self.parser.parseError(
|
||||||
"expected-one-end-tag-but-got-another",
|
"expected-one-end-tag-but-got-another",
|
||||||
{"expectedName": "body", "gotName": node.name})
|
{"gotName": "body", "expectedName": node.name})
|
||||||
break
|
break
|
||||||
self.parser.phase = self.parser.phases["afterBody"]
|
self.parser.phase = self.parser.phases["afterBody"]
|
||||||
|
|
||||||
|
@ -2553,7 +2564,7 @@ def getPhases(debug):
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
if (not self.parser.innerHTML and
|
if (not self.parser.innerHTML and
|
||||||
self.tree.openElements[-1].name != "frameset"):
|
self.tree.openElements[-1].name != "frameset"):
|
||||||
# If we're not in innerHTML mode and the the current node is not a
|
# If we're not in innerHTML mode and the current node is not a
|
||||||
# "frameset" element (anymore) then switch.
|
# "frameset" element (anymore) then switch.
|
||||||
self.parser.phase = self.parser.phases["afterFrameset"]
|
self.parser.phase = self.parser.phases["afterFrameset"]
|
||||||
|
|
||||||
|
|
|
@ -225,6 +225,9 @@ class InfosetFilter(object):
|
||||||
while "--" in data:
|
while "--" in data:
|
||||||
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||||
data = data.replace("--", "- -")
|
data = data.replace("--", "- -")
|
||||||
|
if data.endswith("-"):
|
||||||
|
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
||||||
|
data += " "
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def coerceCharacters(self, data):
|
def coerceCharacters(self, data):
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from six import text_type
|
from six import text_type
|
||||||
from six.moves import http_client
|
from six.moves import http_client, urllib
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
|
@ -28,7 +29,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||||
|
|
||||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
|
||||||
|
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
|
||||||
|
|
||||||
|
if utils.supports_lone_surrogates:
|
||||||
|
# Use one extra step of indirection and create surrogates with
|
||||||
|
# unichr. Not using this indirection would introduce an illegal
|
||||||
|
# unicode literal on platforms not supporting such lone
|
||||||
|
# surrogates.
|
||||||
|
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
|
||||||
|
eval('"\\uD800-\\uDFFF"'))
|
||||||
|
else:
|
||||||
|
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||||
|
|
||||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||||
|
@ -119,9 +131,12 @@ class BufferedStream(object):
|
||||||
|
|
||||||
|
|
||||||
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
||||||
if isinstance(source, http_client.HTTPResponse):
|
# Work around Python bug #20007: read(0) closes the connection.
|
||||||
# Work around Python bug #20007: read(0) closes the connection.
|
# http://bugs.python.org/issue20007
|
||||||
# http://bugs.python.org/issue20007
|
if (isinstance(source, http_client.HTTPResponse) or
|
||||||
|
# Also check for addinfourl wrapping HTTPResponse
|
||||||
|
(isinstance(source, urllib.response.addbase) and
|
||||||
|
isinstance(source.fp, http_client.HTTPResponse))):
|
||||||
isUnicode = False
|
isUnicode = False
|
||||||
elif hasattr(source, "read"):
|
elif hasattr(source, "read"):
|
||||||
isUnicode = isinstance(source.read(0), text_type)
|
isUnicode = isinstance(source.read(0), text_type)
|
||||||
|
@ -164,13 +179,18 @@ class HTMLUnicodeInputStream(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Craziness
|
if not utils.supports_lone_surrogates:
|
||||||
if len("\U0010FFFF") == 1:
|
# Such platforms will have already checked for such
|
||||||
|
# surrogate errors, so no need to do this checking.
|
||||||
|
self.reportCharacterErrors = None
|
||||||
|
self.replaceCharactersRegexp = None
|
||||||
|
elif len("\U0010FFFF") == 1:
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||||
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
|
||||||
else:
|
else:
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||||
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
self.replaceCharactersRegexp = re.compile(
|
||||||
|
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
|
||||||
|
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
self.newLines = [0]
|
self.newLines = [0]
|
||||||
|
@ -265,11 +285,12 @@ class HTMLUnicodeInputStream(object):
|
||||||
self._bufferedCharacter = data[-1]
|
self._bufferedCharacter = data[-1]
|
||||||
data = data[:-1]
|
data = data[:-1]
|
||||||
|
|
||||||
self.reportCharacterErrors(data)
|
if self.reportCharacterErrors:
|
||||||
|
self.reportCharacterErrors(data)
|
||||||
|
|
||||||
# Replace invalid characters
|
# Replace invalid characters
|
||||||
# Note U+0000 is dealt with in the tokenizer
|
# Note U+0000 is dealt with in the tokenizer
|
||||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||||
|
|
||||||
data = data.replace("\r\n", "\n")
|
data = data.replace("\r\n", "\n")
|
||||||
data = data.replace("\r", "\n")
|
data = data.replace("\r", "\n")
|
||||||
|
@ -452,7 +473,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
if encoding is None and parseMeta:
|
if encoding is None and parseMeta:
|
||||||
encoding = self.detectEncodingMeta()
|
encoding = self.detectEncodingMeta()
|
||||||
confidence = "tentative"
|
confidence = "tentative"
|
||||||
# Guess with chardet, if avaliable
|
# Guess with chardet, if available
|
||||||
if encoding is None and chardet:
|
if encoding is None and chardet:
|
||||||
confidence = "tentative"
|
confidence = "tentative"
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -2,11 +2,26 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from xml.sax.saxutils import escape, unescape
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
from six.moves import urllib_parse as urlparse
|
||||||
|
|
||||||
from .tokenizer import HTMLTokenizer
|
from .tokenizer import HTMLTokenizer
|
||||||
from .constants import tokenTypes
|
from .constants import tokenTypes
|
||||||
|
|
||||||
|
|
||||||
|
content_type_rgx = re.compile(r'''
|
||||||
|
^
|
||||||
|
# Match a content type <application>/<type>
|
||||||
|
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||||
|
# Match any character set and encoding
|
||||||
|
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||||
|
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||||
|
# Assume the rest is data
|
||||||
|
,.*
|
||||||
|
$
|
||||||
|
''',
|
||||||
|
re.VERBOSE)
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizerMixin(object):
|
class HTMLSanitizerMixin(object):
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||||
|
|
||||||
|
@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object):
|
||||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
||||||
'y1', 'y2', 'zoomAndPan']
|
'y1', 'y2', 'zoomAndPan']
|
||||||
|
|
||||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
|
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
|
||||||
'xlink:href', 'xml:base']
|
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
|
||||||
|
|
||||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
||||||
|
@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
|
||||||
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
|
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
|
||||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||||
'ssh', 'sftp', 'rtsp', 'afs']
|
'ssh', 'sftp', 'rtsp', 'afs', 'data']
|
||||||
|
|
||||||
|
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
|
||||||
|
|
||||||
# subclasses may define their own versions of these constants
|
# subclasses may define their own versions of these constants
|
||||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||||
|
@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
|
||||||
allowed_css_keywords = acceptable_css_keywords
|
allowed_css_keywords = acceptable_css_keywords
|
||||||
allowed_svg_properties = acceptable_svg_properties
|
allowed_svg_properties = acceptable_svg_properties
|
||||||
allowed_protocols = acceptable_protocols
|
allowed_protocols = acceptable_protocols
|
||||||
|
allowed_content_types = acceptable_content_types
|
||||||
|
|
||||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||||
|
@ -189,10 +207,21 @@ class HTMLSanitizerMixin(object):
|
||||||
unescape(attrs[attr])).lower()
|
unescape(attrs[attr])).lower()
|
||||||
# remove replacement characters from unescaped characters
|
# remove replacement characters from unescaped characters
|
||||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
|
try:
|
||||||
(val_unescaped.split(':')[0] not in
|
uri = urlparse.urlparse(val_unescaped)
|
||||||
self.allowed_protocols)):
|
except ValueError:
|
||||||
|
uri = None
|
||||||
del attrs[attr]
|
del attrs[attr]
|
||||||
|
if uri and uri.scheme:
|
||||||
|
if uri.scheme not in self.allowed_protocols:
|
||||||
|
del attrs[attr]
|
||||||
|
if uri.scheme == 'data':
|
||||||
|
m = content_type_rgx.match(uri.path)
|
||||||
|
if not m:
|
||||||
|
del attrs[attr]
|
||||||
|
elif m.group('content_type') not in self.allowed_content_types:
|
||||||
|
del attrs[attr]
|
||||||
|
|
||||||
for attr in self.svg_attr_val_allows_ref:
|
for attr in self.svg_attr_val_allows_ref:
|
||||||
if attr in attrs:
|
if attr in attrs:
|
||||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||||
|
@ -245,7 +274,7 @@ class HTMLSanitizerMixin(object):
|
||||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||||
'padding']:
|
'padding']:
|
||||||
for keyword in value.split():
|
for keyword in value.split():
|
||||||
if not keyword in self.acceptable_css_keywords and \
|
if keyword not in self.acceptable_css_keywords and \
|
||||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
|
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
from six import text_type
|
from six import text_type
|
||||||
|
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -35,7 +32,7 @@ else:
|
||||||
v = utils.surrogatePairToCodepoint(v)
|
v = utils.surrogatePairToCodepoint(v)
|
||||||
else:
|
else:
|
||||||
v = ord(v)
|
v = ord(v)
|
||||||
if not v in encode_entity_map or k.islower():
|
if v not in encode_entity_map or k.islower():
|
||||||
# prefer < over < and similarly for &, >, etc.
|
# prefer < over < and similarly for &, >, etc.
|
||||||
encode_entity_map[v] = k
|
encode_entity_map[v] = k
|
||||||
|
|
||||||
|
@ -208,7 +205,7 @@ class HTMLSerializer(object):
|
||||||
if token["systemId"]:
|
if token["systemId"]:
|
||||||
if token["systemId"].find('"') >= 0:
|
if token["systemId"].find('"') >= 0:
|
||||||
if token["systemId"].find("'") >= 0:
|
if token["systemId"].find("'") >= 0:
|
||||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
self.serializeError("System identifer contains both single and double quote characters")
|
||||||
quote_char = "'"
|
quote_char = "'"
|
||||||
else:
|
else:
|
||||||
quote_char = '"'
|
quote_char = '"'
|
||||||
|
@ -220,7 +217,7 @@ class HTMLSerializer(object):
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
if type == "SpaceCharacters" or in_cdata:
|
if type == "SpaceCharacters" or in_cdata:
|
||||||
if in_cdata and token["data"].find("</") >= 0:
|
if in_cdata and token["data"].find("</") >= 0:
|
||||||
self.serializeError(_("Unexpected </ in CDATA"))
|
self.serializeError("Unexpected </ in CDATA")
|
||||||
yield self.encode(token["data"])
|
yield self.encode(token["data"])
|
||||||
else:
|
else:
|
||||||
yield self.encode(escape(token["data"]))
|
yield self.encode(escape(token["data"]))
|
||||||
|
@ -231,7 +228,7 @@ class HTMLSerializer(object):
|
||||||
if name in rcdataElements and not self.escape_rcdata:
|
if name in rcdataElements and not self.escape_rcdata:
|
||||||
in_cdata = True
|
in_cdata = True
|
||||||
elif in_cdata:
|
elif in_cdata:
|
||||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
self.serializeError("Unexpected child element of a CDATA element")
|
||||||
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
||||||
# TODO: Add namespace support here
|
# TODO: Add namespace support here
|
||||||
k = attr_name
|
k = attr_name
|
||||||
|
@ -279,20 +276,20 @@ class HTMLSerializer(object):
|
||||||
if name in rcdataElements:
|
if name in rcdataElements:
|
||||||
in_cdata = False
|
in_cdata = False
|
||||||
elif in_cdata:
|
elif in_cdata:
|
||||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
self.serializeError("Unexpected child element of a CDATA element")
|
||||||
yield self.encodeStrict("</%s>" % name)
|
yield self.encodeStrict("</%s>" % name)
|
||||||
|
|
||||||
elif type == "Comment":
|
elif type == "Comment":
|
||||||
data = token["data"]
|
data = token["data"]
|
||||||
if data.find("--") >= 0:
|
if data.find("--") >= 0:
|
||||||
self.serializeError(_("Comment contains --"))
|
self.serializeError("Comment contains --")
|
||||||
yield self.encodeStrict("<!--%s-->" % token["data"])
|
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||||
|
|
||||||
elif type == "Entity":
|
elif type == "Entity":
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
key = name + ";"
|
key = name + ";"
|
||||||
if not key in entities:
|
if key not in entities:
|
||||||
self.serializeError(_("Entity %s not recognized" % name))
|
self.serializeError("Entity %s not recognized" % name)
|
||||||
if self.resolve_entities and key not in xmlEntities:
|
if self.resolve_entities and key not in xmlEntities:
|
||||||
data = entities[key]
|
data = entities[key]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import sax
|
||||||
|
|
||||||
|
__all__ = ["sax"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from . import genshi # flake8: noqa
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
__all__.append("genshi")
|
47
lib/html5lib/treeadapters/genshi.py
Normal file
47
lib/html5lib/treeadapters/genshi.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from genshi.core import QName, Attrs
|
||||||
|
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
||||||
|
|
||||||
|
|
||||||
|
def to_genshi(walker):
|
||||||
|
text = []
|
||||||
|
for token in walker:
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("Characters", "SpaceCharacters"):
|
||||||
|
text.append(token["data"])
|
||||||
|
elif text:
|
||||||
|
yield TEXT, "".join(text), (None, -1, -1)
|
||||||
|
text = []
|
||||||
|
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
if token["namespace"]:
|
||||||
|
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||||
|
else:
|
||||||
|
name = token["name"]
|
||||||
|
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
|
||||||
|
for attr, value in token["data"].items()])
|
||||||
|
yield (START, (QName(name), attrs), (None, -1, -1))
|
||||||
|
if type == "EmptyTag":
|
||||||
|
type = "EndTag"
|
||||||
|
|
||||||
|
if type == "EndTag":
|
||||||
|
if token["namespace"]:
|
||||||
|
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||||
|
else:
|
||||||
|
name = token["name"]
|
||||||
|
|
||||||
|
yield END, QName(name), (None, -1, -1)
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
yield COMMENT, token["data"], (None, -1, -1)
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
yield DOCTYPE, (token["name"], token["publicId"],
|
||||||
|
token["systemId"]), (None, -1, -1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
pass # FIXME: What to do?
|
||||||
|
|
||||||
|
if text:
|
||||||
|
yield TEXT, "".join(text), (None, -1, -1)
|
|
@ -158,7 +158,7 @@ def getDomBuilder(DomImplementation):
|
||||||
else:
|
else:
|
||||||
# HACK: allow text nodes as children of the document node
|
# HACK: allow text nodes as children of the document node
|
||||||
if hasattr(self.dom, '_child_node_types'):
|
if hasattr(self.dom, '_child_node_types'):
|
||||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
if Node.TEXT_NODE not in self.dom._child_node_types:
|
||||||
self.dom._child_node_types = list(self.dom._child_node_types)
|
self.dom._child_node_types = list(self.dom._child_node_types)
|
||||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||||
self.dom.appendChild(self.dom.createTextNode(data))
|
self.dom.appendChild(self.dom.createTextNode(data))
|
||||||
|
|
|
@ -54,7 +54,7 @@ class Document(object):
|
||||||
def testSerializer(element):
|
def testSerializer(element):
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
finalText = None
|
||||||
infosetFilter = ihatexml.InfosetFilter()
|
infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||||
|
|
||||||
def serializeElement(element, indent=0):
|
def serializeElement(element, indent=0):
|
||||||
if not hasattr(element, "tag"):
|
if not hasattr(element, "tag"):
|
||||||
|
@ -79,7 +79,7 @@ def testSerializer(element):
|
||||||
next_element = next_element.getnext()
|
next_element = next_element.getnext()
|
||||||
elif isinstance(element, str) or isinstance(element, bytes):
|
elif isinstance(element, str) or isinstance(element, bytes):
|
||||||
# Text in a fragment
|
# Text in a fragment
|
||||||
assert isinstance(element, str) or sys.version_info.major == 2
|
assert isinstance(element, str) or sys.version_info[0] == 2
|
||||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||||
else:
|
else:
|
||||||
# Fragment case
|
# Fragment case
|
||||||
|
@ -189,7 +189,7 @@ class TreeBuilder(_base.TreeBuilder):
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements, fullTree=False):
|
def __init__(self, namespaceHTMLElements, fullTree=False):
|
||||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||||
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
|
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||||
self.namespaceHTMLElements = namespaceHTMLElements
|
self.namespaceHTMLElements = namespaceHTMLElements
|
||||||
|
|
||||||
class Attributes(dict):
|
class Attributes(dict):
|
||||||
|
@ -257,7 +257,7 @@ class TreeBuilder(_base.TreeBuilder):
|
||||||
data = property(_getData, _setData)
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
self.elementClass = Element
|
self.elementClass = Element
|
||||||
self.commentClass = builder.Comment
|
self.commentClass = Comment
|
||||||
# self.fragmentClass = builder.DocumentFragment
|
# self.fragmentClass = builder.DocumentFragment
|
||||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||||
|
|
||||||
|
@ -315,7 +315,7 @@ class TreeBuilder(_base.TreeBuilder):
|
||||||
"""Create the document root"""
|
"""Create the document root"""
|
||||||
# Because of the way libxml2 works, it doesn't seem to be possible to
|
# Because of the way libxml2 works, it doesn't seem to be possible to
|
||||||
# alter information like the doctype after the tree has been parsed.
|
# alter information like the doctype after the tree has been parsed.
|
||||||
# Therefore we need to use the built-in parser to create our iniial
|
# Therefore we need to use the built-in parser to create our initial
|
||||||
# tree, after which we can add elements like normal
|
# tree, after which we can add elements like normal
|
||||||
docStr = ""
|
docStr = ""
|
||||||
if self.doctype:
|
if self.doctype:
|
||||||
|
@ -344,7 +344,8 @@ class TreeBuilder(_base.TreeBuilder):
|
||||||
|
|
||||||
# Append the initial comments:
|
# Append the initial comments:
|
||||||
for comment_token in self.initial_comments:
|
for comment_token in self.initial_comments:
|
||||||
root.addprevious(etree.Comment(comment_token["data"]))
|
comment = self.commentClass(comment_token["data"])
|
||||||
|
root.addprevious(comment._element)
|
||||||
|
|
||||||
# Create the root document and add the ElementTree to it
|
# Create the root document and add the ElementTree to it
|
||||||
self.document = self.documentClass()
|
self.document = self.documentClass()
|
||||||
|
|
|
@ -10,8 +10,9 @@ returning an iterator generating tokens.
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import sys
|
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
|
||||||
|
|
||||||
|
from .. import constants
|
||||||
from ..utils import default_etree
|
from ..utils import default_etree
|
||||||
|
|
||||||
treeWalkerCache = {}
|
treeWalkerCache = {}
|
||||||
|
@ -20,28 +21,27 @@ treeWalkerCache = {}
|
||||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||||
"""Get a TreeWalker class for various types of tree with built-in support
|
"""Get a TreeWalker class for various types of tree with built-in support
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
Args:
|
||||||
values are:
|
treeType (str): the name of the tree type required (case-insensitive).
|
||||||
|
Supported values are:
|
||||||
|
|
||||||
"dom" - The xml.dom.minidom DOM implementation
|
- "dom": The xml.dom.minidom DOM implementation
|
||||||
"pulldom" - The xml.dom.pulldom event stream
|
- "etree": A generic walker for tree implementations exposing an
|
||||||
"etree" - A generic walker for tree implementations exposing an
|
elementtree-like interface (known to work with
|
||||||
elementtree-like interface (known to work with
|
ElementTree, cElementTree and lxml.etree).
|
||||||
ElementTree, cElementTree and lxml.etree).
|
- "lxml": Optimized walker for lxml.etree
|
||||||
"lxml" - Optimized walker for lxml.etree
|
- "genshi": a Genshi stream
|
||||||
"genshi" - a Genshi stream
|
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" tree type only). A module
|
Implementation: A module implementing the tree type e.g.
|
||||||
implementing the tree type e.g. xml.etree.ElementTree or
|
xml.etree.ElementTree or cElementTree (Currently applies to the
|
||||||
cElementTree."""
|
"etree" tree type only).
|
||||||
|
"""
|
||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
if treeType not in treeWalkerCache:
|
if treeType not in treeWalkerCache:
|
||||||
if treeType in ("dom", "pulldom"):
|
if treeType == "dom":
|
||||||
name = "%s.%s" % (__name__, treeType)
|
from . import dom
|
||||||
__import__(name)
|
treeWalkerCache[treeType] = dom.TreeWalker
|
||||||
mod = sys.modules[name]
|
|
||||||
treeWalkerCache[treeType] = mod.TreeWalker
|
|
||||||
elif treeType == "genshi":
|
elif treeType == "genshi":
|
||||||
from . import genshistream
|
from . import genshistream
|
||||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||||
|
@ -55,3 +55,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||||
return treeWalkerCache.get(treeType)
|
return treeWalkerCache.get(treeType)
|
||||||
|
|
||||||
|
|
||||||
|
def concatenateCharacterTokens(tokens):
|
||||||
|
pendingCharacters = []
|
||||||
|
for token in tokens:
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("Characters", "SpaceCharacters"):
|
||||||
|
pendingCharacters.append(token["data"])
|
||||||
|
else:
|
||||||
|
if pendingCharacters:
|
||||||
|
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||||
|
pendingCharacters = []
|
||||||
|
yield token
|
||||||
|
if pendingCharacters:
|
||||||
|
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||||
|
|
||||||
|
|
||||||
|
def pprint(walker):
|
||||||
|
"""Pretty printer for tree walkers"""
|
||||||
|
output = []
|
||||||
|
indent = 0
|
||||||
|
for token in concatenateCharacterTokens(walker):
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
# tag name
|
||||||
|
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
|
||||||
|
if token["namespace"] in constants.prefixes:
|
||||||
|
ns = constants.prefixes[token["namespace"]]
|
||||||
|
else:
|
||||||
|
ns = token["namespace"]
|
||||||
|
name = "%s %s" % (ns, token["name"])
|
||||||
|
else:
|
||||||
|
name = token["name"]
|
||||||
|
output.append("%s<%s>" % (" " * indent, name))
|
||||||
|
indent += 2
|
||||||
|
# attributes (sorted for consistent ordering)
|
||||||
|
attrs = token["data"]
|
||||||
|
for (namespace, localname), value in sorted(attrs.items()):
|
||||||
|
if namespace:
|
||||||
|
if namespace in constants.prefixes:
|
||||||
|
ns = constants.prefixes[namespace]
|
||||||
|
else:
|
||||||
|
ns = namespace
|
||||||
|
name = "%s %s" % (ns, localname)
|
||||||
|
else:
|
||||||
|
name = localname
|
||||||
|
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
|
||||||
|
# self-closing
|
||||||
|
if type == "EmptyTag":
|
||||||
|
indent -= 2
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
indent -= 2
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
if token["name"]:
|
||||||
|
if token["publicId"]:
|
||||||
|
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(" " * indent,
|
||||||
|
token["name"],
|
||||||
|
token["publicId"],
|
||||||
|
token["systemId"] if token["systemId"] else ""))
|
||||||
|
elif token["systemId"]:
|
||||||
|
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
||||||
|
(" " * indent,
|
||||||
|
token["name"],
|
||||||
|
token["systemId"]))
|
||||||
|
else:
|
||||||
|
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
||||||
|
token["name"]))
|
||||||
|
else:
|
||||||
|
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
||||||
|
|
||||||
|
elif type == "Characters":
|
||||||
|
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
||||||
|
|
||||||
|
elif type == "SpaceCharacters":
|
||||||
|
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown token type, %s" % type)
|
||||||
|
|
||||||
|
return "\n".join(output)
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
from six import text_type, string_types
|
from six import text_type, string_types
|
||||||
|
|
||||||
import gettext
|
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
||||||
_ = gettext.gettext
|
"TreeWalker", "NonRecursiveTreeWalker"]
|
||||||
|
|
||||||
from xml.dom import Node
|
from xml.dom import Node
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ class TreeWalker(object):
|
||||||
"namespace": to_text(namespace),
|
"namespace": to_text(namespace),
|
||||||
"data": attrs}
|
"data": attrs}
|
||||||
if hasChildren:
|
if hasChildren:
|
||||||
yield self.error(_("Void element has children"))
|
yield self.error("Void element has children")
|
||||||
|
|
||||||
def startTag(self, namespace, name, attrs):
|
def startTag(self, namespace, name, attrs):
|
||||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||||
|
@ -122,7 +122,7 @@ class TreeWalker(object):
|
||||||
return {"type": "Entity", "name": text_type(name)}
|
return {"type": "Entity", "name": text_type(name)}
|
||||||
|
|
||||||
def unknown(self, nodeType):
|
def unknown(self, nodeType):
|
||||||
return self.error(_("Unknown node type: ") + nodeType)
|
return self.error("Unknown node type: " + nodeType)
|
||||||
|
|
||||||
|
|
||||||
class NonRecursiveTreeWalker(TreeWalker):
|
class NonRecursiveTreeWalker(TreeWalker):
|
||||||
|
|
|
@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from xml.dom import Node
|
from xml.dom import Node
|
||||||
|
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
from . import _base
|
from . import _base
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,12 +7,10 @@ except ImportError:
|
||||||
from ordereddict import OrderedDict
|
from ordereddict import OrderedDict
|
||||||
except ImportError:
|
except ImportError:
|
||||||
OrderedDict = dict
|
OrderedDict = dict
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from six import text_type
|
from six import string_types
|
||||||
|
|
||||||
from . import _base
|
from . import _base
|
||||||
from ..utils import moduleFactoryFactory
|
from ..utils import moduleFactoryFactory
|
||||||
|
@ -60,7 +58,7 @@ def getETreeBuilder(ElementTreeImplementation):
|
||||||
return _base.COMMENT, node.text
|
return _base.COMMENT, node.text
|
||||||
|
|
||||||
else:
|
else:
|
||||||
assert type(node.tag) == text_type, type(node.tag)
|
assert isinstance(node.tag, string_types), type(node.tag)
|
||||||
# This is assumed to be an ordinary element
|
# This is assumed to be an ordinary element
|
||||||
match = tag_regexp.match(node.tag)
|
match = tag_regexp.match(node.tag)
|
||||||
if match:
|
if match:
|
||||||
|
@ -131,6 +129,7 @@ def getETreeBuilder(ElementTreeImplementation):
|
||||||
if not parents:
|
if not parents:
|
||||||
return parent
|
return parent
|
||||||
else:
|
else:
|
||||||
|
assert list(parents[-1]).count(parent) == 1
|
||||||
return parent, list(parents[-1]).index(parent), parents, None
|
return parent, list(parents[-1]).index(parent), parents, None
|
||||||
|
|
||||||
return locals()
|
return locals()
|
||||||
|
|
|
@ -4,9 +4,6 @@ from six import text_type
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from ..treebuilders.etree import tag_regexp
|
from ..treebuilders.etree import tag_regexp
|
||||||
|
|
||||||
from gettext import gettext
|
|
||||||
_ = gettext
|
|
||||||
|
|
||||||
from . import _base
|
from . import _base
|
||||||
|
|
||||||
from .. import ihatexml
|
from .. import ihatexml
|
||||||
|
@ -130,7 +127,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
if isinstance(node, tuple): # Text node
|
if isinstance(node, tuple): # Text node
|
||||||
node, key = node
|
node, key = node
|
||||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||||
return _base.TEXT, ensure_str(getattr(node, key))
|
return _base.TEXT, ensure_str(getattr(node, key))
|
||||||
|
|
||||||
elif isinstance(node, Root):
|
elif isinstance(node, Root):
|
||||||
|
@ -169,7 +166,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
attrs, len(node) > 0 or node.text)
|
attrs, len(node) > 0 or node.text)
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
def getFirstChild(self, node):
|
||||||
assert not isinstance(node, tuple), _("Text nodes have no children")
|
assert not isinstance(node, tuple), "Text nodes have no children"
|
||||||
|
|
||||||
assert len(node) or node.text, "Node has no children"
|
assert len(node) or node.text, "Node has no children"
|
||||||
if node.text:
|
if node.text:
|
||||||
|
@ -180,7 +177,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
def getNextSibling(self, node):
|
def getNextSibling(self, node):
|
||||||
if isinstance(node, tuple): # Text node
|
if isinstance(node, tuple): # Text node
|
||||||
node, key = node
|
node, key = node
|
||||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||||
if key == "text":
|
if key == "text":
|
||||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||||
# because node[0] might evaluate to False if it has no child element
|
# because node[0] might evaluate to False if it has no child element
|
||||||
|
@ -196,7 +193,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
def getParentNode(self, node):
|
def getParentNode(self, node):
|
||||||
if isinstance(node, tuple): # Text node
|
if isinstance(node, tuple): # Text node
|
||||||
node, key = node
|
node, key = node
|
||||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||||
if key == "text":
|
if key == "text":
|
||||||
return node
|
return node
|
||||||
# else: fallback to "normal" processing
|
# else: fallback to "normal" processing
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
|
||||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
from ..constants import voidElements
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(_base.TreeWalker):
|
|
||||||
def __iter__(self):
|
|
||||||
ignore_until = None
|
|
||||||
previous = None
|
|
||||||
for event in self.tree:
|
|
||||||
if previous is not None and \
|
|
||||||
(ignore_until is None or previous[1] is ignore_until):
|
|
||||||
if previous[1] is ignore_until:
|
|
||||||
ignore_until = None
|
|
||||||
for token in self.tokens(previous, event):
|
|
||||||
yield token
|
|
||||||
if token["type"] == "EmptyTag":
|
|
||||||
ignore_until = previous[1]
|
|
||||||
previous = event
|
|
||||||
if ignore_until is None or previous[1] is ignore_until:
|
|
||||||
for token in self.tokens(previous, None):
|
|
||||||
yield token
|
|
||||||
elif ignore_until is not None:
|
|
||||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
|
||||||
|
|
||||||
def tokens(self, event, next):
|
|
||||||
type, node = event
|
|
||||||
if type == START_ELEMENT:
|
|
||||||
name = node.nodeName
|
|
||||||
namespace = node.namespaceURI
|
|
||||||
attrs = {}
|
|
||||||
for attr in list(node.attributes.keys()):
|
|
||||||
attr = node.getAttributeNode(attr)
|
|
||||||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
|
||||||
if name in voidElements:
|
|
||||||
for token in self.emptyTag(namespace,
|
|
||||||
name,
|
|
||||||
attrs,
|
|
||||||
not next or next[1] is not node):
|
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
yield self.startTag(namespace, name, attrs)
|
|
||||||
|
|
||||||
elif type == END_ELEMENT:
|
|
||||||
name = node.nodeName
|
|
||||||
namespace = node.namespaceURI
|
|
||||||
if name not in voidElements:
|
|
||||||
yield self.endTag(namespace, name)
|
|
||||||
|
|
||||||
elif type == COMMENT:
|
|
||||||
yield self.comment(node.nodeValue)
|
|
||||||
|
|
||||||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
|
||||||
for token in self.text(node.nodeValue):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
else:
|
|
||||||
yield self.unknown(type)
|
|
|
@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import xml.etree.cElementTree as default_etree
|
import xml.etree.cElementTree as default_etree
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -9,7 +11,26 @@ except ImportError:
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||||
"surrogatePairToCodepoint", "moduleFactoryFactory"]
|
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||||
|
"supports_lone_surrogates"]
|
||||||
|
|
||||||
|
|
||||||
|
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||||
|
# caught by the below test. In general this would be any platform
|
||||||
|
# using UTF-16 as its encoding of unicode strings, such as
|
||||||
|
# Jython. This is because UTF-16 itself is based on the use of such
|
||||||
|
# surrogates, and there is no mechanism to further escape such
|
||||||
|
# escapes.
|
||||||
|
try:
|
||||||
|
_x = eval('"\\uD800"')
|
||||||
|
if not isinstance(_x, text_type):
|
||||||
|
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||||
|
_x = eval('u"\\uD800"')
|
||||||
|
assert isinstance(_x, text_type)
|
||||||
|
except:
|
||||||
|
supports_lone_surrogates = False
|
||||||
|
else:
|
||||||
|
supports_lone_surrogates = True
|
||||||
|
|
||||||
|
|
||||||
class MethodDispatcher(dict):
|
class MethodDispatcher(dict):
|
||||||
|
@ -43,7 +64,7 @@ class MethodDispatcher(dict):
|
||||||
return dict.get(self, key, self.default)
|
return dict.get(self, key, self.default)
|
||||||
|
|
||||||
|
|
||||||
# Some utility functions to dal with weirdness around UCS2 vs UCS4
|
# Some utility functions to deal with weirdness around UCS2 vs UCS4
|
||||||
# python builds
|
# python builds
|
||||||
|
|
||||||
def isSurrogatePair(data):
|
def isSurrogatePair(data):
|
||||||
|
@ -70,13 +91,21 @@ def moduleFactoryFactory(factory):
|
||||||
else:
|
else:
|
||||||
name = b"_%s_factory" % baseModule.__name__
|
name = b"_%s_factory" % baseModule.__name__
|
||||||
|
|
||||||
if name in moduleCache:
|
kwargs_tuple = tuple(kwargs.items())
|
||||||
return moduleCache[name]
|
|
||||||
else:
|
try:
|
||||||
|
return moduleCache[name][args][kwargs_tuple]
|
||||||
|
except KeyError:
|
||||||
mod = ModuleType(name)
|
mod = ModuleType(name)
|
||||||
objs = factory(baseModule, *args, **kwargs)
|
objs = factory(baseModule, *args, **kwargs)
|
||||||
mod.__dict__.update(objs)
|
mod.__dict__.update(objs)
|
||||||
moduleCache[name] = mod
|
if "name" not in moduleCache:
|
||||||
|
moduleCache[name] = {}
|
||||||
|
if "args" not in moduleCache[name]:
|
||||||
|
moduleCache[name][args] = {}
|
||||||
|
if "kwargs" not in moduleCache[name][args]:
|
||||||
|
moduleCache[name][args][kwargs_tuple] = {}
|
||||||
|
moduleCache[name][args][kwargs_tuple] = mod
|
||||||
return mod
|
return mod
|
||||||
|
|
||||||
return moduleFactory
|
return moduleFactory
|
||||||
|
|
Loading…
Reference in a new issue