mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-07 10:33:38 +00:00
Merge branch 'feature/UpdateHtml5lib' into develop
This commit is contained in:
commit
18c400acec
24 changed files with 588 additions and 203 deletions
|
@ -6,6 +6,7 @@
|
||||||
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
|
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
|
||||||
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
||||||
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
|
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
|
||||||
|
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
|
||||||
|
|
||||||
[develop changelog]
|
[develop changelog]
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,23 @@
|
||||||
"""
|
"""
|
||||||
HTML parsing library based on the WHATWG "HTML5"
|
HTML parsing library based on the `WHATWG HTML specification
|
||||||
specification. The parser is designed to be compatible with existing
|
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
||||||
HTML found in the wild and implements well-defined error recovery that
|
existing HTML found in the wild and implements well-defined error recovery that
|
||||||
is largely compatible with modern desktop web browsers.
|
is largely compatible with modern desktop web browsers.
|
||||||
|
|
||||||
Example usage:
|
Example usage::
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
f = open("my_document.html")
|
with open("my_document.html", "rb") as f:
|
||||||
tree = html5lib.parse(f)
|
tree = html5lib.parse(f)
|
||||||
|
|
||||||
|
For convenience, this module re-exports the following names:
|
||||||
|
|
||||||
|
* :func:`~.html5parser.parse`
|
||||||
|
* :func:`~.html5parser.parseFragment`
|
||||||
|
* :class:`~.html5parser.HTMLParser`
|
||||||
|
* :func:`~.treebuilders.getTreeBuilder`
|
||||||
|
* :func:`~.treewalkers.getTreeWalker`
|
||||||
|
* :func:`~.serializer.serialize`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
@ -22,4 +31,5 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||||
"getTreeWalker", "serialize"]
|
"getTreeWalker", "serialize"]
|
||||||
|
|
||||||
# this has to be at the top level, see how setup.py parses this
|
# this has to be at the top level, see how setup.py parses this
|
||||||
__version__ = "0.9999999999-dev"
|
#: Distribution version number.
|
||||||
|
__version__ = "1.1-dev"
|
||||||
|
|
|
@ -180,7 +180,7 @@ nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u
|
||||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||||
|
|
||||||
# Simpler things
|
# Simpler things
|
||||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
|
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
||||||
|
|
||||||
|
|
||||||
class InfosetFilter(object):
|
class InfosetFilter(object):
|
||||||
|
|
|
@ -9,7 +9,7 @@ import re
|
||||||
import webencodings
|
import webencodings
|
||||||
|
|
||||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||||
from .constants import ReparseException
|
from .constants import _ReparseException
|
||||||
from . import _utils
|
from . import _utils
|
||||||
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
@ -48,7 +48,7 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||||
0x10FFFE, 0x10FFFF])
|
0x10FFFE, 0x10FFFF])
|
||||||
|
|
||||||
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
||||||
|
|
||||||
# Cache for charsUntil()
|
# Cache for charsUntil()
|
||||||
charsUntilRegEx = {}
|
charsUntilRegEx = {}
|
||||||
|
@ -367,7 +367,7 @@ class HTMLUnicodeInputStream(object):
|
||||||
def unget(self, char):
|
def unget(self, char):
|
||||||
# Only one character is allowed to be ungotten at once - it must
|
# Only one character is allowed to be ungotten at once - it must
|
||||||
# be consumed again before any further call to unget
|
# be consumed again before any further call to unget
|
||||||
if char is not None:
|
if char is not EOF:
|
||||||
if self.chunkOffset == 0:
|
if self.chunkOffset == 0:
|
||||||
# unget is called quite rarely, so it's a good idea to do
|
# unget is called quite rarely, so it's a good idea to do
|
||||||
# more work here if it saves a bit of work in the frequently
|
# more work here if it saves a bit of work in the frequently
|
||||||
|
@ -461,7 +461,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
if charEncoding[0] is not None:
|
if charEncoding[0] is not None:
|
||||||
return charEncoding
|
return charEncoding
|
||||||
|
|
||||||
# If we've been overriden, we've been overriden
|
# If we've been overridden, we've been overridden
|
||||||
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||||
if charEncoding[0] is not None:
|
if charEncoding[0] is not None:
|
||||||
return charEncoding
|
return charEncoding
|
||||||
|
@ -530,7 +530,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
self.rawStream.seek(0)
|
self.rawStream.seek(0)
|
||||||
self.charEncoding = (newEncoding, "certain")
|
self.charEncoding = (newEncoding, "certain")
|
||||||
self.reset()
|
self.reset()
|
||||||
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||||
|
|
||||||
def detectBOM(self):
|
def detectBOM(self):
|
||||||
"""Attempts to detect at BOM at the start of the stream. If
|
"""Attempts to detect at BOM at the start of the stream. If
|
||||||
|
|
|
@ -13,8 +13,7 @@ class Trie(Mapping):
|
||||||
if prefix is None:
|
if prefix is None:
|
||||||
return set(keys)
|
return set(keys)
|
||||||
|
|
||||||
# Python 2.6: no set comprehensions
|
return {x for x in keys if x.startswith(prefix)}
|
||||||
return set([x for x in keys if x.startswith(prefix)])
|
|
||||||
|
|
||||||
def has_keys_with_prefix(self, prefix):
|
def has_keys_with_prefix(self, prefix):
|
||||||
for key in self.keys():
|
for key in self.keys():
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import sys
|
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
|
|
||||||
from six import text_type
|
from six import text_type
|
||||||
|
@ -13,11 +12,9 @@ except ImportError:
|
||||||
|
|
||||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||||
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||||
"supports_lone_surrogates", "PY27"]
|
"supports_lone_surrogates"]
|
||||||
|
|
||||||
|
|
||||||
PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
|
|
||||||
|
|
||||||
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||||
# caught by the below test. In general this would be any platform
|
# caught by the below test. In general this would be any platform
|
||||||
# using UTF-16 as its encoding of unicode strings, such as
|
# using UTF-16 as its encoding of unicode strings, such as
|
||||||
|
|
|
@ -423,7 +423,7 @@ specialElements = frozenset([
|
||||||
])
|
])
|
||||||
|
|
||||||
htmlIntegrationPointElements = frozenset([
|
htmlIntegrationPointElements = frozenset([
|
||||||
(namespaces["mathml"], "annotaion-xml"),
|
(namespaces["mathml"], "annotation-xml"),
|
||||||
(namespaces["svg"], "foreignObject"),
|
(namespaces["svg"], "foreignObject"),
|
||||||
(namespaces["svg"], "desc"),
|
(namespaces["svg"], "desc"),
|
||||||
(namespaces["svg"], "title")
|
(namespaces["svg"], "title")
|
||||||
|
@ -588,7 +588,7 @@ rcdataElements = frozenset([
|
||||||
])
|
])
|
||||||
|
|
||||||
booleanAttributes = {
|
booleanAttributes = {
|
||||||
"": frozenset(["irrelevant"]),
|
"": frozenset(["irrelevant", "itemscope"]),
|
||||||
"style": frozenset(["scoped"]),
|
"style": frozenset(["scoped"]),
|
||||||
"img": frozenset(["ismap"]),
|
"img": frozenset(["ismap"]),
|
||||||
"audio": frozenset(["autoplay", "controls"]),
|
"audio": frozenset(["autoplay", "controls"]),
|
||||||
|
@ -606,6 +606,7 @@ booleanAttributes = {
|
||||||
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
|
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
|
||||||
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
|
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
|
||||||
"output": frozenset(["disabled", "readonly"]),
|
"output": frozenset(["disabled", "readonly"]),
|
||||||
|
"iframe": frozenset(["seamless"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||||
|
@ -2938,8 +2939,9 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
|
||||||
|
|
||||||
|
|
||||||
class DataLossWarning(UserWarning):
|
class DataLossWarning(UserWarning):
|
||||||
|
"""Raised when the current tree is unable to represent the input data"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReparseException(Exception):
|
class _ReparseException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -2,19 +2,28 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from . import base
|
from . import base
|
||||||
|
|
||||||
try:
|
from collections import OrderedDict
|
||||||
from collections import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
from ordereddict import OrderedDict
|
def _attr_key(attr):
|
||||||
|
"""Return an appropriate key for an attribute for sorting
|
||||||
|
|
||||||
|
Attributes have a namespace that can be either ``None`` or a string. We
|
||||||
|
can't compare the two because they're different types, so we convert
|
||||||
|
``None`` to an empty string first.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (attr[0][0] or ''), attr[0][1]
|
||||||
|
|
||||||
|
|
||||||
class Filter(base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Alphabetizes attributes for elements"""
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for token in base.Filter.__iter__(self):
|
for token in base.Filter.__iter__(self):
|
||||||
if token["type"] in ("StartTag", "EmptyTag"):
|
if token["type"] in ("StartTag", "EmptyTag"):
|
||||||
attrs = OrderedDict()
|
attrs = OrderedDict()
|
||||||
for name, value in sorted(token["data"].items(),
|
for name, value in sorted(token["data"].items(),
|
||||||
key=lambda x: x[0]):
|
key=_attr_key):
|
||||||
attrs[name] = value
|
attrs[name] = value
|
||||||
token["data"] = attrs
|
token["data"] = attrs
|
||||||
yield token
|
yield token
|
||||||
|
|
|
@ -4,7 +4,15 @@ from . import base
|
||||||
|
|
||||||
|
|
||||||
class Filter(base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
|
||||||
def __init__(self, source, encoding):
|
def __init__(self, source, encoding):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg source: the source token stream
|
||||||
|
|
||||||
|
:arg encoding: the encoding to set
|
||||||
|
|
||||||
|
"""
|
||||||
base.Filter.__init__(self, source)
|
base.Filter.__init__(self, source)
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,19 @@ spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
class Filter(base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Lints the token stream for errors
|
||||||
|
|
||||||
|
If it finds any errors, it'll raise an ``AssertionError``.
|
||||||
|
|
||||||
|
"""
|
||||||
def __init__(self, source, require_matching_tags=True):
|
def __init__(self, source, require_matching_tags=True):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg source: the source token stream
|
||||||
|
|
||||||
|
:arg require_matching_tags: whether or not to require matching tags
|
||||||
|
|
||||||
|
"""
|
||||||
super(Filter, self).__init__(source)
|
super(Filter, self).__init__(source)
|
||||||
self.require_matching_tags = require_matching_tags
|
self.require_matching_tags = require_matching_tags
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from . import base
|
||||||
|
|
||||||
|
|
||||||
class Filter(base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Removes optional tags from the token stream"""
|
||||||
def slider(self):
|
def slider(self):
|
||||||
previous1 = previous2 = None
|
previous1 = previous2 = None
|
||||||
for token in self.source:
|
for token in self.source:
|
||||||
|
|
|
@ -705,7 +705,7 @@ data_content_type = re.compile(r'''
|
||||||
|
|
||||||
|
|
||||||
class Filter(base.Filter):
|
class Filter(base.Filter):
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
source,
|
source,
|
||||||
allowed_elements=allowed_elements,
|
allowed_elements=allowed_elements,
|
||||||
|
@ -718,6 +718,37 @@ class Filter(base.Filter):
|
||||||
attr_val_is_uri=attr_val_is_uri,
|
attr_val_is_uri=attr_val_is_uri,
|
||||||
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
||||||
svg_allow_local_href=svg_allow_local_href):
|
svg_allow_local_href=svg_allow_local_href):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg allowed_elements: set of elements to allow--everything else will
|
||||||
|
be escaped
|
||||||
|
|
||||||
|
:arg allowed_attributes: set of attributes to allow in
|
||||||
|
elements--everything else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_css_properties: set of CSS properties to allow--everything
|
||||||
|
else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_css_keywords: set of CSS keywords to allow--everything
|
||||||
|
else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_svg_properties: set of SVG properties to allow--everything
|
||||||
|
else will be removed
|
||||||
|
|
||||||
|
:arg allowed_protocols: set of allowed protocols for URIs
|
||||||
|
|
||||||
|
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
|
||||||
|
|
||||||
|
:arg attr_val_is_uri: set of attributes that have URI values--values
|
||||||
|
that have a scheme not listed in ``allowed_protocols`` are removed
|
||||||
|
|
||||||
|
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
|
||||||
|
references
|
||||||
|
|
||||||
|
:arg svg_allow_local_href: set of SVG elements that can have local
|
||||||
|
hrefs--these are removed
|
||||||
|
|
||||||
|
"""
|
||||||
super(Filter, self).__init__(source)
|
super(Filter, self).__init__(source)
|
||||||
self.allowed_elements = allowed_elements
|
self.allowed_elements = allowed_elements
|
||||||
self.allowed_attributes = allowed_attributes
|
self.allowed_attributes = allowed_attributes
|
||||||
|
@ -737,11 +768,11 @@ class Filter(base.Filter):
|
||||||
yield token
|
yield token
|
||||||
|
|
||||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
|
||||||
# attributes are parsed, and a restricted set, # specified by
|
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
|
||||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
|
||||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
|
||||||
# in ALLOWED_PROTOCOLS are allowed.
|
# allowed.
|
||||||
#
|
#
|
||||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||||
# => <script> do_nasty_stuff() </script>
|
# => <script> do_nasty_stuff() </script>
|
||||||
|
@ -782,7 +813,7 @@ class Filter(base.Filter):
|
||||||
# characters, nor why we call unescape. I just know it's always been here.
|
# characters, nor why we call unescape. I just know it's always been here.
|
||||||
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
||||||
# this will do is remove *more* than it otherwise would.
|
# this will do is remove *more* than it otherwise would.
|
||||||
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
|
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
|
||||||
unescape(attrs[attr])).lower()
|
unescape(attrs[attr])).lower()
|
||||||
# remove replacement characters from unescaped characters
|
# remove replacement characters from unescaped characters
|
||||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||||
|
@ -807,7 +838,7 @@ class Filter(base.Filter):
|
||||||
' ',
|
' ',
|
||||||
unescape(attrs[attr]))
|
unescape(attrs[attr]))
|
||||||
if (token["name"] in self.svg_allow_local_href and
|
if (token["name"] in self.svg_allow_local_href and
|
||||||
(namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
|
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
|
||||||
attrs[(namespaces['xlink'], 'href')])):
|
attrs[(namespaces['xlink'], 'href')])):
|
||||||
del attrs[(namespaces['xlink'], 'href')]
|
del attrs[(namespaces['xlink'], 'href')]
|
||||||
if (None, 'style') in attrs:
|
if (None, 'style') in attrs:
|
||||||
|
@ -837,16 +868,16 @@ class Filter(base.Filter):
|
||||||
|
|
||||||
def sanitize_css(self, style):
|
def sanitize_css(self, style):
|
||||||
# disallow urls
|
# disallow urls
|
||||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||||
|
|
||||||
# gauntlet
|
# gauntlet
|
||||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||||
return ''
|
return ''
|
||||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
clean = []
|
clean = []
|
||||||
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||||
if not value:
|
if not value:
|
||||||
continue
|
continue
|
||||||
if prop.lower() in self.allowed_css_properties:
|
if prop.lower() in self.allowed_css_properties:
|
||||||
|
@ -855,7 +886,7 @@ class Filter(base.Filter):
|
||||||
'padding']:
|
'padding']:
|
||||||
for keyword in value.split():
|
for keyword in value.split():
|
||||||
if keyword not in self.allowed_css_keywords and \
|
if keyword not in self.allowed_css_keywords and \
|
||||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
clean.append(prop + ': ' + value + ';')
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
|
|
@ -10,7 +10,7 @@ SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
class Filter(base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Collapses whitespace except in pre, textarea, and script elements"""
|
||||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
from six import with_metaclass, viewkeys, PY3
|
from six import with_metaclass, viewkeys
|
||||||
|
|
||||||
import types
|
import types
|
||||||
|
from collections import OrderedDict
|
||||||
try:
|
|
||||||
from collections import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
from ordereddict import OrderedDict
|
|
||||||
|
|
||||||
from . import _inputstream
|
from . import _inputstream
|
||||||
from . import _tokenizer
|
from . import _tokenizer
|
||||||
|
@ -24,18 +20,53 @@ from .constants import (
|
||||||
adjustForeignAttributes as adjustForeignAttributesMap,
|
adjustForeignAttributes as adjustForeignAttributesMap,
|
||||||
adjustMathMLAttributes, adjustSVGAttributes,
|
adjustMathMLAttributes, adjustSVGAttributes,
|
||||||
E,
|
E,
|
||||||
ReparseException
|
_ReparseException
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
|
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
|
||||||
"""Parse a string or file-like object into a tree"""
|
"""Parse an HTML document as a string or file-like object into a tree
|
||||||
|
|
||||||
|
:arg doc: the document to parse as a string or file-like object
|
||||||
|
|
||||||
|
:arg treebuilder: the treebuilder to use when parsing
|
||||||
|
|
||||||
|
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||||
|
|
||||||
|
:returns: parsed tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5parser import parse
|
||||||
|
>>> parse('<html><body><p>This is a doc</p></body></html>')
|
||||||
|
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
|
||||||
|
|
||||||
|
"""
|
||||||
tb = treebuilders.getTreeBuilder(treebuilder)
|
tb = treebuilders.getTreeBuilder(treebuilder)
|
||||||
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
||||||
return p.parse(doc, **kwargs)
|
return p.parse(doc, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
|
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
|
||||||
|
"""Parse an HTML fragment as a string or file-like object into a tree
|
||||||
|
|
||||||
|
:arg doc: the fragment to parse as a string or file-like object
|
||||||
|
|
||||||
|
:arg container: the container context to parse the fragment in
|
||||||
|
|
||||||
|
:arg treebuilder: the treebuilder to use when parsing
|
||||||
|
|
||||||
|
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||||
|
|
||||||
|
:returns: parsed tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5libparser import parseFragment
|
||||||
|
>>> parseFragment('<b>this is a fragment</b>')
|
||||||
|
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
|
||||||
|
|
||||||
|
"""
|
||||||
tb = treebuilders.getTreeBuilder(treebuilder)
|
tb = treebuilders.getTreeBuilder(treebuilder)
|
||||||
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
||||||
return p.parseFragment(doc, container=container, **kwargs)
|
return p.parseFragment(doc, container=container, **kwargs)
|
||||||
|
@ -54,16 +85,30 @@ def method_decorator_metaclass(function):
|
||||||
|
|
||||||
|
|
||||||
class HTMLParser(object):
|
class HTMLParser(object):
|
||||||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
"""HTML parser
|
||||||
malformed) HTML"""
|
|
||||||
|
Generates a tree structure from a stream of (possibly malformed) HTML.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
|
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
|
||||||
"""
|
"""
|
||||||
strict - raise an exception when a parse error is encountered
|
:arg tree: a treebuilder class controlling the type of tree that will be
|
||||||
|
returned. Built in treebuilders can be accessed through
|
||||||
|
html5lib.treebuilders.getTreeBuilder(treeType)
|
||||||
|
|
||||||
|
:arg strict: raise an exception when a parse error is encountered
|
||||||
|
|
||||||
|
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||||
|
|
||||||
|
:arg debug: whether or not to enable debug mode which logs things
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5parser import HTMLParser
|
||||||
|
>>> parser = HTMLParser() # generates parser with etree builder
|
||||||
|
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
|
||||||
|
|
||||||
tree - a treebuilder class controlling the type of tree that will be
|
|
||||||
returned. Built in treebuilders can be accessed through
|
|
||||||
html5lib.treebuilders.getTreeBuilder(treeType)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Raise an exception on the first error encountered
|
# Raise an exception on the first error encountered
|
||||||
|
@ -87,7 +132,7 @@ class HTMLParser(object):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.mainLoop()
|
self.mainLoop()
|
||||||
except ReparseException:
|
except _ReparseException:
|
||||||
self.reset()
|
self.reset()
|
||||||
self.mainLoop()
|
self.mainLoop()
|
||||||
|
|
||||||
|
@ -127,9 +172,8 @@ class HTMLParser(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def documentEncoding(self):
|
def documentEncoding(self):
|
||||||
"""The name of the character encoding
|
"""Name of the character encoding that was used to decode the input stream, or
|
||||||
that was used to decode the input stream,
|
:obj:`None` if that is not determined yet
|
||||||
or :obj:`None` if that is not determined yet.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if not hasattr(self, 'tokenizer'):
|
if not hasattr(self, 'tokenizer'):
|
||||||
|
@ -223,14 +267,24 @@ class HTMLParser(object):
|
||||||
def parse(self, stream, *args, **kwargs):
|
def parse(self, stream, *args, **kwargs):
|
||||||
"""Parse a HTML document into a well-formed tree
|
"""Parse a HTML document into a well-formed tree
|
||||||
|
|
||||||
stream - a filelike object or string containing the HTML to be parsed
|
:arg stream: a file-like object or string containing the HTML to be parsed
|
||||||
|
|
||||||
The optional encoding parameter must be a string that indicates
|
The optional encoding parameter must be a string that indicates
|
||||||
the encoding. If specified, that encoding will be used,
|
the encoding. If specified, that encoding will be used,
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
element)
|
element).
|
||||||
|
|
||||||
|
:arg scripting: treat noscript elements as if JavaScript was turned on
|
||||||
|
|
||||||
|
:returns: parsed tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5parser import HTMLParser
|
||||||
|
>>> parser = HTMLParser()
|
||||||
|
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
|
||||||
|
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
|
||||||
|
|
||||||
scripting - treat noscript elements as if javascript was turned on
|
|
||||||
"""
|
"""
|
||||||
self._parse(stream, False, None, *args, **kwargs)
|
self._parse(stream, False, None, *args, **kwargs)
|
||||||
return self.tree.getDocument()
|
return self.tree.getDocument()
|
||||||
|
@ -238,17 +292,27 @@ class HTMLParser(object):
|
||||||
def parseFragment(self, stream, *args, **kwargs):
|
def parseFragment(self, stream, *args, **kwargs):
|
||||||
"""Parse a HTML fragment into a well-formed tree fragment
|
"""Parse a HTML fragment into a well-formed tree fragment
|
||||||
|
|
||||||
container - name of the element we're setting the innerHTML property
|
:arg container: name of the element we're setting the innerHTML
|
||||||
if set to None, default to 'div'
|
property if set to None, default to 'div'
|
||||||
|
|
||||||
stream - a filelike object or string containing the HTML to be parsed
|
:arg stream: a file-like object or string containing the HTML to be parsed
|
||||||
|
|
||||||
The optional encoding parameter must be a string that indicates
|
The optional encoding parameter must be a string that indicates
|
||||||
the encoding. If specified, that encoding will be used,
|
the encoding. If specified, that encoding will be used,
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
element)
|
element)
|
||||||
|
|
||||||
|
:arg scripting: treat noscript elements as if JavaScript was turned on
|
||||||
|
|
||||||
|
:returns: parsed tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5libparser import HTMLParser
|
||||||
|
>>> parser = HTMLParser()
|
||||||
|
>>> parser.parseFragment('<b>this is a fragment</b>')
|
||||||
|
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
|
||||||
|
|
||||||
scripting - treat noscript elements as if javascript was turned on
|
|
||||||
"""
|
"""
|
||||||
self._parse(stream, True, *args, **kwargs)
|
self._parse(stream, True, *args, **kwargs)
|
||||||
return self.tree.getFragment()
|
return self.tree.getFragment()
|
||||||
|
@ -262,8 +326,7 @@ class HTMLParser(object):
|
||||||
raise ParseError(E[errorcode] % datavars)
|
raise ParseError(E[errorcode] % datavars)
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
def normalizeToken(self, token):
|
||||||
""" HTML5 specific normalizations to the token stream """
|
# HTML5 specific normalizations to the token stream
|
||||||
|
|
||||||
if token["type"] == tokenTypes["StartTag"]:
|
if token["type"] == tokenTypes["StartTag"]:
|
||||||
raw = token["data"]
|
raw = token["data"]
|
||||||
token["data"] = OrderedDict(raw)
|
token["data"] = OrderedDict(raw)
|
||||||
|
@ -331,9 +394,7 @@ class HTMLParser(object):
|
||||||
self.phase = new_phase
|
self.phase = new_phase
|
||||||
|
|
||||||
def parseRCDataRawtext(self, token, contentType):
|
def parseRCDataRawtext(self, token, contentType):
|
||||||
"""Generic RCDATA/RAWTEXT Parsing algorithm
|
# Generic RCDATA/RAWTEXT Parsing algorithm
|
||||||
contentType - RCDATA or RAWTEXT
|
|
||||||
"""
|
|
||||||
assert contentType in ("RAWTEXT", "RCDATA")
|
assert contentType in ("RAWTEXT", "RCDATA")
|
||||||
|
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
|
@ -2711,10 +2772,7 @@ def getPhases(debug):
|
||||||
|
|
||||||
|
|
||||||
def adjust_attributes(token, replacements):
|
def adjust_attributes(token, replacements):
|
||||||
if PY3 or _utils.PY27:
|
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
|
||||||
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
|
|
||||||
else:
|
|
||||||
needs_adjustment = frozenset(token['data']) & frozenset(replacements)
|
|
||||||
if needs_adjustment:
|
if needs_adjustment:
|
||||||
token['data'] = OrderedDict((replacements.get(k, k), v)
|
token['data'] = OrderedDict((replacements.get(k, k), v)
|
||||||
for k, v in token['data'].items())
|
for k, v in token['data'].items())
|
||||||
|
|
|
@ -68,10 +68,33 @@ def htmlentityreplace_errors(exc):
|
||||||
else:
|
else:
|
||||||
return xmlcharrefreplace_errors(exc)
|
return xmlcharrefreplace_errors(exc)
|
||||||
|
|
||||||
|
|
||||||
register_error("htmlentityreplace", htmlentityreplace_errors)
|
register_error("htmlentityreplace", htmlentityreplace_errors)
|
||||||
|
|
||||||
|
|
||||||
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
||||||
|
"""Serializes the input token stream using the specified treewalker
|
||||||
|
|
||||||
|
:arg input: the token stream to serialize
|
||||||
|
|
||||||
|
:arg tree: the treewalker to use
|
||||||
|
|
||||||
|
:arg encoding: the encoding to use
|
||||||
|
|
||||||
|
:arg serializer_opts: any options to pass to the
|
||||||
|
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
|
||||||
|
|
||||||
|
:returns: the tree serialized as a string
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5parser import parse
|
||||||
|
>>> from html5lib.serializer import serialize
|
||||||
|
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
|
||||||
|
>>> serialize(token_stream, omit_optional_tags=False)
|
||||||
|
'<html><head></head><body><p>Hi!</p></body></html>'
|
||||||
|
|
||||||
|
"""
|
||||||
# XXX: Should we cache this?
|
# XXX: Should we cache this?
|
||||||
walker = treewalkers.getTreeWalker(tree)
|
walker = treewalkers.getTreeWalker(tree)
|
||||||
s = HTMLSerializer(**serializer_opts)
|
s = HTMLSerializer(**serializer_opts)
|
||||||
|
@ -110,50 +133,83 @@ class HTMLSerializer(object):
|
||||||
"strip_whitespace", "sanitize")
|
"strip_whitespace", "sanitize")
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
"""Initialize HTMLSerializer.
|
"""Initialize HTMLSerializer
|
||||||
|
|
||||||
Keyword options (default given first unless specified) include:
|
:arg inject_meta_charset: Whether or not to inject the meta charset.
|
||||||
|
|
||||||
inject_meta_charset=True|False
|
Defaults to ``True``.
|
||||||
Whether it insert a meta element to define the character set of the
|
|
||||||
document.
|
:arg quote_attr_values: Whether to quote attribute values that don't
|
||||||
quote_attr_values="legacy"|"spec"|"always"
|
require quoting per legacy browser behavior (``"legacy"``), when
|
||||||
Whether to quote attribute values that don't require quoting
|
required by the standard (``"spec"``), or always (``"always"``).
|
||||||
per legacy browser behaviour, when required by the standard, or always.
|
|
||||||
quote_char=u'"'|u"'"
|
Defaults to ``"legacy"``.
|
||||||
Use given quote character for attribute quoting. Default is to
|
|
||||||
use double quote unless attribute value contains a double quote,
|
:arg quote_char: Use given quote character for attribute quoting.
|
||||||
in which case single quotes are used instead.
|
|
||||||
escape_lt_in_attrs=False|True
|
Defaults to ``"`` which will use double quotes unless attribute
|
||||||
Whether to escape < in attribute values.
|
value contains a double quote, in which case single quotes are
|
||||||
escape_rcdata=False|True
|
used.
|
||||||
Whether to escape characters that need to be escaped within normal
|
|
||||||
elements within rcdata elements such as style.
|
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
|
||||||
resolve_entities=True|False
|
values.
|
||||||
Whether to resolve named character entities that appear in the
|
|
||||||
source tree. The XML predefined entities < > & " '
|
Defaults to ``False``.
|
||||||
are unaffected by this setting.
|
|
||||||
strip_whitespace=False|True
|
:arg escape_rcdata: Whether to escape characters that need to be
|
||||||
Whether to remove semantically meaningless whitespace. (This
|
escaped within normal elements within rcdata elements such as
|
||||||
compresses all whitespace to a single space except within pre.)
|
style.
|
||||||
minimize_boolean_attributes=True|False
|
|
||||||
Shortens boolean attributes to give just the attribute value,
|
Defaults to ``False``.
|
||||||
for example <input disabled="disabled"> becomes <input disabled>.
|
|
||||||
use_trailing_solidus=False|True
|
:arg resolve_entities: Whether to resolve named character entities that
|
||||||
Includes a close-tag slash at the end of the start tag of void
|
appear in the source tree. The XML predefined entities < >
|
||||||
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
& " ' are unaffected by this setting.
|
||||||
space_before_trailing_solidus=True|False
|
|
||||||
Places a space immediately before the closing slash in a tag
|
Defaults to ``True``.
|
||||||
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
|
||||||
sanitize=False|True
|
:arg strip_whitespace: Whether to remove semantically meaningless
|
||||||
Strip all unsafe or unknown constructs from output.
|
whitespace. (This compresses all whitespace to a single space
|
||||||
See `html5lib user documentation`_
|
except within ``pre``.)
|
||||||
omit_optional_tags=True|False
|
|
||||||
Omit start/end tags that are optional.
|
Defaults to ``False``.
|
||||||
alphabetical_attributes=False|True
|
|
||||||
Reorder attributes to be in alphabetical order.
|
:arg minimize_boolean_attributes: Shortens boolean attributes to give
|
||||||
|
just the attribute value, for example::
|
||||||
|
|
||||||
|
<input disabled="disabled">
|
||||||
|
|
||||||
|
becomes::
|
||||||
|
|
||||||
|
<input disabled>
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
|
||||||
|
start tag of void elements (empty elements whose end tag is
|
||||||
|
forbidden). E.g. ``<hr/>``.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg space_before_trailing_solidus: Places a space immediately before
|
||||||
|
the closing slash in a tag using a trailing solidus. E.g.
|
||||||
|
``<hr />``. Requires ``use_trailing_solidus=True``.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg sanitize: Strip all unsafe or unknown constructs from output.
|
||||||
|
See :py:class:`html5lib.filters.sanitizer.Filter`.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg omit_optional_tags: Omit start/end tags that are optional.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
|
||||||
"""
|
"""
|
||||||
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
||||||
if len(unexpected_args) > 0:
|
if len(unexpected_args) > 0:
|
||||||
|
@ -218,7 +274,7 @@ class HTMLSerializer(object):
|
||||||
if token["systemId"]:
|
if token["systemId"]:
|
||||||
if token["systemId"].find('"') >= 0:
|
if token["systemId"].find('"') >= 0:
|
||||||
if token["systemId"].find("'") >= 0:
|
if token["systemId"].find("'") >= 0:
|
||||||
self.serializeError("System identifer contains both single and double quote characters")
|
self.serializeError("System identifier contains both single and double quote characters")
|
||||||
quote_char = "'"
|
quote_char = "'"
|
||||||
else:
|
else:
|
||||||
quote_char = '"'
|
quote_char = '"'
|
||||||
|
@ -317,6 +373,25 @@ class HTMLSerializer(object):
|
||||||
self.serializeError(token["data"])
|
self.serializeError(token["data"])
|
||||||
|
|
||||||
def render(self, treewalker, encoding=None):
|
def render(self, treewalker, encoding=None):
|
||||||
|
"""Serializes the stream from the treewalker into a string
|
||||||
|
|
||||||
|
:arg treewalker: the treewalker to serialize
|
||||||
|
|
||||||
|
:arg encoding: the string encoding to use
|
||||||
|
|
||||||
|
:returns: the serialized tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib import parse, getTreeWalker
|
||||||
|
>>> from html5lib.serializer import HTMLSerializer
|
||||||
|
>>> token_stream = parse('<html><body>Hi!</body></html>')
|
||||||
|
>>> walker = getTreeWalker('etree')
|
||||||
|
>>> serializer = HTMLSerializer(omit_optional_tags=False)
|
||||||
|
>>> serializer.render(walker(token_stream))
|
||||||
|
'<html><head></head><body>Hi!</body></html>'
|
||||||
|
|
||||||
|
"""
|
||||||
if encoding:
|
if encoding:
|
||||||
return b"".join(list(self.serialize(treewalker, encoding)))
|
return b"".join(list(self.serialize(treewalker, encoding)))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,3 +1,21 @@
|
||||||
|
"""Tree adapters let you convert from one tree structure to another
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
from html5lib.treeadapters import genshi
|
||||||
|
|
||||||
|
doc = '<html><body>Hi!</body></html>'
|
||||||
|
treebuilder = html5lib.getTreeBuilder('etree')
|
||||||
|
parser = html5lib.HTMLParser(tree=treebuilder)
|
||||||
|
tree = parser.parse(doc)
|
||||||
|
TreeWalker = html5lib.getTreeWalker('etree')
|
||||||
|
|
||||||
|
genshi_tree = genshi.to_genshi(TreeWalker(tree))
|
||||||
|
|
||||||
|
"""
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from . import sax
|
from . import sax
|
||||||
|
|
|
@ -5,6 +5,13 @@ from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
||||||
|
|
||||||
|
|
||||||
def to_genshi(walker):
|
def to_genshi(walker):
|
||||||
|
"""Convert a tree to a genshi tree
|
||||||
|
|
||||||
|
:arg walker: the treewalker to use to walk the tree to convert it
|
||||||
|
|
||||||
|
:returns: generator of genshi nodes
|
||||||
|
|
||||||
|
"""
|
||||||
text = []
|
text = []
|
||||||
for token in walker:
|
for token in walker:
|
||||||
type = token["type"]
|
type = token["type"]
|
||||||
|
|
|
@ -11,7 +11,13 @@ for prefix, localName, namespace in adjustForeignAttributes.values():
|
||||||
|
|
||||||
|
|
||||||
def to_sax(walker, handler):
|
def to_sax(walker, handler):
|
||||||
"""Call SAX-like content handler based on treewalker walker"""
|
"""Call SAX-like content handler based on treewalker walker
|
||||||
|
|
||||||
|
:arg walker: the treewalker to use to walk the tree to convert it
|
||||||
|
|
||||||
|
:arg handler: SAX handler to use
|
||||||
|
|
||||||
|
"""
|
||||||
handler.startDocument()
|
handler.startDocument()
|
||||||
for prefix, namespace in prefix_mapping.items():
|
for prefix, namespace in prefix_mapping.items():
|
||||||
handler.startPrefixMapping(prefix, namespace)
|
handler.startPrefixMapping(prefix, namespace)
|
||||||
|
|
|
@ -1,29 +1,32 @@
|
||||||
"""A collection of modules for building different kinds of tree from
|
"""A collection of modules for building different kinds of trees from HTML
|
||||||
HTML documents.
|
documents.
|
||||||
|
|
||||||
To create a treebuilder for a new type of tree, you need to do
|
To create a treebuilder for a new type of tree, you need to do
|
||||||
implement several things:
|
implement several things:
|
||||||
|
|
||||||
1) A set of classes for various types of elements: Document, Doctype,
|
1. A set of classes for various types of elements: Document, Doctype, Comment,
|
||||||
Comment, Element. These must implement the interface of
|
Element. These must implement the interface of ``base.treebuilders.Node``
|
||||||
_base.treebuilders.Node (although comment nodes have a different
|
(although comment nodes have a different signature for their constructor,
|
||||||
signature for their constructor, see treebuilders.etree.Comment)
|
see ``treebuilders.etree.Comment``) Textual content may also be implemented
|
||||||
Textual content may also be implemented as another node type, or not, as
|
as another node type, or not, as your tree implementation requires.
|
||||||
your tree implementation requires.
|
|
||||||
|
|
||||||
2) A treebuilder object (called TreeBuilder by convention) that
|
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
|
||||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
|
||||||
documentClass - the class to use for the bottommost node of a document
|
|
||||||
elementClass - the class to use for HTML Elements
|
* ``documentClass`` - the class to use for the bottommost node of a document
|
||||||
commentClass - the class to use for comments
|
* ``elementClass`` - the class to use for HTML Elements
|
||||||
doctypeClass - the class to use for doctypes
|
* ``commentClass`` - the class to use for comments
|
||||||
It also has one required method:
|
* ``doctypeClass`` - the class to use for doctypes
|
||||||
getDocument - Returns the root node of the complete document tree
|
|
||||||
|
It also has one required method:
|
||||||
|
|
||||||
|
* ``getDocument`` - Returns the root node of the complete document tree
|
||||||
|
|
||||||
|
3. If you wish to run the unit tests, you must also create a ``testSerializer``
|
||||||
|
method on your treebuilder which accepts a node and returns a string
|
||||||
|
containing Node and its children serialized according to the format used in
|
||||||
|
the unittests
|
||||||
|
|
||||||
3) If you wish to run the unit tests, you must also create a
|
|
||||||
testSerializer method on your treebuilder which accepts a node and
|
|
||||||
returns a string containing Node and its children serialized according
|
|
||||||
to the format used in the unittests
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
@ -34,23 +37,32 @@ treeBuilderCache = {}
|
||||||
|
|
||||||
|
|
||||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
"""Get a TreeBuilder class for various types of trees with built-in support
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
:arg treeType: the name of the tree type required (case-insensitive). Supported
|
||||||
values are:
|
values are:
|
||||||
|
|
||||||
"dom" - A generic builder for DOM implementations, defaulting to
|
* "dom" - A generic builder for DOM implementations, defaulting to a
|
||||||
a xml.dom.minidom based implementation.
|
xml.dom.minidom based implementation.
|
||||||
"etree" - A generic builder for tree implementations exposing an
|
* "etree" - A generic builder for tree implementations exposing an
|
||||||
ElementTree-like interface, defaulting to
|
ElementTree-like interface, defaulting to xml.etree.cElementTree if
|
||||||
xml.etree.cElementTree if available and
|
available and xml.etree.ElementTree if not.
|
||||||
xml.etree.ElementTree if not.
|
* "lxml" - A etree-based builder for lxml.etree, handling limitations
|
||||||
"lxml" - A etree-based builder for lxml.etree, handling
|
of lxml's implementation.
|
||||||
limitations of lxml's implementation.
|
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
:arg implementation: (Currently applies to the "etree" and "dom" tree
|
||||||
module implementing the tree type e.g.
|
types). A module implementing the tree type e.g. xml.etree.ElementTree
|
||||||
xml.etree.ElementTree or xml.etree.cElementTree."""
|
or xml.etree.cElementTree.
|
||||||
|
|
||||||
|
:arg kwargs: Any additional options to pass to the TreeBuilder when
|
||||||
|
creating it.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.treebuilders import getTreeBuilder
|
||||||
|
>>> builder = getTreeBuilder('etree')
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
if treeType not in treeBuilderCache:
|
if treeType not in treeBuilderCache:
|
||||||
|
|
|
@ -21,22 +21,25 @@ listElementsMap = {
|
||||||
|
|
||||||
|
|
||||||
class Node(object):
|
class Node(object):
|
||||||
|
"""Represents an item in the tree"""
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
"""Node representing an item in the tree.
|
"""Creates a Node
|
||||||
name - The tag name associated with the node
|
|
||||||
parent - The parent of the current node (or None for the document node)
|
:arg name: The tag name associated with the node
|
||||||
value - The value of the current node (applies to text nodes and
|
|
||||||
comments
|
|
||||||
attributes - a dict holding name, value pairs for attributes of the node
|
|
||||||
childNodes - a list of child nodes of the current node. This must
|
|
||||||
include all elements but not necessarily other node types
|
|
||||||
_flags - A list of miscellaneous flags that can be set on the node
|
|
||||||
"""
|
"""
|
||||||
|
# The tag name associated with the node
|
||||||
self.name = name
|
self.name = name
|
||||||
|
# The parent of the current node (or None for the document node)
|
||||||
self.parent = None
|
self.parent = None
|
||||||
|
# The value of the current node (applies to text nodes and comments)
|
||||||
self.value = None
|
self.value = None
|
||||||
|
# A dict holding name -> value pairs for attributes of the node
|
||||||
self.attributes = {}
|
self.attributes = {}
|
||||||
|
# A list of child nodes of the current node. This must include all
|
||||||
|
# elements but not necessarily other node types.
|
||||||
self.childNodes = []
|
self.childNodes = []
|
||||||
|
# A list of miscellaneous flags that can be set on the node.
|
||||||
self._flags = []
|
self._flags = []
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -53,23 +56,41 @@ class Node(object):
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
"""Insert node as a child of the current node
|
"""Insert node as a child of the current node
|
||||||
|
|
||||||
|
:arg node: the node to insert
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
def insertText(self, data, insertBefore=None):
|
||||||
"""Insert data as text in the current node, positioned before the
|
"""Insert data as text in the current node, positioned before the
|
||||||
start of node insertBefore or to the end of the node's text.
|
start of node insertBefore or to the end of the node's text.
|
||||||
|
|
||||||
|
:arg data: the data to insert
|
||||||
|
|
||||||
|
:arg insertBefore: True if you want to insert the text before the node
|
||||||
|
and False if you want to insert it after the node
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
def insertBefore(self, node, refNode):
|
||||||
"""Insert node as a child of the current node, before refNode in the
|
"""Insert node as a child of the current node, before refNode in the
|
||||||
list of child nodes. Raises ValueError if refNode is not a child of
|
list of child nodes. Raises ValueError if refNode is not a child of
|
||||||
the current node"""
|
the current node
|
||||||
|
|
||||||
|
:arg node: the node to insert
|
||||||
|
|
||||||
|
:arg refNode: the child node to insert the node before
|
||||||
|
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def removeChild(self, node):
|
def removeChild(self, node):
|
||||||
"""Remove node from the children of the current node
|
"""Remove node from the children of the current node
|
||||||
|
|
||||||
|
:arg node: the child node to remove
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -77,6 +98,9 @@ class Node(object):
|
||||||
"""Move all the children of the current node to newParent.
|
"""Move all the children of the current node to newParent.
|
||||||
This is needed so that trees that don't store text as nodes move the
|
This is needed so that trees that don't store text as nodes move the
|
||||||
text in the correct way
|
text in the correct way
|
||||||
|
|
||||||
|
:arg newParent: the node to move all this node's children to
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# XXX - should this method be made more general?
|
# XXX - should this method be made more general?
|
||||||
for child in self.childNodes:
|
for child in self.childNodes:
|
||||||
|
@ -121,10 +145,12 @@ class ActiveFormattingElements(list):
|
||||||
|
|
||||||
class TreeBuilder(object):
|
class TreeBuilder(object):
|
||||||
"""Base treebuilder implementation
|
"""Base treebuilder implementation
|
||||||
documentClass - the class to use for the bottommost node of a document
|
|
||||||
elementClass - the class to use for HTML Elements
|
* documentClass - the class to use for the bottommost node of a document
|
||||||
commentClass - the class to use for comments
|
* elementClass - the class to use for HTML Elements
|
||||||
doctypeClass - the class to use for doctypes
|
* commentClass - the class to use for comments
|
||||||
|
* doctypeClass - the class to use for doctypes
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# pylint:disable=not-callable
|
# pylint:disable=not-callable
|
||||||
|
|
||||||
|
@ -144,6 +170,11 @@ class TreeBuilder(object):
|
||||||
fragmentClass = None
|
fragmentClass = None
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements):
|
def __init__(self, namespaceHTMLElements):
|
||||||
|
"""Create a TreeBuilder
|
||||||
|
|
||||||
|
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||||
|
|
||||||
|
"""
|
||||||
if namespaceHTMLElements:
|
if namespaceHTMLElements:
|
||||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||||
else:
|
else:
|
||||||
|
@ -367,11 +398,11 @@ class TreeBuilder(object):
|
||||||
self.generateImpliedEndTags(exclude)
|
self.generateImpliedEndTags(exclude)
|
||||||
|
|
||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
"Return the final tree"
|
"""Return the final tree"""
|
||||||
return self.document
|
return self.document
|
||||||
|
|
||||||
def getFragment(self):
|
def getFragment(self):
|
||||||
"Return the final fragment"
|
"""Return the final fragment"""
|
||||||
# assert self.innerHTML
|
# assert self.innerHTML
|
||||||
fragment = self.fragmentClass()
|
fragment = self.fragmentClass()
|
||||||
self.openElements[0].reparentChildren(fragment)
|
self.openElements[0].reparentChildren(fragment)
|
||||||
|
@ -379,5 +410,8 @@ class TreeBuilder(object):
|
||||||
|
|
||||||
def testSerializer(self, node):
|
def testSerializer(self, node):
|
||||||
"""Serialize the subtree of node in the format required by unit tests
|
"""Serialize the subtree of node in the format required by unit tests
|
||||||
node - the node from which to start serializing"""
|
|
||||||
|
:arg node: the node from which to start serializing
|
||||||
|
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
@ -309,7 +309,6 @@ class TreeBuilder(base.TreeBuilder):
|
||||||
super(TreeBuilder, self).insertComment(data, parent)
|
super(TreeBuilder, self).insertComment(data, parent)
|
||||||
|
|
||||||
def insertRoot(self, token):
|
def insertRoot(self, token):
|
||||||
"""Create the document root"""
|
|
||||||
# Because of the way libxml2 works, it doesn't seem to be possible to
|
# Because of the way libxml2 works, it doesn't seem to be possible to
|
||||||
# alter information like the doctype after the tree has been parsed.
|
# alter information like the doctype after the tree has been parsed.
|
||||||
# Therefore we need to use the built-in parser to create our initial
|
# Therefore we need to use the built-in parser to create our initial
|
||||||
|
|
|
@ -2,10 +2,10 @@
|
||||||
tree, generating tokens identical to those produced by the tokenizer
|
tree, generating tokens identical to those produced by the tokenizer
|
||||||
module.
|
module.
|
||||||
|
|
||||||
To create a tree walker for a new type of tree, you need to do
|
To create a tree walker for a new type of tree, you need to
|
||||||
implement a tree walker object (called TreeWalker by convention) that
|
implement a tree walker object (called TreeWalker by convention) that
|
||||||
implements a 'serialize' method taking a tree as sole argument and
|
implements a 'serialize' method which takes a tree as sole argument and
|
||||||
returning an iterator generating tokens.
|
returns an iterator which generates tokens.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
@ -13,7 +13,7 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
from .. import constants
|
from .. import constants
|
||||||
from .._utils import default_etree
|
from .._utils import default_etree
|
||||||
|
|
||||||
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
|
__all__ = ["getTreeWalker", "pprint"]
|
||||||
|
|
||||||
treeWalkerCache = {}
|
treeWalkerCache = {}
|
||||||
|
|
||||||
|
@ -21,20 +21,25 @@ treeWalkerCache = {}
|
||||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||||
"""Get a TreeWalker class for various types of tree with built-in support
|
"""Get a TreeWalker class for various types of tree with built-in support
|
||||||
|
|
||||||
Args:
|
:arg str treeType: the name of the tree type required (case-insensitive).
|
||||||
treeType (str): the name of the tree type required (case-insensitive).
|
Supported values are:
|
||||||
Supported values are:
|
|
||||||
|
|
||||||
- "dom": The xml.dom.minidom DOM implementation
|
* "dom": The xml.dom.minidom DOM implementation
|
||||||
- "etree": A generic walker for tree implementations exposing an
|
* "etree": A generic walker for tree implementations exposing an
|
||||||
elementtree-like interface (known to work with
|
elementtree-like interface (known to work with ElementTree,
|
||||||
ElementTree, cElementTree and lxml.etree).
|
cElementTree and lxml.etree).
|
||||||
- "lxml": Optimized walker for lxml.etree
|
* "lxml": Optimized walker for lxml.etree
|
||||||
- "genshi": a Genshi stream
|
* "genshi": a Genshi stream
|
||||||
|
|
||||||
|
:arg implementation: A module implementing the tree type e.g.
|
||||||
|
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
|
||||||
|
tree type only).
|
||||||
|
|
||||||
|
:arg kwargs: keyword arguments passed to the etree walker--for other
|
||||||
|
walkers, this has no effect
|
||||||
|
|
||||||
|
:returns: a TreeWalker class
|
||||||
|
|
||||||
Implementation: A module implementing the tree type e.g.
|
|
||||||
xml.etree.ElementTree or cElementTree (Currently applies to the
|
|
||||||
"etree" tree type only).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
|
@ -73,7 +78,13 @@ def concatenateCharacterTokens(tokens):
|
||||||
|
|
||||||
|
|
||||||
def pprint(walker):
|
def pprint(walker):
|
||||||
"""Pretty printer for tree walkers"""
|
"""Pretty printer for tree walkers
|
||||||
|
|
||||||
|
Takes a TreeWalker instance and pretty prints the output of walking the tree.
|
||||||
|
|
||||||
|
:arg walker: a TreeWalker instance
|
||||||
|
|
||||||
|
"""
|
||||||
output = []
|
output = []
|
||||||
indent = 0
|
indent = 0
|
||||||
for token in concatenateCharacterTokens(walker):
|
for token in concatenateCharacterTokens(walker):
|
||||||
|
|
|
@ -18,16 +18,48 @@ spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(object):
|
class TreeWalker(object):
|
||||||
|
"""Walks a tree yielding tokens
|
||||||
|
|
||||||
|
Tokens are dicts that all have a ``type`` field specifying the type of the
|
||||||
|
token.
|
||||||
|
|
||||||
|
"""
|
||||||
def __init__(self, tree):
|
def __init__(self, tree):
|
||||||
|
"""Creates a TreeWalker
|
||||||
|
|
||||||
|
:arg tree: the tree to walk
|
||||||
|
|
||||||
|
"""
|
||||||
self.tree = tree
|
self.tree = tree
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def error(self, msg):
|
def error(self, msg):
|
||||||
|
"""Generates an error token with the given message
|
||||||
|
|
||||||
|
:arg msg: the error message
|
||||||
|
|
||||||
|
:returns: SerializeError token
|
||||||
|
|
||||||
|
"""
|
||||||
return {"type": "SerializeError", "data": msg}
|
return {"type": "SerializeError", "data": msg}
|
||||||
|
|
||||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||||
|
"""Generates an EmptyTag token
|
||||||
|
|
||||||
|
:arg namespace: the namespace of the token--can be ``None``
|
||||||
|
|
||||||
|
:arg name: the name of the element
|
||||||
|
|
||||||
|
:arg attrs: the attributes of the element as a dict
|
||||||
|
|
||||||
|
:arg hasChildren: whether or not to yield a SerializationError because
|
||||||
|
this tag shouldn't have children
|
||||||
|
|
||||||
|
:returns: EmptyTag token
|
||||||
|
|
||||||
|
"""
|
||||||
yield {"type": "EmptyTag", "name": name,
|
yield {"type": "EmptyTag", "name": name,
|
||||||
"namespace": namespace,
|
"namespace": namespace,
|
||||||
"data": attrs}
|
"data": attrs}
|
||||||
|
@ -35,17 +67,61 @@ class TreeWalker(object):
|
||||||
yield self.error("Void element has children")
|
yield self.error("Void element has children")
|
||||||
|
|
||||||
def startTag(self, namespace, name, attrs):
|
def startTag(self, namespace, name, attrs):
|
||||||
|
"""Generates a StartTag token
|
||||||
|
|
||||||
|
:arg namespace: the namespace of the token--can be ``None``
|
||||||
|
|
||||||
|
:arg name: the name of the element
|
||||||
|
|
||||||
|
:arg attrs: the attributes of the element as a dict
|
||||||
|
|
||||||
|
:returns: StartTag token
|
||||||
|
|
||||||
|
"""
|
||||||
return {"type": "StartTag",
|
return {"type": "StartTag",
|
||||||
"name": name,
|
"name": name,
|
||||||
"namespace": namespace,
|
"namespace": namespace,
|
||||||
"data": attrs}
|
"data": attrs}
|
||||||
|
|
||||||
def endTag(self, namespace, name):
|
def endTag(self, namespace, name):
|
||||||
|
"""Generates an EndTag token
|
||||||
|
|
||||||
|
:arg namespace: the namespace of the token--can be ``None``
|
||||||
|
|
||||||
|
:arg name: the name of the element
|
||||||
|
|
||||||
|
:returns: EndTag token
|
||||||
|
|
||||||
|
"""
|
||||||
return {"type": "EndTag",
|
return {"type": "EndTag",
|
||||||
"name": name,
|
"name": name,
|
||||||
"namespace": namespace}
|
"namespace": namespace}
|
||||||
|
|
||||||
def text(self, data):
|
def text(self, data):
|
||||||
|
"""Generates SpaceCharacters and Characters tokens
|
||||||
|
|
||||||
|
Depending on what's in the data, this generates one or more
|
||||||
|
``SpaceCharacters`` and ``Characters`` tokens.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
>>> from html5lib.treewalkers.base import TreeWalker
|
||||||
|
>>> # Give it an empty tree just so it instantiates
|
||||||
|
>>> walker = TreeWalker([])
|
||||||
|
>>> list(walker.text(''))
|
||||||
|
[]
|
||||||
|
>>> list(walker.text(' '))
|
||||||
|
[{u'data': ' ', u'type': u'SpaceCharacters'}]
|
||||||
|
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
[{u'data': ' ', u'type': u'SpaceCharacters'},
|
||||||
|
{u'data': u'abc', u'type': u'Characters'},
|
||||||
|
{u'data': u' ', u'type': u'SpaceCharacters'}]
|
||||||
|
|
||||||
|
:arg data: the text data
|
||||||
|
|
||||||
|
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
|
||||||
|
|
||||||
|
"""
|
||||||
data = data
|
data = data
|
||||||
middle = data.lstrip(spaceCharacters)
|
middle = data.lstrip(spaceCharacters)
|
||||||
left = data[:len(data) - len(middle)]
|
left = data[:len(data) - len(middle)]
|
||||||
|
@ -60,18 +136,44 @@ class TreeWalker(object):
|
||||||
yield {"type": "SpaceCharacters", "data": right}
|
yield {"type": "SpaceCharacters", "data": right}
|
||||||
|
|
||||||
def comment(self, data):
|
def comment(self, data):
|
||||||
|
"""Generates a Comment token
|
||||||
|
|
||||||
|
:arg data: the comment
|
||||||
|
|
||||||
|
:returns: Comment token
|
||||||
|
|
||||||
|
"""
|
||||||
return {"type": "Comment", "data": data}
|
return {"type": "Comment", "data": data}
|
||||||
|
|
||||||
def doctype(self, name, publicId=None, systemId=None):
|
def doctype(self, name, publicId=None, systemId=None):
|
||||||
|
"""Generates a Doctype token
|
||||||
|
|
||||||
|
:arg name:
|
||||||
|
|
||||||
|
:arg publicId:
|
||||||
|
|
||||||
|
:arg systemId:
|
||||||
|
|
||||||
|
:returns: the Doctype token
|
||||||
|
|
||||||
|
"""
|
||||||
return {"type": "Doctype",
|
return {"type": "Doctype",
|
||||||
"name": name,
|
"name": name,
|
||||||
"publicId": publicId,
|
"publicId": publicId,
|
||||||
"systemId": systemId}
|
"systemId": systemId}
|
||||||
|
|
||||||
def entity(self, name):
|
def entity(self, name):
|
||||||
|
"""Generates an Entity token
|
||||||
|
|
||||||
|
:arg name: the entity name
|
||||||
|
|
||||||
|
:returns: an Entity token
|
||||||
|
|
||||||
|
"""
|
||||||
return {"type": "Entity", "name": name}
|
return {"type": "Entity", "name": name}
|
||||||
|
|
||||||
def unknown(self, nodeType):
|
def unknown(self, nodeType):
|
||||||
|
"""Handles unknown node types"""
|
||||||
return self.error("Unknown node type: " + nodeType)
|
return self.error("Unknown node type: " + nodeType)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,6 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
try:
|
from collections import OrderedDict
|
||||||
from collections import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
try:
|
|
||||||
from ordereddict import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
OrderedDict = dict
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from six import string_types
|
from six import string_types
|
||||||
|
|
Loading…
Reference in a new issue