Merge branch 'feature/UpdateHtml5lib' into develop

This commit is contained in:
JackDandy 2018-03-28 00:44:29 +01:00
commit 18c400acec
24 changed files with 588 additions and 203 deletions

View file

@ -6,6 +6,7 @@
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f) * Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed * Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo * Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
[develop changelog] [develop changelog]

View file

@ -1,14 +1,23 @@
""" """
HTML parsing library based on the WHATWG "HTML5" HTML parsing library based on the `WHATWG HTML specification
specification. The parser is designed to be compatible with existing <https://whatwg.org/html>`_. The parser is designed to be compatible with
HTML found in the wild and implements well-defined error recovery that existing HTML found in the wild and implements well-defined error recovery that
is largely compatible with modern desktop web browsers. is largely compatible with modern desktop web browsers.
Example usage: Example usage::
import html5lib import html5lib
f = open("my_document.html") with open("my_document.html", "rb") as f:
tree = html5lib.parse(f) tree = html5lib.parse(f)
For convenience, this module re-exports the following names:
* :func:`~.html5parser.parse`
* :func:`~.html5parser.parseFragment`
* :class:`~.html5parser.HTMLParser`
* :func:`~.treebuilders.getTreeBuilder`
* :func:`~.treewalkers.getTreeWalker`
* :func:`~.serializer.serialize`
""" """
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
@ -22,4 +31,5 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"] "getTreeWalker", "serialize"]
# this has to be at the top level, see how setup.py parses this # this has to be at the top level, see how setup.py parses this
__version__ = "0.9999999999-dev" #: Distribution version number.
__version__ = "1.1-dev"

View file

@ -180,7 +180,7 @@ nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things # Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
class InfosetFilter(object): class InfosetFilter(object):

View file

@ -9,7 +9,7 @@ import re
import webencodings import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import ReparseException from .constants import _ReparseException
from . import _utils from . import _utils
from io import StringIO from io import StringIO
@ -48,7 +48,7 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF]) 0x10FFFE, 0x10FFFF])
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil() # Cache for charsUntil()
charsUntilRegEx = {} charsUntilRegEx = {}
@ -367,7 +367,7 @@ class HTMLUnicodeInputStream(object):
def unget(self, char): def unget(self, char):
# Only one character is allowed to be ungotten at once - it must # Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget # be consumed again before any further call to unget
if char is not None: if char is not EOF:
if self.chunkOffset == 0: if self.chunkOffset == 0:
# unget is called quite rarely, so it's a good idea to do # unget is called quite rarely, so it's a good idea to do
# more work here if it saves a bit of work in the frequently # more work here if it saves a bit of work in the frequently
@ -461,7 +461,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
if charEncoding[0] is not None: if charEncoding[0] is not None:
return charEncoding return charEncoding
# If we've been overriden, we've been overriden # If we've been overridden, we've been overridden
charEncoding = lookupEncoding(self.override_encoding), "certain" charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None: if charEncoding[0] is not None:
return charEncoding return charEncoding
@ -530,7 +530,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
self.rawStream.seek(0) self.rawStream.seek(0)
self.charEncoding = (newEncoding, "certain") self.charEncoding = (newEncoding, "certain")
self.reset() self.reset()
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
def detectBOM(self): def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If """Attempts to detect at BOM at the start of the stream. If

View file

@ -13,8 +13,7 @@ class Trie(Mapping):
if prefix is None: if prefix is None:
return set(keys) return set(keys)
# Python 2.6: no set comprehensions return {x for x in keys if x.startswith(prefix)}
return set([x for x in keys if x.startswith(prefix)])
def has_keys_with_prefix(self, prefix): def has_keys_with_prefix(self, prefix):
for key in self.keys(): for key in self.keys():

View file

@ -1,6 +1,5 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
import sys
from types import ModuleType from types import ModuleType
from six import text_type from six import text_type
@ -13,11 +12,9 @@ except ImportError:
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory", "surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates", "PY27"] "supports_lone_surrogates"]
PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform # caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as # using UTF-16 as its encoding of unicode strings, such as

View file

@ -423,7 +423,7 @@ specialElements = frozenset([
]) ])
htmlIntegrationPointElements = frozenset([ htmlIntegrationPointElements = frozenset([
(namespaces["mathml"], "annotaion-xml"), (namespaces["mathml"], "annotation-xml"),
(namespaces["svg"], "foreignObject"), (namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"), (namespaces["svg"], "desc"),
(namespaces["svg"], "title") (namespaces["svg"], "title")
@ -588,7 +588,7 @@ rcdataElements = frozenset([
]) ])
booleanAttributes = { booleanAttributes = {
"": frozenset(["irrelevant"]), "": frozenset(["irrelevant", "itemscope"]),
"style": frozenset(["scoped"]), "style": frozenset(["scoped"]),
"img": frozenset(["ismap"]), "img": frozenset(["ismap"]),
"audio": frozenset(["autoplay", "controls"]), "audio": frozenset(["autoplay", "controls"]),
@ -606,6 +606,7 @@ booleanAttributes = {
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"output": frozenset(["disabled", "readonly"]), "output": frozenset(["disabled", "readonly"]),
"iframe": frozenset(["seamless"]),
} }
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@ -2938,8 +2939,9 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
class DataLossWarning(UserWarning): class DataLossWarning(UserWarning):
"""Raised when the current tree is unable to represent the input data"""
pass pass
class ReparseException(Exception): class _ReparseException(Exception):
pass pass

View file

@ -2,19 +2,28 @@ from __future__ import absolute_import, division, unicode_literals
from . import base from . import base
try: from collections import OrderedDict
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict def _attr_key(attr):
"""Return an appropriate key for an attribute for sorting
Attributes have a namespace that can be either ``None`` or a string. We
can't compare the two because they're different types, so we convert
``None`` to an empty string first.
"""
return (attr[0][0] or ''), attr[0][1]
class Filter(base.Filter): class Filter(base.Filter):
"""Alphabetizes attributes for elements"""
def __iter__(self): def __iter__(self):
for token in base.Filter.__iter__(self): for token in base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"): if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict() attrs = OrderedDict()
for name, value in sorted(token["data"].items(), for name, value in sorted(token["data"].items(),
key=lambda x: x[0]): key=_attr_key):
attrs[name] = value attrs[name] = value
token["data"] = attrs token["data"] = attrs
yield token yield token

View file

@ -4,7 +4,15 @@ from . import base
class Filter(base.Filter): class Filter(base.Filter):
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
def __init__(self, source, encoding): def __init__(self, source, encoding):
"""Creates a Filter
:arg source: the source token stream
:arg encoding: the encoding to set
"""
base.Filter.__init__(self, source) base.Filter.__init__(self, source)
self.encoding = encoding self.encoding = encoding

View file

@ -10,7 +10,19 @@ spaceCharacters = "".join(spaceCharacters)
class Filter(base.Filter): class Filter(base.Filter):
"""Lints the token stream for errors
If it finds any errors, it'll raise an ``AssertionError``.
"""
def __init__(self, source, require_matching_tags=True): def __init__(self, source, require_matching_tags=True):
"""Creates a Filter
:arg source: the source token stream
:arg require_matching_tags: whether or not to require matching tags
"""
super(Filter, self).__init__(source) super(Filter, self).__init__(source)
self.require_matching_tags = require_matching_tags self.require_matching_tags = require_matching_tags

View file

@ -4,6 +4,7 @@ from . import base
class Filter(base.Filter): class Filter(base.Filter):
"""Removes optional tags from the token stream"""
def slider(self): def slider(self):
previous1 = previous2 = None previous1 = previous2 = None
for token in self.source: for token in self.source:

View file

@ -705,7 +705,7 @@ data_content_type = re.compile(r'''
class Filter(base.Filter): class Filter(base.Filter):
""" sanitization of XHTML+MathML+SVG and of inline style attributes.""" """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
def __init__(self, def __init__(self,
source, source,
allowed_elements=allowed_elements, allowed_elements=allowed_elements,
@ -718,6 +718,37 @@ class Filter(base.Filter):
attr_val_is_uri=attr_val_is_uri, attr_val_is_uri=attr_val_is_uri,
svg_attr_val_allows_ref=svg_attr_val_allows_ref, svg_attr_val_allows_ref=svg_attr_val_allows_ref,
svg_allow_local_href=svg_allow_local_href): svg_allow_local_href=svg_allow_local_href):
"""Creates a Filter
:arg allowed_elements: set of elements to allow--everything else will
be escaped
:arg allowed_attributes: set of attributes to allow in
elements--everything else will be stripped
:arg allowed_css_properties: set of CSS properties to allow--everything
else will be stripped
:arg allowed_css_keywords: set of CSS keywords to allow--everything
else will be stripped
:arg allowed_svg_properties: set of SVG properties to allow--everything
else will be removed
:arg allowed_protocols: set of allowed protocols for URIs
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
:arg attr_val_is_uri: set of attributes that have URI values--values
that have a scheme not listed in ``allowed_protocols`` are removed
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
references
:arg svg_allow_local_href: set of SVG elements that can have local
hrefs--these are removed
"""
super(Filter, self).__init__(source) super(Filter, self).__init__(source)
self.allowed_elements = allowed_elements self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes self.allowed_attributes = allowed_attributes
@ -737,11 +768,11 @@ class Filter(base.Filter):
yield token yield token
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
# attributes are parsed, and a restricted set, # specified by # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
# in ALLOWED_PROTOCOLS are allowed. # allowed.
# #
# sanitize_html('<script> do_nasty_stuff() </script>') # sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script> # => &lt;script> do_nasty_stuff() &lt;/script>
@ -782,7 +813,7 @@ class Filter(base.Filter):
# characters, nor why we call unescape. I just know it's always been here. # characters, nor why we call unescape. I just know it's always been here.
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
# this will do is remove *more* than it otherwise would. # this will do is remove *more* than it otherwise would.
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '', val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
unescape(attrs[attr])).lower() unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters # remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "") val_unescaped = val_unescaped.replace("\ufffd", "")
@ -807,7 +838,7 @@ class Filter(base.Filter):
' ', ' ',
unescape(attrs[attr])) unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and if (token["name"] in self.svg_allow_local_href and
(namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*', (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
attrs[(namespaces['xlink'], 'href')])): attrs[(namespaces['xlink'], 'href')])):
del attrs[(namespaces['xlink'], 'href')] del attrs[(namespaces['xlink'], 'href')]
if (None, 'style') in attrs: if (None, 'style') in attrs:
@ -837,16 +868,16 @@ class Filter(base.Filter):
def sanitize_css(self, style): def sanitize_css(self, style):
# disallow urls # disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet # gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return '' return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return '' return ''
clean = [] clean = []
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style): for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value: if not value:
continue continue
if prop.lower() in self.allowed_css_properties: if prop.lower() in self.allowed_css_properties:
@ -855,7 +886,7 @@ class Filter(base.Filter):
'padding']: 'padding']:
for keyword in value.split(): for keyword in value.split():
if keyword not in self.allowed_css_keywords and \ if keyword not in self.allowed_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
break break
else: else:
clean.append(prop + ': ' + value + ';') clean.append(prop + ': ' + value + ';')

View file

@ -10,7 +10,7 @@ SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
class Filter(base.Filter): class Filter(base.Filter):
"""Collapses whitespace except in pre, textarea, and script elements"""
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self): def __iter__(self):

View file

@ -1,12 +1,8 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass, viewkeys, PY3 from six import with_metaclass, viewkeys
import types import types
from collections import OrderedDict
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from . import _inputstream from . import _inputstream
from . import _tokenizer from . import _tokenizer
@ -24,18 +20,53 @@ from .constants import (
adjustForeignAttributes as adjustForeignAttributesMap, adjustForeignAttributes as adjustForeignAttributesMap,
adjustMathMLAttributes, adjustSVGAttributes, adjustMathMLAttributes, adjustSVGAttributes,
E, E,
ReparseException _ReparseException
) )
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse a string or file-like object into a tree""" """Parse an HTML document as a string or file-like object into a tree
:arg doc: the document to parse as a string or file-like object
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5parser import parse
>>> parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
"""
tb = treebuilders.getTreeBuilder(treebuilder) tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, **kwargs) return p.parse(doc, **kwargs)
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse an HTML fragment as a string or file-like object into a tree
:arg doc: the fragment to parse as a string or file-like object
:arg container: the container context to parse the fragment in
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import parseFragment
>>> parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
"""
tb = treebuilders.getTreeBuilder(treebuilder) tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parseFragment(doc, container=container, **kwargs) return p.parseFragment(doc, container=container, **kwargs)
@ -54,16 +85,30 @@ def method_decorator_metaclass(function):
class HTMLParser(object): class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly """HTML parser
malformed) HTML"""
Generates a tree structure from a stream of (possibly malformed) HTML.
"""
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
""" """
strict - raise an exception when a parse error is encountered :arg tree: a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
:arg strict: raise an exception when a parse error is encountered
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:arg debug: whether or not to enable debug mode which logs things
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser() # generates parser with etree builder
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
""" """
# Raise an exception on the first error encountered # Raise an exception on the first error encountered
@ -87,7 +132,7 @@ class HTMLParser(object):
try: try:
self.mainLoop() self.mainLoop()
except ReparseException: except _ReparseException:
self.reset() self.reset()
self.mainLoop() self.mainLoop()
@ -127,9 +172,8 @@ class HTMLParser(object):
@property @property
def documentEncoding(self): def documentEncoding(self):
"""The name of the character encoding """Name of the character encoding that was used to decode the input stream, or
that was used to decode the input stream, :obj:`None` if that is not determined yet
or :obj:`None` if that is not determined yet.
""" """
if not hasattr(self, 'tokenizer'): if not hasattr(self, 'tokenizer'):
@ -223,14 +267,24 @@ class HTMLParser(object):
def parse(self, stream, *args, **kwargs): def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree """Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed :arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used, the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta regardless of any BOM or later declaration (such as in a meta
element) element).
:arg scripting: treat noscript elements as if JavaScript was turned on
:returns: parsed tree
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
scripting - treat noscript elements as if javascript was turned on
""" """
self._parse(stream, False, None, *args, **kwargs) self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument() return self.tree.getDocument()
@ -238,17 +292,27 @@ class HTMLParser(object):
def parseFragment(self, stream, *args, **kwargs): def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment """Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property :arg container: name of the element we're setting the innerHTML
if set to None, default to 'div' property if set to None, default to 'div'
stream - a filelike object or string containing the HTML to be parsed :arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used, the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta regardless of any BOM or later declaration (such as in a meta
element) element)
:arg scripting: treat noscript elements as if JavaScript was turned on
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
scripting - treat noscript elements as if javascript was turned on
""" """
self._parse(stream, True, *args, **kwargs) self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment() return self.tree.getFragment()
@ -262,8 +326,7 @@ class HTMLParser(object):
raise ParseError(E[errorcode] % datavars) raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token): def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """ # HTML5 specific normalizations to the token stream
if token["type"] == tokenTypes["StartTag"]: if token["type"] == tokenTypes["StartTag"]:
raw = token["data"] raw = token["data"]
token["data"] = OrderedDict(raw) token["data"] = OrderedDict(raw)
@ -331,9 +394,7 @@ class HTMLParser(object):
self.phase = new_phase self.phase = new_phase
def parseRCDataRawtext(self, token, contentType): def parseRCDataRawtext(self, token, contentType):
"""Generic RCDATA/RAWTEXT Parsing algorithm # Generic RCDATA/RAWTEXT Parsing algorithm
contentType - RCDATA or RAWTEXT
"""
assert contentType in ("RAWTEXT", "RCDATA") assert contentType in ("RAWTEXT", "RCDATA")
self.tree.insertElement(token) self.tree.insertElement(token)
@ -2711,10 +2772,7 @@ def getPhases(debug):
def adjust_attributes(token, replacements): def adjust_attributes(token, replacements):
if PY3 or _utils.PY27: needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
else:
needs_adjustment = frozenset(token['data']) & frozenset(replacements)
if needs_adjustment: if needs_adjustment:
token['data'] = OrderedDict((replacements.get(k, k), v) token['data'] = OrderedDict((replacements.get(k, k), v)
for k, v in token['data'].items()) for k, v in token['data'].items())

View file

@ -68,10 +68,33 @@ def htmlentityreplace_errors(exc):
else: else:
return xmlcharrefreplace_errors(exc) return xmlcharrefreplace_errors(exc)
register_error("htmlentityreplace", htmlentityreplace_errors) register_error("htmlentityreplace", htmlentityreplace_errors)
def serialize(input, tree="etree", encoding=None, **serializer_opts): def serialize(input, tree="etree", encoding=None, **serializer_opts):
"""Serializes the input token stream using the specified treewalker
:arg input: the token stream to serialize
:arg tree: the treewalker to use
:arg encoding: the encoding to use
:arg serializer_opts: any options to pass to the
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
:returns: the tree serialized as a string
Example:
>>> from html5lib.html5parser import parse
>>> from html5lib.serializer import serialize
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
>>> serialize(token_stream, omit_optional_tags=False)
'<html><head></head><body><p>Hi!</p></body></html>'
"""
# XXX: Should we cache this? # XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree) walker = treewalkers.getTreeWalker(tree)
s = HTMLSerializer(**serializer_opts) s = HTMLSerializer(**serializer_opts)
@ -110,50 +133,83 @@ class HTMLSerializer(object):
"strip_whitespace", "sanitize") "strip_whitespace", "sanitize")
def __init__(self, **kwargs): def __init__(self, **kwargs):
"""Initialize HTMLSerializer. """Initialize HTMLSerializer
Keyword options (default given first unless specified) include: :arg inject_meta_charset: Whether or not to inject the meta charset.
inject_meta_charset=True|False Defaults to ``True``.
Whether it insert a meta element to define the character set of the
document. :arg quote_attr_values: Whether to quote attribute values that don't
quote_attr_values="legacy"|"spec"|"always" require quoting per legacy browser behavior (``"legacy"``), when
Whether to quote attribute values that don't require quoting required by the standard (``"spec"``), or always (``"always"``).
per legacy browser behaviour, when required by the standard, or always.
quote_char=u'"'|u"'" Defaults to ``"legacy"``.
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote, :arg quote_char: Use given quote character for attribute quoting.
in which case single quotes are used instead.
escape_lt_in_attrs=False|True Defaults to ``"`` which will use double quotes unless attribute
Whether to escape < in attribute values. value contains a double quote, in which case single quotes are
escape_rcdata=False|True used.
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style. :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
resolve_entities=True|False values.
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos; Defaults to ``False``.
are unaffected by this setting.
strip_whitespace=False|True :arg escape_rcdata: Whether to escape characters that need to be
Whether to remove semantically meaningless whitespace. (This escaped within normal elements within rcdata elements such as
compresses all whitespace to a single space except within pre.) style.
minimize_boolean_attributes=True|False
Shortens boolean attributes to give just the attribute value, Defaults to ``False``.
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus=False|True :arg resolve_entities: Whether to resolve named character entities that
Includes a close-tag slash at the end of the start tag of void appear in the source tree. The XML predefined entities &lt; &gt;
elements (empty elements whose end tag is forbidden). E.g. <hr/>. &amp; &quot; &apos; are unaffected by this setting.
space_before_trailing_solidus=True|False
Places a space immediately before the closing slash in a tag Defaults to ``True``.
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize=False|True :arg strip_whitespace: Whether to remove semantically meaningless
Strip all unsafe or unknown constructs from output. whitespace. (This compresses all whitespace to a single space
See `html5lib user documentation`_ except within ``pre``.)
omit_optional_tags=True|False
Omit start/end tags that are optional. Defaults to ``False``.
alphabetical_attributes=False|True
Reorder attributes to be in alphabetical order. :arg minimize_boolean_attributes: Shortens boolean attributes to give
just the attribute value, for example::
<input disabled="disabled">
becomes::
<input disabled>
Defaults to ``True``.
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
start tag of void elements (empty elements whose end tag is
forbidden). E.g. ``<hr/>``.
Defaults to ``False``.
:arg space_before_trailing_solidus: Places a space immediately before
the closing slash in a tag using a trailing solidus. E.g.
``<hr />``. Requires ``use_trailing_solidus=True``.
Defaults to ``True``.
:arg sanitize: Strip all unsafe or unknown constructs from output.
See :py:class:`html5lib.filters.sanitizer.Filter`.
Defaults to ``False``.
:arg omit_optional_tags: Omit start/end tags that are optional.
Defaults to ``True``.
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
Defaults to ``False``.
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
""" """
unexpected_args = frozenset(kwargs) - frozenset(self.options) unexpected_args = frozenset(kwargs) - frozenset(self.options)
if len(unexpected_args) > 0: if len(unexpected_args) > 0:
@ -218,7 +274,7 @@ class HTMLSerializer(object):
if token["systemId"]: if token["systemId"]:
if token["systemId"].find('"') >= 0: if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0: if token["systemId"].find("'") >= 0:
self.serializeError("System identifer contains both single and double quote characters") self.serializeError("System identifier contains both single and double quote characters")
quote_char = "'" quote_char = "'"
else: else:
quote_char = '"' quote_char = '"'
@ -317,6 +373,25 @@ class HTMLSerializer(object):
self.serializeError(token["data"]) self.serializeError(token["data"])
def render(self, treewalker, encoding=None): def render(self, treewalker, encoding=None):
"""Serializes the stream from the treewalker into a string
:arg treewalker: the treewalker to serialize
:arg encoding: the string encoding to use
:returns: the serialized tree
Example:
>>> from html5lib import parse, getTreeWalker
>>> from html5lib.serializer import HTMLSerializer
>>> token_stream = parse('<html><body>Hi!</body></html>')
>>> walker = getTreeWalker('etree')
>>> serializer = HTMLSerializer(omit_optional_tags=False)
>>> serializer.render(walker(token_stream))
'<html><head></head><body>Hi!</body></html>'
"""
if encoding: if encoding:
return b"".join(list(self.serialize(treewalker, encoding))) return b"".join(list(self.serialize(treewalker, encoding)))
else: else:

View file

@ -1,3 +1,21 @@
"""Tree adapters let you convert from one tree structure to another
Example:
.. code-block:: python
import html5lib
from html5lib.treeadapters import genshi
doc = '<html><body>Hi!</body></html>'
treebuilder = html5lib.getTreeBuilder('etree')
parser = html5lib.HTMLParser(tree=treebuilder)
tree = parser.parse(doc)
TreeWalker = html5lib.getTreeWalker('etree')
genshi_tree = genshi.to_genshi(TreeWalker(tree))
"""
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from . import sax from . import sax

View file

@ -5,6 +5,13 @@ from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
def to_genshi(walker): def to_genshi(walker):
"""Convert a tree to a genshi tree
:arg walker: the treewalker to use to walk the tree to convert it
:returns: generator of genshi nodes
"""
text = [] text = []
for token in walker: for token in walker:
type = token["type"] type = token["type"]

View file

@ -11,7 +11,13 @@ for prefix, localName, namespace in adjustForeignAttributes.values():
def to_sax(walker, handler): def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker""" """Call SAX-like content handler based on treewalker walker
:arg walker: the treewalker to use to walk the tree to convert it
:arg handler: SAX handler to use
"""
handler.startDocument() handler.startDocument()
for prefix, namespace in prefix_mapping.items(): for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace) handler.startPrefixMapping(prefix, namespace)

View file

@ -1,29 +1,32 @@
"""A collection of modules for building different kinds of tree from """A collection of modules for building different kinds of trees from HTML
HTML documents. documents.
To create a treebuilder for a new type of tree, you need to do To create a treebuilder for a new type of tree, you need to do
implement several things: implement several things:
1) A set of classes for various types of elements: Document, Doctype, 1. A set of classes for various types of elements: Document, Doctype, Comment,
Comment, Element. These must implement the interface of Element. These must implement the interface of ``base.treebuilders.Node``
_base.treebuilders.Node (although comment nodes have a different (although comment nodes have a different signature for their constructor,
signature for their constructor, see treebuilders.etree.Comment) see ``treebuilders.etree.Comment``) Textual content may also be implemented
Textual content may also be implemented as another node type, or not, as as another node type, or not, as your tree implementation requires.
your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that 2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements * ``documentClass`` - the class to use for the bottommost node of a document
commentClass - the class to use for comments * ``elementClass`` - the class to use for HTML Elements
doctypeClass - the class to use for doctypes * ``commentClass`` - the class to use for comments
It also has one required method: * ``doctypeClass`` - the class to use for doctypes
getDocument - Returns the root node of the complete document tree
It also has one required method:
* ``getDocument`` - Returns the root node of the complete document tree
3. If you wish to run the unit tests, you must also create a ``testSerializer``
method on your treebuilder which accepts a node and returns a string
containing Node and its children serialized according to the format used in
the unittests
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
""" """
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
@ -34,23 +37,32 @@ treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs): def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support """Get a TreeBuilder class for various types of trees with built-in support
treeType - the name of the tree type required (case-insensitive). Supported :arg treeType: the name of the tree type required (case-insensitive). Supported
values are: values are:
"dom" - A generic builder for DOM implementations, defaulting to * "dom" - A generic builder for DOM implementations, defaulting to a
a xml.dom.minidom based implementation. xml.dom.minidom based implementation.
"etree" - A generic builder for tree implementations exposing an * "etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to ElementTree-like interface, defaulting to xml.etree.cElementTree if
xml.etree.cElementTree if available and available and xml.etree.ElementTree if not.
xml.etree.ElementTree if not. * "lxml" - A etree-based builder for lxml.etree, handling limitations
"lxml" - A etree-based builder for lxml.etree, handling of lxml's implementation.
limitations of lxml's implementation.
implementation - (Currently applies to the "etree" and "dom" tree types). A :arg implementation: (Currently applies to the "etree" and "dom" tree
module implementing the tree type e.g. types). A module implementing the tree type e.g. xml.etree.ElementTree
xml.etree.ElementTree or xml.etree.cElementTree.""" or xml.etree.cElementTree.
:arg kwargs: Any additional options to pass to the TreeBuilder when
creating it.
Example:
>>> from html5lib.treebuilders import getTreeBuilder
>>> builder = getTreeBuilder('etree')
"""
treeType = treeType.lower() treeType = treeType.lower()
if treeType not in treeBuilderCache: if treeType not in treeBuilderCache:

View file

@ -21,22 +21,25 @@ listElementsMap = {
class Node(object): class Node(object):
"""Represents an item in the tree"""
def __init__(self, name): def __init__(self, name):
"""Node representing an item in the tree. """Creates a Node
name - The tag name associated with the node
parent - The parent of the current node (or None for the document node) :arg name: The tag name associated with the node
value - The value of the current node (applies to text nodes and
comments
attributes - a dict holding name, value pairs for attributes of the node
childNodes - a list of child nodes of the current node. This must
include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node
""" """
# The tag name associated with the node
self.name = name self.name = name
# The parent of the current node (or None for the document node)
self.parent = None self.parent = None
# The value of the current node (applies to text nodes and comments)
self.value = None self.value = None
# A dict holding name -> value pairs for attributes of the node
self.attributes = {} self.attributes = {}
# A list of child nodes of the current node. This must include all
# elements but not necessarily other node types.
self.childNodes = [] self.childNodes = []
# A list of miscellaneous flags that can be set on the node.
self._flags = [] self._flags = []
def __str__(self): def __str__(self):
@ -53,23 +56,41 @@ class Node(object):
def appendChild(self, node): def appendChild(self, node):
"""Insert node as a child of the current node """Insert node as a child of the current node
:arg node: the node to insert
""" """
raise NotImplementedError raise NotImplementedError
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the """Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text. start of node insertBefore or to the end of the node's text.
:arg data: the data to insert
:arg insertBefore: True if you want to insert the text before the node
and False if you want to insert it after the node
""" """
raise NotImplementedError raise NotImplementedError
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the """Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of list of child nodes. Raises ValueError if refNode is not a child of
the current node""" the current node
:arg node: the node to insert
:arg refNode: the child node to insert the node before
"""
raise NotImplementedError raise NotImplementedError
def removeChild(self, node): def removeChild(self, node):
"""Remove node from the children of the current node """Remove node from the children of the current node
:arg node: the child node to remove
""" """
raise NotImplementedError raise NotImplementedError
@ -77,6 +98,9 @@ class Node(object):
"""Move all the children of the current node to newParent. """Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the This is needed so that trees that don't store text as nodes move the
text in the correct way text in the correct way
:arg newParent: the node to move all this node's children to
""" """
# XXX - should this method be made more general? # XXX - should this method be made more general?
for child in self.childNodes: for child in self.childNodes:
@ -121,10 +145,12 @@ class ActiveFormattingElements(list):
class TreeBuilder(object): class TreeBuilder(object):
"""Base treebuilder implementation """Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements * documentClass - the class to use for the bottommost node of a document
commentClass - the class to use for comments * elementClass - the class to use for HTML Elements
doctypeClass - the class to use for doctypes * commentClass - the class to use for comments
* doctypeClass - the class to use for doctypes
""" """
# pylint:disable=not-callable # pylint:disable=not-callable
@ -144,6 +170,11 @@ class TreeBuilder(object):
fragmentClass = None fragmentClass = None
def __init__(self, namespaceHTMLElements): def __init__(self, namespaceHTMLElements):
"""Create a TreeBuilder
:arg namespaceHTMLElements: whether or not to namespace HTML elements
"""
if namespaceHTMLElements: if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml" self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else: else:
@ -367,11 +398,11 @@ class TreeBuilder(object):
self.generateImpliedEndTags(exclude) self.generateImpliedEndTags(exclude)
def getDocument(self): def getDocument(self):
"Return the final tree" """Return the final tree"""
return self.document return self.document
def getFragment(self): def getFragment(self):
"Return the final fragment" """Return the final fragment"""
# assert self.innerHTML # assert self.innerHTML
fragment = self.fragmentClass() fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment) self.openElements[0].reparentChildren(fragment)
@ -379,5 +410,8 @@ class TreeBuilder(object):
def testSerializer(self, node): def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests """Serialize the subtree of node in the format required by unit tests
node - the node from which to start serializing"""
:arg node: the node from which to start serializing
"""
raise NotImplementedError raise NotImplementedError

View file

@ -309,7 +309,6 @@ class TreeBuilder(base.TreeBuilder):
super(TreeBuilder, self).insertComment(data, parent) super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token): def insertRoot(self, token):
"""Create the document root"""
# Because of the way libxml2 works, it doesn't seem to be possible to # Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed. # alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our initial # Therefore we need to use the built-in parser to create our initial

View file

@ -2,10 +2,10 @@
tree, generating tokens identical to those produced by the tokenizer tree, generating tokens identical to those produced by the tokenizer
module. module.
To create a tree walker for a new type of tree, you need to do To create a tree walker for a new type of tree, you need to
implement a tree walker object (called TreeWalker by convention) that implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method taking a tree as sole argument and implements a 'serialize' method which takes a tree as sole argument and
returning an iterator generating tokens. returns an iterator which generates tokens.
""" """
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
@ -13,7 +13,7 @@ from __future__ import absolute_import, division, unicode_literals
from .. import constants from .. import constants
from .._utils import default_etree from .._utils import default_etree
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"] __all__ = ["getTreeWalker", "pprint"]
treeWalkerCache = {} treeWalkerCache = {}
@ -21,20 +21,25 @@ treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs): def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support """Get a TreeWalker class for various types of tree with built-in support
Args: :arg str treeType: the name of the tree type required (case-insensitive).
treeType (str): the name of the tree type required (case-insensitive). Supported values are:
Supported values are:
- "dom": The xml.dom.minidom DOM implementation * "dom": The xml.dom.minidom DOM implementation
- "etree": A generic walker for tree implementations exposing an * "etree": A generic walker for tree implementations exposing an
elementtree-like interface (known to work with elementtree-like interface (known to work with ElementTree,
ElementTree, cElementTree and lxml.etree). cElementTree and lxml.etree).
- "lxml": Optimized walker for lxml.etree * "lxml": Optimized walker for lxml.etree
- "genshi": a Genshi stream * "genshi": a Genshi stream
:arg implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
tree type only).
:arg kwargs: keyword arguments passed to the etree walker--for other
walkers, this has no effect
:returns: a TreeWalker class
Implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the
"etree" tree type only).
""" """
treeType = treeType.lower() treeType = treeType.lower()
@ -73,7 +78,13 @@ def concatenateCharacterTokens(tokens):
def pprint(walker): def pprint(walker):
"""Pretty printer for tree walkers""" """Pretty printer for tree walkers
Takes a TreeWalker instance and pretty prints the output of walking the tree.
:arg walker: a TreeWalker instance
"""
output = [] output = []
indent = 0 indent = 0
for token in concatenateCharacterTokens(walker): for token in concatenateCharacterTokens(walker):

View file

@ -18,16 +18,48 @@ spaceCharacters = "".join(spaceCharacters)
class TreeWalker(object): class TreeWalker(object):
"""Walks a tree yielding tokens
Tokens are dicts that all have a ``type`` field specifying the type of the
token.
"""
def __init__(self, tree): def __init__(self, tree):
"""Creates a TreeWalker
:arg tree: the tree to walk
"""
self.tree = tree self.tree = tree
def __iter__(self): def __iter__(self):
raise NotImplementedError raise NotImplementedError
def error(self, msg): def error(self, msg):
"""Generates an error token with the given message
:arg msg: the error message
:returns: SerializeError token
"""
return {"type": "SerializeError", "data": msg} return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False): def emptyTag(self, namespace, name, attrs, hasChildren=False):
"""Generates an EmptyTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:arg hasChildren: whether or not to yield a SerializationError because
this tag shouldn't have children
:returns: EmptyTag token
"""
yield {"type": "EmptyTag", "name": name, yield {"type": "EmptyTag", "name": name,
"namespace": namespace, "namespace": namespace,
"data": attrs} "data": attrs}
@ -35,17 +67,61 @@ class TreeWalker(object):
yield self.error("Void element has children") yield self.error("Void element has children")
def startTag(self, namespace, name, attrs): def startTag(self, namespace, name, attrs):
"""Generates a StartTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:returns: StartTag token
"""
return {"type": "StartTag", return {"type": "StartTag",
"name": name, "name": name,
"namespace": namespace, "namespace": namespace,
"data": attrs} "data": attrs}
def endTag(self, namespace, name): def endTag(self, namespace, name):
"""Generates an EndTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:returns: EndTag token
"""
return {"type": "EndTag", return {"type": "EndTag",
"name": name, "name": name,
"namespace": namespace} "namespace": namespace}
def text(self, data): def text(self, data):
"""Generates SpaceCharacters and Characters tokens
Depending on what's in the data, this generates one or more
``SpaceCharacters`` and ``Characters`` tokens.
For example:
>>> from html5lib.treewalkers.base import TreeWalker
>>> # Give it an empty tree just so it instantiates
>>> walker = TreeWalker([])
>>> list(walker.text(''))
[]
>>> list(walker.text(' '))
[{u'data': ' ', u'type': u'SpaceCharacters'}]
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
[{u'data': ' ', u'type': u'SpaceCharacters'},
{u'data': u'abc', u'type': u'Characters'},
{u'data': u' ', u'type': u'SpaceCharacters'}]
:arg data: the text data
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
"""
data = data data = data
middle = data.lstrip(spaceCharacters) middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)] left = data[:len(data) - len(middle)]
@ -60,18 +136,44 @@ class TreeWalker(object):
yield {"type": "SpaceCharacters", "data": right} yield {"type": "SpaceCharacters", "data": right}
def comment(self, data): def comment(self, data):
"""Generates a Comment token
:arg data: the comment
:returns: Comment token
"""
return {"type": "Comment", "data": data} return {"type": "Comment", "data": data}
def doctype(self, name, publicId=None, systemId=None): def doctype(self, name, publicId=None, systemId=None):
"""Generates a Doctype token
:arg name:
:arg publicId:
:arg systemId:
:returns: the Doctype token
"""
return {"type": "Doctype", return {"type": "Doctype",
"name": name, "name": name,
"publicId": publicId, "publicId": publicId,
"systemId": systemId} "systemId": systemId}
def entity(self, name): def entity(self, name):
"""Generates an Entity token
:arg name: the entity name
:returns: an Entity token
"""
return {"type": "Entity", "name": name} return {"type": "Entity", "name": name}
def unknown(self, nodeType): def unknown(self, nodeType):
"""Handles unknown node types"""
return self.error("Unknown node type: " + nodeType) return self.error("Unknown node type: " + nodeType)

View file

@ -1,13 +1,6 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
try: from collections import OrderedDict
from collections import OrderedDict
except ImportError:
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
import re import re
from six import string_types from six import string_types