[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+ # Match any character set and encoding
+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+ # Assume the rest is data
+ ,.*
+ $
+ ''',
+ re.VERBOSE)
+
+
+class Filter(base.Filter):
+ """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+ def __init__(self,
+ source,
+ allowed_elements=allowed_elements,
+ allowed_attributes=allowed_attributes,
+ allowed_css_properties=allowed_css_properties,
+ allowed_css_keywords=allowed_css_keywords,
+ allowed_svg_properties=allowed_svg_properties,
+ allowed_protocols=allowed_protocols,
+ allowed_content_types=allowed_content_types,
+ attr_val_is_uri=attr_val_is_uri,
+ svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+ svg_allow_local_href=svg_allow_local_href):
+ super(Filter, self).__init__(source)
+ self.allowed_elements = allowed_elements
+ self.allowed_attributes = allowed_attributes
+ self.allowed_css_properties = allowed_css_properties
+ self.allowed_css_keywords = allowed_css_keywords
+ self.allowed_svg_properties = allowed_svg_properties
+ self.allowed_protocols = allowed_protocols
+ self.allowed_content_types = allowed_content_types
+ self.attr_val_is_uri = attr_val_is_uri
+ self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+ self.svg_allow_local_href = svg_allow_local_href
+
def __iter__(self):
- for token in _base.Filter.__iter__(self):
+ for token in base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
+
+ # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+ # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+ # attributes are parsed, and a restricted set, # specified by
+ # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+ # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+ # in ALLOWED_PROTOCOLS are allowed.
+ #
+ # sanitize_html('')
+ # => <script> do_nasty_stuff() </script>
+ # sanitize_html('Click here for $100')
+ # => Click here for $100
+ def sanitize_token(self, token):
+
+ # accommodate filters which use token_type differently
+ token_type = token["type"]
+ if token_type in ("StartTag", "EndTag", "EmptyTag"):
+ name = token["name"]
+ namespace = token["namespace"]
+ if ((namespace, name) in self.allowed_elements or
+ (namespace is None and
+ (namespaces["html"], name) in self.allowed_elements)):
+ return self.allowed_token(token)
+ else:
+ return self.disallowed_token(token)
+ elif token_type == "Comment":
+ pass
+ else:
+ return token
+
+ def allowed_token(self, token):
+ if "data" in token:
+ attrs = token["data"]
+ attr_names = set(attrs.keys())
+
+ # Remove forbidden attributes
+ for to_remove in (attr_names - self.allowed_attributes):
+ del token["data"][to_remove]
+ attr_names.remove(to_remove)
+
+ # Remove attributes with disallowed URL values
+ for attr in (attr_names & self.attr_val_is_uri):
+ assert attr in attrs
+ # I don't have a clue where this regexp comes from or why it matches those
+ # characters, nor why we call unescape. I just know it's always been here.
+ # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+ # this will do is remove *more* than it otherwise would.
+ val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
+ unescape(attrs[attr])).lower()
+ # remove replacement characters from unescaped characters
+ val_unescaped = val_unescaped.replace("\ufffd", "")
+ try:
+ uri = urlparse.urlparse(val_unescaped)
+ except ValueError:
+ uri = None
+ del attrs[attr]
+ if uri and uri.scheme:
+ if uri.scheme not in self.allowed_protocols:
+ del attrs[attr]
+ if uri.scheme == 'data':
+ m = data_content_type.match(uri.path)
+ if not m:
+ del attrs[attr]
+ elif m.group('content_type') not in self.allowed_content_types:
+ del attrs[attr]
+
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token["name"] in self.svg_allow_local_href and
+ (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
+ attrs[(namespaces['xlink'], 'href')])):
+ del attrs[(namespaces['xlink'], 'href')]
+ if (None, 'style') in attrs:
+ attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+ token["data"] = attrs
+ return token
+
+ def disallowed_token(self, token):
+ token_type = token["type"]
+ if token_type == "EndTag":
+ token["data"] = "%s>" % token["name"]
+ elif token["data"]:
+ assert token_type in ("StartTag", "EmptyTag")
+ attrs = []
+ for (ns, name), v in token["data"].items():
+ attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+ token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+ else:
+ token["data"] = "<%s>" % token["name"]
+ if token.get("selfClosing"):
+ token["data"] = token["data"][:-1] + "/>"
+
+ token["type"] = "Characters"
+
+ del token["name"]
+ return token
+
+ def sanitize_css(self, style):
+ # disallow urls
+ style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+ return ''
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+ return ''
+
+ clean = []
+ for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
+ if not value:
+ continue
+ if prop.lower() in self.allowed_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+ 'padding']:
+ for keyword in value.split():
+ if keyword not in self.allowed_css_keywords and \
+ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.lower() in self.allowed_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
diff --git a/lib/html5lib/filters/whitespace.py b/lib/html5lib/filters/whitespace.py
index dfc60eeb..89210528 100644
--- a/lib/html5lib/filters/whitespace.py
+++ b/lib/html5lib/filters/whitespace.py
@@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
import re
-from . import _base
+from . import base
from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
-class Filter(_base.Filter):
+class Filter(base.Filter):
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self):
preserve = 0
- for token in _base.Filter.__iter__(self):
+ for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):
diff --git a/lib/html5lib/html5parser.py b/lib/html5lib/html5parser.py
index c2c30783..2abd63e4 100644
--- a/lib/html5lib/html5parser.py
+++ b/lib/html5lib/html5parser.py
@@ -1,39 +1,44 @@
from __future__ import absolute_import, division, unicode_literals
-from six import with_metaclass
+from six import with_metaclass, viewkeys, PY3
import types
-from . import inputstream
-from . import tokenizer
+try:
+ from collections import OrderedDict
+except ImportError:
+ from ordereddict import OrderedDict
+
+from . import _inputstream
+from . import _tokenizer
from . import treebuilders
-from .treebuilders._base import Marker
+from .treebuilders.base import Marker
-from . import utils
-from . import constants
-from .constants import spaceCharacters, asciiUpper2Lower
-from .constants import specialElements
-from .constants import headingElements
-from .constants import cdataElements, rcdataElements
-from .constants import tokenTypes, ReparseException, namespaces
-from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
-from .constants import adjustForeignAttributes as adjustForeignAttributesMap
-from .constants import E
+from . import _utils
+from .constants import (
+ spaceCharacters, asciiUpper2Lower,
+ specialElements, headingElements, cdataElements, rcdataElements,
+ tokenTypes, tagTokenTypes,
+ namespaces,
+ htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+ adjustForeignAttributes as adjustForeignAttributesMap,
+ adjustMathMLAttributes, adjustSVGAttributes,
+ E,
+ ReparseException
+)
-def parse(doc, treebuilder="etree", encoding=None,
- namespaceHTMLElements=True):
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse a string or file-like object into a tree"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parse(doc, encoding=encoding)
+ return p.parse(doc, **kwargs)
-def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
- namespaceHTMLElements=True):
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parseFragment(doc, container=container, encoding=encoding)
+ return p.parseFragment(doc, container=container, **kwargs)
def method_decorator_metaclass(function):
@@ -52,18 +57,13 @@ class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
- def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
- strict=False, namespaceHTMLElements=True, debug=False):
+ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
-
- tokenizer - a class that provides a stream of tokens to the treebuilder.
- This may be replaced for e.g. a sanitizer which converts some tags to
- text
"""
# Raise an exception on the first error encountered
@@ -72,29 +72,24 @@ class HTMLParser(object):
if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
- self.tokenizer_class = tokenizer
self.errors = []
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()])
- def _parse(self, stream, innerHTML=False, container="div",
- encoding=None, parseMeta=True, useChardet=True, **kwargs):
+ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
self.innerHTMLMode = innerHTML
self.container = container
- self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
- parseMeta=parseMeta,
- useChardet=useChardet,
- parser=self, **kwargs)
+ self.scripting = scripting
+ self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()
- while True:
- try:
- self.mainLoop()
- break
- except ReparseException:
- self.reset()
+ try:
+ self.mainLoop()
+ except ReparseException:
+ self.reset()
+ self.mainLoop()
def reset(self):
self.tree.reset()
@@ -121,7 +116,7 @@ class HTMLParser(object):
self.phase.insertHtmlElement()
self.resetInsertionMode()
else:
- self.innerHTML = False
+ self.innerHTML = False # pylint:disable=redefined-variable-type
self.phase = self.phases["initial"]
self.lastPhase = None
@@ -139,7 +134,7 @@ class HTMLParser(object):
"""
if not hasattr(self, 'tokenizer'):
return None
- return self.tokenizer.stream.charEncoding[0]
+ return self.tokenizer.stream.charEncoding[0].name
def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
@@ -164,8 +159,10 @@ class HTMLParser(object):
ParseErrorToken = tokenTypes["ParseError"]
for token in self.normalizedTokens():
+ prev_token = None
new_token = token
while new_token is not None:
+ prev_token = new_token
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
currentNodeNamespace = currentNode.namespace if currentNode else None
currentNodeName = currentNode.name if currentNode else None
@@ -184,6 +181,7 @@ class HTMLParser(object):
type in (CharactersToken, SpaceCharactersToken))) or
(currentNodeNamespace == namespaces["mathml"] and
currentNodeName == "annotation-xml" and
+ type == StartTagToken and
token["name"] == "svg") or
(self.isHTMLIntegrationPoint(currentNode) and
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
@@ -204,10 +202,10 @@ class HTMLParser(object):
elif type == DoctypeToken:
new_token = phase.processDoctype(new_token)
- if (type == StartTagToken and token["selfClosing"]
- and not token["selfClosingAcknowledged"]):
+ if (type == StartTagToken and prev_token["selfClosing"] and
+ not prev_token["selfClosingAcknowledged"]):
self.parseError("non-void-element-with-trailing-solidus",
- {"name": token["name"]})
+ {"name": prev_token["name"]})
# When the loop finishes it's EOF
reprocess = True
@@ -222,7 +220,7 @@ class HTMLParser(object):
for token in self.tokenizer:
yield self.normalizeToken(token)
- def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
+ def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
@@ -231,13 +229,13 @@ class HTMLParser(object):
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
+
+ scripting - treat noscript elements as if javascript was turned on
"""
- self._parse(stream, innerHTML=False, encoding=encoding,
- parseMeta=parseMeta, useChardet=useChardet)
+ self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument()
- def parseFragment(self, stream, container="div", encoding=None,
- parseMeta=False, useChardet=True):
+ def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property
@@ -249,12 +247,16 @@ class HTMLParser(object):
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
+
+ scripting - treat noscript elements as if javascript was turned on
"""
- self._parse(stream, True, container=container, encoding=encoding)
+ self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()
- def parseError(self, errorcode="XXX-undefined-error", datavars={}):
+ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
# XXX The idea is to make errorcode mandatory.
+ if datavars is None:
+ datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
raise ParseError(E[errorcode] % datavars)
@@ -263,98 +265,25 @@ class HTMLParser(object):
""" HTML5 specific normalizations to the token stream """
if token["type"] == tokenTypes["StartTag"]:
- token["data"] = dict(token["data"][::-1])
+ raw = token["data"]
+ token["data"] = OrderedDict(raw)
+ if len(raw) > len(token["data"]):
+ # we had some duplicated attribute, fix so first wins
+ token["data"].update(raw[::-1])
return token
def adjustMathMLAttributes(self, token):
- replacements = {"definitionurl": "definitionURL"}
- for k, v in replacements.items():
- if k in token["data"]:
- token["data"][v] = token["data"][k]
- del token["data"][k]
+ adjust_attributes(token, adjustMathMLAttributes)
def adjustSVGAttributes(self, token):
- replacements = {
- "attributename": "attributeName",
- "attributetype": "attributeType",
- "basefrequency": "baseFrequency",
- "baseprofile": "baseProfile",
- "calcmode": "calcMode",
- "clippathunits": "clipPathUnits",
- "contentscripttype": "contentScriptType",
- "contentstyletype": "contentStyleType",
- "diffuseconstant": "diffuseConstant",
- "edgemode": "edgeMode",
- "externalresourcesrequired": "externalResourcesRequired",
- "filterres": "filterRes",
- "filterunits": "filterUnits",
- "glyphref": "glyphRef",
- "gradienttransform": "gradientTransform",
- "gradientunits": "gradientUnits",
- "kernelmatrix": "kernelMatrix",
- "kernelunitlength": "kernelUnitLength",
- "keypoints": "keyPoints",
- "keysplines": "keySplines",
- "keytimes": "keyTimes",
- "lengthadjust": "lengthAdjust",
- "limitingconeangle": "limitingConeAngle",
- "markerheight": "markerHeight",
- "markerunits": "markerUnits",
- "markerwidth": "markerWidth",
- "maskcontentunits": "maskContentUnits",
- "maskunits": "maskUnits",
- "numoctaves": "numOctaves",
- "pathlength": "pathLength",
- "patterncontentunits": "patternContentUnits",
- "patterntransform": "patternTransform",
- "patternunits": "patternUnits",
- "pointsatx": "pointsAtX",
- "pointsaty": "pointsAtY",
- "pointsatz": "pointsAtZ",
- "preservealpha": "preserveAlpha",
- "preserveaspectratio": "preserveAspectRatio",
- "primitiveunits": "primitiveUnits",
- "refx": "refX",
- "refy": "refY",
- "repeatcount": "repeatCount",
- "repeatdur": "repeatDur",
- "requiredextensions": "requiredExtensions",
- "requiredfeatures": "requiredFeatures",
- "specularconstant": "specularConstant",
- "specularexponent": "specularExponent",
- "spreadmethod": "spreadMethod",
- "startoffset": "startOffset",
- "stddeviation": "stdDeviation",
- "stitchtiles": "stitchTiles",
- "surfacescale": "surfaceScale",
- "systemlanguage": "systemLanguage",
- "tablevalues": "tableValues",
- "targetx": "targetX",
- "targety": "targetY",
- "textlength": "textLength",
- "viewbox": "viewBox",
- "viewtarget": "viewTarget",
- "xchannelselector": "xChannelSelector",
- "ychannelselector": "yChannelSelector",
- "zoomandpan": "zoomAndPan"
- }
- for originalName in list(token["data"].keys()):
- if originalName in replacements:
- svgName = replacements[originalName]
- token["data"][svgName] = token["data"][originalName]
- del token["data"][originalName]
+ adjust_attributes(token, adjustSVGAttributes)
def adjustForeignAttributes(self, token):
- replacements = adjustForeignAttributesMap
-
- for originalName in token["data"].keys():
- if originalName in replacements:
- foreignName = replacements[originalName]
- token["data"][foreignName] = token["data"][originalName]
- del token["data"][originalName]
+ adjust_attributes(token, adjustForeignAttributesMap)
def reparseTokenNormal(self, token):
+ # pylint:disable=unused-argument
self.parser.phase()
def resetInsertionMode(self):
@@ -419,11 +348,12 @@ class HTMLParser(object):
self.phase = self.phases["text"]
+@_utils.memoize
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
- constants.tokenTypes.items())
+ tokenTypes.items())
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
@@ -432,7 +362,7 @@ def getPhases(debug):
info = {"type": type_names[token['type']]}
except:
raise
- if token['type'] in constants.tagTokenTypes:
+ if token['type'] in tagTokenTypes:
info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__,
@@ -451,6 +381,7 @@ def getPhases(debug):
else:
return type
+ # pylint:disable=unused-argument
class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing
"""
@@ -517,77 +448,76 @@ def getPhases(debug):
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
- if (not correct or token["name"] != "html"
- or publicId.startswith(
- ("+//silmaril//dtd html pro v0r11 19970101//",
- "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
- "-//as//dtd html 3.0 aswedit + extensions//",
- "-//ietf//dtd html 2.0 level 1//",
- "-//ietf//dtd html 2.0 level 2//",
- "-//ietf//dtd html 2.0 strict level 1//",
- "-//ietf//dtd html 2.0 strict level 2//",
- "-//ietf//dtd html 2.0 strict//",
- "-//ietf//dtd html 2.0//",
- "-//ietf//dtd html 2.1e//",
- "-//ietf//dtd html 3.0//",
- "-//ietf//dtd html 3.2 final//",
- "-//ietf//dtd html 3.2//",
- "-//ietf//dtd html 3//",
- "-//ietf//dtd html level 0//",
- "-//ietf//dtd html level 1//",
- "-//ietf//dtd html level 2//",
- "-//ietf//dtd html level 3//",
- "-//ietf//dtd html strict level 0//",
- "-//ietf//dtd html strict level 1//",
- "-//ietf//dtd html strict level 2//",
- "-//ietf//dtd html strict level 3//",
- "-//ietf//dtd html strict//",
- "-//ietf//dtd html//",
- "-//metrius//dtd metrius presentational//",
- "-//microsoft//dtd internet explorer 2.0 html strict//",
- "-//microsoft//dtd internet explorer 2.0 html//",
- "-//microsoft//dtd internet explorer 2.0 tables//",
- "-//microsoft//dtd internet explorer 3.0 html strict//",
- "-//microsoft//dtd internet explorer 3.0 html//",
- "-//microsoft//dtd internet explorer 3.0 tables//",
- "-//netscape comm. corp.//dtd html//",
- "-//netscape comm. corp.//dtd strict html//",
- "-//o'reilly and associates//dtd html 2.0//",
- "-//o'reilly and associates//dtd html extended 1.0//",
- "-//o'reilly and associates//dtd html extended relaxed 1.0//",
- "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
- "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
- "-//spyglass//dtd html 2.0 extended//",
- "-//sq//dtd html 2.0 hotmetal + extensions//",
- "-//sun microsystems corp.//dtd hotjava html//",
- "-//sun microsystems corp.//dtd hotjava strict html//",
- "-//w3c//dtd html 3 1995-03-24//",
- "-//w3c//dtd html 3.2 draft//",
- "-//w3c//dtd html 3.2 final//",
- "-//w3c//dtd html 3.2//",
- "-//w3c//dtd html 3.2s draft//",
- "-//w3c//dtd html 4.0 frameset//",
- "-//w3c//dtd html 4.0 transitional//",
- "-//w3c//dtd html experimental 19960712//",
- "-//w3c//dtd html experimental 970421//",
- "-//w3c//dtd w3 html//",
- "-//w3o//dtd w3 html 3.0//",
- "-//webtechs//dtd mozilla html 2.0//",
- "-//webtechs//dtd mozilla html//"))
- or publicId in
- ("-//w3o//dtd w3 html strict 3.0//en//",
- "-/w3c/dtd html 4.0 transitional/en",
- "html")
- or publicId.startswith(
- ("-//w3c//dtd html 4.01 frameset//",
- "-//w3c//dtd html 4.01 transitional//")) and
- systemId is None
- or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+ if (not correct or token["name"] != "html" or
+ publicId.startswith(
+ ("+//silmaril//dtd html pro v0r11 19970101//",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//")) or
+ publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html") or
+ publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId is None or
+ systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
- "-//w3c//dtd xhtml 1.0 transitional//"))
- or publicId.startswith(
+ "-//w3c//dtd xhtml 1.0 transitional//")) or
+ publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is not None):
@@ -660,13 +590,13 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@@ -706,10 +636,11 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("title", self.startTagTitle),
- (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
+ (("noframes", "style"), self.startTagNoFramesStyle),
+ ("noscript", self.startTagNoscript),
("script", self.startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
self.startTagBaseLinkCommand),
@@ -718,7 +649,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self. endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("head", self.endTagHead),
(("br", "html", "body"), self.endTagHtmlBodyBr)
])
@@ -760,18 +691,25 @@ def getPhases(debug):
# the abstract Unicode string, and just use the
# ContentAttrParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
- data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
- parser = inputstream.ContentAttrParser(data)
+ data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+ parser = _inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
self.parser.parseRCDataRawtext(token, "RCDATA")
- def startTagNoScriptNoFramesStyle(self, token):
+ def startTagNoFramesStyle(self, token):
# Need to decide whether to implement the scripting-disabled case
self.parser.parseRCDataRawtext(token, "RAWTEXT")
+ def startTagNoscript(self, token):
+ if self.parser.scripting:
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
+ else:
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inHeadNoscript"]
+
def startTagScript(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
@@ -797,15 +735,75 @@ def getPhases(debug):
def anythingElse(self):
self.endTagHead(impliedTagToken("head"))
- # XXX If we implement a parser for which scripting is disabled we need to
- # implement this phase.
- #
- # class InHeadNoScriptPhase(Phase):
+ class InHeadNoscriptPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = _utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
+ (("head", "noscript"), self.startTagHeadNoscript),
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = _utils.MethodDispatcher([
+ ("noscript", self.endTagNoscript),
+ ("br", self.endTagBr),
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.parser.parseError("eof-in-head-noscript")
+ self.anythingElse()
+ return True
+
+ def processComment(self, token):
+ return self.parser.phases["inHead"].processComment(token)
+
+ def processCharacters(self, token):
+ self.parser.parseError("char-in-head-noscript")
+ self.anythingElse()
+ return token
+
+ def processSpaceCharacters(self, token):
+ return self.parser.phases["inHead"].processSpaceCharacters(token)
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagBaseLinkCommand(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagHeadNoscript(self, token):
+ self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def endTagNoscript(self, token):
+ node = self.parser.tree.openElements.pop()
+ assert node.name == "noscript", "Expected noscript got %s" % node.name
+ self.parser.phase = self.parser.phases["inHead"]
+
+ def endTagBr(self, token):
+ self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def anythingElse(self):
+ # Caller must raise parse error first!
+ self.endTagNoscript(impliedTagToken("noscript"))
+
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
@@ -815,8 +813,8 @@ def getPhases(debug):
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
- self.endTagHtmlBodyBr)])
+ self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
+ self.endTagHtmlBodyBr)])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
@@ -874,10 +872,10 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- # Keep a ref to this for special handling of whitespace in
- self.processSpaceCharactersNonPre = self.processSpaceCharacters
+ # Set this to the default handler
+ self.processSpaceCharacters = self.processSpaceCharactersNonPre
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
@@ -885,7 +883,7 @@ def getPhases(debug):
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
- "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+ "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
self.startTagCloseP),
@@ -911,7 +909,8 @@ def getPhases(debug):
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
- (("noembed", "noframes", "noscript"), self.startTagRawtext),
+ ("noscript", self.startTagNoscript),
+ (("noembed", "noframes"), self.startTagRawtext),
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
@@ -923,7 +922,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("body", self.endTagBody),
("html", self.endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
@@ -942,17 +941,9 @@ def getPhases(debug):
self.endTagHandler.default = self.endTagOther
def isMatchingFormattingElement(self, node1, node2):
- if node1.name != node2.name or node1.namespace != node2.namespace:
- return False
- elif len(node1.attributes) != len(node2.attributes):
- return False
- else:
- attributes1 = sorted(node1.attributes.items())
- attributes2 = sorted(node2.attributes.items())
- for attr1, attr2 in zip(attributes1, attributes2):
- if attr1 != attr2:
- return False
- return True
+ return (node1.name == node2.name and
+ node1.namespace == node2.namespace and
+ node1.attributes == node2.attributes)
# helper
def addFormattingElement(self, token):
@@ -988,8 +979,8 @@ def getPhases(debug):
data = token["data"]
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and
- self.tree.openElements[-1].name in ("pre", "listing", "textarea")
- and not self.tree.openElements[-1].hasContent()):
+ self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
+ not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.reconstructActiveFormattingElements()
@@ -1007,7 +998,7 @@ def getPhases(debug):
for char in token["data"]])):
self.parser.framesetOK = False
- def processSpaceCharacters(self, token):
+ def processSpaceCharactersNonPre(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
@@ -1016,8 +1007,8 @@ def getPhases(debug):
def startTagBody(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "body"})
- if (len(self.tree.openElements) == 1
- or self.tree.openElements[1].name != "body"):
+ if (len(self.tree.openElements) == 1 or
+ self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
else:
self.parser.framesetOK = False
@@ -1232,6 +1223,12 @@ def getPhases(debug):
self.parser.framesetOK = False
self.startTagRawtext(token)
+ def startTagNoscript(self, token):
+ if self.parser.scripting:
+ self.startTagRawtext(token)
+ else:
+ self.startTagOther(token)
+
def startTagRawtext(self, token):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
self.parser.parseRCDataRawtext(token, "RAWTEXT")
@@ -1595,9 +1592,9 @@ def getPhases(debug):
class TextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([])
+ self.startTagHandler = _utils.MethodDispatcher([])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("script", self.endTagScript)])
self.endTagHandler.default = self.endTagOther
@@ -1629,7 +1626,7 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("caption", self.startTagCaption),
("colgroup", self.startTagColgroup),
@@ -1643,7 +1640,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
@@ -1820,14 +1817,14 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableElement)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("caption", self.endTagCaption),
("table", self.endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
@@ -1892,13 +1889,13 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("col", self.startTagCol)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("colgroup", self.endTagColgroup),
("col", self.endTagCol)
])
@@ -1926,6 +1923,7 @@ def getPhases(debug):
def startTagCol(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
def startTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
@@ -1955,7 +1953,7 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell),
@@ -1964,7 +1962,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
@@ -2053,7 +2051,7 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
@@ -2061,7 +2059,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("tr", self.endTagTr),
("table", self.endTagTable),
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
@@ -2142,14 +2140,14 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
(("td", "th"), self.endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
@@ -2218,7 +2216,7 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("option", self.startTagOption),
("optgroup", self.startTagOptgroup),
@@ -2228,7 +2226,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("option", self.endTagOption),
("optgroup", self.endTagOptgroup),
("select", self.endTagSelect)
@@ -2318,13 +2316,13 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.startTagTable)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.endTagTable)
])
@@ -2445,7 +2443,7 @@ def getPhases(debug):
def processEndTag(self, token):
nodeIndex = len(self.tree.openElements) - 1
node = self.tree.openElements[-1]
- if node.name != token["name"]:
+ if node.name.translate(asciiUpper2Lower) != token["name"]:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
while True:
@@ -2472,12 +2470,12 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
+ self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
@@ -2520,7 +2518,7 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("frameset", self.startTagFrameset),
("frame", self.startTagFrame),
@@ -2528,7 +2526,7 @@ def getPhases(debug):
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("frameset", self.endTagFrameset)
])
self.endTagHandler.default = self.endTagOther
@@ -2577,13 +2575,13 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
- self.endTagHandler = utils.MethodDispatcher([
+ self.endTagHandler = _utils.MethodDispatcher([
("html", self.endTagHtml)
])
self.endTagHandler.default = self.endTagOther
@@ -2613,7 +2611,7 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
@@ -2651,7 +2649,7 @@ def getPhases(debug):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
- self.startTagHandler = utils.MethodDispatcher([
+ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("noframes", self.startTagNoFrames)
])
@@ -2682,13 +2680,14 @@ def getPhases(debug):
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
+ # pylint:enable=unused-argument
return {
"initial": InitialPhase,
"beforeHtml": BeforeHtmlPhase,
"beforeHead": BeforeHeadPhase,
"inHead": InHeadPhase,
- # XXX "inHeadNoscript": InHeadNoScriptPhase,
+ "inHeadNoscript": InHeadNoscriptPhase,
"afterHead": AfterHeadPhase,
"inBody": InBodyPhase,
"text": TextPhase,
@@ -2711,6 +2710,16 @@ def getPhases(debug):
}
+def adjust_attributes(token, replacements):
+ if PY3 or _utils.PY27:
+ needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+ else:
+ needs_adjustment = frozenset(token['data']) & frozenset(replacements)
+ if needs_adjustment:
+ token['data'] = OrderedDict((replacements.get(k, k), v)
+ for k, v in token['data'].items())
+
+
def impliedTagToken(name, type="EndTag", attributes=None,
selfClosing=False):
if attributes is None:
diff --git a/lib/html5lib/sanitizer.py b/lib/html5lib/sanitizer.py
deleted file mode 100644
index b714e8c9..00000000
--- a/lib/html5lib/sanitizer.py
+++ /dev/null
@@ -1,300 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-import re
-from xml.sax.saxutils import escape, unescape
-from six.moves import urllib_parse as urlparse
-
-from .tokenizer import HTMLTokenizer
-from .constants import tokenTypes
-
-
-content_type_rgx = re.compile(r'''
- ^
- # Match a content type /
- (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
- # Match any character set and encoding
- (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
- |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
- # Assume the rest is data
- ,.*
- $
- ''',
- re.VERBOSE)
-
-
-class HTMLSanitizerMixin(object):
- """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
-
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
- 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
- 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
- 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
- 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
- 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
- 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
- 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
- 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
- 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
- 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
- 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
-
- mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
- 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
- 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
- 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
- 'munderover', 'none']
-
- svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
- 'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
- 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
- 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
- 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
- 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
-
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
- 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
- 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
- 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
- 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
- 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
- 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
- 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
- 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
- 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
- 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
- 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
- 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
- 'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
- 'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
- 'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
- 'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
- 'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
- 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
- 'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
- 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
- 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
- 'width', 'wrap', 'xml:lang']
-
- mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
- 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
- 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
- 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
- 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
- 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
- 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
- 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
- 'xlink:type', 'xmlns', 'xmlns:xlink']
-
- svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
- 'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
- 'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
- 'fill-opacity', 'fill-rule', 'font-family', 'font-size',
- 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
- 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
- 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
- 'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
- 'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
- 'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
- 'opacity', 'orient', 'origin', 'overline-position',
- 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
- 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
- 'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
- 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
- 'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
- 'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
- 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
- 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
- 'transform', 'type', 'u1', 'u2', 'underline-position',
- 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
- 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
- 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
- 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
- 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
- 'y1', 'y2', 'zoomAndPan']
-
- attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
- 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
-
- svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
- 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
- 'mask', 'stroke']
-
- svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
- 'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
- 'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
- 'set', 'use']
-
- acceptable_css_properties = ['azimuth', 'background-color',
- 'border-bottom-color', 'border-collapse', 'border-color',
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
- 'white-space', 'width']
-
- acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
- 'transparent', 'underline', 'white', 'yellow']
-
- acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
- 'stroke-opacity']
-
- acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
- 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
- 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
- 'ssh', 'sftp', 'rtsp', 'afs', 'data']
-
- acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
-
- # subclasses may define their own versions of these constants
- allowed_elements = acceptable_elements + mathml_elements + svg_elements
- allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
- allowed_css_properties = acceptable_css_properties
- allowed_css_keywords = acceptable_css_keywords
- allowed_svg_properties = acceptable_svg_properties
- allowed_protocols = acceptable_protocols
- allowed_content_types = acceptable_content_types
-
- # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
- # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
- # attributes are parsed, and a restricted set, # specified by
- # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
- # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
- # in ALLOWED_PROTOCOLS are allowed.
- #
- # sanitize_html('')
- # => <script> do_nasty_stuff() </script>
- # sanitize_html('Click here for $100')
- # => Click here for $100
- def sanitize_token(self, token):
-
- # accommodate filters which use token_type differently
- token_type = token["type"]
- if token_type in list(tokenTypes.keys()):
- token_type = tokenTypes[token_type]
-
- if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
- tokenTypes["EmptyTag"]):
- if token["name"] in self.allowed_elements:
- return self.allowed_token(token, token_type)
- else:
- return self.disallowed_token(token, token_type)
- elif token_type == tokenTypes["Comment"]:
- pass
- else:
- return token
-
- def allowed_token(self, token, token_type):
- if "data" in token:
- attrs = dict([(name, val) for name, val in
- token["data"][::-1]
- if name in self.allowed_attributes])
- for attr in self.attr_val_is_uri:
- if attr not in attrs:
- continue
- val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
- unescape(attrs[attr])).lower()
- # remove replacement characters from unescaped characters
- val_unescaped = val_unescaped.replace("\ufffd", "")
- try:
- uri = urlparse.urlparse(val_unescaped)
- except ValueError:
- uri = None
- del attrs[attr]
- if uri and uri.scheme:
- if uri.scheme not in self.allowed_protocols:
- del attrs[attr]
- if uri.scheme == 'data':
- m = content_type_rgx.match(uri.path)
- if not m:
- del attrs[attr]
- elif m.group('content_type') not in self.allowed_content_types:
- del attrs[attr]
-
- for attr in self.svg_attr_val_allows_ref:
- if attr in attrs:
- attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(attrs[attr]))
- if (token["name"] in self.svg_allow_local_href and
- 'xlink:href' in attrs and re.search('^\s*[^#\s].*',
- attrs['xlink:href'])):
- del attrs['xlink:href']
- if 'style' in attrs:
- attrs['style'] = self.sanitize_css(attrs['style'])
- token["data"] = [[name, val] for name, val in list(attrs.items())]
- return token
-
- def disallowed_token(self, token, token_type):
- if token_type == tokenTypes["EndTag"]:
- token["data"] = "%s>" % token["name"]
- elif token["data"]:
- attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
- token["data"] = "<%s%s>" % (token["name"], attrs)
- else:
- token["data"] = "<%s>" % token["name"]
- if token.get("selfClosing"):
- token["data"] = token["data"][:-1] + "/>"
-
- if token["type"] in list(tokenTypes.keys()):
- token["type"] = "Characters"
- else:
- token["type"] = tokenTypes["Characters"]
-
- del token["name"]
- return token
-
- def sanitize_css(self, style):
- # disallow urls
- style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
- # gauntlet
- if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
- return ''
- if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
- return ''
-
- clean = []
- for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
- if not value:
- continue
- if prop.lower() in self.allowed_css_properties:
- clean.append(prop + ': ' + value + ';')
- elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
- 'padding']:
- for keyword in value.split():
- if keyword not in self.acceptable_css_keywords and \
- not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
- break
- else:
- clean.append(prop + ': ' + value + ';')
- elif prop.lower() in self.allowed_svg_properties:
- clean.append(prop + ': ' + value + ';')
-
- return ' '.join(clean)
-
-
-class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
- def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
- lowercaseElementName=False, lowercaseAttrName=False, parser=None):
- # Change case matching defaults as we only output lowercase html anyway
- # This solution doesn't seem ideal...
- HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
- lowercaseElementName, lowercaseAttrName, parser=parser)
-
- def __iter__(self):
- for token in HTMLTokenizer.__iter__(self):
- token = self.sanitize_token(token)
- if token:
- yield token
diff --git a/lib/html5lib/serializer/htmlserializer.py b/lib/html5lib/serializer.py
similarity index 68%
rename from lib/html5lib/serializer/htmlserializer.py
rename to lib/html5lib/serializer.py
index be4d6344..103dd206 100644
--- a/lib/html5lib/serializer/htmlserializer.py
+++ b/lib/html5lib/serializer.py
@@ -1,79 +1,87 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
-try:
- from functools import reduce
-except ImportError:
- pass
+import re
-from ..constants import voidElements, booleanAttributes, spaceCharacters
-from ..constants import rcdataElements, entities, xmlEntities
-from .. import utils
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
from xml.sax.saxutils import escape
-spaceCharacters = "".join(spaceCharacters)
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+ "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+ "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+ "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+ "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+ "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+ "\u3000]")
-try:
- from codecs import register_error, xmlcharrefreplace_errors
-except ImportError:
- unicode_encode_errors = "strict"
-else:
- unicode_encode_errors = "htmlentityreplace"
- encode_entity_map = {}
- is_ucs4 = len("\U0010FFFF") == 1
- for k, v in list(entities.items()):
- # skip multi-character entities
- if ((is_ucs4 and len(v) > 1) or
- (not is_ucs4 and len(v) > 2)):
- continue
- if v != "&":
- if len(v) == 2:
- v = utils.surrogatePairToCodepoint(v)
- else:
- v = ord(v)
- if v not in encode_entity_map or k.islower():
- # prefer < over < and similarly for &, >, etc.
- encode_entity_map[v] = k
-
- def htmlentityreplace_errors(exc):
- if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
- res = []
- codepoints = []
- skip = False
- for i, c in enumerate(exc.object[exc.start:exc.end]):
- if skip:
- skip = False
- continue
- index = i + exc.start
- if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
- codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
- skip = True
- else:
- codepoint = ord(c)
- codepoints.append(codepoint)
- for cp in codepoints:
- e = encode_entity_map.get(cp)
- if e:
- res.append("&")
- res.append(e)
- if not e.endswith(";"):
- res.append(";")
- else:
- res.append("%s;" % (hex(cp)[2:]))
- return ("".join(res), exc.end)
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+ # skip multi-character entities
+ if ((_is_ucs4 and len(v) > 1) or
+ (not _is_ucs4 and len(v) > 2)):
+ continue
+ if v != "&":
+ if len(v) == 2:
+ v = _utils.surrogatePairToCodepoint(v)
else:
- return xmlcharrefreplace_errors(exc)
+ v = ord(v)
+ if v not in _encode_entity_map or k.islower():
+ # prefer < over < and similarly for &, >, etc.
+ _encode_entity_map[v] = k
- register_error(unicode_encode_errors, htmlentityreplace_errors)
- del register_error
+def htmlentityreplace_errors(exc):
+ if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+ res = []
+ codepoints = []
+ skip = False
+ for i, c in enumerate(exc.object[exc.start:exc.end]):
+ if skip:
+ skip = False
+ continue
+ index = i + exc.start
+ if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+ codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+ skip = True
+ else:
+ codepoint = ord(c)
+ codepoints.append(codepoint)
+ for cp in codepoints:
+ e = _encode_entity_map.get(cp)
+ if e:
+ res.append("&")
+ res.append(e)
+ if not e.endswith(";"):
+ res.append(";")
+ else:
+ res.append("%s;" % (hex(cp)[2:]))
+ return ("".join(res), exc.end)
+ else:
+ return xmlcharrefreplace_errors(exc)
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+ # XXX: Should we cache this?
+ walker = treewalkers.getTreeWalker(tree)
+ s = HTMLSerializer(**serializer_opts)
+ return s.render(walker(input), encoding)
class HTMLSerializer(object):
# attribute quoting options
- quote_attr_values = False
+ quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True
@@ -109,9 +117,9 @@ class HTMLSerializer(object):
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
- quote_attr_values=True|False
+ quote_attr_values="legacy"|"spec"|"always"
Whether to quote attribute values that don't require quoting
- per HTML5 parsing rules.
+ per legacy browser behaviour, when required by the standard, or always.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
@@ -147,6 +155,9 @@ class HTMLSerializer(object):
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
+ unexpected_args = frozenset(kwargs) - frozenset(self.options)
+ if len(unexpected_args) > 0:
+ raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
@@ -157,7 +168,7 @@ class HTMLSerializer(object):
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
- return string.encode(self.encoding, unicode_encode_errors)
+ return string.encode(self.encoding, "htmlentityreplace")
else:
return string
@@ -169,28 +180,30 @@ class HTMLSerializer(object):
return string
def serialize(self, treewalker, encoding=None):
+ # pylint:disable=too-many-nested-blocks
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
- from ..filters.inject_meta_charset import Filter
+ from .filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
+ # Alphabetical attributes is here under the assumption that none of
+ # the later filters add or change order of attributes; it needs to be
+ # before the sanitizer so escaped elements come out correctly
+ if self.alphabetical_attributes:
+ from .filters.alphabeticalattributes import Filter
+ treewalker = Filter(treewalker)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
- from ..filters.whitespace import Filter
+ from .filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
- from ..filters.sanitizer import Filter
+ from .filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
- from ..filters.optionaltags import Filter
- treewalker = Filter(treewalker)
- # Alphabetical attributes must be last, as other filters
- # could add attributes and alter the order
- if self.alphabetical_attributes:
- from ..filters.alphabeticalattributes import Filter
+ from .filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
@@ -229,7 +242,7 @@ class HTMLSerializer(object):
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
- for (attr_namespace, attr_name), attr_value in token["data"].items():
+ for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
@@ -237,14 +250,18 @@ class HTMLSerializer(object):
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
- (k not in booleanAttributes.get(name, tuple())
- and k not in booleanAttributes.get("", tuple())):
+ (k not in booleanAttributes.get(name, tuple()) and
+ k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
- if self.quote_attr_values or not v:
+ if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
+ elif self.quote_attr_values == "spec":
+ quote_attr = _quoteAttributeSpec.search(v) is not None
+ elif self.quote_attr_values == "legacy":
+ quote_attr = _quoteAttributeLegacy.search(v) is not None
else:
- quote_attr = reduce(lambda x, y: x or (y in v),
- spaceCharacters + ">\"'=", False)
+ raise ValueError("quote_attr_values must be one of: "
+ "'always', 'spec', or 'legacy'")
v = v.replace("&", "&")
if self.escape_lt_in_attrs:
v = v.replace("<", "<")
@@ -312,6 +329,6 @@ class HTMLSerializer(object):
raise SerializeError
-def SerializeError(Exception):
+class SerializeError(Exception):
"""Error in serialized tree"""
pass
diff --git a/lib/html5lib/serializer/__init__.py b/lib/html5lib/serializer/__init__.py
deleted file mode 100644
index 8380839a..00000000
--- a/lib/html5lib/serializer/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from .. import treewalkers
-
-from .htmlserializer import HTMLSerializer
-
-
-def serialize(input, tree="etree", format="html", encoding=None,
- **serializer_opts):
- # XXX: Should we cache this?
- walker = treewalkers.getTreeWalker(tree)
- if format == "html":
- s = HTMLSerializer(**serializer_opts)
- else:
- raise ValueError("type must be html")
- return s.render(walker(input), encoding)
diff --git a/lib/html5lib/treeadapters/__init__.py b/lib/html5lib/treeadapters/__init__.py
index 57d71304..4f978466 100644
--- a/lib/html5lib/treeadapters/__init__.py
+++ b/lib/html5lib/treeadapters/__init__.py
@@ -5,7 +5,7 @@ from . import sax
__all__ = ["sax"]
try:
- from . import genshi # flake8: noqa
+ from . import genshi # noqa
except ImportError:
pass
else:
diff --git a/lib/html5lib/treebuilders/__init__.py b/lib/html5lib/treebuilders/__init__.py
index 6a6b2a4c..e2328847 100644
--- a/lib/html5lib/treebuilders/__init__.py
+++ b/lib/html5lib/treebuilders/__init__.py
@@ -28,7 +28,7 @@ to the format used in the unittests
from __future__ import absolute_import, division, unicode_literals
-from ..utils import default_etree
+from .._utils import default_etree
treeBuilderCache = {}
diff --git a/lib/html5lib/treebuilders/_base.py b/lib/html5lib/treebuilders/base.py
similarity index 97%
rename from lib/html5lib/treebuilders/_base.py
rename to lib/html5lib/treebuilders/base.py
index 8b97cc11..a4b2792a 100644
--- a/lib/html5lib/treebuilders/_base.py
+++ b/lib/html5lib/treebuilders/base.py
@@ -126,6 +126,7 @@ class TreeBuilder(object):
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
"""
+ # pylint:disable=not-callable
# Document class
documentClass = None
@@ -166,12 +167,17 @@ class TreeBuilder(object):
# If we pass a node in we match that. if we pass a string
# match any node with that name
exactNode = hasattr(target, "nameTuple")
+ if not exactNode:
+ if isinstance(target, text_type):
+ target = (namespaces["html"], target)
+ assert isinstance(target, tuple)
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
- if (node.name == target and not exactNode or
- node == target and exactNode):
+ if exactNode and node == target:
+ return True
+ elif not exactNode and node.nameTuple == target:
return True
elif (invert ^ (node.nameTuple in listElements)):
return False
@@ -353,8 +359,8 @@ class TreeBuilder(object):
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
- if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
- and name != exclude):
+ if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
+ name != exclude):
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
diff --git a/lib/html5lib/treebuilders/dom.py b/lib/html5lib/treebuilders/dom.py
index 234233b7..dcfac220 100644
--- a/lib/html5lib/treebuilders/dom.py
+++ b/lib/html5lib/treebuilders/dom.py
@@ -1,54 +1,62 @@
from __future__ import absolute_import, division, unicode_literals
+from collections import MutableMapping
from xml.dom import minidom, Node
import weakref
-from . import _base
+from . import base
from .. import constants
from ..constants import namespaces
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory
def getDomBuilder(DomImplementation):
Dom = DomImplementation
- class AttrList(object):
+ class AttrList(MutableMapping):
def __init__(self, element):
self.element = element
def __iter__(self):
- return list(self.element.attributes.items()).__iter__()
+ return iter(self.element.attributes.keys())
def __setitem__(self, name, value):
- self.element.setAttribute(name, value)
-
- def __len__(self):
- return len(list(self.element.attributes.items()))
-
- def items(self):
- return [(item[0], item[1]) for item in
- list(self.element.attributes.items())]
-
- def keys(self):
- return list(self.element.attributes.keys())
-
- def __getitem__(self, name):
- return self.element.getAttribute(name)
-
- def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
- return self.element.hasAttribute(name)
+ attr = self.element.ownerDocument.createAttribute(name)
+ attr.value = value
+ self.element.attributes[name] = attr
- class NodeBuilder(_base.Node):
+ def __len__(self):
+ return len(self.element.attributes)
+
+ def items(self):
+ return list(self.element.attributes.items())
+
+ def values(self):
+ return list(self.element.attributes.values())
+
+ def __getitem__(self, name):
+ if isinstance(name, tuple):
+ raise NotImplementedError
+ else:
+ return self.element.attributes[name].value
+
+ def __delitem__(self, name):
+ if isinstance(name, tuple):
+ raise NotImplementedError
+ else:
+ del self.element.attributes[name]
+
+ class NodeBuilder(base.Node):
def __init__(self, element):
- _base.Node.__init__(self, element.nodeName)
+ base.Node.__init__(self, element.nodeName)
self.element = element
- namespace = property(lambda self: hasattr(self.element, "namespaceURI")
- and self.element.namespaceURI or None)
+ namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
+ self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
@@ -109,7 +117,7 @@ def getDomBuilder(DomImplementation):
nameTuple = property(getNameTuple)
- class TreeBuilder(_base.TreeBuilder):
+ class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self)
@@ -149,15 +157,16 @@ def getDomBuilder(DomImplementation):
return self.dom
def getFragment(self):
- return _base.TreeBuilder.getFragment(self).element
+ return base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data = data
if parent != self:
- _base.TreeBuilder.insertText(self, data, parent)
+ base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
+ # pylint:disable=protected-access
if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
diff --git a/lib/html5lib/treebuilders/etree.py b/lib/html5lib/treebuilders/etree.py
index 2c8ed19f..cb1d4aef 100644
--- a/lib/html5lib/treebuilders/etree.py
+++ b/lib/html5lib/treebuilders/etree.py
@@ -1,13 +1,15 @@
from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
from six import text_type
import re
-from . import _base
-from .. import ihatexml
+from . import base
+from .. import _ihatexml
from .. import constants
from ..constants import namespaces
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
@@ -16,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
- class Element(_base.Node):
+ class Element(base.Node):
def __init__(self, name, namespace=None):
self._name = name
self._namespace = namespace
@@ -98,6 +100,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
node.parent = self
def removeChild(self, node):
+ self._childNodes.remove(node)
self._element.remove(node._element)
node.parent = None
@@ -139,7 +142,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
- _base.Node.reparentChildren(self, newParent)
+ base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
@@ -253,10 +256,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "\n".join(rv)
- def tostring(element):
+ def tostring(element): # pylint:disable=unused-variable
"""Serialize an element and its child nodes to a string"""
rv = []
- filter = ihatexml.InfosetFilter()
+ filter = _ihatexml.InfosetFilter()
def serializeElement(element):
if isinstance(element, ElementTree.ElementTree):
@@ -307,7 +310,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "".join(rv)
- class TreeBuilder(_base.TreeBuilder):
+ class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
@@ -329,7 +332,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self.document._element.find("html")
def getFragment(self):
- return _base.TreeBuilder.getFragment(self)._element
+ return base.TreeBuilder.getFragment(self)._element
return locals()
diff --git a/lib/html5lib/treebuilders/etree_lxml.py b/lib/html5lib/treebuilders/etree_lxml.py
index 138b30bd..908820c0 100644
--- a/lib/html5lib/treebuilders/etree_lxml.py
+++ b/lib/html5lib/treebuilders/etree_lxml.py
@@ -10,16 +10,17 @@ When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
import warnings
import re
import sys
-from . import _base
+from . import base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
-from .. import ihatexml
+from .. import _ihatexml
import lxml.etree as etree
@@ -53,8 +54,7 @@ class Document(object):
def testSerializer(element):
rv = []
- finalText = None
- infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+ infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
@@ -128,16 +128,12 @@ def testSerializer(element):
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
- if finalText is not None:
- rv.append("|%s\"%s\"" % (' ' * 2, finalText))
-
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
- finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
@@ -173,13 +169,10 @@ def tostring(element):
serializeElement(element)
- if finalText is not None:
- rv.append("%s\"" % (' ' * 2, finalText))
-
return "".join(rv)
-class TreeBuilder(_base.TreeBuilder):
+class TreeBuilder(base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
@@ -189,13 +182,15 @@ class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
- infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+ infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
- def __init__(self, element, value={}):
+ def __init__(self, element, value=None):
+ if value is None:
+ value = {}
self._element = element
- dict.__init__(self, value)
+ dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@@ -259,10 +254,10 @@ class TreeBuilder(_base.TreeBuilder):
self.elementClass = Element
self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment
- _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+ base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
- _base.TreeBuilder.reset(self)
+ base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
@@ -303,12 +298,14 @@ class TreeBuilder(_base.TreeBuilder):
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
+ assert parent is None or parent is self.document
+ assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
- warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+ warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
diff --git a/lib/html5lib/treewalkers/__init__.py b/lib/html5lib/treewalkers/__init__.py
index 21f46b01..9e19a559 100644
--- a/lib/html5lib/treewalkers/__init__.py
+++ b/lib/html5lib/treewalkers/__init__.py
@@ -10,10 +10,10 @@ returning an iterator generating tokens.
from __future__ import absolute_import, division, unicode_literals
-__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
-
from .. import constants
-from ..utils import default_etree
+from .._utils import default_etree
+
+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
treeWalkerCache = {}
@@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
from . import dom
treeWalkerCache[treeType] = dom.TreeWalker
elif treeType == "genshi":
- from . import genshistream
- treeWalkerCache[treeType] = genshistream.TreeWalker
+ from . import genshi
+ treeWalkerCache[treeType] = genshi.TreeWalker
elif treeType == "lxml":
- from . import lxmletree
- treeWalkerCache[treeType] = lxmletree.TreeWalker
+ from . import etree_lxml
+ treeWalkerCache[treeType] = etree_lxml.TreeWalker
elif treeType == "etree":
from . import etree
if implementation is None:
diff --git a/lib/html5lib/treewalkers/_base.py b/lib/html5lib/treewalkers/base.py
similarity index 59%
rename from lib/html5lib/treewalkers/_base.py
rename to lib/html5lib/treewalkers/base.py
index 4e11cd02..36e1ba24 100644
--- a/lib/html5lib/treewalkers/_base.py
+++ b/lib/html5lib/treewalkers/base.py
@@ -1,11 +1,11 @@
from __future__ import absolute_import, division, unicode_literals
-from six import text_type, string_types
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
-from xml.dom import Node
-
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
@@ -14,28 +14,9 @@ COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
-from ..constants import voidElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
-def to_text(s, blank_if_none=True):
- """Wrapper around six.text_type to convert None to empty string"""
- if s is None:
- if blank_if_none:
- return ""
- else:
- return None
- elif isinstance(s, text_type):
- return s
- else:
- return text_type(s)
-
-
-def is_text_or_none(string):
- """Wrapper around isinstance(string_types) or is None"""
- return string is None or isinstance(string, string_types)
-
-
class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
@@ -47,47 +28,25 @@ class TreeWalker(object):
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
- assert namespace is None or isinstance(namespace, string_types), type(namespace)
- assert isinstance(name, string_types), type(name)
- assert all((namespace is None or isinstance(namespace, string_types)) and
- isinstance(name, string_types) and
- isinstance(value, string_types)
- for (namespace, name), value in attrs.items())
-
- yield {"type": "EmptyTag", "name": to_text(name, False),
- "namespace": to_text(namespace),
+ yield {"type": "EmptyTag", "name": name,
+ "namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
- assert namespace is None or isinstance(namespace, string_types), type(namespace)
- assert isinstance(name, string_types), type(name)
- assert all((namespace is None or isinstance(namespace, string_types)) and
- isinstance(name, string_types) and
- isinstance(value, string_types)
- for (namespace, name), value in attrs.items())
-
return {"type": "StartTag",
- "name": text_type(name),
- "namespace": to_text(namespace),
- "data": dict(((to_text(namespace, False), to_text(name)),
- to_text(value, False))
- for (namespace, name), value in attrs.items())}
+ "name": name,
+ "namespace": namespace,
+ "data": attrs}
def endTag(self, namespace, name):
- assert namespace is None or isinstance(namespace, string_types), type(namespace)
- assert isinstance(name, string_types), type(namespace)
-
return {"type": "EndTag",
- "name": to_text(name, False),
- "namespace": to_text(namespace),
- "data": {}}
+ "name": name,
+ "namespace": namespace}
def text(self, data):
- assert isinstance(data, string_types), type(data)
-
- data = to_text(data)
+ data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
@@ -101,25 +60,16 @@ class TreeWalker(object):
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
- assert isinstance(data, string_types), type(data)
-
- return {"type": "Comment", "data": text_type(data)}
-
- def doctype(self, name, publicId=None, systemId=None, correct=True):
- assert is_text_or_none(name), type(name)
- assert is_text_or_none(publicId), type(publicId)
- assert is_text_or_none(systemId), type(systemId)
+ return {"type": "Comment", "data": data}
+ def doctype(self, name, publicId=None, systemId=None):
return {"type": "Doctype",
- "name": to_text(name),
- "publicId": to_text(publicId),
- "systemId": to_text(systemId),
- "correct": to_text(correct)}
+ "name": name,
+ "publicId": publicId,
+ "systemId": systemId}
def entity(self, name):
- assert isinstance(name, string_types), type(name)
-
- return {"type": "Entity", "name": text_type(name)}
+ return {"type": "Entity", "name": name}
def unknown(self, nodeType):
return self.error("Unknown node type: " + nodeType)
@@ -154,7 +104,7 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
- if name in voidElements:
+ if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
@@ -187,7 +137,7 @@ class NonRecursiveTreeWalker(TreeWalker):
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
- if name not in voidElements:
+ if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
diff --git a/lib/html5lib/treewalkers/dom.py b/lib/html5lib/treewalkers/dom.py
index ac4dcf31..b0c89b00 100644
--- a/lib/html5lib/treewalkers/dom.py
+++ b/lib/html5lib/treewalkers/dom.py
@@ -2,16 +2,16 @@ from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
-from . import _base
+from . import base
-class TreeWalker(_base.NonRecursiveTreeWalker):
+class TreeWalker(base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
- return _base.DOCTYPE, node.name, node.publicId, node.systemId
+ return base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
- return _base.TEXT, node.nodeValue
+ return base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
attrs = {}
@@ -21,17 +21,17 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs[(attr.namespaceURI, attr.localName)] = attr.value
else:
attrs[(None, attr.name)] = attr.value
- return (_base.ELEMENT, node.namespaceURI, node.nodeName,
+ return (base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE:
- return _base.COMMENT, node.nodeValue
+ return base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
- return (_base.DOCUMENT,)
+ return (base.DOCUMENT,)
else:
- return _base.UNKNOWN, node.nodeType
+ return base.UNKNOWN, node.nodeType
def getFirstChild(self, node):
return node.firstChild
diff --git a/lib/html5lib/treewalkers/etree.py b/lib/html5lib/treewalkers/etree.py
index 73c8e26a..8f30f078 100644
--- a/lib/html5lib/treewalkers/etree.py
+++ b/lib/html5lib/treewalkers/etree.py
@@ -12,8 +12,8 @@ import re
from six import string_types
-from . import _base
-from ..utils import moduleFactoryFactory
+from . import base
+from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
@@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
- class TreeWalker(_base.NonRecursiveTreeWalker):
+ class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
@@ -38,9 +38,9 @@ def getETreeBuilder(ElementTreeImplementation):
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
- elt, key, parents, flag = node
+ elt, _, _, flag = node
if flag in ("text", "tail"):
- return _base.TEXT, getattr(elt, flag)
+ return base.TEXT, getattr(elt, flag)
else:
node = elt
@@ -48,14 +48,14 @@ def getETreeBuilder(ElementTreeImplementation):
node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
- return (_base.DOCUMENT,)
+ return (base.DOCUMENT,)
elif node.tag == "":
- return (_base.DOCTYPE, node.text,
+ return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTreeCommentType:
- return _base.COMMENT, node.text
+ return base.COMMENT, node.text
else:
assert isinstance(node.tag, string_types), type(node.tag)
@@ -73,7 +73,7 @@ def getETreeBuilder(ElementTreeImplementation):
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
- return (_base.ELEMENT, namespace, tag,
+ return (base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node):
diff --git a/lib/html5lib/treewalkers/lxmletree.py b/lib/html5lib/treewalkers/etree_lxml.py
similarity index 77%
rename from lib/html5lib/treewalkers/lxmletree.py
rename to lib/html5lib/treewalkers/etree_lxml.py
index 90e116d3..fb236311 100644
--- a/lib/html5lib/treewalkers/lxmletree.py
+++ b/lib/html5lib/treewalkers/etree_lxml.py
@@ -4,9 +4,9 @@ from six import text_type
from lxml import etree
from ..treebuilders.etree import tag_regexp
-from . import _base
+from . import base
-from .. import ihatexml
+from .. import _ihatexml
def ensure_str(s):
@@ -15,20 +15,27 @@ def ensure_str(s):
elif isinstance(s, text_type):
return s
else:
- return s.decode("utf-8", "strict")
+ return s.decode("ascii", "strict")
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
- if et.docinfo.internalDTD:
- self.children.append(Doctype(self,
- ensure_str(et.docinfo.root_name),
- ensure_str(et.docinfo.public_id),
- ensure_str(et.docinfo.system_url)))
- root = et.getroot()
- node = root
+
+ try:
+ if et.docinfo.internalDTD:
+ self.children.append(Doctype(self,
+ ensure_str(et.docinfo.root_name),
+ ensure_str(et.docinfo.public_id),
+ ensure_str(et.docinfo.system_url)))
+ except AttributeError:
+ pass
+
+ try:
+ node = et.getroot()
+ except AttributeError:
+ node = et
while node.getprevious() is not None:
node = node.getprevious()
@@ -115,35 +122,38 @@ class FragmentWrapper(object):
return len(self.obj)
-class TreeWalker(_base.NonRecursiveTreeWalker):
+class TreeWalker(base.NonRecursiveTreeWalker):
def __init__(self, tree):
- if hasattr(tree, "getroot"):
- tree = Root(tree)
- elif isinstance(tree, list):
+ # pylint:disable=redefined-variable-type
+ if isinstance(tree, list):
+ self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
- _base.NonRecursiveTreeWalker.__init__(self, tree)
- self.filter = ihatexml.InfosetFilter()
+ else:
+ self.fragmentChildren = set()
+ tree = Root(tree)
+ base.NonRecursiveTreeWalker.__init__(self, tree)
+ self.filter = _ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
- return _base.TEXT, ensure_str(getattr(node, key))
+ return base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
- return (_base.DOCUMENT,)
+ return (base.DOCUMENT,)
elif isinstance(node, Doctype):
- return _base.DOCTYPE, node.name, node.public_id, node.system_id
+ return base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
- return _base.TEXT, node.obj
+ return base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment:
- return _base.COMMENT, ensure_str(node.text)
+ return base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
- return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
+ return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else:
# This is assumed to be an ordinary element
@@ -162,7 +172,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
- return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+ return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
@@ -197,5 +207,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
if key == "text":
return node
# else: fallback to "normal" processing
+ elif node in self.fragmentChildren:
+ return None
return node.getparent()
diff --git a/lib/html5lib/treewalkers/genshistream.py b/lib/html5lib/treewalkers/genshi.py
similarity index 90%
rename from lib/html5lib/treewalkers/genshistream.py
rename to lib/html5lib/treewalkers/genshi.py
index f559c45d..7483be27 100644
--- a/lib/html5lib/treewalkers/genshistream.py
+++ b/lib/html5lib/treewalkers/genshi.py
@@ -4,12 +4,12 @@ from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
-from . import _base
+from . import base
from ..constants import voidElements, namespaces
-class TreeWalker(_base.TreeWalker):
+class TreeWalker(base.TreeWalker):
def __iter__(self):
# Buffer the events so we can pass in the following one
previous = None
@@ -25,7 +25,7 @@ class TreeWalker(_base.TreeWalker):
yield token
def tokens(self, event, next):
- kind, data, pos = event
+ kind, data, _ = event
if kind == START:
tag, attribs = data
name = tag.localname
@@ -39,8 +39,8 @@ class TreeWalker(_base.TreeWalker):
if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
- not next or next[0] != END
- or next[1] != tag):
+ not next or next[0] != END or
+ next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, converted_attribs)
@@ -48,7 +48,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == END:
name = data.localname
namespace = data.namespace
- if name not in voidElements:
+ if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT: