mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-01 00:43:37 +00:00
Added lxml to our libs
This commit is contained in:
parent
9ac649444d
commit
32c029c3cf
190 changed files with 55421 additions and 0 deletions
223
lib/lxml/ElementInclude.py
Normal file
223
lib/lxml/ElementInclude.py
Normal file
|
@ -0,0 +1,223 @@
|
|||
#
|
||||
# ElementTree
|
||||
# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $
|
||||
#
|
||||
# limited xinclude support for element trees
|
||||
#
|
||||
# history:
|
||||
# 2003-08-15 fl created
|
||||
# 2003-11-14 fl fixed default loader
|
||||
#
|
||||
# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved.
|
||||
#
|
||||
# fredrik@pythonware.com
|
||||
# http://www.pythonware.com
|
||||
#
|
||||
# --------------------------------------------------------------------
|
||||
# The ElementTree toolkit is
|
||||
#
|
||||
# Copyright (c) 1999-2004 by Fredrik Lundh
|
||||
#
|
||||
# By obtaining, using, and/or copying this software and/or its
|
||||
# associated documentation, you agree that you have read, understood,
|
||||
# and will comply with the following terms and conditions:
|
||||
#
|
||||
# Permission to use, copy, modify, and distribute this software and
|
||||
# its associated documentation for any purpose and without fee is
|
||||
# hereby granted, provided that the above copyright notice appears in
|
||||
# all copies, and that both that copyright notice and this permission
|
||||
# notice appear in supporting documentation, and that the name of
|
||||
# Secret Labs AB or the author not be used in advertising or publicity
|
||||
# pertaining to distribution of the software without specific, written
|
||||
# prior permission.
|
||||
#
|
||||
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
||||
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
||||
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
||||
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
||||
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
# OF THIS SOFTWARE.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
Limited XInclude support for the ElementTree package.
|
||||
|
||||
While lxml.etree has full support for XInclude (see
|
||||
`etree.ElementTree.xinclude()`), this module provides a simpler, pure
|
||||
Python, ElementTree compatible implementation that supports a simple
|
||||
form of custom URL resolvers.
|
||||
"""
|
||||
|
||||
from lxml import etree
|
||||
import copy
|
||||
try:
|
||||
from urlparse import urljoin
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import urlopen
|
||||
|
||||
try:
|
||||
set
|
||||
except NameError:
|
||||
# Python 2.3
|
||||
from sets import Set as set
|
||||
|
||||
XINCLUDE = "{http://www.w3.org/2001/XInclude}"
|
||||
|
||||
XINCLUDE_INCLUDE = XINCLUDE + "include"
|
||||
XINCLUDE_FALLBACK = XINCLUDE + "fallback"
|
||||
|
||||
##
|
||||
# Fatal include error.
|
||||
|
||||
class FatalIncludeError(etree.LxmlSyntaxError):
|
||||
pass
|
||||
|
||||
##
|
||||
# ET compatible default loader.
|
||||
# This loader reads an included resource from disk.
|
||||
#
|
||||
# @param href Resource reference.
|
||||
# @param parse Parse mode. Either "xml" or "text".
|
||||
# @param encoding Optional text encoding.
|
||||
# @return The expanded resource. If the parse mode is "xml", this
|
||||
# is an ElementTree instance. If the parse mode is "text", this
|
||||
# is a Unicode string. If the loader fails, it can return None
|
||||
# or raise an IOError exception.
|
||||
# @throws IOError If the loader fails to load the resource.
|
||||
|
||||
def default_loader(href, parse, encoding=None):
|
||||
file = open(href, 'rb')
|
||||
if parse == "xml":
|
||||
data = etree.parse(file).getroot()
|
||||
else:
|
||||
data = file.read()
|
||||
if not encoding:
|
||||
encoding = 'utf-8'
|
||||
data = data.decode(encoding)
|
||||
file.close()
|
||||
return data
|
||||
|
||||
##
|
||||
# Default loader used by lxml.etree - handles custom resolvers properly
|
||||
#
|
||||
|
||||
def _lxml_default_loader(href, parse, encoding=None, parser=None):
|
||||
if parse == "xml":
|
||||
data = etree.parse(href, parser).getroot()
|
||||
else:
|
||||
if "://" in href:
|
||||
f = urlopen(href)
|
||||
else:
|
||||
f = open(href, 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
if not encoding:
|
||||
encoding = 'utf-8'
|
||||
data = data.decode(encoding)
|
||||
return data
|
||||
|
||||
##
|
||||
# Wrapper for ET compatibility - drops the parser
|
||||
|
||||
def _wrap_et_loader(loader):
|
||||
def load(href, parse, encoding=None, parser=None):
|
||||
return loader(href, parse, encoding)
|
||||
return load
|
||||
|
||||
|
||||
##
|
||||
# Expand XInclude directives.
|
||||
#
|
||||
# @param elem Root element.
|
||||
# @param loader Optional resource loader. If omitted, it defaults
|
||||
# to {@link default_loader}. If given, it should be a callable
|
||||
# that implements the same interface as <b>default_loader</b>.
|
||||
# @throws FatalIncludeError If the function fails to include a given
|
||||
# resource, or if the tree contains malformed XInclude elements.
|
||||
# @throws IOError If the function fails to load a given resource.
|
||||
# @returns the node or its replacement if it was an XInclude node
|
||||
|
||||
def include(elem, loader=None, base_url=None):
|
||||
if base_url is None:
|
||||
if hasattr(elem, 'getroot'):
|
||||
tree = elem
|
||||
elem = elem.getroot()
|
||||
else:
|
||||
tree = elem.getroottree()
|
||||
if hasattr(tree, 'docinfo'):
|
||||
base_url = tree.docinfo.URL
|
||||
elif hasattr(elem, 'getroot'):
|
||||
elem = elem.getroot()
|
||||
_include(elem, loader, base_url=base_url)
|
||||
|
||||
def _include(elem, loader=None, _parent_hrefs=None, base_url=None):
|
||||
if loader is not None:
|
||||
load_include = _wrap_et_loader(loader)
|
||||
else:
|
||||
load_include = _lxml_default_loader
|
||||
|
||||
if _parent_hrefs is None:
|
||||
_parent_hrefs = set()
|
||||
|
||||
parser = elem.getroottree().parser
|
||||
|
||||
include_elements = list(
|
||||
elem.iter('{http://www.w3.org/2001/XInclude}*'))
|
||||
|
||||
for e in include_elements:
|
||||
if e.tag == XINCLUDE_INCLUDE:
|
||||
# process xinclude directive
|
||||
href = urljoin(base_url, e.get("href"))
|
||||
parse = e.get("parse", "xml")
|
||||
parent = e.getparent()
|
||||
if parse == "xml":
|
||||
if href in _parent_hrefs:
|
||||
raise FatalIncludeError(
|
||||
"recursive include of %r detected" % href
|
||||
)
|
||||
_parent_hrefs.add(href)
|
||||
node = load_include(href, parse, parser=parser)
|
||||
if node is None:
|
||||
raise FatalIncludeError(
|
||||
"cannot load %r as %r" % (href, parse)
|
||||
)
|
||||
node = _include(node, loader, _parent_hrefs)
|
||||
if e.tail:
|
||||
node.tail = (node.tail or "") + e.tail
|
||||
if parent is None:
|
||||
return node # replaced the root node!
|
||||
parent.replace(e, node)
|
||||
elif parse == "text":
|
||||
text = load_include(href, parse, encoding=e.get("encoding"))
|
||||
if text is None:
|
||||
raise FatalIncludeError(
|
||||
"cannot load %r as %r" % (href, parse)
|
||||
)
|
||||
predecessor = e.getprevious()
|
||||
if predecessor is not None:
|
||||
predecessor.tail = (predecessor.tail or "") + text
|
||||
elif parent is None:
|
||||
return text # replaced the root node!
|
||||
else:
|
||||
parent.text = (parent.text or "") + text + (e.tail or "")
|
||||
parent.remove(e)
|
||||
else:
|
||||
raise FatalIncludeError(
|
||||
"unknown parse type in xi:include tag (%r)" % parse
|
||||
)
|
||||
elif e.tag == XINCLUDE_FALLBACK:
|
||||
parent = e.getparent()
|
||||
if parent is not None and parent.tag != XINCLUDE_INCLUDE:
|
||||
raise FatalIncludeError(
|
||||
"xi:fallback tag must be child of xi:include (%r)" % e.tag
|
||||
)
|
||||
else:
|
||||
raise FatalIncludeError(
|
||||
"Invalid element found in XInclude namespace (%r)" % e.tag
|
||||
)
|
||||
return elem
|
20
lib/lxml/__init__.py
Normal file
20
lib/lxml/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# this is a package
|
||||
|
||||
def get_include():
|
||||
"""
|
||||
Returns a list of header include paths (for lxml itself, libxml2
|
||||
and libxslt) needed to compile C code against lxml if it was built
|
||||
with statically linked libraries.
|
||||
"""
|
||||
import os
|
||||
lxml_path = __path__[0]
|
||||
include_path = os.path.join(lxml_path, 'includes')
|
||||
includes = [include_path, lxml_path]
|
||||
|
||||
for name in os.listdir(include_path):
|
||||
path = os.path.join(include_path, name)
|
||||
if os.path.isdir(path):
|
||||
includes.append(path)
|
||||
|
||||
return includes
|
||||
|
306
lib/lxml/_elementpath.py
Normal file
306
lib/lxml/_elementpath.py
Normal file
|
@ -0,0 +1,306 @@
|
|||
#
|
||||
# ElementTree
|
||||
# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
|
||||
#
|
||||
# limited xpath support for element trees
|
||||
#
|
||||
# history:
|
||||
# 2003-05-23 fl created
|
||||
# 2003-05-28 fl added support for // etc
|
||||
# 2003-08-27 fl fixed parsing of periods in element names
|
||||
# 2007-09-10 fl new selection engine
|
||||
# 2007-09-12 fl fixed parent selector
|
||||
# 2007-09-13 fl added iterfind; changed findall to return a list
|
||||
# 2007-11-30 fl added namespaces support
|
||||
# 2009-10-30 fl added child element value filter
|
||||
#
|
||||
# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
|
||||
#
|
||||
# fredrik@pythonware.com
|
||||
# http://www.pythonware.com
|
||||
#
|
||||
# --------------------------------------------------------------------
|
||||
# The ElementTree toolkit is
|
||||
#
|
||||
# Copyright (c) 1999-2009 by Fredrik Lundh
|
||||
#
|
||||
# By obtaining, using, and/or copying this software and/or its
|
||||
# associated documentation, you agree that you have read, understood,
|
||||
# and will comply with the following terms and conditions:
|
||||
#
|
||||
# Permission to use, copy, modify, and distribute this software and
|
||||
# its associated documentation for any purpose and without fee is
|
||||
# hereby granted, provided that the above copyright notice appears in
|
||||
# all copies, and that both that copyright notice and this permission
|
||||
# notice appear in supporting documentation, and that the name of
|
||||
# Secret Labs AB or the author not be used in advertising or publicity
|
||||
# pertaining to distribution of the software without specific, written
|
||||
# prior permission.
|
||||
#
|
||||
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
||||
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
||||
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
||||
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
||||
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
# OF THIS SOFTWARE.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
##
|
||||
# Implementation module for XPath support. There's usually no reason
|
||||
# to import this module directly; the <b>ElementTree</b> does this for
|
||||
# you, if needed.
|
||||
##
|
||||
|
||||
import re
|
||||
|
||||
xpath_tokenizer_re = re.compile(
|
||||
"("
|
||||
"'[^']*'|\"[^\"]*\"|"
|
||||
"::|"
|
||||
"//?|"
|
||||
"\.\.|"
|
||||
"\(\)|"
|
||||
"[/.*:\[\]\(\)@=])|"
|
||||
"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
|
||||
"\s+"
|
||||
)
|
||||
|
||||
def xpath_tokenizer(pattern, namespaces=None):
|
||||
for token in xpath_tokenizer_re.findall(pattern):
|
||||
tag = token[1]
|
||||
if tag and tag[0] != "{" and ":" in tag:
|
||||
try:
|
||||
prefix, uri = tag.split(":", 1)
|
||||
if not namespaces:
|
||||
raise KeyError
|
||||
yield token[0], "{%s}%s" % (namespaces[prefix], uri)
|
||||
except KeyError:
|
||||
raise SyntaxError("prefix %r not found in prefix map" % prefix)
|
||||
else:
|
||||
yield token
|
||||
|
||||
|
||||
def prepare_child(next, token):
|
||||
tag = token[1]
|
||||
def select(result):
|
||||
for elem in result:
|
||||
for e in elem.iterchildren(tag):
|
||||
yield e
|
||||
return select
|
||||
|
||||
def prepare_star(next, token):
|
||||
def select(result):
|
||||
for elem in result:
|
||||
for e in elem.iterchildren('*'):
|
||||
yield e
|
||||
return select
|
||||
|
||||
def prepare_self(next, token):
|
||||
def select(result):
|
||||
return result
|
||||
return select
|
||||
|
||||
def prepare_descendant(next, token):
|
||||
token = next()
|
||||
if token[0] == "*":
|
||||
tag = "*"
|
||||
elif not token[0]:
|
||||
tag = token[1]
|
||||
else:
|
||||
raise SyntaxError("invalid descendant")
|
||||
def select(result):
|
||||
for elem in result:
|
||||
for e in elem.iterdescendants(tag):
|
||||
yield e
|
||||
return select
|
||||
|
||||
def prepare_parent(next, token):
|
||||
def select(result):
|
||||
for elem in result:
|
||||
parent = elem.getparent()
|
||||
if parent is not None:
|
||||
yield parent
|
||||
return select
|
||||
|
||||
def prepare_predicate(next, token):
|
||||
# FIXME: replace with real parser!!! refs:
|
||||
# http://effbot.org/zone/simple-iterator-parser.htm
|
||||
# http://javascript.crockford.com/tdop/tdop.html
|
||||
signature = []
|
||||
predicate = []
|
||||
while 1:
|
||||
token = next()
|
||||
if token[0] == "]":
|
||||
break
|
||||
if token[0] and token[0][:1] in "'\"":
|
||||
token = "'", token[0][1:-1]
|
||||
signature.append(token[0] or "-")
|
||||
predicate.append(token[1])
|
||||
signature = "".join(signature)
|
||||
# use signature to determine predicate type
|
||||
if signature == "@-":
|
||||
# [@attribute] predicate
|
||||
key = predicate[1]
|
||||
def select(result):
|
||||
for elem in result:
|
||||
if elem.get(key) is not None:
|
||||
yield elem
|
||||
return select
|
||||
if signature == "@-='":
|
||||
# [@attribute='value']
|
||||
key = predicate[1]
|
||||
value = predicate[-1]
|
||||
def select(result):
|
||||
for elem in result:
|
||||
if elem.get(key) == value:
|
||||
yield elem
|
||||
return select
|
||||
if signature == "-" and not re.match("-?\d+$", predicate[0]):
|
||||
# [tag]
|
||||
tag = predicate[0]
|
||||
def select(result):
|
||||
for elem in result:
|
||||
for _ in elem.iterchildren(tag):
|
||||
yield elem
|
||||
break
|
||||
return select
|
||||
if signature == "-='" and not re.match("-?\d+$", predicate[0]):
|
||||
# [tag='value']
|
||||
tag = predicate[0]
|
||||
value = predicate[-1]
|
||||
def select(result):
|
||||
for elem in result:
|
||||
for e in elem.iterchildren(tag):
|
||||
if "".join(e.itertext()) == value:
|
||||
yield elem
|
||||
break
|
||||
return select
|
||||
if signature == "-" or signature == "-()" or signature == "-()-":
|
||||
# [index] or [last()] or [last()-index]
|
||||
if signature == "-":
|
||||
# [index]
|
||||
index = int(predicate[0]) - 1
|
||||
if index < 0:
|
||||
if index == -1:
|
||||
raise SyntaxError(
|
||||
"indices in path predicates are 1-based, not 0-based")
|
||||
else:
|
||||
raise SyntaxError("path index >= 1 expected")
|
||||
else:
|
||||
if predicate[0] != "last":
|
||||
raise SyntaxError("unsupported function")
|
||||
if signature == "-()-":
|
||||
try:
|
||||
index = int(predicate[2]) - 1
|
||||
except ValueError:
|
||||
raise SyntaxError("unsupported expression")
|
||||
else:
|
||||
index = -1
|
||||
def select(result):
|
||||
for elem in result:
|
||||
parent = elem.getparent()
|
||||
if parent is None:
|
||||
continue
|
||||
try:
|
||||
# FIXME: what if the selector is "*" ?
|
||||
elems = list(parent.iterchildren(elem.tag))
|
||||
if elems[index] is elem:
|
||||
yield elem
|
||||
except IndexError:
|
||||
pass
|
||||
return select
|
||||
raise SyntaxError("invalid predicate")
|
||||
|
||||
ops = {
|
||||
"": prepare_child,
|
||||
"*": prepare_star,
|
||||
".": prepare_self,
|
||||
"..": prepare_parent,
|
||||
"//": prepare_descendant,
|
||||
"[": prepare_predicate,
|
||||
}
|
||||
|
||||
_cache = {}
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def _build_path_iterator(path, namespaces):
|
||||
# compile selector pattern
|
||||
if path[-1:] == "/":
|
||||
path = path + "*" # implicit all (FIXME: keep this?)
|
||||
try:
|
||||
return _cache[(path, namespaces and tuple(sorted(namespaces.items())) or None)]
|
||||
except KeyError:
|
||||
pass
|
||||
if len(_cache) > 100:
|
||||
_cache.clear()
|
||||
|
||||
if path[:1] == "/":
|
||||
raise SyntaxError("cannot use absolute path on element")
|
||||
stream = iter(xpath_tokenizer(path, namespaces))
|
||||
try:
|
||||
_next = stream.next
|
||||
except AttributeError:
|
||||
# Python 3
|
||||
_next = stream.__next__
|
||||
try:
|
||||
token = _next()
|
||||
except StopIteration:
|
||||
raise SyntaxError("empty path expression")
|
||||
selector = []
|
||||
while 1:
|
||||
try:
|
||||
selector.append(ops[token[0]](_next, token))
|
||||
except StopIteration:
|
||||
raise SyntaxError("invalid path")
|
||||
try:
|
||||
token = _next()
|
||||
if token[0] == "/":
|
||||
token = _next()
|
||||
except StopIteration:
|
||||
break
|
||||
_cache[path] = selector
|
||||
return selector
|
||||
|
||||
##
|
||||
# Iterate over the matching nodes
|
||||
|
||||
def iterfind(elem, path, namespaces=None):
|
||||
selector = _build_path_iterator(path, namespaces)
|
||||
result = iter((elem,))
|
||||
for select in selector:
|
||||
result = select(result)
|
||||
return result
|
||||
|
||||
##
|
||||
# Find first matching object.
|
||||
|
||||
def find(elem, path, namespaces=None):
|
||||
it = iterfind(elem, path, namespaces)
|
||||
try:
|
||||
try:
|
||||
_next = it.next
|
||||
except AttributeError:
|
||||
return next(it)
|
||||
else:
|
||||
return _next()
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
##
|
||||
# Find all matching objects.
|
||||
|
||||
def findall(elem, path, namespaces=None):
|
||||
return list(iterfind(elem, path, namespaces))
|
||||
|
||||
##
|
||||
# Find text for first matching object.
|
||||
|
||||
def findtext(elem, path, default=None, namespaces=None):
|
||||
el = find(elem, path, namespaces)
|
||||
if el is None:
|
||||
return default
|
||||
else:
|
||||
return el.text or ''
|
1645
lib/lxml/apihelpers.pxi
Normal file
1645
lib/lxml/apihelpers.pxi
Normal file
File diff suppressed because it is too large
Load diff
238
lib/lxml/builder.py
Normal file
238
lib/lxml/builder.py
Normal file
|
@ -0,0 +1,238 @@
|
|||
#
|
||||
# Element generator factory by Fredrik Lundh.
|
||||
#
|
||||
# Source:
|
||||
# http://online.effbot.org/2006_11_01_archive.htm#et-builder
|
||||
# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py
|
||||
#
|
||||
# --------------------------------------------------------------------
|
||||
# The ElementTree toolkit is
|
||||
#
|
||||
# Copyright (c) 1999-2004 by Fredrik Lundh
|
||||
#
|
||||
# By obtaining, using, and/or copying this software and/or its
|
||||
# associated documentation, you agree that you have read, understood,
|
||||
# and will comply with the following terms and conditions:
|
||||
#
|
||||
# Permission to use, copy, modify, and distribute this software and
|
||||
# its associated documentation for any purpose and without fee is
|
||||
# hereby granted, provided that the above copyright notice appears in
|
||||
# all copies, and that both that copyright notice and this permission
|
||||
# notice appear in supporting documentation, and that the name of
|
||||
# Secret Labs AB or the author not be used in advertising or publicity
|
||||
# pertaining to distribution of the software without specific, written
|
||||
# prior permission.
|
||||
#
|
||||
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
||||
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
||||
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
||||
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
||||
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
# OF THIS SOFTWARE.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
The ``E`` Element factory for generating XML documents.
|
||||
"""
|
||||
|
||||
import lxml.etree as ET
|
||||
|
||||
try:
|
||||
from functools import partial
|
||||
except ImportError:
|
||||
# fake it for pre-2.5 releases
|
||||
def partial(func, tag):
|
||||
return lambda *args, **kwargs: func(tag, *args, **kwargs)
|
||||
|
||||
try:
|
||||
callable
|
||||
except NameError:
|
||||
# Python 3
|
||||
def callable(f):
|
||||
return hasattr(f, '__call__')
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
unicode = str
|
||||
|
||||
|
||||
class ElementMaker(object):
|
||||
"""Element generator factory.
|
||||
|
||||
Unlike the ordinary Element factory, the E factory allows you to pass in
|
||||
more than just a tag and some optional attributes; you can also pass in
|
||||
text and other elements. The text is added as either text or tail
|
||||
attributes, and elements are inserted at the right spot. Some small
|
||||
examples::
|
||||
|
||||
>>> from lxml import etree as ET
|
||||
>>> from lxml.builder import E
|
||||
|
||||
>>> ET.tostring(E("tag"))
|
||||
'<tag/>'
|
||||
>>> ET.tostring(E("tag", "text"))
|
||||
'<tag>text</tag>'
|
||||
>>> ET.tostring(E("tag", "text", key="value"))
|
||||
'<tag key="value">text</tag>'
|
||||
>>> ET.tostring(E("tag", E("subtag", "text"), "tail"))
|
||||
'<tag><subtag>text</subtag>tail</tag>'
|
||||
|
||||
For simple tags, the factory also allows you to write ``E.tag(...)`` instead
|
||||
of ``E('tag', ...)``::
|
||||
|
||||
>>> ET.tostring(E.tag())
|
||||
'<tag/>'
|
||||
>>> ET.tostring(E.tag("text"))
|
||||
'<tag>text</tag>'
|
||||
>>> ET.tostring(E.tag(E.subtag("text"), "tail"))
|
||||
'<tag><subtag>text</subtag>tail</tag>'
|
||||
|
||||
Here's a somewhat larger example; this shows how to generate HTML
|
||||
documents, using a mix of prepared factory functions for inline elements,
|
||||
nested ``E.tag`` calls, and embedded XHTML fragments::
|
||||
|
||||
# some common inline elements
|
||||
A = E.a
|
||||
I = E.i
|
||||
B = E.b
|
||||
|
||||
def CLASS(v):
|
||||
# helper function, 'class' is a reserved word
|
||||
return {'class': v}
|
||||
|
||||
page = (
|
||||
E.html(
|
||||
E.head(
|
||||
E.title("This is a sample document")
|
||||
),
|
||||
E.body(
|
||||
E.h1("Hello!", CLASS("title")),
|
||||
E.p("This is a paragraph with ", B("bold"), " text in it!"),
|
||||
E.p("This is another paragraph, with a ",
|
||||
A("link", href="http://www.python.org"), "."),
|
||||
E.p("Here are some reservered characters: <spam&egg>."),
|
||||
ET.XML("<p>And finally, here is an embedded XHTML fragment.</p>"),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
print ET.tostring(page)
|
||||
|
||||
Here's a prettyprinted version of the output from the above script::
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>This is a sample document</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1 class="title">Hello!</h1>
|
||||
<p>This is a paragraph with <b>bold</b> text in it!</p>
|
||||
<p>This is another paragraph, with <a href="http://www.python.org">link</a>.</p>
|
||||
<p>Here are some reservered characters: <spam&egg>.</p>
|
||||
<p>And finally, here is an embedded XHTML fragment.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
For namespace support, you can pass a namespace map (``nsmap``)
|
||||
and/or a specific target ``namespace`` to the ElementMaker class::
|
||||
|
||||
>>> E = ElementMaker(namespace="http://my.ns/")
|
||||
>>> print(ET.tostring( E.test ))
|
||||
<test xmlns="http://my.ns/"/>
|
||||
|
||||
>>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'})
|
||||
>>> print(ET.tostring( E.test ))
|
||||
<p:test xmlns:p="http://my.ns/"/>
|
||||
"""
|
||||
|
||||
def __init__(self, typemap=None,
|
||||
namespace=None, nsmap=None, makeelement=None):
|
||||
if namespace is not None:
|
||||
self._namespace = '{' + namespace + '}'
|
||||
else:
|
||||
self._namespace = None
|
||||
|
||||
if nsmap:
|
||||
self._nsmap = dict(nsmap)
|
||||
else:
|
||||
self._nsmap = None
|
||||
|
||||
if makeelement is not None:
|
||||
assert callable(makeelement)
|
||||
self._makeelement = makeelement
|
||||
else:
|
||||
self._makeelement = ET.Element
|
||||
|
||||
# initialize type map for this element factory
|
||||
|
||||
if typemap:
|
||||
typemap = typemap.copy()
|
||||
else:
|
||||
typemap = {}
|
||||
|
||||
def add_text(elem, item):
|
||||
try:
|
||||
elem[-1].tail = (elem[-1].tail or "") + item
|
||||
except IndexError:
|
||||
elem.text = (elem.text or "") + item
|
||||
if str not in typemap:
|
||||
typemap[str] = add_text
|
||||
if unicode not in typemap:
|
||||
typemap[unicode] = add_text
|
||||
|
||||
def add_dict(elem, item):
|
||||
attrib = elem.attrib
|
||||
for k, v in item.items():
|
||||
if isinstance(v, basestring):
|
||||
attrib[k] = v
|
||||
else:
|
||||
attrib[k] = typemap[type(v)](None, v)
|
||||
if dict not in typemap:
|
||||
typemap[dict] = add_dict
|
||||
|
||||
self._typemap = typemap
|
||||
|
||||
def __call__(self, tag, *children, **attrib):
|
||||
get = self._typemap.get
|
||||
|
||||
if self._namespace is not None and tag[0] != '{':
|
||||
tag = self._namespace + tag
|
||||
elem = self._makeelement(tag, nsmap=self._nsmap)
|
||||
if attrib:
|
||||
get(dict)(elem, attrib)
|
||||
|
||||
for item in children:
|
||||
if callable(item):
|
||||
item = item()
|
||||
t = get(type(item))
|
||||
if t is None:
|
||||
if ET.iselement(item):
|
||||
elem.append(item)
|
||||
continue
|
||||
for basetype in type(item).__mro__:
|
||||
# See if the typemap knows of any of this type's bases.
|
||||
t = get(basetype)
|
||||
if t is not None:
|
||||
break
|
||||
else:
|
||||
raise TypeError("bad argument type: %s(%r)" %
|
||||
(type(item).__name__, item))
|
||||
v = t(elem, item)
|
||||
if v:
|
||||
get(type(v))(elem, v)
|
||||
|
||||
return elem
|
||||
|
||||
def __getattr__(self, tag):
|
||||
return partial(self, tag)
|
||||
|
||||
# create factory object
|
||||
E = ElementMaker()
|
565
lib/lxml/classlookup.pxi
Normal file
565
lib/lxml/classlookup.pxi
Normal file
|
@ -0,0 +1,565 @@
|
|||
# Configurable Element class lookup
|
||||
|
||||
################################################################################
|
||||
# Custom Element classes
|
||||
|
||||
cdef public class ElementBase(_Element) [ type LxmlElementBaseType,
|
||||
object LxmlElementBase ]:
|
||||
u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
|
||||
|
||||
The public Element class. All custom Element classes must inherit
|
||||
from this one. To create an Element, use the `Element()` factory.
|
||||
|
||||
BIG FAT WARNING: Subclasses *must not* override __init__ or
|
||||
__new__ as it is absolutely undefined when these objects will be
|
||||
created or destroyed. All persistent state of Elements must be
|
||||
stored in the underlying XML. If you really need to initialize
|
||||
the object after creation, you can implement an ``_init(self)``
|
||||
method that will be called directly after object creation.
|
||||
|
||||
Subclasses of this class can be instantiated to create a new
|
||||
Element. By default, the tag name will be the class name and the
|
||||
namespace will be empty. You can modify this with the following
|
||||
class attributes:
|
||||
|
||||
* TAG - the tag name, possibly containing a namespace in Clark
|
||||
notation
|
||||
|
||||
* NAMESPACE - the default namespace URI, unless provided as part
|
||||
of the TAG attribute.
|
||||
|
||||
* HTML - flag if the class is an HTML tag, as opposed to an XML
|
||||
tag. This only applies to un-namespaced tags and defaults to
|
||||
false (i.e. XML).
|
||||
|
||||
* PARSER - the parser that provides the configuration for the
|
||||
newly created document. Providing an HTML parser here will
|
||||
default to creating an HTML element.
|
||||
|
||||
In user code, the latter three are commonly inherited in class
|
||||
hierarchies that implement a common namespace.
|
||||
"""
|
||||
def __init__(self, *children, attrib=None, nsmap=None, **_extra):
|
||||
u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
|
||||
"""
|
||||
cdef bint is_html = 0
|
||||
cdef _BaseParser parser
|
||||
cdef _Element last_child
|
||||
# don't use normal attribute access as it might be overridden
|
||||
_getattr = object.__getattribute__
|
||||
try:
|
||||
namespace = _utf8(_getattr(self, 'NAMESPACE'))
|
||||
except AttributeError:
|
||||
namespace = None
|
||||
try:
|
||||
ns, tag = _getNsTag(_getattr(self, 'TAG'))
|
||||
if ns is not None:
|
||||
namespace = ns
|
||||
except AttributeError:
|
||||
tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__'))
|
||||
if b'.' in tag:
|
||||
tag = tag.split(b'.')[-1]
|
||||
try:
|
||||
parser = _getattr(self, 'PARSER')
|
||||
except AttributeError:
|
||||
parser = None
|
||||
for child in children:
|
||||
if isinstance(child, _Element):
|
||||
parser = (<_Element>child)._doc._parser
|
||||
break
|
||||
if isinstance(parser, HTMLParser):
|
||||
is_html = 1
|
||||
if namespace is None:
|
||||
try:
|
||||
is_html = _getattr(self, 'HTML')
|
||||
except AttributeError:
|
||||
pass
|
||||
_initNewElement(self, is_html, tag, namespace, parser,
|
||||
attrib, nsmap, _extra)
|
||||
last_child = None
|
||||
for child in children:
|
||||
if _isString(child):
|
||||
if last_child is None:
|
||||
_setNodeText(self._c_node,
|
||||
(_collectText(self._c_node.children) or '') + child)
|
||||
else:
|
||||
_setTailText(last_child._c_node,
|
||||
(_collectText(last_child._c_node.next) or '') + child)
|
||||
elif isinstance(child, _Element):
|
||||
last_child = child
|
||||
_appendChild(self, last_child)
|
||||
elif isinstance(child, type) and issubclass(child, ElementBase):
|
||||
last_child = child()
|
||||
_appendChild(self, last_child)
|
||||
else:
|
||||
raise TypeError, "Invalid child type: %r" % type(child)
|
||||
|
||||
cdef class CommentBase(_Comment):
|
||||
u"""All custom Comment classes must inherit from this one.
|
||||
|
||||
To create an XML Comment instance, use the ``Comment()`` factory.
|
||||
|
||||
Subclasses *must not* override __init__ or __new__ as it is
|
||||
absolutely undefined when these objects will be created or
|
||||
destroyed. All persistent state of Comments must be stored in the
|
||||
underlying XML. If you really need to initialize the object after
|
||||
creation, you can implement an ``_init(self)`` method that will be
|
||||
called after object creation.
|
||||
"""
|
||||
def __init__(self, text):
|
||||
# copied from Comment() factory
|
||||
cdef _Document doc
|
||||
cdef xmlDoc* c_doc
|
||||
if text is None:
|
||||
text = b''
|
||||
else:
|
||||
text = _utf8(text)
|
||||
c_doc = _newXMLDoc()
|
||||
doc = _documentFactory(c_doc, None)
|
||||
self._c_node = _createComment(c_doc, _xcstr(text))
|
||||
if self._c_node is NULL:
|
||||
raise MemoryError()
|
||||
tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
|
||||
_registerProxy(self, doc, self._c_node)
|
||||
self._init()
|
||||
|
||||
cdef class PIBase(_ProcessingInstruction):
|
||||
u"""All custom Processing Instruction classes must inherit from this one.
|
||||
|
||||
To create an XML ProcessingInstruction instance, use the ``PI()``
|
||||
factory.
|
||||
|
||||
Subclasses *must not* override __init__ or __new__ as it is
|
||||
absolutely undefined when these objects will be created or
|
||||
destroyed. All persistent state of PIs must be stored in the
|
||||
underlying XML. If you really need to initialize the object after
|
||||
creation, you can implement an ``_init(self)`` method that will be
|
||||
called after object creation.
|
||||
"""
|
||||
def __init__(self, target, text=None):
|
||||
# copied from PI() factory
|
||||
cdef _Document doc
|
||||
cdef xmlDoc* c_doc
|
||||
target = _utf8(target)
|
||||
if text is None:
|
||||
text = b''
|
||||
else:
|
||||
text = _utf8(text)
|
||||
c_doc = _newXMLDoc()
|
||||
doc = _documentFactory(c_doc, None)
|
||||
self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
|
||||
if self._c_node is NULL:
|
||||
raise MemoryError()
|
||||
tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
|
||||
_registerProxy(self, doc, self._c_node)
|
||||
self._init()
|
||||
|
||||
cdef class EntityBase(_Entity):
|
||||
u"""All custom Entity classes must inherit from this one.
|
||||
|
||||
To create an XML Entity instance, use the ``Entity()`` factory.
|
||||
|
||||
Subclasses *must not* override __init__ or __new__ as it is
|
||||
absolutely undefined when these objects will be created or
|
||||
destroyed. All persistent state of Entities must be stored in the
|
||||
underlying XML. If you really need to initialize the object after
|
||||
creation, you can implement an ``_init(self)`` method that will be
|
||||
called after object creation.
|
||||
"""
|
||||
def __init__(self, name):
|
||||
cdef _Document doc
|
||||
cdef xmlDoc* c_doc
|
||||
name_utf = _utf8(name)
|
||||
c_name = _xcstr(name_utf)
|
||||
if c_name[0] == c'#':
|
||||
if not _characterReferenceIsValid(c_name + 1):
|
||||
raise ValueError, u"Invalid character reference: '%s'" % name
|
||||
elif not _xmlNameIsValid(c_name):
|
||||
raise ValueError, u"Invalid entity reference: '%s'" % name
|
||||
c_doc = _newXMLDoc()
|
||||
doc = _documentFactory(c_doc, None)
|
||||
self._c_node = _createEntity(c_doc, c_name)
|
||||
if self._c_node is NULL:
|
||||
raise MemoryError()
|
||||
tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
|
||||
_registerProxy(self, doc, self._c_node)
|
||||
self._init()
|
||||
|
||||
|
||||
cdef int _validateNodeClass(xmlNode* c_node, cls) except -1:
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
expected = ElementBase
|
||||
elif c_node.type == tree.XML_COMMENT_NODE:
|
||||
expected = CommentBase
|
||||
elif c_node.type == tree.XML_ENTITY_REF_NODE:
|
||||
expected = EntityBase
|
||||
elif c_node.type == tree.XML_PI_NODE:
|
||||
expected = PIBase
|
||||
else:
|
||||
assert 0, u"Unknown node type: %s" % c_node.type
|
||||
|
||||
if not (isinstance(cls, type) and issubclass(cls, expected)):
|
||||
raise TypeError(
|
||||
"result of class lookup must be subclass of %s, got %s"
|
||||
% (type(expected), type(cls)))
|
||||
return 0
|
||||
|
||||
|
||||
################################################################################
|
||||
# Element class lookup
|
||||
|
||||
ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*)
|
||||
|
||||
# class to store element class lookup functions
|
||||
cdef public class ElementClassLookup [ type LxmlElementClassLookupType,
|
||||
object LxmlElementClassLookup ]:
|
||||
u"""ElementClassLookup(self)
|
||||
Superclass of Element class lookups.
|
||||
"""
|
||||
cdef _element_class_lookup_function _lookup_function
|
||||
def __cinit__(self):
|
||||
self._lookup_function = NULL # use default lookup
|
||||
|
||||
cdef public class FallbackElementClassLookup(ElementClassLookup) \
|
||||
[ type LxmlFallbackElementClassLookupType,
|
||||
object LxmlFallbackElementClassLookup ]:
|
||||
u"""FallbackElementClassLookup(self, fallback=None)
|
||||
|
||||
Superclass of Element class lookups with additional fallback.
|
||||
"""
|
||||
cdef readonly ElementClassLookup fallback
|
||||
cdef _element_class_lookup_function _fallback_function
|
||||
def __cinit__(self):
|
||||
# fall back to default lookup
|
||||
self._fallback_function = _lookupDefaultElementClass
|
||||
|
||||
def __init__(self, ElementClassLookup fallback=None):
|
||||
if fallback is not None:
|
||||
self._setFallback(fallback)
|
||||
else:
|
||||
self._fallback_function = _lookupDefaultElementClass
|
||||
|
||||
cdef void _setFallback(self, ElementClassLookup lookup):
|
||||
u"""Sets the fallback scheme for this lookup method.
|
||||
"""
|
||||
self.fallback = lookup
|
||||
self._fallback_function = lookup._lookup_function
|
||||
if self._fallback_function is NULL:
|
||||
self._fallback_function = _lookupDefaultElementClass
|
||||
|
||||
def set_fallback(self, ElementClassLookup lookup not None):
|
||||
u"""set_fallback(self, lookup)
|
||||
|
||||
Sets the fallback scheme for this lookup method.
|
||||
"""
|
||||
self._setFallback(lookup)
|
||||
|
||||
cdef inline object _callLookupFallback(FallbackElementClassLookup lookup,
|
||||
_Document doc, xmlNode* c_node):
|
||||
return lookup._fallback_function(lookup.fallback, doc, c_node)
|
||||
|
||||
|
||||
################################################################################
|
||||
# default lookup scheme
|
||||
|
||||
cdef class ElementDefaultClassLookup(ElementClassLookup):
|
||||
u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None)
|
||||
Element class lookup scheme that always returns the default Element
|
||||
class.
|
||||
|
||||
The keyword arguments ``element``, ``comment``, ``pi`` and ``entity``
|
||||
accept the respective Element classes.
|
||||
"""
|
||||
cdef readonly object element_class
|
||||
cdef readonly object comment_class
|
||||
cdef readonly object pi_class
|
||||
cdef readonly object entity_class
|
||||
def __cinit__(self):
|
||||
self._lookup_function = _lookupDefaultElementClass
|
||||
|
||||
def __init__(self, element=None, comment=None, pi=None, entity=None):
|
||||
if element is None:
|
||||
self.element_class = _Element
|
||||
elif issubclass(element, ElementBase):
|
||||
self.element_class = element
|
||||
else:
|
||||
raise TypeError, u"element class must be subclass of ElementBase"
|
||||
|
||||
if comment is None:
|
||||
self.comment_class = _Comment
|
||||
elif issubclass(comment, CommentBase):
|
||||
self.comment_class = comment
|
||||
else:
|
||||
raise TypeError, u"comment class must be subclass of CommentBase"
|
||||
|
||||
if entity is None:
|
||||
self.entity_class = _Entity
|
||||
elif issubclass(entity, EntityBase):
|
||||
self.entity_class = entity
|
||||
else:
|
||||
raise TypeError, u"Entity class must be subclass of EntityBase"
|
||||
|
||||
if pi is None:
|
||||
self.pi_class = None # special case, see below
|
||||
elif issubclass(pi, PIBase):
|
||||
self.pi_class = pi
|
||||
else:
|
||||
raise TypeError, u"PI class must be subclass of PIBase"
|
||||
|
||||
cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node):
|
||||
u"Trivial class lookup function that always returns the default class."
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
if state is not None:
|
||||
return (<ElementDefaultClassLookup>state).element_class
|
||||
else:
|
||||
return _Element
|
||||
elif c_node.type == tree.XML_COMMENT_NODE:
|
||||
if state is not None:
|
||||
return (<ElementDefaultClassLookup>state).comment_class
|
||||
else:
|
||||
return _Comment
|
||||
elif c_node.type == tree.XML_ENTITY_REF_NODE:
|
||||
if state is not None:
|
||||
return (<ElementDefaultClassLookup>state).entity_class
|
||||
else:
|
||||
return _Entity
|
||||
elif c_node.type == tree.XML_PI_NODE:
|
||||
if state is None or (<ElementDefaultClassLookup>state).pi_class is None:
|
||||
# special case XSLT-PI
|
||||
if c_node.name is not NULL and c_node.content is not NULL:
|
||||
if tree.xmlStrcmp(c_node.name, <unsigned char*>"xml-stylesheet") == 0:
|
||||
if tree.xmlStrstr(c_node.content, <unsigned char*>"text/xsl") is not NULL or \
|
||||
tree.xmlStrstr(c_node.content, <unsigned char*>"text/xml") is not NULL:
|
||||
return _XSLTProcessingInstruction
|
||||
return _ProcessingInstruction
|
||||
else:
|
||||
return (<ElementDefaultClassLookup>state).pi_class
|
||||
else:
|
||||
assert 0, u"Unknown node type: %s" % c_node.type
|
||||
|
||||
|
||||
################################################################################
|
||||
# attribute based lookup scheme
|
||||
|
||||
cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup):
|
||||
u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None)
|
||||
Checks an attribute of an Element and looks up the value in a
|
||||
class dictionary.
|
||||
|
||||
Arguments:
|
||||
- attribute name - '{ns}name' style string
|
||||
- class mapping - Python dict mapping attribute values to Element classes
|
||||
- fallback - optional fallback lookup mechanism
|
||||
|
||||
A None key in the class mapping will be checked if the attribute is
|
||||
missing.
|
||||
"""
|
||||
cdef object _class_mapping
|
||||
cdef tuple _pytag
|
||||
cdef const_xmlChar* _c_ns
|
||||
cdef const_xmlChar* _c_name
|
||||
def __cinit__(self):
|
||||
self._lookup_function = _attribute_class_lookup
|
||||
|
||||
def __init__(self, attribute_name, class_mapping,
|
||||
ElementClassLookup fallback=None):
|
||||
self._pytag = _getNsTag(attribute_name)
|
||||
ns, name = self._pytag
|
||||
if ns is None:
|
||||
self._c_ns = NULL
|
||||
else:
|
||||
self._c_ns = _xcstr(ns)
|
||||
self._c_name = _xcstr(name)
|
||||
self._class_mapping = dict(class_mapping)
|
||||
|
||||
FallbackElementClassLookup.__init__(self, fallback)
|
||||
|
||||
cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node):
|
||||
cdef AttributeBasedElementClassLookup lookup
|
||||
cdef python.PyObject* dict_result
|
||||
|
||||
lookup = <AttributeBasedElementClassLookup>state
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
value = _attributeValueFromNsName(
|
||||
c_node, lookup._c_ns, lookup._c_name)
|
||||
dict_result = python.PyDict_GetItem(lookup._class_mapping, value)
|
||||
if dict_result is not NULL:
|
||||
cls = <object>dict_result
|
||||
_validateNodeClass(c_node, cls)
|
||||
return cls
|
||||
return _callLookupFallback(lookup, doc, c_node)
|
||||
|
||||
|
||||
################################################################################
|
||||
# per-parser lookup scheme
|
||||
|
||||
cdef class ParserBasedElementClassLookup(FallbackElementClassLookup):
|
||||
u"""ParserBasedElementClassLookup(self, fallback=None)
|
||||
Element class lookup based on the XML parser.
|
||||
"""
|
||||
def __cinit__(self):
|
||||
self._lookup_function = _parser_class_lookup
|
||||
|
||||
cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node):
|
||||
if doc._parser._class_lookup is not None:
|
||||
return doc._parser._class_lookup._lookup_function(
|
||||
doc._parser._class_lookup, doc, c_node)
|
||||
return _callLookupFallback(<FallbackElementClassLookup>state, doc, c_node)
|
||||
|
||||
|
||||
################################################################################
|
||||
# custom class lookup based on node type, namespace, name
|
||||
|
||||
cdef class CustomElementClassLookup(FallbackElementClassLookup):
|
||||
u"""CustomElementClassLookup(self, fallback=None)
|
||||
Element class lookup based on a subclass method.
|
||||
|
||||
You can inherit from this class and override the method::
|
||||
|
||||
lookup(self, type, doc, namespace, name)
|
||||
|
||||
to lookup the element class for a node. Arguments of the method:
|
||||
* type: one of 'element', 'comment', 'PI', 'entity'
|
||||
* doc: document that the node is in
|
||||
* namespace: namespace URI of the node (or None for comments/PIs/entities)
|
||||
* name: name of the element/entity, None for comments, target for PIs
|
||||
|
||||
If you return None from this method, the fallback will be called.
|
||||
"""
|
||||
def __cinit__(self):
|
||||
self._lookup_function = _custom_class_lookup
|
||||
|
||||
def lookup(self, type, doc, namespace, name):
|
||||
u"lookup(self, type, doc, namespace, name)"
|
||||
return None
|
||||
|
||||
cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node):
|
||||
cdef CustomElementClassLookup lookup
|
||||
|
||||
lookup = <CustomElementClassLookup>state
|
||||
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
element_type = u"element"
|
||||
elif c_node.type == tree.XML_COMMENT_NODE:
|
||||
element_type = u"comment"
|
||||
elif c_node.type == tree.XML_PI_NODE:
|
||||
element_type = u"PI"
|
||||
elif c_node.type == tree.XML_ENTITY_REF_NODE:
|
||||
element_type = u"entity"
|
||||
else:
|
||||
element_type = u"element"
|
||||
if c_node.name is NULL:
|
||||
name = None
|
||||
else:
|
||||
name = funicode(c_node.name)
|
||||
c_str = tree._getNs(c_node)
|
||||
ns = funicode(c_str) if c_str is not NULL else None
|
||||
|
||||
cls = lookup.lookup(element_type, doc, ns, name)
|
||||
if cls is not None:
|
||||
_validateNodeClass(c_node, cls)
|
||||
return cls
|
||||
return _callLookupFallback(lookup, doc, c_node)
|
||||
|
||||
|
||||
################################################################################
|
||||
# read-only tree based class lookup
|
||||
|
||||
cdef class PythonElementClassLookup(FallbackElementClassLookup):
|
||||
u"""PythonElementClassLookup(self, fallback=None)
|
||||
Element class lookup based on a subclass method.
|
||||
|
||||
This class lookup scheme allows access to the entire XML tree in
|
||||
read-only mode. To use it, re-implement the ``lookup(self, doc,
|
||||
root)`` method in a subclass::
|
||||
|
||||
from lxml import etree, pyclasslookup
|
||||
|
||||
class MyElementClass(etree.ElementBase):
|
||||
honkey = True
|
||||
|
||||
class MyLookup(pyclasslookup.PythonElementClassLookup):
|
||||
def lookup(self, doc, root):
|
||||
if root.tag == "sometag":
|
||||
return MyElementClass
|
||||
else:
|
||||
for child in root:
|
||||
if child.tag == "someothertag":
|
||||
return MyElementClass
|
||||
# delegate to default
|
||||
return None
|
||||
|
||||
If you return None from this method, the fallback will be called.
|
||||
|
||||
The first argument is the opaque document instance that contains
|
||||
the Element. The second argument is a lightweight Element proxy
|
||||
implementation that is only valid during the lookup. Do not try
|
||||
to keep a reference to it. Once the lookup is done, the proxy
|
||||
will be invalid.
|
||||
|
||||
Also, you cannot wrap such a read-only Element in an ElementTree,
|
||||
and you must take care not to keep a reference to them outside of
|
||||
the `lookup()` method.
|
||||
|
||||
Note that the API of the Element objects is not complete. It is
|
||||
purely read-only and does not support all features of the normal
|
||||
`lxml.etree` API (such as XPath, extended slicing or some
|
||||
iteration methods).
|
||||
|
||||
See http://codespeak.net/lxml/element_classes.html
|
||||
"""
|
||||
def __cinit__(self):
|
||||
self._lookup_function = _python_class_lookup
|
||||
|
||||
def lookup(self, doc, element):
|
||||
u"""lookup(self, doc, element)
|
||||
|
||||
Override this method to implement your own lookup scheme.
|
||||
"""
|
||||
return None
|
||||
|
||||
cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node):
|
||||
cdef PythonElementClassLookup lookup
|
||||
cdef _ReadOnlyElementProxy proxy
|
||||
lookup = <PythonElementClassLookup>state
|
||||
|
||||
proxy = _newReadOnlyProxy(None, c_node)
|
||||
cls = lookup.lookup(doc, proxy)
|
||||
_freeReadOnlyProxies(proxy)
|
||||
|
||||
if cls is not None:
|
||||
_validateNodeClass(c_node, cls)
|
||||
return cls
|
||||
return _callLookupFallback(lookup, doc, c_node)
|
||||
|
||||
################################################################################
|
||||
# Global setup
|
||||
|
||||
cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS
|
||||
cdef object ELEMENT_CLASS_LOOKUP_STATE
|
||||
|
||||
cdef void _setElementClassLookupFunction(
|
||||
_element_class_lookup_function function, object state):
|
||||
global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE
|
||||
if function is NULL:
|
||||
state = DEFAULT_ELEMENT_CLASS_LOOKUP
|
||||
function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function
|
||||
|
||||
ELEMENT_CLASS_LOOKUP_STATE = state
|
||||
LOOKUP_ELEMENT_CLASS = function
|
||||
|
||||
def set_element_class_lookup(ElementClassLookup lookup = None):
|
||||
u"""set_element_class_lookup(lookup = None)
|
||||
|
||||
Set the global default element class lookup method.
|
||||
"""
|
||||
if lookup is None or lookup._lookup_function is NULL:
|
||||
_setElementClassLookupFunction(NULL, None)
|
||||
else:
|
||||
_setElementClassLookupFunction(lookup._lookup_function, lookup)
|
||||
|
||||
# default setup: parser delegation
|
||||
cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP
|
||||
DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup()
|
||||
|
||||
set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP)
|
210
lib/lxml/cleanup.pxi
Normal file
210
lib/lxml/cleanup.pxi
Normal file
|
@ -0,0 +1,210 @@
|
|||
# functions for tree cleanup and removing elements from subtrees
|
||||
|
||||
def cleanup_namespaces(tree_or_element):
|
||||
u"""cleanup_namespaces(tree_or_element)
|
||||
|
||||
Remove all namespace declarations from a subtree that are not used
|
||||
by any of the elements or attributes in that tree.
|
||||
"""
|
||||
cdef _Element element
|
||||
element = _rootNodeOrRaise(tree_or_element)
|
||||
_removeUnusedNamespaceDeclarations(element._c_node)
|
||||
|
||||
def strip_attributes(tree_or_element, *attribute_names):
|
||||
u"""strip_attributes(tree_or_element, *attribute_names)
|
||||
|
||||
Delete all attributes with the provided attribute names from an
|
||||
Element (or ElementTree) and its descendants.
|
||||
|
||||
Attribute names can contain wildcards as in `_Element.iter`.
|
||||
|
||||
Example usage::
|
||||
|
||||
strip_attributes(root_element,
|
||||
'simpleattr',
|
||||
'{http://some/ns}attrname',
|
||||
'{http://other/ns}*')
|
||||
"""
|
||||
cdef _MultiTagMatcher matcher
|
||||
cdef _Element element
|
||||
|
||||
element = _rootNodeOrRaise(tree_or_element)
|
||||
if not attribute_names:
|
||||
return
|
||||
|
||||
matcher = _MultiTagMatcher(attribute_names)
|
||||
matcher.cacheTags(element._doc)
|
||||
if matcher.rejectsAllAttributes():
|
||||
return
|
||||
_strip_attributes(element._c_node, matcher)
|
||||
|
||||
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
|
||||
cdef xmlAttr* c_attr
|
||||
cdef xmlAttr* c_next_attr
|
||||
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
c_attr = c_node.properties
|
||||
while c_attr is not NULL:
|
||||
c_next_attr = c_attr.next
|
||||
if matcher.matchesAttribute(c_attr):
|
||||
tree.xmlRemoveProp(c_attr)
|
||||
c_attr = c_next_attr
|
||||
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
||||
|
||||
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
|
||||
u"""strip_elements(tree_or_element, *tag_names, with_tail=True)
|
||||
|
||||
Delete all elements with the provided tag names from a tree or
|
||||
subtree. This will remove the elements and their entire subtree,
|
||||
including all their attributes, text content and descendants. It
|
||||
will also remove the tail text of the element unless you
|
||||
explicitly set the ``with_tail`` keyword argument option to False.
|
||||
|
||||
Tag names can contain wildcards as in `_Element.iter`.
|
||||
|
||||
Note that this will not delete the element (or ElementTree root
|
||||
element) that you passed even if it matches. It will only treat
|
||||
its descendants. If you want to include the root element, check
|
||||
its tag name directly before even calling this function.
|
||||
|
||||
Example usage::
|
||||
|
||||
strip_elements(some_element,
|
||||
'simpletagname', # non-namespaced tag
|
||||
'{http://some/ns}tagname', # namespaced tag
|
||||
'{http://some/other/ns}*' # any tag from a namespace
|
||||
lxml.etree.Comment # comments
|
||||
)
|
||||
"""
|
||||
cdef _MultiTagMatcher matcher
|
||||
cdef _Element element
|
||||
cdef _Document doc
|
||||
cdef list ns_tags
|
||||
cdef qname* c_ns_tags
|
||||
cdef Py_ssize_t c_tag_count
|
||||
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
|
||||
|
||||
doc = _documentOrRaise(tree_or_element)
|
||||
element = _rootNodeOrRaise(tree_or_element)
|
||||
if not tag_names:
|
||||
return
|
||||
|
||||
matcher = _MultiTagMatcher(tag_names)
|
||||
matcher.cacheTags(doc)
|
||||
if matcher.rejectsAll():
|
||||
return
|
||||
|
||||
if isinstance(tree_or_element, _ElementTree):
|
||||
# include PIs and comments next to the root node
|
||||
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
||||
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
|
||||
if matcher.matchesType(tree.XML_PI_NODE):
|
||||
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
|
||||
_strip_elements(doc, element._c_node, matcher, with_tail)
|
||||
|
||||
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
|
||||
bint with_tail):
|
||||
cdef xmlNode* c_child
|
||||
cdef xmlNode* c_next
|
||||
|
||||
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
# we run through the children here to prevent any problems
|
||||
# with the tree iteration which would occur if we unlinked the
|
||||
# c_node itself
|
||||
c_child = _findChildForwards(c_node, 0)
|
||||
while c_child is not NULL:
|
||||
c_next = _nextElement(c_child)
|
||||
if matcher.matches(c_child):
|
||||
if c_child.type == tree.XML_ELEMENT_NODE:
|
||||
if not with_tail:
|
||||
tree.xmlUnlinkNode(c_child)
|
||||
_removeNode(doc, c_child)
|
||||
else:
|
||||
if with_tail:
|
||||
_removeText(c_child.next)
|
||||
tree.xmlUnlinkNode(c_child)
|
||||
attemptDeallocation(c_child)
|
||||
c_child = c_next
|
||||
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
||||
|
||||
|
||||
def strip_tags(tree_or_element, *tag_names):
|
||||
u"""strip_tags(tree_or_element, *tag_names)
|
||||
|
||||
Delete all elements with the provided tag names from a tree or
|
||||
subtree. This will remove the elements and their attributes, but
|
||||
*not* their text/tail content or descendants. Instead, it will
|
||||
merge the text content and children of the element into its
|
||||
parent.
|
||||
|
||||
Tag names can contain wildcards as in `_Element.iter`.
|
||||
|
||||
Note that this will not delete the element (or ElementTree root
|
||||
element) that you passed even if it matches. It will only treat
|
||||
its descendants.
|
||||
|
||||
Example usage::
|
||||
|
||||
strip_tags(some_element,
|
||||
'simpletagname', # non-namespaced tag
|
||||
'{http://some/ns}tagname', # namespaced tag
|
||||
'{http://some/other/ns}*' # any tag from a namespace
|
||||
Comment # comments (including their text!)
|
||||
)
|
||||
"""
|
||||
cdef _MultiTagMatcher matcher
|
||||
cdef _Element element
|
||||
cdef _Document doc
|
||||
cdef list ns_tags
|
||||
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
|
||||
cdef char** c_ns_tags
|
||||
cdef Py_ssize_t c_tag_count
|
||||
|
||||
doc = _documentOrRaise(tree_or_element)
|
||||
element = _rootNodeOrRaise(tree_or_element)
|
||||
if not tag_names:
|
||||
return
|
||||
|
||||
matcher = _MultiTagMatcher(tag_names)
|
||||
matcher.cacheTags(doc)
|
||||
if matcher.rejectsAll():
|
||||
return
|
||||
|
||||
if isinstance(tree_or_element, _ElementTree):
|
||||
# include PIs and comments next to the root node
|
||||
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
||||
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
|
||||
if matcher.matchesType(tree.XML_PI_NODE):
|
||||
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
|
||||
_strip_tags(doc, element._c_node, matcher)
|
||||
|
||||
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
|
||||
cdef xmlNode* c_child
|
||||
cdef xmlNode* c_next
|
||||
cdef Py_ssize_t i
|
||||
|
||||
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
||||
if c_node.type == tree.XML_ELEMENT_NODE:
|
||||
# we run through the children here to prevent any problems
|
||||
# with the tree iteration which would occur if we unlinked the
|
||||
# c_node itself
|
||||
c_child = _findChildForwards(c_node, 0)
|
||||
while c_child is not NULL:
|
||||
if not matcher.matches(c_child):
|
||||
c_child = _nextElement(c_child)
|
||||
continue
|
||||
if c_child.type == tree.XML_ELEMENT_NODE:
|
||||
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
|
||||
_replaceNodeByChildren(doc, c_child)
|
||||
if not attemptDeallocation(c_child):
|
||||
if c_child.nsDef is not NULL:
|
||||
# make namespaces absolute
|
||||
moveNodeToDocument(doc, doc._c_doc, c_child)
|
||||
c_child = c_next
|
||||
else:
|
||||
c_next = _nextElement(c_child)
|
||||
tree.xmlUnlinkNode(c_child)
|
||||
attemptDeallocation(c_child)
|
||||
c_child = c_next
|
||||
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
103
lib/lxml/cssselect.py
Normal file
103
lib/lxml/cssselect.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
"""CSS Selectors based on XPath.
|
||||
|
||||
This module supports selecting XML/HTML tags based on CSS selectors.
|
||||
See the `CSSSelector` class for details.
|
||||
|
||||
This is a thin wrapper around cssselect 0.7 or later.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from lxml import etree
|
||||
|
||||
## Work-around the lack of absolute import in Python 2.4
|
||||
#from __future__ import absolute_import
|
||||
#from cssselect import ...
|
||||
try:
|
||||
external_cssselect = __import__('cssselect')
|
||||
except ImportError:
|
||||
raise ImportError('cssselect seems not to be installed. '
|
||||
'See http://packages.python.org/cssselect/')
|
||||
|
||||
SelectorSyntaxError = external_cssselect.SelectorSyntaxError
|
||||
ExpressionError = external_cssselect.ExpressionError
|
||||
SelectorError = external_cssselect.SelectorError
|
||||
|
||||
|
||||
__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
|
||||
'CSSSelector']
|
||||
|
||||
|
||||
class LxmlTranslator(external_cssselect.GenericTranslator):
|
||||
"""
|
||||
A custom CSS selector to XPath translator with lxml-specific extensions.
|
||||
"""
|
||||
def xpath_contains_function(self, xpath, function):
|
||||
# Defined there, removed in later drafts:
|
||||
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
|
||||
if function.argument_types() not in (['STRING'], ['IDENT']):
|
||||
raise ExpressionError(
|
||||
"Expected a single string or ident for :contains(), got %r"
|
||||
% function.arguments)
|
||||
value = function.arguments[0].value
|
||||
return xpath.add_condition(
|
||||
'contains(__lxml_internal_css:lower-case(string(.)), %s)'
|
||||
% self.xpath_literal(value.lower()))
|
||||
|
||||
|
||||
class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
|
||||
"""
|
||||
lxml extensions + HTML support.
|
||||
"""
|
||||
|
||||
|
||||
def _make_lower_case(context, s):
|
||||
return s.lower()
|
||||
|
||||
ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
|
||||
ns.prefix = '__lxml_internal_css'
|
||||
ns['lower-case'] = _make_lower_case
|
||||
|
||||
|
||||
class CSSSelector(etree.XPath):
|
||||
"""A CSS selector.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> from lxml import etree, cssselect
|
||||
>>> select = cssselect.CSSSelector("a tag > child")
|
||||
|
||||
>>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
|
||||
>>> [ el.tag for el in select(root) ]
|
||||
['child']
|
||||
|
||||
To use CSS namespaces, you need to pass a prefix-to-namespace
|
||||
mapping as ``namespaces`` keyword argument::
|
||||
|
||||
>>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
||||
>>> select_ns = cssselect.CSSSelector('root > rdf|Description',
|
||||
... namespaces={'rdf': rdfns})
|
||||
|
||||
>>> rdf = etree.XML((
|
||||
... '<root xmlns:rdf="%s">'
|
||||
... '<rdf:Description>blah</rdf:Description>'
|
||||
... '</root>') % rdfns)
|
||||
>>> [(el.tag, el.text) for el in select_ns(rdf)]
|
||||
[('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
|
||||
|
||||
"""
|
||||
def __init__(self, css, namespaces=None, translator='xml'):
|
||||
if translator == 'xml':
|
||||
translator = LxmlTranslator()
|
||||
elif translator == 'html':
|
||||
translator = LxmlHTMLTranslator()
|
||||
elif translator == 'xhtml':
|
||||
translator = LxmlHTMLTranslator(xhtml=True)
|
||||
path = translator.css_to_xpath(css)
|
||||
etree.XPath.__init__(self, path, namespaces=namespaces)
|
||||
self.css = css
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s %s for %r>' % (
|
||||
self.__class__.__name__,
|
||||
hex(abs(id(self)))[2:],
|
||||
self.css)
|
8
lib/lxml/cvarargs.pxd
Normal file
8
lib/lxml/cvarargs.pxd
Normal file
|
@ -0,0 +1,8 @@
|
|||
cdef extern from "stdarg.h":
|
||||
ctypedef void *va_list
|
||||
void va_start(va_list ap, void *last) nogil
|
||||
void va_end(va_list ap) nogil
|
||||
|
||||
cdef extern from "etree_defs.h":
|
||||
cdef int va_int(va_list ap) nogil
|
||||
cdef char *va_charptr(va_list ap) nogil
|
91
lib/lxml/debug.pxi
Normal file
91
lib/lxml/debug.pxi
Normal file
|
@ -0,0 +1,91 @@
|
|||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
cdef class _MemDebug:
|
||||
"""Debugging support for the memory allocation in libxml2.
|
||||
"""
|
||||
def bytes_used(self):
|
||||
"""bytes_used(self)
|
||||
|
||||
Returns the total amount of memory (in bytes) currently used by libxml2.
|
||||
Note that libxml2 constrains this value to a C int, which limits
|
||||
the accuracy on 64 bit systems.
|
||||
"""
|
||||
return tree.xmlMemUsed()
|
||||
|
||||
def blocks_used(self):
|
||||
"""blocks_used(self)
|
||||
|
||||
Returns the total number of memory blocks currently allocated by libxml2.
|
||||
Note that libxml2 constrains this value to a C int, which limits
|
||||
the accuracy on 64 bit systems.
|
||||
"""
|
||||
return tree.xmlMemBlocks()
|
||||
|
||||
def dict_size(self):
|
||||
"""dict_size(self)
|
||||
|
||||
Returns the current size of the global name dictionary used by libxml2
|
||||
for the current thread. Each thread has its own dictionary.
|
||||
"""
|
||||
c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL)
|
||||
if c_dict is NULL:
|
||||
raise MemoryError()
|
||||
return tree.xmlDictSize(c_dict)
|
||||
|
||||
def dump(self, output_file=None, byte_count=None):
|
||||
"""dump(self, output_file=None, byte_count=None)
|
||||
|
||||
Dumps the current memory blocks allocated by libxml2 to a file.
|
||||
|
||||
The optional parameter 'output_file' specifies the file path. It defaults
|
||||
to the file ".memorylist" in the current directory.
|
||||
|
||||
The optional parameter 'byte_count' limits the number of bytes in the dump.
|
||||
Note that this parameter is ignored when lxml is compiled against a libxml2
|
||||
version before 2.7.0.
|
||||
"""
|
||||
cdef Py_ssize_t c_count
|
||||
if output_file is None:
|
||||
output_file = b'.memorylist'
|
||||
elif isinstance(output_file, unicode):
|
||||
output_file.encode(sys.getfilesystemencoding())
|
||||
|
||||
f = stdio.fopen(output_file, "w")
|
||||
if f is NULL:
|
||||
raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding()))
|
||||
try:
|
||||
if byte_count is None:
|
||||
tree.xmlMemDisplay(f)
|
||||
else:
|
||||
c_count = byte_count
|
||||
tree.xmlMemDisplayLast(f, c_count)
|
||||
finally:
|
||||
stdio.fclose(f)
|
||||
|
||||
def show(self, output_file=None, block_count=None):
|
||||
"""show(self, output_file=None, block_count=None)
|
||||
|
||||
Dumps the current memory blocks allocated by libxml2 to a file.
|
||||
The output file format is suitable for line diffing.
|
||||
|
||||
The optional parameter 'output_file' specifies the file path. It defaults
|
||||
to the file ".memorydump" in the current directory.
|
||||
|
||||
The optional parameter 'block_count' limits the number of blocks
|
||||
in the dump.
|
||||
"""
|
||||
if output_file is None:
|
||||
output_file = b'.memorydump'
|
||||
elif isinstance(output_file, unicode):
|
||||
output_file.encode(sys.getfilesystemencoding())
|
||||
|
||||
f = stdio.fopen(output_file, "w")
|
||||
if f is NULL:
|
||||
raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding()))
|
||||
try:
|
||||
tree.xmlMemShow(f, block_count if block_count is not None else tree.xmlMemBlocks())
|
||||
finally:
|
||||
stdio.fclose(f)
|
||||
|
||||
memory_debugger = _MemDebug()
|
175
lib/lxml/docloader.pxi
Normal file
175
lib/lxml/docloader.pxi
Normal file
|
@ -0,0 +1,175 @@
|
|||
# Custom resolver API
|
||||
|
||||
ctypedef enum _InputDocumentDataType:
|
||||
PARSER_DATA_INVALID
|
||||
PARSER_DATA_EMPTY
|
||||
PARSER_DATA_STRING
|
||||
PARSER_DATA_FILENAME
|
||||
PARSER_DATA_FILE
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
cdef class _InputDocument:
|
||||
cdef _InputDocumentDataType _type
|
||||
cdef bytes _data_bytes
|
||||
cdef object _filename
|
||||
cdef object _file
|
||||
cdef bint _close_file
|
||||
|
||||
def __cinit__(self):
|
||||
self._type = PARSER_DATA_INVALID
|
||||
|
||||
|
||||
cdef class Resolver:
|
||||
u"This is the base class of all resolvers."
|
||||
def resolve(self, system_url, public_id, context):
|
||||
u"""resolve(self, system_url, public_id, context)
|
||||
|
||||
Override this method to resolve an external source by
|
||||
``system_url`` and ``public_id``. The third argument is an
|
||||
opaque context object.
|
||||
|
||||
Return the result of one of the ``resolve_*()`` methods.
|
||||
"""
|
||||
return None
|
||||
|
||||
def resolve_empty(self, context):
|
||||
u"""resolve_empty(self, context)
|
||||
|
||||
Return an empty input document.
|
||||
|
||||
Pass context as parameter.
|
||||
"""
|
||||
cdef _InputDocument doc_ref
|
||||
doc_ref = _InputDocument()
|
||||
doc_ref._type = PARSER_DATA_EMPTY
|
||||
return doc_ref
|
||||
|
||||
def resolve_string(self, string, context, *, base_url=None):
|
||||
u"""resolve_string(self, string, context, base_url=None)
|
||||
|
||||
Return a parsable string as input document.
|
||||
|
||||
Pass data string and context as parameters. You can pass the
|
||||
source URL or filename through the ``base_url`` keyword
|
||||
argument.
|
||||
"""
|
||||
cdef _InputDocument doc_ref
|
||||
if isinstance(string, unicode):
|
||||
string = (<unicode>string).encode('utf8')
|
||||
elif not isinstance(string, bytes):
|
||||
raise TypeError, "argument must be a byte string or unicode string"
|
||||
doc_ref = _InputDocument()
|
||||
doc_ref._type = PARSER_DATA_STRING
|
||||
doc_ref._data_bytes = string
|
||||
if base_url is not None:
|
||||
doc_ref._filename = _encodeFilename(base_url)
|
||||
return doc_ref
|
||||
|
||||
def resolve_filename(self, filename, context):
|
||||
u"""resolve_filename(self, filename, context)
|
||||
|
||||
Return the name of a parsable file as input document.
|
||||
|
||||
Pass filename and context as parameters. You can also pass a
|
||||
URL with an HTTP, FTP or file target.
|
||||
"""
|
||||
cdef _InputDocument doc_ref
|
||||
doc_ref = _InputDocument()
|
||||
doc_ref._type = PARSER_DATA_FILENAME
|
||||
doc_ref._filename = _encodeFilename(filename)
|
||||
return doc_ref
|
||||
|
||||
def resolve_file(self, f, context, *, base_url=None, bint close=True):
|
||||
u"""resolve_file(self, f, context, base_url=None, close=True)
|
||||
|
||||
Return an open file-like object as input document.
|
||||
|
||||
Pass open file and context as parameters. You can pass the
|
||||
base URL or filename of the file through the ``base_url``
|
||||
keyword argument. If the ``close`` flag is True (the
|
||||
default), the file will be closed after reading.
|
||||
|
||||
Note that using ``.resolve_filename()`` is more efficient,
|
||||
especially in threaded environments.
|
||||
"""
|
||||
cdef _InputDocument doc_ref
|
||||
try:
|
||||
f.read
|
||||
except AttributeError:
|
||||
raise TypeError, u"Argument is not a file-like object"
|
||||
doc_ref = _InputDocument()
|
||||
doc_ref._type = PARSER_DATA_FILE
|
||||
if base_url is not None:
|
||||
doc_ref._filename = _encodeFilename(base_url)
|
||||
else:
|
||||
doc_ref._filename = _getFilenameForFile(f)
|
||||
doc_ref._close_file = close
|
||||
doc_ref._file = f
|
||||
return doc_ref
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
cdef class _ResolverRegistry:
|
||||
cdef object _resolvers
|
||||
cdef Resolver _default_resolver
|
||||
def __cinit__(self, Resolver default_resolver=None):
|
||||
self._resolvers = set()
|
||||
self._default_resolver = default_resolver
|
||||
|
||||
def add(self, Resolver resolver not None):
|
||||
u"""add(self, resolver)
|
||||
|
||||
Register a resolver.
|
||||
|
||||
For each requested entity, the 'resolve' method of the resolver will
|
||||
be called and the result will be passed to the parser. If this method
|
||||
returns None, the request will be delegated to other resolvers or the
|
||||
default resolver. The resolvers will be tested in an arbitrary order
|
||||
until the first match is found.
|
||||
"""
|
||||
self._resolvers.add(resolver)
|
||||
|
||||
def remove(self, resolver):
|
||||
u"remove(self, resolver)"
|
||||
self._resolvers.discard(resolver)
|
||||
|
||||
cdef _ResolverRegistry _copy(self):
|
||||
cdef _ResolverRegistry registry
|
||||
registry = _ResolverRegistry(self._default_resolver)
|
||||
registry._resolvers = self._resolvers.copy()
|
||||
return registry
|
||||
|
||||
def copy(self):
|
||||
u"copy(self)"
|
||||
return self._copy()
|
||||
|
||||
def resolve(self, system_url, public_id, context):
|
||||
u"resolve(self, system_url, public_id, context)"
|
||||
for resolver in self._resolvers:
|
||||
result = resolver.resolve(system_url, public_id, context)
|
||||
if result is not None:
|
||||
return result
|
||||
if self._default_resolver is None:
|
||||
return None
|
||||
return self._default_resolver.resolve(system_url, public_id, context)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self._resolvers)
|
||||
|
||||
@cython.internal
|
||||
cdef class _ResolverContext(_ExceptionContext):
|
||||
cdef _ResolverRegistry _resolvers
|
||||
cdef _TempStore _storage
|
||||
|
||||
cdef void clear(self):
|
||||
_ExceptionContext.clear(self)
|
||||
self._storage.clear()
|
||||
|
||||
cdef _initResolverContext(_ResolverContext context,
|
||||
_ResolverRegistry resolvers):
|
||||
if resolvers is None:
|
||||
context._resolvers = _ResolverRegistry()
|
||||
else:
|
||||
context._resolvers = resolvers
|
||||
context._storage = _TempStore()
|
505
lib/lxml/doctestcompare.py
Normal file
505
lib/lxml/doctestcompare.py
Normal file
|
@ -0,0 +1,505 @@
|
|||
"""
|
||||
lxml-based doctest output comparison.
|
||||
|
||||
Note: normally, you should just import the `lxml.usedoctest` and
|
||||
`lxml.html.usedoctest` modules from within a doctest, instead of this
|
||||
one::
|
||||
|
||||
>>> import lxml.usedoctest # for XML output
|
||||
|
||||
>>> import lxml.html.usedoctest # for HTML output
|
||||
|
||||
To use this module directly, you must call ``lxmldoctest.install()``,
|
||||
which will cause doctest to use this in all subsequent calls.
|
||||
|
||||
This changes the way output is checked and comparisons are made for
|
||||
XML or HTML-like content.
|
||||
|
||||
XML or HTML content is noticed because the example starts with ``<``
|
||||
(it's HTML if it starts with ``<html``). You can also use the
|
||||
``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
|
||||
|
||||
Some rough wildcard-like things are allowed. Whitespace is generally
|
||||
ignored (except in attributes). In text (attributes and text in the
|
||||
body) you can use ``...`` as a wildcard. In an example it also
|
||||
matches any trailing tags in the element, though it does not match
|
||||
leading tags. You may create a tag ``<any>`` or include an ``any``
|
||||
attribute in the tag. An ``any`` tag matches any tag, while the
|
||||
attribute matches any and all attributes.
|
||||
|
||||
When a match fails, the reformatted example and gotten text is
|
||||
displayed (indented), and a rough diff-like output is given. Anything
|
||||
marked with ``-`` is in the output but wasn't supposed to be, and
|
||||
similarly ``+`` means its in the example but wasn't in the output.
|
||||
|
||||
You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
|
||||
"""
|
||||
|
||||
from lxml import etree
|
||||
import sys
|
||||
import re
|
||||
import doctest
|
||||
import cgi
|
||||
|
||||
__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
|
||||
'LHTMLOutputChecker', 'install', 'temp_install']
|
||||
|
||||
try:
|
||||
_basestring = basestring
|
||||
except NameError:
|
||||
_basestring = (str, bytes)
|
||||
|
||||
_IS_PYTHON_3 = sys.version_info[0] >= 3
|
||||
|
||||
PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
|
||||
PARSE_XML = doctest.register_optionflag('PARSE_XML')
|
||||
NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')
|
||||
|
||||
OutputChecker = doctest.OutputChecker
|
||||
|
||||
def strip(v):
|
||||
if v is None:
|
||||
return None
|
||||
else:
|
||||
return v.strip()
|
||||
|
||||
def norm_whitespace(v):
|
||||
return _norm_whitespace_re.sub(' ', v)
|
||||
|
||||
_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)
|
||||
|
||||
def html_fromstring(html):
|
||||
return etree.fromstring(html, _html_parser)
|
||||
|
||||
# We use this to distinguish repr()s from elements:
|
||||
_repr_re = re.compile(r'^<[^>]+ (at|object) ')
|
||||
_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')
|
||||
|
||||
class LXMLOutputChecker(OutputChecker):
|
||||
|
||||
empty_tags = (
|
||||
'param', 'img', 'area', 'br', 'basefont', 'input',
|
||||
'base', 'meta', 'link', 'col')
|
||||
|
||||
def get_default_parser(self):
|
||||
return etree.XML
|
||||
|
||||
def check_output(self, want, got, optionflags):
|
||||
alt_self = getattr(self, '_temp_override_self', None)
|
||||
if alt_self is not None:
|
||||
super_method = self._temp_call_super_check_output
|
||||
self = alt_self
|
||||
else:
|
||||
super_method = OutputChecker.check_output
|
||||
parser = self.get_parser(want, got, optionflags)
|
||||
if not parser:
|
||||
return super_method(
|
||||
self, want, got, optionflags)
|
||||
try:
|
||||
want_doc = parser(want)
|
||||
except etree.XMLSyntaxError:
|
||||
return False
|
||||
try:
|
||||
got_doc = parser(got)
|
||||
except etree.XMLSyntaxError:
|
||||
return False
|
||||
return self.compare_docs(want_doc, got_doc)
|
||||
|
||||
def get_parser(self, want, got, optionflags):
|
||||
parser = None
|
||||
if NOPARSE_MARKUP & optionflags:
|
||||
return None
|
||||
if PARSE_HTML & optionflags:
|
||||
parser = html_fromstring
|
||||
elif PARSE_XML & optionflags:
|
||||
parser = etree.XML
|
||||
elif (want.strip().lower().startswith('<html')
|
||||
and got.strip().startswith('<html')):
|
||||
parser = html_fromstring
|
||||
elif (self._looks_like_markup(want)
|
||||
and self._looks_like_markup(got)):
|
||||
parser = self.get_default_parser()
|
||||
return parser
|
||||
|
||||
def _looks_like_markup(self, s):
|
||||
s = s.strip()
|
||||
return (s.startswith('<')
|
||||
and not _repr_re.search(s))
|
||||
|
||||
def compare_docs(self, want, got):
|
||||
if not self.tag_compare(want.tag, got.tag):
|
||||
return False
|
||||
if not self.text_compare(want.text, got.text, True):
|
||||
return False
|
||||
if not self.text_compare(want.tail, got.tail, True):
|
||||
return False
|
||||
if 'any' not in want.attrib:
|
||||
want_keys = sorted(want.attrib.keys())
|
||||
got_keys = sorted(got.attrib.keys())
|
||||
if want_keys != got_keys:
|
||||
return False
|
||||
for key in want_keys:
|
||||
if not self.text_compare(want.attrib[key], got.attrib[key], False):
|
||||
return False
|
||||
if want.text != '...' or len(want):
|
||||
want_children = list(want)
|
||||
got_children = list(got)
|
||||
while want_children or got_children:
|
||||
if not want_children or not got_children:
|
||||
return False
|
||||
want_first = want_children.pop(0)
|
||||
got_first = got_children.pop(0)
|
||||
if not self.compare_docs(want_first, got_first):
|
||||
return False
|
||||
if not got_children and want_first.tail == '...':
|
||||
break
|
||||
return True
|
||||
|
||||
def text_compare(self, want, got, strip):
|
||||
want = want or ''
|
||||
got = got or ''
|
||||
if strip:
|
||||
want = norm_whitespace(want).strip()
|
||||
got = norm_whitespace(got).strip()
|
||||
want = '^%s$' % re.escape(want)
|
||||
want = want.replace(r'\.\.\.', '.*')
|
||||
if re.search(want, got):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def tag_compare(self, want, got):
|
||||
if want == 'any':
|
||||
return True
|
||||
if (not isinstance(want, _basestring)
|
||||
or not isinstance(got, _basestring)):
|
||||
return want == got
|
||||
want = want or ''
|
||||
got = got or ''
|
||||
if want.startswith('{...}'):
|
||||
# Ellipsis on the namespace
|
||||
return want.split('}')[-1] == got.split('}')[-1]
|
||||
else:
|
||||
return want == got
|
||||
|
||||
def output_difference(self, example, got, optionflags):
|
||||
want = example.want
|
||||
parser = self.get_parser(want, got, optionflags)
|
||||
errors = []
|
||||
if parser is not None:
|
||||
try:
|
||||
want_doc = parser(want)
|
||||
except etree.XMLSyntaxError:
|
||||
e = sys.exc_info()[1]
|
||||
errors.append('In example: %s' % e)
|
||||
try:
|
||||
got_doc = parser(got)
|
||||
except etree.XMLSyntaxError:
|
||||
e = sys.exc_info()[1]
|
||||
errors.append('In actual output: %s' % e)
|
||||
if parser is None or errors:
|
||||
value = OutputChecker.output_difference(
|
||||
self, example, got, optionflags)
|
||||
if errors:
|
||||
errors.append(value)
|
||||
return '\n'.join(errors)
|
||||
else:
|
||||
return value
|
||||
html = parser is html_fromstring
|
||||
diff_parts = []
|
||||
diff_parts.append('Expected:')
|
||||
diff_parts.append(self.format_doc(want_doc, html, 2))
|
||||
diff_parts.append('Got:')
|
||||
diff_parts.append(self.format_doc(got_doc, html, 2))
|
||||
diff_parts.append('Diff:')
|
||||
diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
|
||||
return '\n'.join(diff_parts)
|
||||
|
||||
def html_empty_tag(self, el, html=True):
|
||||
if not html:
|
||||
return False
|
||||
if el.tag not in self.empty_tags:
|
||||
return False
|
||||
if el.text or len(el):
|
||||
# This shouldn't happen (contents in an empty tag)
|
||||
return False
|
||||
return True
|
||||
|
||||
def format_doc(self, doc, html, indent, prefix=''):
|
||||
parts = []
|
||||
if not len(doc):
|
||||
# No children...
|
||||
parts.append(' '*indent)
|
||||
parts.append(prefix)
|
||||
parts.append(self.format_tag(doc))
|
||||
if not self.html_empty_tag(doc, html):
|
||||
if strip(doc.text):
|
||||
parts.append(self.format_text(doc.text))
|
||||
parts.append(self.format_end_tag(doc))
|
||||
if strip(doc.tail):
|
||||
parts.append(self.format_text(doc.tail))
|
||||
parts.append('\n')
|
||||
return ''.join(parts)
|
||||
parts.append(' '*indent)
|
||||
parts.append(prefix)
|
||||
parts.append(self.format_tag(doc))
|
||||
if not self.html_empty_tag(doc, html):
|
||||
parts.append('\n')
|
||||
if strip(doc.text):
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.format_text(doc.text))
|
||||
parts.append('\n')
|
||||
for el in doc:
|
||||
parts.append(self.format_doc(el, html, indent+2))
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.format_end_tag(doc))
|
||||
parts.append('\n')
|
||||
if strip(doc.tail):
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.format_text(doc.tail))
|
||||
parts.append('\n')
|
||||
return ''.join(parts)
|
||||
|
||||
def format_text(self, text, strip=True):
|
||||
if text is None:
|
||||
return ''
|
||||
if strip:
|
||||
text = text.strip()
|
||||
return cgi.escape(text, 1)
|
||||
|
||||
def format_tag(self, el):
|
||||
attrs = []
|
||||
if isinstance(el, etree.CommentBase):
|
||||
# FIXME: probably PIs should be handled specially too?
|
||||
return '<!--'
|
||||
for name, value in sorted(el.attrib.items()):
|
||||
attrs.append('%s="%s"' % (name, self.format_text(value, False)))
|
||||
if not attrs:
|
||||
return '<%s>' % el.tag
|
||||
return '<%s %s>' % (el.tag, ' '.join(attrs))
|
||||
|
||||
def format_end_tag(self, el):
|
||||
if isinstance(el, etree.CommentBase):
|
||||
# FIXME: probably PIs should be handled specially too?
|
||||
return '-->'
|
||||
return '</%s>' % el.tag
|
||||
|
||||
def collect_diff(self, want, got, html, indent):
|
||||
parts = []
|
||||
if not len(want) and not len(got):
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.collect_diff_tag(want, got))
|
||||
if not self.html_empty_tag(got, html):
|
||||
parts.append(self.collect_diff_text(want.text, got.text))
|
||||
parts.append(self.collect_diff_end_tag(want, got))
|
||||
parts.append(self.collect_diff_text(want.tail, got.tail))
|
||||
parts.append('\n')
|
||||
return ''.join(parts)
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.collect_diff_tag(want, got))
|
||||
parts.append('\n')
|
||||
if strip(want.text) or strip(got.text):
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.collect_diff_text(want.text, got.text))
|
||||
parts.append('\n')
|
||||
want_children = list(want)
|
||||
got_children = list(got)
|
||||
while want_children or got_children:
|
||||
if not want_children:
|
||||
parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-'))
|
||||
continue
|
||||
if not got_children:
|
||||
parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+'))
|
||||
continue
|
||||
parts.append(self.collect_diff(
|
||||
want_children.pop(0), got_children.pop(0), html, indent+2))
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.collect_diff_end_tag(want, got))
|
||||
parts.append('\n')
|
||||
if strip(want.tail) or strip(got.tail):
|
||||
parts.append(' '*indent)
|
||||
parts.append(self.collect_diff_text(want.tail, got.tail))
|
||||
parts.append('\n')
|
||||
return ''.join(parts)
|
||||
|
||||
def collect_diff_tag(self, want, got):
|
||||
if not self.tag_compare(want.tag, got.tag):
|
||||
tag = '%s (got: %s)' % (want.tag, got.tag)
|
||||
else:
|
||||
tag = got.tag
|
||||
attrs = []
|
||||
any = want.tag == 'any' or 'any' in want.attrib
|
||||
for name, value in sorted(got.attrib.items()):
|
||||
if name not in want.attrib and not any:
|
||||
attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
|
||||
else:
|
||||
if name in want.attrib:
|
||||
text = self.collect_diff_text(want.attrib[name], value, False)
|
||||
else:
|
||||
text = self.format_text(value, False)
|
||||
attrs.append('%s="%s"' % (name, text))
|
||||
if not any:
|
||||
for name, value in sorted(want.attrib.items()):
|
||||
if name in got.attrib:
|
||||
continue
|
||||
attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
|
||||
if attrs:
|
||||
tag = '<%s %s>' % (tag, ' '.join(attrs))
|
||||
else:
|
||||
tag = '<%s>' % tag
|
||||
return tag
|
||||
|
||||
def collect_diff_end_tag(self, want, got):
|
||||
if want.tag != got.tag:
|
||||
tag = '%s (got: %s)' % (want.tag, got.tag)
|
||||
else:
|
||||
tag = got.tag
|
||||
return '</%s>' % tag
|
||||
|
||||
def collect_diff_text(self, want, got, strip=True):
|
||||
if self.text_compare(want, got, strip):
|
||||
if not got:
|
||||
return ''
|
||||
return self.format_text(got, strip)
|
||||
text = '%s (got: %s)' % (want, got)
|
||||
return self.format_text(text, strip)
|
||||
|
||||
class LHTMLOutputChecker(LXMLOutputChecker):
|
||||
def get_default_parser(self):
|
||||
return html_fromstring
|
||||
|
||||
def install(html=False):
|
||||
"""
|
||||
Install doctestcompare for all future doctests.
|
||||
|
||||
If html is true, then by default the HTML parser will be used;
|
||||
otherwise the XML parser is used.
|
||||
"""
|
||||
if html:
|
||||
doctest.OutputChecker = LHTMLOutputChecker
|
||||
else:
|
||||
doctest.OutputChecker = LXMLOutputChecker
|
||||
|
||||
def temp_install(html=False, del_module=None):
|
||||
"""
|
||||
Use this *inside* a doctest to enable this checker for this
|
||||
doctest only.
|
||||
|
||||
If html is true, then by default the HTML parser will be used;
|
||||
otherwise the XML parser is used.
|
||||
"""
|
||||
if html:
|
||||
Checker = LHTMLOutputChecker
|
||||
else:
|
||||
Checker = LXMLOutputChecker
|
||||
frame = _find_doctest_frame()
|
||||
dt_self = frame.f_locals['self']
|
||||
checker = Checker()
|
||||
old_checker = dt_self._checker
|
||||
dt_self._checker = checker
|
||||
# The unfortunate thing is that there is a local variable 'check'
|
||||
# in the function that runs the doctests, that is a bound method
|
||||
# into the output checker. We have to update that. We can't
|
||||
# modify the frame, so we have to modify the object in place. The
|
||||
# only way to do this is to actually change the func_code
|
||||
# attribute of the method. We change it, and then wait for
|
||||
# __record_outcome to be run, which signals the end of the __run
|
||||
# method, at which point we restore the previous check_output
|
||||
# implementation.
|
||||
if _IS_PYTHON_3:
|
||||
check_func = frame.f_locals['check'].__func__
|
||||
checker_check_func = checker.check_output.__func__
|
||||
else:
|
||||
check_func = frame.f_locals['check'].im_func
|
||||
checker_check_func = checker.check_output.im_func
|
||||
# Because we can't patch up func_globals, this is the only global
|
||||
# in check_output that we care about:
|
||||
doctest.etree = etree
|
||||
_RestoreChecker(dt_self, old_checker, checker,
|
||||
check_func, checker_check_func,
|
||||
del_module)
|
||||
|
||||
class _RestoreChecker(object):
|
||||
def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
|
||||
del_module):
|
||||
self.dt_self = dt_self
|
||||
self.checker = old_checker
|
||||
self.checker._temp_call_super_check_output = self.call_super
|
||||
self.checker._temp_override_self = new_checker
|
||||
self.check_func = check_func
|
||||
self.clone_func = clone_func
|
||||
self.del_module = del_module
|
||||
self.install_clone()
|
||||
self.install_dt_self()
|
||||
def install_clone(self):
|
||||
if _IS_PYTHON_3:
|
||||
self.func_code = self.check_func.__code__
|
||||
self.func_globals = self.check_func.__globals__
|
||||
self.check_func.__code__ = self.clone_func.__code__
|
||||
else:
|
||||
self.func_code = self.check_func.func_code
|
||||
self.func_globals = self.check_func.func_globals
|
||||
self.check_func.func_code = self.clone_func.func_code
|
||||
def uninstall_clone(self):
|
||||
if _IS_PYTHON_3:
|
||||
self.check_func.__code__ = self.func_code
|
||||
else:
|
||||
self.check_func.func_code = self.func_code
|
||||
def install_dt_self(self):
|
||||
self.prev_func = self.dt_self._DocTestRunner__record_outcome
|
||||
self.dt_self._DocTestRunner__record_outcome = self
|
||||
def uninstall_dt_self(self):
|
||||
self.dt_self._DocTestRunner__record_outcome = self.prev_func
|
||||
def uninstall_module(self):
|
||||
if self.del_module:
|
||||
import sys
|
||||
del sys.modules[self.del_module]
|
||||
if '.' in self.del_module:
|
||||
package, module = self.del_module.rsplit('.', 1)
|
||||
package_mod = sys.modules[package]
|
||||
delattr(package_mod, module)
|
||||
def __call__(self, *args, **kw):
|
||||
self.uninstall_clone()
|
||||
self.uninstall_dt_self()
|
||||
del self.checker._temp_override_self
|
||||
del self.checker._temp_call_super_check_output
|
||||
result = self.prev_func(*args, **kw)
|
||||
self.uninstall_module()
|
||||
return result
|
||||
def call_super(self, *args, **kw):
|
||||
self.uninstall_clone()
|
||||
try:
|
||||
return self.check_func(*args, **kw)
|
||||
finally:
|
||||
self.install_clone()
|
||||
|
||||
def _find_doctest_frame():
|
||||
import sys
|
||||
frame = sys._getframe(1)
|
||||
while frame:
|
||||
l = frame.f_locals
|
||||
if 'BOOM' in l:
|
||||
# Sign of doctest
|
||||
return frame
|
||||
frame = frame.f_back
|
||||
raise LookupError(
|
||||
"Could not find doctest (only use this function *inside* a doctest)")
|
||||
|
||||
__test__ = {
|
||||
'basic': '''
|
||||
>>> temp_install()
|
||||
>>> print """<xml a="1" b="2">stuff</xml>"""
|
||||
<xml b="2" a="1">...</xml>
|
||||
>>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>"""
|
||||
<xml xmlns="...">
|
||||
<tag attr="..." />
|
||||
</xml>
|
||||
>>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
|
||||
<xml>...foo /></xml>
|
||||
'''}
|
||||
|
||||
if __name__ == '__main__':
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
468
lib/lxml/dtd.pxi
Normal file
468
lib/lxml/dtd.pxi
Normal file
|
@ -0,0 +1,468 @@
|
|||
# support for DTD validation
|
||||
from lxml.includes cimport dtdvalid
|
||||
|
||||
class DTDError(LxmlError):
|
||||
u"""Base class for DTD errors.
|
||||
"""
|
||||
pass
|
||||
|
||||
class DTDParseError(DTDError):
|
||||
u"""Error while parsing a DTD.
|
||||
"""
|
||||
pass
|
||||
|
||||
class DTDValidateError(DTDError):
|
||||
u"""Error while validating an XML document with a DTD.
|
||||
"""
|
||||
pass
|
||||
|
||||
cdef inline int _assertValidDTDNode(node, void *c_node) except -1:
|
||||
assert c_node is not NULL, u"invalid DTD proxy at %s" % id(node)
|
||||
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
@cython.freelist(8)
|
||||
cdef class _DTDElementContentDecl:
|
||||
cdef DTD _dtd
|
||||
cdef tree.xmlElementContent* _c_node
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s.%s object name=%r type=%r occur=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.type, self.occur, id(self))
|
||||
|
||||
property name:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
|
||||
|
||||
property type:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef int type = self._c_node.type
|
||||
if type == tree.XML_ELEMENT_CONTENT_PCDATA:
|
||||
return "pcdata"
|
||||
elif type == tree.XML_ELEMENT_CONTENT_ELEMENT:
|
||||
return "element"
|
||||
elif type == tree.XML_ELEMENT_CONTENT_SEQ:
|
||||
return "seq"
|
||||
elif type == tree.XML_ELEMENT_CONTENT_OR:
|
||||
return "or"
|
||||
else:
|
||||
return None
|
||||
|
||||
property occur:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef int occur = self._c_node.ocur
|
||||
if occur == tree.XML_ELEMENT_CONTENT_ONCE:
|
||||
return "once"
|
||||
elif occur == tree.XML_ELEMENT_CONTENT_OPT:
|
||||
return "opt"
|
||||
elif occur == tree.XML_ELEMENT_CONTENT_MULT:
|
||||
return "mult"
|
||||
elif occur == tree.XML_ELEMENT_CONTENT_PLUS:
|
||||
return "plus"
|
||||
else:
|
||||
return None
|
||||
|
||||
property left:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
c1 = self._c_node.c1
|
||||
if c1:
|
||||
node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
|
||||
node._dtd = self._dtd
|
||||
node._c_node = <tree.xmlElementContent*>c1
|
||||
return node
|
||||
else:
|
||||
return None
|
||||
|
||||
property right:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
c2 = self._c_node.c2
|
||||
if c2:
|
||||
node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
|
||||
node._dtd = self._dtd
|
||||
node._c_node = <tree.xmlElementContent*>c2
|
||||
return node
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
@cython.freelist(8)
|
||||
cdef class _DTDAttributeDecl:
|
||||
cdef DTD _dtd
|
||||
cdef tree.xmlAttribute* _c_node
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s.%s object name=%r elemname=%r prefix=%r type=%r default=%r default_value=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.elemname, self.prefix, self.type, self.default, self.default_value, id(self))
|
||||
|
||||
property name:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
|
||||
|
||||
property elemname:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.elem) if self._c_node.elem is not NULL else None
|
||||
|
||||
property prefix:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None
|
||||
|
||||
property type:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef int type = self._c_node.atype
|
||||
if type == tree.XML_ATTRIBUTE_CDATA:
|
||||
return "cdata"
|
||||
elif type == tree.XML_ATTRIBUTE_ID:
|
||||
return "id"
|
||||
elif type == tree.XML_ATTRIBUTE_IDREF:
|
||||
return "idref"
|
||||
elif type == tree.XML_ATTRIBUTE_IDREFS:
|
||||
return "idrefs"
|
||||
elif type == tree.XML_ATTRIBUTE_ENTITY:
|
||||
return "entity"
|
||||
elif type == tree.XML_ATTRIBUTE_ENTITIES:
|
||||
return "entities"
|
||||
elif type == tree.XML_ATTRIBUTE_NMTOKEN:
|
||||
return "nmtoken"
|
||||
elif type == tree.XML_ATTRIBUTE_NMTOKENS:
|
||||
return "nmtokens"
|
||||
elif type == tree.XML_ATTRIBUTE_ENUMERATION:
|
||||
return "enumeration"
|
||||
elif type == tree.XML_ATTRIBUTE_NOTATION:
|
||||
return "notation"
|
||||
else:
|
||||
return None
|
||||
|
||||
property default:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef int default = self._c_node.def_
|
||||
if default == tree.XML_ATTRIBUTE_NONE:
|
||||
return "none"
|
||||
elif default == tree.XML_ATTRIBUTE_REQUIRED:
|
||||
return "required"
|
||||
elif default == tree.XML_ATTRIBUTE_IMPLIED:
|
||||
return "implied"
|
||||
elif default == tree.XML_ATTRIBUTE_FIXED:
|
||||
return "fixed"
|
||||
else:
|
||||
return None
|
||||
|
||||
property default_value:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.defaultValue) if self._c_node.defaultValue is not NULL else None
|
||||
|
||||
def itervalues(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef tree.xmlEnumeration *c_node = self._c_node.tree
|
||||
while c_node is not NULL:
|
||||
yield funicode(c_node.name)
|
||||
c_node = c_node.next
|
||||
|
||||
def values(self):
|
||||
return list(self.itervalues())
|
||||
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
@cython.freelist(8)
|
||||
cdef class _DTDElementDecl:
|
||||
cdef DTD _dtd
|
||||
cdef tree.xmlElement* _c_node
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s.%s object name=%r prefix=%r type=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.prefix, self.type, id(self))
|
||||
|
||||
property name:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
|
||||
|
||||
property prefix:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None
|
||||
|
||||
property type:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef int type = self._c_node.etype
|
||||
if type == tree.XML_ELEMENT_TYPE_UNDEFINED:
|
||||
return "undefined"
|
||||
elif type == tree.XML_ELEMENT_TYPE_EMPTY:
|
||||
return "empty"
|
||||
elif type == tree.XML_ELEMENT_TYPE_ANY:
|
||||
return "any"
|
||||
elif type == tree.XML_ELEMENT_TYPE_MIXED:
|
||||
return "mixed"
|
||||
elif type == tree.XML_ELEMENT_TYPE_ELEMENT:
|
||||
return "element"
|
||||
else:
|
||||
return None
|
||||
|
||||
property content:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef tree.xmlElementContent *content = self._c_node.content
|
||||
if content:
|
||||
node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
|
||||
node._dtd = self._dtd
|
||||
node._c_node = content
|
||||
return node
|
||||
else:
|
||||
return None
|
||||
|
||||
def iterattributes(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
cdef tree.xmlAttribute *c_node = self._c_node.attributes
|
||||
while c_node:
|
||||
node = <_DTDAttributeDecl>_DTDAttributeDecl.__new__(_DTDAttributeDecl)
|
||||
node._dtd = self._dtd
|
||||
node._c_node = c_node
|
||||
yield node
|
||||
c_node = c_node.nexth
|
||||
|
||||
def attributes(self):
|
||||
return list(self.iterattributes())
|
||||
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
@cython.freelist(8)
|
||||
cdef class _DTDEntityDecl:
|
||||
cdef DTD _dtd
|
||||
cdef tree.xmlEntity* _c_node
|
||||
def __repr__(self):
|
||||
return "<%s.%s object name=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
|
||||
|
||||
property name:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
|
||||
|
||||
property orig:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.orig) if self._c_node.orig is not NULL else None
|
||||
|
||||
property content:
|
||||
def __get__(self):
|
||||
_assertValidDTDNode(self, self._c_node)
|
||||
return funicode(self._c_node.content) if self._c_node.content is not NULL else None
|
||||
|
||||
|
||||
################################################################################
|
||||
# DTD
|
||||
|
||||
cdef class DTD(_Validator):
|
||||
u"""DTD(self, file=None, external_id=None)
|
||||
A DTD validator.
|
||||
|
||||
Can load from filesystem directly given a filename or file-like object.
|
||||
Alternatively, pass the keyword parameter ``external_id`` to load from a
|
||||
catalog.
|
||||
"""
|
||||
cdef tree.xmlDtd* _c_dtd
|
||||
def __init__(self, file=None, *, external_id=None):
|
||||
_Validator.__init__(self)
|
||||
if file is not None:
|
||||
if _isString(file):
|
||||
file = _encodeFilename(file)
|
||||
with self._error_log:
|
||||
self._c_dtd = xmlparser.xmlParseDTD(NULL, _xcstr(file))
|
||||
elif hasattr(file, 'read'):
|
||||
self._c_dtd = _parseDtdFromFilelike(file)
|
||||
else:
|
||||
raise DTDParseError, u"file must be a filename or file-like object"
|
||||
elif external_id is not None:
|
||||
with self._error_log:
|
||||
self._c_dtd = xmlparser.xmlParseDTD(<const_xmlChar*>external_id, NULL)
|
||||
else:
|
||||
raise DTDParseError, u"either filename or external ID required"
|
||||
|
||||
if self._c_dtd is NULL:
|
||||
raise DTDParseError(
|
||||
self._error_log._buildExceptionMessage(u"error parsing DTD"),
|
||||
self._error_log)
|
||||
|
||||
property name:
|
||||
def __get__(self):
|
||||
if self._c_dtd is NULL:
|
||||
return None
|
||||
return funicodeOrNone(self._c_dtd.name)
|
||||
|
||||
property external_id:
|
||||
def __get__(self):
|
||||
if self._c_dtd is NULL:
|
||||
return None
|
||||
return funicodeOrNone(self._c_dtd.ExternalID)
|
||||
|
||||
property system_url:
|
||||
def __get__(self):
|
||||
if self._c_dtd is NULL:
|
||||
return None
|
||||
return funicodeOrNone(self._c_dtd.SystemID)
|
||||
|
||||
def iterelements(self):
|
||||
cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
|
||||
while c_node is not NULL:
|
||||
if c_node.type == tree.XML_ELEMENT_DECL:
|
||||
node = _DTDElementDecl()
|
||||
node._dtd = self
|
||||
node._c_node = <tree.xmlElement*>c_node
|
||||
yield node
|
||||
c_node = c_node.next
|
||||
|
||||
def elements(self):
|
||||
return list(self.iterelements())
|
||||
|
||||
def iterentities(self):
|
||||
cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
|
||||
while c_node is not NULL:
|
||||
if c_node.type == tree.XML_ENTITY_DECL:
|
||||
node = _DTDEntityDecl()
|
||||
node._dtd = self
|
||||
node._c_node = <tree.xmlEntity*>c_node
|
||||
yield node
|
||||
c_node = c_node.next
|
||||
|
||||
def entities(self):
|
||||
return list(self.iterentities())
|
||||
|
||||
def __dealloc__(self):
|
||||
tree.xmlFreeDtd(self._c_dtd)
|
||||
|
||||
def __call__(self, etree):
|
||||
u"""__call__(self, etree)
|
||||
|
||||
Validate doc using the DTD.
|
||||
|
||||
Returns true if the document is valid, false if not.
|
||||
"""
|
||||
cdef _Document doc
|
||||
cdef _Element root_node
|
||||
cdef xmlDoc* c_doc
|
||||
cdef dtdvalid.xmlValidCtxt* valid_ctxt
|
||||
cdef int ret = -1
|
||||
|
||||
assert self._c_dtd is not NULL, "DTD not initialised"
|
||||
doc = _documentOrRaise(etree)
|
||||
root_node = _rootNodeOrRaise(etree)
|
||||
|
||||
valid_ctxt = dtdvalid.xmlNewValidCtxt()
|
||||
if valid_ctxt is NULL:
|
||||
raise DTDError(u"Failed to create validation context")
|
||||
|
||||
# work around error reporting bug in libxml2 <= 2.9.1 (and later?)
|
||||
# https://bugzilla.gnome.org/show_bug.cgi?id=724903
|
||||
valid_ctxt.error = <dtdvalid.xmlValidityErrorFunc>_nullGenericErrorFunc
|
||||
valid_ctxt.userData = NULL
|
||||
|
||||
try:
|
||||
with self._error_log:
|
||||
c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
|
||||
ret = dtdvalid.xmlValidateDtd(valid_ctxt, c_doc, self._c_dtd)
|
||||
_destroyFakeDoc(doc._c_doc, c_doc)
|
||||
finally:
|
||||
dtdvalid.xmlFreeValidCtxt(valid_ctxt)
|
||||
|
||||
if ret == -1:
|
||||
raise DTDValidateError(u"Internal error in DTD validation",
|
||||
self._error_log)
|
||||
return ret == 1
|
||||
|
||||
|
||||
cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL:
|
||||
cdef _ExceptionContext exc_context
|
||||
cdef _FileReaderContext dtd_parser
|
||||
cdef _ErrorLog error_log
|
||||
cdef tree.xmlDtd* c_dtd
|
||||
exc_context = _ExceptionContext()
|
||||
dtd_parser = _FileReaderContext(file, exc_context, None)
|
||||
error_log = _ErrorLog()
|
||||
|
||||
with error_log:
|
||||
c_dtd = dtd_parser._readDtd()
|
||||
|
||||
exc_context._raise_if_stored()
|
||||
if c_dtd is NULL:
|
||||
raise DTDParseError(u"error parsing DTD", error_log)
|
||||
return c_dtd
|
||||
|
||||
cdef DTD _dtdFactory(tree.xmlDtd* c_dtd):
|
||||
# do not run through DTD.__init__()!
|
||||
cdef DTD dtd
|
||||
if c_dtd is NULL:
|
||||
return None
|
||||
dtd = DTD.__new__(DTD)
|
||||
dtd._c_dtd = _copyDtd(c_dtd)
|
||||
_Validator.__init__(dtd)
|
||||
return dtd
|
||||
|
||||
|
||||
cdef tree.xmlDtd* _copyDtd(tree.xmlDtd* c_orig_dtd) except NULL:
|
||||
"""
|
||||
Copy a DTD. libxml2 (currently) fails to set up the element->attributes
|
||||
links when copying DTDs, so we have to rebuild them here.
|
||||
"""
|
||||
c_dtd = tree.xmlCopyDtd(c_orig_dtd)
|
||||
if not c_dtd:
|
||||
raise MemoryError
|
||||
cdef tree.xmlNode* c_node = c_dtd.children
|
||||
while c_node:
|
||||
if c_node.type == tree.XML_ATTRIBUTE_DECL:
|
||||
_linkDtdAttribute(c_dtd, <tree.xmlAttribute*>c_node)
|
||||
c_node = c_node.next
|
||||
return c_dtd
|
||||
|
||||
|
||||
cdef void _linkDtdAttribute(tree.xmlDtd* c_dtd, tree.xmlAttribute* c_attr):
|
||||
"""
|
||||
Create the link to the DTD attribute declaration from the corresponding
|
||||
element declaration.
|
||||
"""
|
||||
c_elem = dtdvalid.xmlGetDtdElementDesc(c_dtd, c_attr.elem)
|
||||
if not c_elem:
|
||||
# no such element? something is wrong with the DTD ...
|
||||
return
|
||||
c_pos = c_elem.attributes
|
||||
if not c_pos:
|
||||
c_elem.attributes = c_attr
|
||||
c_attr.nexth = NULL
|
||||
return
|
||||
# libxml2 keeps namespace declarations first, and we need to make
|
||||
# sure we don't re-insert attributes that are already there
|
||||
if _isDtdNsDecl(c_attr):
|
||||
if not _isDtdNsDecl(c_pos):
|
||||
c_elem.attributes = c_attr
|
||||
c_attr.nexth = c_pos
|
||||
return
|
||||
while c_pos != c_attr and c_pos.nexth and _isDtdNsDecl(c_pos.nexth):
|
||||
c_pos = c_pos.nexth
|
||||
else:
|
||||
# append at end
|
||||
while c_pos != c_attr and c_pos.nexth:
|
||||
c_pos = c_pos.nexth
|
||||
if c_pos == c_attr:
|
||||
return
|
||||
c_attr.nexth = c_pos.nexth
|
||||
c_pos.nexth = c_attr
|
||||
|
||||
|
||||
cdef bint _isDtdNsDecl(tree.xmlAttribute* c_attr):
|
||||
if cstring_h.strcmp(<const_char*>c_attr.name, "xmlns") == 0:
|
||||
return True
|
||||
if (c_attr.prefix is not NULL and
|
||||
cstring_h.strcmp(<const_char*>c_attr.prefix, "xmlns") == 0):
|
||||
return True
|
||||
return False
|
855
lib/lxml/extensions.pxi
Normal file
855
lib/lxml/extensions.pxi
Normal file
|
@ -0,0 +1,855 @@
|
|||
# support for extension functions in XPath and XSLT
|
||||
|
||||
class XPathError(LxmlError):
|
||||
u"""Base class of all XPath errors.
|
||||
"""
|
||||
pass
|
||||
|
||||
class XPathEvalError(XPathError):
|
||||
u"""Error during XPath evaluation.
|
||||
"""
|
||||
pass
|
||||
|
||||
class XPathFunctionError(XPathEvalError):
|
||||
u"""Internal error looking up an XPath extension function.
|
||||
"""
|
||||
pass
|
||||
|
||||
class XPathResultError(XPathEvalError):
|
||||
u"""Error handling an XPath result.
|
||||
"""
|
||||
pass
|
||||
|
||||
# forward declarations
|
||||
|
||||
ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf)
|
||||
cdef class _ExsltRegExp
|
||||
|
||||
################################################################################
|
||||
# Base class for XSLT and XPath evaluation contexts: functions, namespaces, ...
|
||||
|
||||
@cython.internal
|
||||
cdef class _BaseContext:
|
||||
cdef xpath.xmlXPathContext* _xpathCtxt
|
||||
cdef _Document _doc
|
||||
cdef dict _extensions
|
||||
cdef list _namespaces
|
||||
cdef list _global_namespaces
|
||||
cdef dict _utf_refs
|
||||
cdef dict _function_cache
|
||||
cdef dict _eval_context_dict
|
||||
cdef bint _build_smart_strings
|
||||
# for exception handling and temporary reference keeping:
|
||||
cdef _TempStore _temp_refs
|
||||
cdef set _temp_documents
|
||||
cdef _ExceptionContext _exc
|
||||
cdef _ErrorLog _error_log
|
||||
|
||||
def __cinit__(self):
|
||||
self._xpathCtxt = NULL
|
||||
|
||||
def __init__(self, namespaces, extensions, error_log, enable_regexp,
|
||||
build_smart_strings):
|
||||
cdef _ExsltRegExp _regexp
|
||||
cdef dict new_extensions
|
||||
cdef list ns
|
||||
self._utf_refs = {}
|
||||
self._global_namespaces = []
|
||||
self._function_cache = {}
|
||||
self._eval_context_dict = None
|
||||
self._error_log = error_log
|
||||
|
||||
if extensions is not None:
|
||||
# convert extensions to UTF-8
|
||||
if isinstance(extensions, dict):
|
||||
extensions = (extensions,)
|
||||
# format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function}
|
||||
new_extensions = {}
|
||||
for extension in extensions:
|
||||
for (ns_uri, name), function in extension.items():
|
||||
if name is None:
|
||||
raise ValueError, u"extensions must have non empty names"
|
||||
ns_utf = self._to_utf(ns_uri)
|
||||
name_utf = self._to_utf(name)
|
||||
new_extensions[(ns_utf, name_utf)] = function
|
||||
extensions = new_extensions or None
|
||||
|
||||
if namespaces is not None:
|
||||
if isinstance(namespaces, dict):
|
||||
namespaces = namespaces.items()
|
||||
if namespaces:
|
||||
ns = []
|
||||
for prefix, ns_uri in namespaces:
|
||||
if prefix is None or not prefix:
|
||||
raise TypeError, \
|
||||
u"empty namespace prefix is not supported in XPath"
|
||||
if ns_uri is None or not ns_uri:
|
||||
raise TypeError, \
|
||||
u"setting default namespace is not supported in XPath"
|
||||
prefix_utf = self._to_utf(prefix)
|
||||
ns_uri_utf = self._to_utf(ns_uri)
|
||||
ns.append( (prefix_utf, ns_uri_utf) )
|
||||
namespaces = ns
|
||||
else:
|
||||
namespaces = None
|
||||
|
||||
self._doc = None
|
||||
self._exc = _ExceptionContext()
|
||||
self._extensions = extensions
|
||||
self._namespaces = namespaces
|
||||
self._temp_refs = _TempStore()
|
||||
self._temp_documents = set()
|
||||
self._build_smart_strings = build_smart_strings
|
||||
|
||||
if enable_regexp:
|
||||
_regexp = _ExsltRegExp()
|
||||
_regexp._register_in_context(self)
|
||||
|
||||
cdef _BaseContext _copy(self):
|
||||
cdef _BaseContext context
|
||||
if self._namespaces is not None:
|
||||
namespaces = self._namespaces[:]
|
||||
else:
|
||||
namespaces = None
|
||||
context = self.__class__(namespaces, None, self._error_log, False,
|
||||
self._build_smart_strings)
|
||||
if self._extensions is not None:
|
||||
context._extensions = self._extensions.copy()
|
||||
return context
|
||||
|
||||
cdef bytes _to_utf(self, s):
|
||||
u"Convert to UTF-8 and keep a reference to the encoded string"
|
||||
cdef python.PyObject* dict_result
|
||||
if s is None:
|
||||
return None
|
||||
dict_result = python.PyDict_GetItem(self._utf_refs, s)
|
||||
if dict_result is not NULL:
|
||||
return <bytes>dict_result
|
||||
utf = _utf8(s)
|
||||
self._utf_refs[s] = utf
|
||||
if python.IS_PYPY:
|
||||
# use C level refs, PyPy refs are not enough!
|
||||
python.Py_INCREF(utf)
|
||||
return utf
|
||||
|
||||
cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt):
|
||||
self._xpathCtxt = xpathCtxt
|
||||
xpathCtxt.userData = <void*>self
|
||||
xpathCtxt.error = _receiveXPathError
|
||||
|
||||
@cython.final
|
||||
cdef _register_context(self, _Document doc):
|
||||
self._doc = doc
|
||||
self._exc.clear()
|
||||
|
||||
@cython.final
|
||||
cdef _cleanup_context(self):
|
||||
#xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt)
|
||||
#self.unregisterGlobalNamespaces()
|
||||
if python.IS_PYPY:
|
||||
# clean up double refs in PyPy (see "_to_utf()" method)
|
||||
for ref in self._utf_refs.itervalues():
|
||||
python.Py_DECREF(ref)
|
||||
self._utf_refs.clear()
|
||||
self._eval_context_dict = None
|
||||
self._doc = None
|
||||
|
||||
@cython.final
|
||||
cdef _release_context(self):
|
||||
if self._xpathCtxt is not NULL:
|
||||
self._xpathCtxt.userData = NULL
|
||||
self._xpathCtxt = NULL
|
||||
|
||||
# namespaces (internal UTF-8 methods with leading '_')
|
||||
|
||||
cdef addNamespace(self, prefix, ns_uri):
|
||||
cdef list namespaces
|
||||
if prefix is None:
|
||||
raise TypeError, u"empty prefix is not supported in XPath"
|
||||
prefix_utf = self._to_utf(prefix)
|
||||
ns_uri_utf = self._to_utf(ns_uri)
|
||||
new_item = (prefix_utf, ns_uri_utf)
|
||||
if self._namespaces is None:
|
||||
self._namespaces = [new_item]
|
||||
else:
|
||||
namespaces = []
|
||||
for item in self._namespaces:
|
||||
if item[0] == prefix_utf:
|
||||
item = new_item
|
||||
new_item = None
|
||||
namespaces.append(item)
|
||||
if new_item is not None:
|
||||
namespaces.append(new_item)
|
||||
self._namespaces = namespaces
|
||||
if self._xpathCtxt is not NULL:
|
||||
xpath.xmlXPathRegisterNs(
|
||||
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||||
|
||||
cdef registerNamespace(self, prefix, ns_uri):
|
||||
if prefix is None:
|
||||
raise TypeError, u"empty prefix is not supported in XPath"
|
||||
prefix_utf = self._to_utf(prefix)
|
||||
ns_uri_utf = self._to_utf(ns_uri)
|
||||
self._global_namespaces.append(prefix_utf)
|
||||
xpath.xmlXPathRegisterNs(self._xpathCtxt,
|
||||
_xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||||
|
||||
cdef registerLocalNamespaces(self):
|
||||
if self._namespaces is None:
|
||||
return
|
||||
for prefix_utf, ns_uri_utf in self._namespaces:
|
||||
xpath.xmlXPathRegisterNs(
|
||||
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||||
|
||||
cdef registerGlobalNamespaces(self):
|
||||
cdef list ns_prefixes = _find_all_extension_prefixes()
|
||||
if python.PyList_GET_SIZE(ns_prefixes) > 0:
|
||||
for prefix_utf, ns_uri_utf in ns_prefixes:
|
||||
self._global_namespaces.append(prefix_utf)
|
||||
xpath.xmlXPathRegisterNs(
|
||||
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||||
|
||||
cdef unregisterGlobalNamespaces(self):
|
||||
if python.PyList_GET_SIZE(self._global_namespaces) > 0:
|
||||
for prefix_utf in self._global_namespaces:
|
||||
xpath.xmlXPathRegisterNs(self._xpathCtxt,
|
||||
_xcstr(prefix_utf), NULL)
|
||||
del self._global_namespaces[:]
|
||||
|
||||
cdef void _unregisterNamespace(self, prefix_utf):
|
||||
xpath.xmlXPathRegisterNs(self._xpathCtxt,
|
||||
_xcstr(prefix_utf), NULL)
|
||||
|
||||
# extension functions
|
||||
|
||||
cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1:
|
||||
if self._extensions is None:
|
||||
self._extensions = {}
|
||||
self._extensions[(ns_utf, name_utf)] = function
|
||||
return 0
|
||||
|
||||
cdef registerGlobalFunctions(self, void* ctxt,
|
||||
_register_function reg_func):
|
||||
cdef python.PyObject* dict_result
|
||||
cdef dict d
|
||||
for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems():
|
||||
dict_result = python.PyDict_GetItem(
|
||||
self._function_cache, ns_utf)
|
||||
if dict_result is not NULL:
|
||||
d = <dict>dict_result
|
||||
else:
|
||||
d = {}
|
||||
self._function_cache[ns_utf] = d
|
||||
for name_utf, function in ns_functions.iteritems():
|
||||
d[name_utf] = function
|
||||
reg_func(ctxt, name_utf, ns_utf)
|
||||
|
||||
cdef registerLocalFunctions(self, void* ctxt,
|
||||
_register_function reg_func):
|
||||
cdef python.PyObject* dict_result
|
||||
cdef dict d
|
||||
if self._extensions is None:
|
||||
return # done
|
||||
last_ns = None
|
||||
d = None
|
||||
for (ns_utf, name_utf), function in self._extensions.iteritems():
|
||||
if ns_utf is not last_ns or d is None:
|
||||
last_ns = ns_utf
|
||||
dict_result = python.PyDict_GetItem(
|
||||
self._function_cache, ns_utf)
|
||||
if dict_result is not NULL:
|
||||
d = <dict>dict_result
|
||||
else:
|
||||
d = {}
|
||||
self._function_cache[ns_utf] = d
|
||||
d[name_utf] = function
|
||||
reg_func(ctxt, name_utf, ns_utf)
|
||||
|
||||
cdef unregisterAllFunctions(self, void* ctxt,
|
||||
_register_function unreg_func):
|
||||
for ns_utf, functions in self._function_cache.iteritems():
|
||||
for name_utf in functions:
|
||||
unreg_func(ctxt, name_utf, ns_utf)
|
||||
|
||||
cdef unregisterGlobalFunctions(self, void* ctxt,
|
||||
_register_function unreg_func):
|
||||
for ns_utf, functions in self._function_cache.items():
|
||||
for name_utf in functions:
|
||||
if self._extensions is None or \
|
||||
(ns_utf, name_utf) not in self._extensions:
|
||||
unreg_func(ctxt, name_utf, ns_utf)
|
||||
|
||||
@cython.final
|
||||
cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name):
|
||||
u"""Lookup an extension function in the cache and return it.
|
||||
|
||||
Parameters: c_ns_uri may be NULL, c_name must not be NULL
|
||||
"""
|
||||
cdef python.PyObject* c_dict
|
||||
cdef python.PyObject* dict_result
|
||||
c_dict = python.PyDict_GetItem(
|
||||
self._function_cache, None if c_ns_uri is NULL else c_ns_uri)
|
||||
if c_dict is not NULL:
|
||||
dict_result = python.PyDict_GetItem(
|
||||
<object>c_dict, <unsigned char*>c_name)
|
||||
if dict_result is not NULL:
|
||||
return <object>dict_result
|
||||
return None
|
||||
|
||||
# Python access to the XPath context for extension functions
|
||||
|
||||
property context_node:
|
||||
def __get__(self):
|
||||
cdef xmlNode* c_node
|
||||
if self._xpathCtxt is NULL:
|
||||
raise XPathError, \
|
||||
u"XPath context is only usable during the evaluation"
|
||||
c_node = self._xpathCtxt.node
|
||||
if c_node is NULL:
|
||||
raise XPathError, u"no context node"
|
||||
if c_node.doc != self._xpathCtxt.doc:
|
||||
raise XPathError, \
|
||||
u"document-external context nodes are not supported"
|
||||
if self._doc is None:
|
||||
raise XPathError, u"document context is missing"
|
||||
return _elementFactory(self._doc, c_node)
|
||||
|
||||
property eval_context:
|
||||
def __get__(self):
|
||||
if self._eval_context_dict is None:
|
||||
self._eval_context_dict = {}
|
||||
return self._eval_context_dict
|
||||
|
||||
# Python reference keeping during XPath function evaluation
|
||||
|
||||
@cython.final
|
||||
cdef _release_temp_refs(self):
|
||||
u"Free temporarily referenced objects from this context."
|
||||
self._temp_refs.clear()
|
||||
self._temp_documents.clear()
|
||||
|
||||
@cython.final
|
||||
cdef _hold(self, obj):
|
||||
u"""A way to temporarily hold references to nodes in the evaluator.
|
||||
|
||||
This is needed because otherwise nodes created in XPath extension
|
||||
functions would be reference counted too soon, during the XPath
|
||||
evaluation. This is most important in the case of exceptions.
|
||||
"""
|
||||
cdef _Element element
|
||||
if isinstance(obj, _Element):
|
||||
self._temp_refs.add(obj)
|
||||
self._temp_documents.add((<_Element>obj)._doc)
|
||||
return
|
||||
elif _isString(obj) or not python.PySequence_Check(obj):
|
||||
return
|
||||
for o in obj:
|
||||
if isinstance(o, _Element):
|
||||
#print "Holding element:", <int>element._c_node
|
||||
self._temp_refs.add(o)
|
||||
#print "Holding document:", <int>element._doc._c_doc
|
||||
self._temp_documents.add((<_Element>o)._doc)
|
||||
|
||||
@cython.final
|
||||
cdef _Document _findDocumentForNode(self, xmlNode* c_node):
|
||||
u"""If an XPath expression returns an element from a different
|
||||
document than the current context document, we call this to
|
||||
see if it was possibly created by an extension and is a known
|
||||
document instance.
|
||||
"""
|
||||
cdef _Document doc
|
||||
for doc in self._temp_documents:
|
||||
if doc is not None and doc._c_doc is c_node.doc:
|
||||
return doc
|
||||
return None
|
||||
|
||||
|
||||
# libxml2 keeps these error messages in a static array in its code
|
||||
# and doesn't give us access to them ...
|
||||
|
||||
cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = (
|
||||
b"Ok",
|
||||
b"Number encoding",
|
||||
b"Unfinished literal",
|
||||
b"Start of literal",
|
||||
b"Expected $ for variable reference",
|
||||
b"Undefined variable",
|
||||
b"Invalid predicate",
|
||||
b"Invalid expression",
|
||||
b"Missing closing curly brace",
|
||||
b"Unregistered function",
|
||||
b"Invalid operand",
|
||||
b"Invalid type",
|
||||
b"Invalid number of arguments",
|
||||
b"Invalid context size",
|
||||
b"Invalid context position",
|
||||
b"Memory allocation error",
|
||||
b"Syntax error",
|
||||
b"Resource error",
|
||||
b"Sub resource error",
|
||||
b"Undefined namespace prefix",
|
||||
b"Encoding error",
|
||||
b"Char out of XML range",
|
||||
b"Invalid or incomplete context",
|
||||
b"Stack usage error",
|
||||
)
|
||||
|
||||
cdef void _forwardXPathError(void* c_ctxt, xmlerror.xmlError* c_error) with gil:
|
||||
cdef xmlerror.xmlError error
|
||||
cdef int xpath_code
|
||||
if c_error.message is not NULL:
|
||||
error.message = c_error.message
|
||||
else:
|
||||
xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK
|
||||
if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES):
|
||||
error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code])
|
||||
else:
|
||||
error.message = b"unknown error"
|
||||
error.domain = c_error.domain
|
||||
error.code = c_error.code
|
||||
error.level = c_error.level
|
||||
error.line = c_error.line
|
||||
error.int2 = c_error.int1 # column
|
||||
error.file = c_error.file
|
||||
|
||||
(<_BaseContext>c_ctxt)._error_log._receive(&error)
|
||||
|
||||
cdef void _receiveXPathError(void* c_context, xmlerror.xmlError* error) nogil:
|
||||
if not __DEBUG:
|
||||
return
|
||||
if c_context is NULL:
|
||||
_forwardError(NULL, error)
|
||||
else:
|
||||
_forwardXPathError(c_context, error)
|
||||
|
||||
|
||||
def Extension(module, function_mapping=None, *, ns=None):
|
||||
u"""Extension(module, function_mapping=None, ns=None)
|
||||
|
||||
Build a dictionary of extension functions from the functions
|
||||
defined in a module or the methods of an object.
|
||||
|
||||
As second argument, you can pass an additional mapping of
|
||||
attribute names to XPath function names, or a list of function
|
||||
names that should be taken.
|
||||
|
||||
The ``ns`` keyword argument accepts a namespace URI for the XPath
|
||||
functions.
|
||||
"""
|
||||
cdef dict functions = {}
|
||||
if isinstance(function_mapping, dict):
|
||||
for function_name, xpath_name in function_mapping.items():
|
||||
functions[(ns, xpath_name)] = getattr(module, function_name)
|
||||
else:
|
||||
if function_mapping is None:
|
||||
function_mapping = [ name for name in dir(module)
|
||||
if not name.startswith(u'_') ]
|
||||
for function_name in function_mapping:
|
||||
functions[(ns, function_name)] = getattr(module, function_name)
|
||||
return functions
|
||||
|
||||
################################################################################
|
||||
# EXSLT regexp implementation
|
||||
|
||||
@cython.final
|
||||
@cython.internal
|
||||
cdef class _ExsltRegExp:
|
||||
cdef dict _compile_map
|
||||
def __cinit__(self):
|
||||
self._compile_map = {}
|
||||
|
||||
cdef _make_string(self, value):
|
||||
if _isString(value):
|
||||
return value
|
||||
elif isinstance(value, list):
|
||||
# node set: take recursive text concatenation of first element
|
||||
if python.PyList_GET_SIZE(value) == 0:
|
||||
return u''
|
||||
firstnode = value[0]
|
||||
if _isString(firstnode):
|
||||
return firstnode
|
||||
elif isinstance(firstnode, _Element):
|
||||
c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
|
||||
if c_text is NULL:
|
||||
raise MemoryError()
|
||||
try:
|
||||
return funicode(c_text)
|
||||
finally:
|
||||
tree.xmlFree(c_text)
|
||||
else:
|
||||
return unicode(firstnode)
|
||||
else:
|
||||
return unicode(value)
|
||||
|
||||
cdef _compile(self, rexp, ignore_case):
|
||||
cdef python.PyObject* c_result
|
||||
rexp = self._make_string(rexp)
|
||||
key = (rexp, ignore_case)
|
||||
c_result = python.PyDict_GetItem(self._compile_map, key)
|
||||
if c_result is not NULL:
|
||||
return <object>c_result
|
||||
py_flags = re.UNICODE
|
||||
if ignore_case:
|
||||
py_flags = py_flags | re.IGNORECASE
|
||||
rexp_compiled = re.compile(rexp, py_flags)
|
||||
self._compile_map[key] = rexp_compiled
|
||||
return rexp_compiled
|
||||
|
||||
def test(self, ctxt, s, rexp, flags=u''):
|
||||
flags = self._make_string(flags)
|
||||
s = self._make_string(s)
|
||||
rexpc = self._compile(rexp, u'i' in flags)
|
||||
if rexpc.search(s) is None:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def match(self, ctxt, s, rexp, flags=u''):
|
||||
cdef list result_list
|
||||
flags = self._make_string(flags)
|
||||
s = self._make_string(s)
|
||||
rexpc = self._compile(rexp, u'i' in flags)
|
||||
if u'g' in flags:
|
||||
results = rexpc.findall(s)
|
||||
if not results:
|
||||
return ()
|
||||
else:
|
||||
result = rexpc.search(s)
|
||||
if not result:
|
||||
return ()
|
||||
results = [ result.group() ]
|
||||
results.extend( result.groups(u'') )
|
||||
result_list = []
|
||||
root = Element(u'matches')
|
||||
join_groups = u''.join
|
||||
for s_match in results:
|
||||
if python.PyTuple_CheckExact(s_match):
|
||||
s_match = join_groups(s_match)
|
||||
elem = SubElement(root, u'match')
|
||||
elem.text = s_match
|
||||
result_list.append(elem)
|
||||
return result_list
|
||||
|
||||
def replace(self, ctxt, s, rexp, flags, replacement):
|
||||
replacement = self._make_string(replacement)
|
||||
flags = self._make_string(flags)
|
||||
s = self._make_string(s)
|
||||
rexpc = self._compile(rexp, u'i' in flags)
|
||||
if u'g' in flags:
|
||||
count = 0
|
||||
else:
|
||||
count = 1
|
||||
return rexpc.sub(replacement, s, count)
|
||||
|
||||
cdef _register_in_context(self, _BaseContext context):
|
||||
ns = b"http://exslt.org/regular-expressions"
|
||||
context._addLocalExtensionFunction(ns, b"test", self.test)
|
||||
context._addLocalExtensionFunction(ns, b"match", self.match)
|
||||
context._addLocalExtensionFunction(ns, b"replace", self.replace)
|
||||
|
||||
|
||||
################################################################################
|
||||
# helper functions
|
||||
|
||||
cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc,
|
||||
_BaseContext context) except NULL:
|
||||
cdef xpath.xmlNodeSet* resultSet
|
||||
cdef _Element fake_node = None
|
||||
cdef xmlNode* c_node
|
||||
|
||||
if isinstance(obj, unicode):
|
||||
obj = _utf8(obj)
|
||||
if isinstance(obj, bytes):
|
||||
# libxml2 copies the string value
|
||||
return xpath.xmlXPathNewCString(_cstr(obj))
|
||||
if isinstance(obj, bool):
|
||||
return xpath.xmlXPathNewBoolean(obj)
|
||||
if python.PyNumber_Check(obj):
|
||||
return xpath.xmlXPathNewFloat(obj)
|
||||
if obj is None:
|
||||
resultSet = xpath.xmlXPathNodeSetCreate(NULL)
|
||||
elif isinstance(obj, _Element):
|
||||
resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node)
|
||||
elif python.PySequence_Check(obj):
|
||||
resultSet = xpath.xmlXPathNodeSetCreate(NULL)
|
||||
try:
|
||||
for value in obj:
|
||||
if isinstance(value, _Element):
|
||||
if context is not None:
|
||||
context._hold(value)
|
||||
xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node)
|
||||
else:
|
||||
if context is None or doc is None:
|
||||
raise XPathResultError, \
|
||||
u"Non-Element values not supported at this point - got %r" % value
|
||||
# support strings by appending text nodes to an Element
|
||||
if isinstance(value, unicode):
|
||||
value = _utf8(value)
|
||||
if isinstance(value, bytes):
|
||||
if fake_node is None:
|
||||
fake_node = _makeElement("text-root", NULL, doc, None,
|
||||
None, None, None, None, None)
|
||||
context._hold(fake_node)
|
||||
else:
|
||||
# append a comment node to keep the text nodes separate
|
||||
c_node = tree.xmlNewDocComment(doc._c_doc, <unsigned char*>"")
|
||||
if c_node is NULL:
|
||||
raise MemoryError()
|
||||
tree.xmlAddChild(fake_node._c_node, c_node)
|
||||
context._hold(value)
|
||||
c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value))
|
||||
if c_node is NULL:
|
||||
raise MemoryError()
|
||||
tree.xmlAddChild(fake_node._c_node, c_node)
|
||||
xpath.xmlXPathNodeSetAdd(resultSet, c_node)
|
||||
else:
|
||||
raise XPathResultError, \
|
||||
u"This is not a supported node-set result: %r" % value
|
||||
except:
|
||||
xpath.xmlXPathFreeNodeSet(resultSet)
|
||||
raise
|
||||
else:
|
||||
raise XPathResultError, u"Unknown return type: %s" % \
|
||||
python._fqtypename(obj).decode('utf8')
|
||||
return xpath.xmlXPathWrapNodeSet(resultSet)
|
||||
|
||||
cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj,
|
||||
_Document doc, _BaseContext context):
|
||||
if xpathObj.type == xpath.XPATH_UNDEFINED:
|
||||
raise XPathResultError, u"Undefined xpath result"
|
||||
elif xpathObj.type == xpath.XPATH_NODESET:
|
||||
return _createNodeSetResult(xpathObj, doc, context)
|
||||
elif xpathObj.type == xpath.XPATH_BOOLEAN:
|
||||
return xpathObj.boolval
|
||||
elif xpathObj.type == xpath.XPATH_NUMBER:
|
||||
return xpathObj.floatval
|
||||
elif xpathObj.type == xpath.XPATH_STRING:
|
||||
stringval = funicode(xpathObj.stringval)
|
||||
if context._build_smart_strings:
|
||||
stringval = _elementStringResultFactory(
|
||||
stringval, None, None, 0)
|
||||
return stringval
|
||||
elif xpathObj.type == xpath.XPATH_POINT:
|
||||
raise NotImplementedError, u"XPATH_POINT"
|
||||
elif xpathObj.type == xpath.XPATH_RANGE:
|
||||
raise NotImplementedError, u"XPATH_RANGE"
|
||||
elif xpathObj.type == xpath.XPATH_LOCATIONSET:
|
||||
raise NotImplementedError, u"XPATH_LOCATIONSET"
|
||||
elif xpathObj.type == xpath.XPATH_USERS:
|
||||
raise NotImplementedError, u"XPATH_USERS"
|
||||
elif xpathObj.type == xpath.XPATH_XSLT_TREE:
|
||||
return _createNodeSetResult(xpathObj, doc, context)
|
||||
else:
|
||||
raise XPathResultError, u"Unknown xpath result %s" % unicode(xpathObj.type)
|
||||
|
||||
cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc,
|
||||
_BaseContext context):
|
||||
cdef xmlNode* c_node
|
||||
cdef int i
|
||||
cdef list result
|
||||
result = []
|
||||
if xpathObj.nodesetval is NULL:
|
||||
return result
|
||||
for i in range(xpathObj.nodesetval.nodeNr):
|
||||
c_node = xpathObj.nodesetval.nodeTab[i]
|
||||
_unpackNodeSetEntry(result, c_node, doc, context,
|
||||
xpathObj.type == xpath.XPATH_XSLT_TREE)
|
||||
return result
|
||||
|
||||
cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc,
|
||||
_BaseContext context, bint is_fragment):
|
||||
cdef xmlNode* c_child
|
||||
if _isElement(c_node):
|
||||
if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
|
||||
# XXX: works, but maybe not always the right thing to do?
|
||||
# XPath: only runs when extensions create or copy trees
|
||||
# -> we store Python refs to these, so that is OK
|
||||
# XSLT: can it leak when merging trees from multiple sources?
|
||||
c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
|
||||
# FIXME: call _instantiateElementFromXPath() instead?
|
||||
results.append(
|
||||
_fakeDocElementFactory(doc, c_node))
|
||||
elif c_node.type == tree.XML_TEXT_NODE or \
|
||||
c_node.type == tree.XML_CDATA_SECTION_NODE or \
|
||||
c_node.type == tree.XML_ATTRIBUTE_NODE:
|
||||
results.append(
|
||||
_buildElementStringResult(doc, c_node, context))
|
||||
elif c_node.type == tree.XML_NAMESPACE_DECL:
|
||||
results.append( (funicodeOrNone((<xmlNs*>c_node).prefix),
|
||||
funicodeOrNone((<xmlNs*>c_node).href)) )
|
||||
elif c_node.type == tree.XML_DOCUMENT_NODE or \
|
||||
c_node.type == tree.XML_HTML_DOCUMENT_NODE:
|
||||
# ignored for everything but result tree fragments
|
||||
if is_fragment:
|
||||
c_child = c_node.children
|
||||
while c_child is not NULL:
|
||||
_unpackNodeSetEntry(results, c_child, doc, context, 0)
|
||||
c_child = c_child.next
|
||||
elif c_node.type == tree.XML_XINCLUDE_START or \
|
||||
c_node.type == tree.XML_XINCLUDE_END:
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError, \
|
||||
u"Not yet implemented result node type: %d" % c_node.type
|
||||
|
||||
cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj):
|
||||
u"""Free the XPath object, but *never* free the *content* of node sets.
|
||||
Python dealloc will do that for us.
|
||||
"""
|
||||
if xpathObj.nodesetval is not NULL:
|
||||
xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval)
|
||||
xpathObj.nodesetval = NULL
|
||||
xpath.xmlXPathFreeObject(xpathObj)
|
||||
|
||||
cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc,
|
||||
_BaseContext context):
|
||||
# NOTE: this may copy the element - only call this when it can't leak
|
||||
if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
|
||||
# not from the context document and not from a fake document
|
||||
# either => may still be from a known document, e.g. one
|
||||
# created by an extension function
|
||||
doc = context._findDocumentForNode(c_node)
|
||||
if doc is None:
|
||||
# not from a known document at all! => can only make a
|
||||
# safety copy here
|
||||
c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
|
||||
return _fakeDocElementFactory(doc, c_node)
|
||||
|
||||
################################################################################
|
||||
# special str/unicode subclasses
|
||||
|
||||
@cython.final
|
||||
cdef class _ElementUnicodeResult(unicode):
|
||||
cdef _Element _parent
|
||||
cdef readonly object attrname
|
||||
cdef readonly bint is_tail
|
||||
cdef readonly bint is_text
|
||||
cdef readonly bint is_attribute
|
||||
|
||||
def getparent(self):
|
||||
return self._parent
|
||||
|
||||
class _ElementStringResult(bytes):
|
||||
# we need to use a Python class here, bytes cannot be C-subclassed
|
||||
# in Pyrex/Cython
|
||||
def getparent(self):
|
||||
return self._parent
|
||||
|
||||
cdef object _elementStringResultFactory(string_value, _Element parent,
|
||||
attrname, bint is_tail):
|
||||
cdef _ElementUnicodeResult uresult
|
||||
cdef bint is_text
|
||||
cdef bint is_attribute = attrname is not None
|
||||
if parent is None:
|
||||
is_text = 0
|
||||
else:
|
||||
is_text = not (is_tail or is_attribute)
|
||||
|
||||
if type(string_value) is bytes:
|
||||
result = _ElementStringResult(string_value)
|
||||
result._parent = parent
|
||||
result.is_attribute = is_attribute
|
||||
result.is_tail = is_tail
|
||||
result.is_text = is_text
|
||||
result.attrname = attrname
|
||||
return result
|
||||
else:
|
||||
uresult = _ElementUnicodeResult(string_value)
|
||||
uresult._parent = parent
|
||||
uresult.is_attribute = is_attribute
|
||||
uresult.is_tail = is_tail
|
||||
uresult.is_text = is_text
|
||||
uresult.attrname = attrname
|
||||
return uresult
|
||||
|
||||
cdef object _buildElementStringResult(_Document doc, xmlNode* c_node,
|
||||
_BaseContext context):
|
||||
cdef _Element parent = None
|
||||
cdef object attrname = None
|
||||
cdef xmlNode* c_element
|
||||
cdef bint is_tail
|
||||
|
||||
if c_node.type == tree.XML_ATTRIBUTE_NODE:
|
||||
attrname = _namespacedName(c_node)
|
||||
is_tail = 0
|
||||
s = tree.xmlNodeGetContent(c_node)
|
||||
try:
|
||||
value = funicode(s)
|
||||
finally:
|
||||
tree.xmlFree(s)
|
||||
c_element = NULL
|
||||
else:
|
||||
#assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
|
||||
# may be tail text or normal text
|
||||
value = funicode(c_node.content)
|
||||
c_element = _previousElement(c_node)
|
||||
is_tail = c_element is not NULL
|
||||
|
||||
if not context._build_smart_strings:
|
||||
return value
|
||||
|
||||
if c_element is NULL:
|
||||
# non-tail text or attribute text
|
||||
c_element = c_node.parent
|
||||
while c_element is not NULL and not _isElement(c_element):
|
||||
c_element = c_element.parent
|
||||
|
||||
if c_element is not NULL:
|
||||
parent = _instantiateElementFromXPath(c_element, doc, context)
|
||||
|
||||
return _elementStringResultFactory(
|
||||
value, parent, attrname, is_tail)
|
||||
|
||||
################################################################################
|
||||
# callbacks for XPath/XSLT extension functions
|
||||
|
||||
cdef void _extension_function_call(_BaseContext context, function,
|
||||
xpath.xmlXPathParserContext* ctxt, int nargs):
|
||||
cdef _Document doc
|
||||
cdef xpath.xmlXPathObject* obj
|
||||
cdef list args
|
||||
cdef int i
|
||||
doc = context._doc
|
||||
try:
|
||||
args = []
|
||||
for i in range(nargs):
|
||||
obj = xpath.valuePop(ctxt)
|
||||
o = _unwrapXPathObject(obj, doc, context)
|
||||
_freeXPathObject(obj)
|
||||
args.append(o)
|
||||
args.reverse()
|
||||
|
||||
res = function(context, *args)
|
||||
# wrap result for XPath consumption
|
||||
obj = _wrapXPathObject(res, doc, context)
|
||||
# prevent Python from deallocating elements handed to libxml2
|
||||
context._hold(res)
|
||||
xpath.valuePush(ctxt, obj)
|
||||
except:
|
||||
xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR)
|
||||
context._exc._store_raised()
|
||||
finally:
|
||||
return # swallow any further exceptions
|
||||
|
||||
# lookup the function by name and call it
|
||||
|
||||
cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt,
|
||||
int nargs) with gil:
|
||||
cdef _BaseContext context
|
||||
cdef xpath.xmlXPathContext* rctxt = ctxt.context
|
||||
context = <_BaseContext> rctxt.userData
|
||||
try:
|
||||
function = context._find_cached_function(rctxt.functionURI, rctxt.function)
|
||||
if function is not None:
|
||||
_extension_function_call(context, function, ctxt, nargs)
|
||||
else:
|
||||
xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
|
||||
context._exc._store_exception(
|
||||
XPathFunctionError(u"XPath function '%s' not found" %
|
||||
_namespacedNameFromNsName(rctxt.functionURI, rctxt.function)))
|
||||
except:
|
||||
# may not be the right error, but we need to tell libxml2 *something*
|
||||
xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
|
||||
context._exc._store_raised()
|
||||
finally:
|
||||
return # swallow any further exceptions
|
10
lib/lxml/html/ElementSoup.py
Normal file
10
lib/lxml/html/ElementSoup.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
|
||||
"""
|
||||
|
||||
__all__ = ["parse", "convert_tree"]
|
||||
|
||||
from soupparser import convert_tree, parse as _parse
|
||||
|
||||
def parse(file, beautifulsoup=None, makeelement=None):
|
||||
root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
|
||||
return root.getroot()
|
1697
lib/lxml/html/__init__.py
Normal file
1697
lib/lxml/html/__init__.py
Normal file
File diff suppressed because it is too large
Load diff
87
lib/lxml/html/_diffcommand.py
Normal file
87
lib/lxml/html/_diffcommand.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
import optparse
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
from lxml.html.diff import htmldiff
|
||||
|
||||
description = """\
|
||||
"""
|
||||
|
||||
parser = optparse.OptionParser(
|
||||
usage="%prog [OPTIONS] FILE1 FILE2\n"
|
||||
"%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
|
||||
description=description,
|
||||
)
|
||||
|
||||
parser.add_option(
|
||||
'-o', '--output',
|
||||
metavar="FILE",
|
||||
dest="output",
|
||||
default="-",
|
||||
help="File to write the difference to",
|
||||
)
|
||||
|
||||
parser.add_option(
|
||||
'-a', '--annotation',
|
||||
action="store_true",
|
||||
dest="annotation",
|
||||
help="Do an annotation")
|
||||
|
||||
def main(args=None):
|
||||
if args is None:
|
||||
args = sys.argv[1:]
|
||||
options, args = parser.parse_args(args)
|
||||
if options.annotation:
|
||||
return annotate(options, args)
|
||||
if len(args) != 2:
|
||||
print('Error: you must give two files')
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
file1, file2 = args
|
||||
input1 = read_file(file1)
|
||||
input2 = read_file(file2)
|
||||
body1 = split_body(input1)[1]
|
||||
pre, body2, post = split_body(input2)
|
||||
result = htmldiff(body1, body2)
|
||||
result = pre + result + post
|
||||
if options.output == '-':
|
||||
if not result.endswith('\n'):
|
||||
result += '\n'
|
||||
sys.stdout.write(result)
|
||||
else:
|
||||
f = open(options.output, 'wb')
|
||||
f.write(result)
|
||||
f.close()
|
||||
|
||||
def read_file(filename):
|
||||
if filename == '-':
|
||||
c = sys.stdin.read()
|
||||
elif not os.path.exists(filename):
|
||||
raise OSError(
|
||||
"Input file %s does not exist" % filename)
|
||||
else:
|
||||
f = open(filename, 'rb')
|
||||
c = f.read()
|
||||
f.close()
|
||||
return c
|
||||
|
||||
body_start_re = re.compile(
|
||||
r"<body.*?>", re.I|re.S)
|
||||
body_end_re = re.compile(
|
||||
r"</body.*?>", re.I|re.S)
|
||||
|
||||
def split_body(html):
|
||||
match = body_start_re.search(html)
|
||||
if match:
|
||||
pre = html[:match.end()]
|
||||
html = html[match.end():]
|
||||
match = body_end_re.search(html)
|
||||
if match:
|
||||
post = html[match.start():]
|
||||
html = html[:match.start()]
|
||||
return pre, html, post
|
||||
|
||||
def annotate(options, args):
|
||||
print("Not yet implemented")
|
||||
sys.exit(1)
|
||||
|
100
lib/lxml/html/_html5builder.py
Normal file
100
lib/lxml/html/_html5builder.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
"""
|
||||
Legacy module - don't use in new code!
|
||||
|
||||
html5lib now has its own proper implementation.
|
||||
|
||||
This module implements a tree builder for html5lib that generates lxml
|
||||
html element trees. This module uses camelCase as it follows the
|
||||
html5lib style guide.
|
||||
"""
|
||||
|
||||
from html5lib.treebuilders import _base, etree as etree_builders
|
||||
from lxml import html, etree
|
||||
|
||||
|
||||
class DocumentType(object):
|
||||
|
||||
def __init__(self, name, publicId, systemId):
|
||||
self.name = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
class Document(object):
|
||||
|
||||
def __init__(self):
|
||||
self._elementTree = None
|
||||
self.childNodes = []
|
||||
|
||||
def appendChild(self, element):
|
||||
self._elementTree.getroot().addnext(element._element)
|
||||
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
commentClass = None
|
||||
fragmentClass = Document
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html_builder = etree_builders.getETreeModule(html, fullTree=False)
|
||||
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
|
||||
self.elementClass = html_builder.Element
|
||||
self.commentClass = etree_builder.Comment
|
||||
_base.TreeBuilder.__init__(self, *args, **kwargs)
|
||||
|
||||
def reset(self):
|
||||
_base.TreeBuilder.reset(self)
|
||||
self.rootInserted = False
|
||||
self.initialComments = []
|
||||
self.doctype = None
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._elementTree
|
||||
|
||||
def getFragment(self):
|
||||
fragment = []
|
||||
element = self.openElements[0]._element
|
||||
if element.text:
|
||||
fragment.append(element.text)
|
||||
fragment.extend(element.getchildren())
|
||||
if element.tail:
|
||||
fragment.append(element.tail)
|
||||
return fragment
|
||||
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
if not self.rootInserted:
|
||||
self.initialComments.append(data)
|
||||
else:
|
||||
_base.TreeBuilder.insertComment(self, data, parent)
|
||||
|
||||
def insertRoot(self, name):
|
||||
buf = []
|
||||
if self.doctype and self.doctype.name:
|
||||
buf.append('<!DOCTYPE %s' % self.doctype.name)
|
||||
if self.doctype.publicId is not None or self.doctype.systemId is not None:
|
||||
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
|
||||
self.doctype.systemId))
|
||||
buf.append('>')
|
||||
buf.append('<html></html>')
|
||||
root = html.fromstring(''.join(buf))
|
||||
|
||||
# Append the initial comments:
|
||||
for comment in self.initialComments:
|
||||
root.addprevious(etree.Comment(comment))
|
||||
|
||||
# Create the root document and add the ElementTree to it
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
# Add the root element to the internal child/open data structures
|
||||
root_element = self.elementClass(name)
|
||||
root_element._element = root
|
||||
self.document.childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
||||
self.rootInserted = True
|
115
lib/lxml/html/_setmixin.py
Normal file
115
lib/lxml/html/_setmixin.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
class SetMixin(object):
|
||||
|
||||
"""
|
||||
Mix-in for sets. You must define __iter__, add, remove
|
||||
"""
|
||||
|
||||
def __len__(self):
|
||||
length = 0
|
||||
for item in self:
|
||||
length += 1
|
||||
return length
|
||||
|
||||
def __contains__(self, item):
|
||||
for has_item in self:
|
||||
if item == has_item:
|
||||
return True
|
||||
return False
|
||||
|
||||
def issubset(self, other):
|
||||
for item in other:
|
||||
if item not in self:
|
||||
return False
|
||||
return True
|
||||
|
||||
__le__ = issubset
|
||||
|
||||
def issuperset(self, other):
|
||||
for item in self:
|
||||
if item not in other:
|
||||
return False
|
||||
return True
|
||||
|
||||
__ge__ = issuperset
|
||||
|
||||
def union(self, other):
|
||||
return self | other
|
||||
|
||||
def __or__(self, other):
|
||||
new = self.copy()
|
||||
new |= other
|
||||
return new
|
||||
|
||||
def intersection(self, other):
|
||||
return self & other
|
||||
|
||||
def __and__(self, other):
|
||||
new = self.copy()
|
||||
new &= other
|
||||
return new
|
||||
|
||||
def difference(self, other):
|
||||
return self - other
|
||||
|
||||
def __sub__(self, other):
|
||||
new = self.copy()
|
||||
new -= other
|
||||
return new
|
||||
|
||||
def symmetric_difference(self, other):
|
||||
return self ^ other
|
||||
|
||||
def __xor__(self, other):
|
||||
new = self.copy()
|
||||
new ^= other
|
||||
return new
|
||||
|
||||
def copy(self):
|
||||
return set(self)
|
||||
|
||||
def update(self, other):
|
||||
for item in other:
|
||||
self.add(item)
|
||||
|
||||
def __ior__(self, other):
|
||||
self.update(other)
|
||||
return self
|
||||
|
||||
def intersection_update(self, other):
|
||||
for item in self:
|
||||
if item not in other:
|
||||
self.remove(item)
|
||||
|
||||
def __iand__(self, other):
|
||||
self.intersection_update(other)
|
||||
return self
|
||||
|
||||
def difference_update(self, other):
|
||||
for item in other:
|
||||
if item in self:
|
||||
self.remove(item)
|
||||
|
||||
def __isub__(self, other):
|
||||
self.difference_update(other)
|
||||
return self
|
||||
|
||||
def symmetric_difference_update(self, other):
|
||||
for item in other:
|
||||
if item in self:
|
||||
self.remove(item)
|
||||
else:
|
||||
self.add(item)
|
||||
|
||||
def __ixor__(self, other):
|
||||
self.symmetric_difference_update(other)
|
||||
return self
|
||||
|
||||
def discard(self, item):
|
||||
try:
|
||||
self.remove(item)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def clear(self):
|
||||
for item in list(self):
|
||||
self.remove(item)
|
133
lib/lxml/html/builder.py
Normal file
133
lib/lxml/html/builder.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
# --------------------------------------------------------------------
|
||||
# The ElementTree toolkit is
|
||||
# Copyright (c) 1999-2004 by Fredrik Lundh
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
A set of HTML generator tags for building HTML documents.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> from lxml.html.builder import *
|
||||
>>> html = HTML(
|
||||
... HEAD( TITLE("Hello World") ),
|
||||
... BODY( CLASS("main"),
|
||||
... H1("Hello World !")
|
||||
... )
|
||||
... )
|
||||
|
||||
>>> import lxml.etree
|
||||
>>> print lxml.etree.tostring(html, pretty_print=True)
|
||||
<html>
|
||||
<head>
|
||||
<title>Hello World</title>
|
||||
</head>
|
||||
<body class="main">
|
||||
<h1>Hello World !</h1>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
"""
|
||||
|
||||
from lxml.builder import ElementMaker
|
||||
from lxml.html import html_parser
|
||||
|
||||
E = ElementMaker(makeelement=html_parser.makeelement)
|
||||
|
||||
# elements
|
||||
A = E.a # anchor
|
||||
ABBR = E.abbr # abbreviated form (e.g., WWW, HTTP, etc.)
|
||||
ACRONYM = E.acronym #
|
||||
ADDRESS = E.address # information on author
|
||||
APPLET = E.applet # Java applet (DEPRECATED)
|
||||
AREA = E.area # client-side image map area
|
||||
B = E.b # bold text style
|
||||
BASE = E.base # document base URI
|
||||
BASEFONT = E.basefont # base font size (DEPRECATED)
|
||||
BDO = E.bdo # I18N BiDi over-ride
|
||||
BIG = E.big # large text style
|
||||
BLOCKQUOTE = E.blockquote # long quotation
|
||||
BODY = E.body # document body
|
||||
BR = E.br # forced line break
|
||||
BUTTON = E.button # push button
|
||||
CAPTION = E.caption # table caption
|
||||
CENTER = E.center # shorthand for DIV align=center (DEPRECATED)
|
||||
CITE = E.cite # citation
|
||||
CODE = E.code # computer code fragment
|
||||
COL = E.col # table column
|
||||
COLGROUP = E.colgroup # table column group
|
||||
DD = E.dd # definition description
|
||||
DEL = getattr(E, 'del') # deleted text
|
||||
DFN = E.dfn # instance definition
|
||||
DIR = E.dir # directory list (DEPRECATED)
|
||||
DIV = E.div # generic language/style container
|
||||
DL = E.dl # definition list
|
||||
DT = E.dt # definition term
|
||||
EM = E.em # emphasis
|
||||
FIELDSET = E.fieldset # form control group
|
||||
FONT = E.font # local change to font (DEPRECATED)
|
||||
FORM = E.form # interactive form
|
||||
FRAME = E.frame # subwindow
|
||||
FRAMESET = E.frameset # window subdivision
|
||||
H1 = E.h1 # heading
|
||||
H2 = E.h2 # heading
|
||||
H3 = E.h3 # heading
|
||||
H4 = E.h4 # heading
|
||||
H5 = E.h5 # heading
|
||||
H6 = E.h6 # heading
|
||||
HEAD = E.head # document head
|
||||
HR = E.hr # horizontal rule
|
||||
HTML = E.html # document root element
|
||||
I = E.i # italic text style
|
||||
IFRAME = E.iframe # inline subwindow
|
||||
IMG = E.img # Embedded image
|
||||
INPUT = E.input # form control
|
||||
INS = E.ins # inserted text
|
||||
ISINDEX = E.isindex # single line prompt (DEPRECATED)
|
||||
KBD = E.kbd # text to be entered by the user
|
||||
LABEL = E.label # form field label text
|
||||
LEGEND = E.legend # fieldset legend
|
||||
LI = E.li # list item
|
||||
LINK = E.link # a media-independent link
|
||||
MAP = E.map # client-side image map
|
||||
MENU = E.menu # menu list (DEPRECATED)
|
||||
META = E.meta # generic metainformation
|
||||
NOFRAMES = E.noframes # alternate content container for non frame-based rendering
|
||||
NOSCRIPT = E.noscript # alternate content container for non script-based rendering
|
||||
OBJECT = E.object # generic embedded object
|
||||
OL = E.ol # ordered list
|
||||
OPTGROUP = E.optgroup # option group
|
||||
OPTION = E.option # selectable choice
|
||||
P = E.p # paragraph
|
||||
PARAM = E.param # named property value
|
||||
PRE = E.pre # preformatted text
|
||||
Q = E.q # short inline quotation
|
||||
S = E.s # strike-through text style (DEPRECATED)
|
||||
SAMP = E.samp # sample program output, scripts, etc.
|
||||
SCRIPT = E.script # script statements
|
||||
SELECT = E.select # option selector
|
||||
SMALL = E.small # small text style
|
||||
SPAN = E.span # generic language/style container
|
||||
STRIKE = E.strike # strike-through text (DEPRECATED)
|
||||
STRONG = E.strong # strong emphasis
|
||||
STYLE = E.style # style info
|
||||
SUB = E.sub # subscript
|
||||
SUP = E.sup # superscript
|
||||
TABLE = E.table #
|
||||
TBODY = E.tbody # table body
|
||||
TD = E.td # table data cell
|
||||
TEXTAREA = E.textarea # multi-line text field
|
||||
TFOOT = E.tfoot # table footer
|
||||
TH = E.th # table header cell
|
||||
THEAD = E.thead # table header
|
||||
TITLE = E.title # document title
|
||||
TR = E.tr # table row
|
||||
TT = E.tt # teletype or monospaced text style
|
||||
U = E.u # underlined text style (DEPRECATED)
|
||||
UL = E.ul # unordered list
|
||||
VAR = E.var # instance of a variable or program argument
|
||||
|
||||
# attributes (only reserved words are included here)
|
||||
ATTR = dict
|
||||
def CLASS(v): return {'class': v}
|
||||
def FOR(v): return {'for': v}
|
724
lib/lxml/html/clean.py
Normal file
724
lib/lxml/html/clean.py
Normal file
|
@ -0,0 +1,724 @@
|
|||
"""A cleanup tool for HTML.
|
||||
|
||||
Removes unwanted tags and content. See the `Cleaner` class for
|
||||
details.
|
||||
"""
|
||||
|
||||
import re
|
||||
import copy
|
||||
try:
|
||||
from urlparse import urlsplit
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib.parse import urlsplit
|
||||
from lxml import etree
|
||||
from lxml.html import defs
|
||||
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
|
||||
from lxml.html import xhtml_to_html, _transform_result
|
||||
|
||||
try:
|
||||
unichr
|
||||
except NameError:
|
||||
# Python 3
|
||||
unichr = chr
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
# Python 3
|
||||
unicode = str
|
||||
try:
|
||||
bytes
|
||||
except NameError:
|
||||
# Python < 2.6
|
||||
bytes = str
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = (str, bytes)
|
||||
|
||||
|
||||
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
|
||||
'word_break', 'word_break_html']
|
||||
|
||||
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
|
||||
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
|
||||
# I have multiple kinds of schemes searched; but should schemes be
|
||||
# whitelisted instead?
|
||||
# max height?
|
||||
# remove images? Also in CSS? background attribute?
|
||||
# Some way to whitelist object, iframe, etc (e.g., if you want to
|
||||
# allow *just* embedded YouTube movies)
|
||||
# Log what was deleted and why?
|
||||
# style="behavior: ..." might be bad in IE?
|
||||
# Should we have something for just <meta http-equiv>? That's the worst of the
|
||||
# metas.
|
||||
# UTF-7 detections? Example:
|
||||
# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
|
||||
# you don't always have to have the charset set, if the page has no charset
|
||||
# and there's UTF7-like code in it.
|
||||
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
|
||||
|
||||
|
||||
# This is an IE-specific construct you can have in a stylesheet to
|
||||
# run some Javascript:
|
||||
_css_javascript_re = re.compile(
|
||||
r'expression\s*\(.*?\)', re.S|re.I)
|
||||
|
||||
# Do I have to worry about @\nimport?
|
||||
_css_import_re = re.compile(
|
||||
r'@\s*import', re.I)
|
||||
|
||||
# All kinds of schemes besides just javascript: that can cause
|
||||
# execution:
|
||||
_javascript_scheme_re = re.compile(
|
||||
r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
|
||||
_substitute_whitespace = re.compile(r'\s+').sub
|
||||
# FIXME: should data: be blocked?
|
||||
|
||||
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
|
||||
_conditional_comment_re = re.compile(
|
||||
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
|
||||
|
||||
_find_styled_elements = etree.XPath(
|
||||
"descendant-or-self::*[@style]")
|
||||
|
||||
_find_external_links = etree.XPath(
|
||||
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
|
||||
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
|
||||
class Cleaner(object):
|
||||
"""
|
||||
Instances cleans the document of each of the possible offending
|
||||
elements. The cleaning is controlled by attributes; you can
|
||||
override attributes in a subclass, or set them in the constructor.
|
||||
|
||||
``scripts``:
|
||||
Removes any ``<script>`` tags.
|
||||
|
||||
``javascript``:
|
||||
Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
|
||||
as they could contain Javascript.
|
||||
|
||||
``comments``:
|
||||
Removes any comments.
|
||||
|
||||
``style``:
|
||||
Removes any style tags or attributes.
|
||||
|
||||
``links``:
|
||||
Removes any ``<link>`` tags
|
||||
|
||||
``meta``:
|
||||
Removes any ``<meta>`` tags
|
||||
|
||||
``page_structure``:
|
||||
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
|
||||
|
||||
``processing_instructions``:
|
||||
Removes any processing instructions.
|
||||
|
||||
``embedded``:
|
||||
Removes any embedded objects (flash, iframes)
|
||||
|
||||
``frames``:
|
||||
Removes any frame-related tags
|
||||
|
||||
``forms``:
|
||||
Removes any form tags
|
||||
|
||||
``annoying_tags``:
|
||||
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
|
||||
|
||||
``remove_tags``:
|
||||
A list of tags to remove. Only the tags will be removed,
|
||||
their content will get pulled up into the parent tag.
|
||||
|
||||
``kill_tags``:
|
||||
A list of tags to kill. Killing also removes the tag's content,
|
||||
i.e. the whole subtree, not just the tag itself.
|
||||
|
||||
``allow_tags``:
|
||||
A list of tags to include (default include all).
|
||||
|
||||
``remove_unknown_tags``:
|
||||
Remove any tags that aren't standard parts of HTML.
|
||||
|
||||
``safe_attrs_only``:
|
||||
If true, only include 'safe' attributes (specifically the list
|
||||
from the feedparser HTML sanitisation web site).
|
||||
|
||||
``safe_attrs``:
|
||||
A set of attribute names to override the default list of attributes
|
||||
considered 'safe' (when safe_attrs_only=True).
|
||||
|
||||
``add_nofollow``:
|
||||
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
|
||||
|
||||
``host_whitelist``:
|
||||
A list or set of hosts that you can use for embedded content
|
||||
(for content like ``<object>``, ``<link rel="stylesheet">``, etc).
|
||||
You can also implement/override the method
|
||||
``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
|
||||
implement more complex rules for what can be embedded.
|
||||
Anything that passes this test will be shown, regardless of
|
||||
the value of (for instance) ``embedded``.
|
||||
|
||||
Note that this parameter might not work as intended if you do not
|
||||
make the links absolute before doing the cleaning.
|
||||
|
||||
Note that you may also need to set ``whitelist_tags``.
|
||||
|
||||
``whitelist_tags``:
|
||||
A set of tags that can be included with ``host_whitelist``.
|
||||
The default is ``iframe`` and ``embed``; you may wish to
|
||||
include other tags like ``script``, or you may want to
|
||||
implement ``allow_embedded_url`` for more control. Set to None to
|
||||
include all tags.
|
||||
|
||||
This modifies the document *in place*.
|
||||
"""
|
||||
|
||||
scripts = True
|
||||
javascript = True
|
||||
comments = True
|
||||
style = False
|
||||
links = True
|
||||
meta = True
|
||||
page_structure = True
|
||||
processing_instructions = True
|
||||
embedded = True
|
||||
frames = True
|
||||
forms = True
|
||||
annoying_tags = True
|
||||
remove_tags = None
|
||||
allow_tags = None
|
||||
kill_tags = None
|
||||
remove_unknown_tags = True
|
||||
safe_attrs_only = True
|
||||
safe_attrs = defs.safe_attrs
|
||||
add_nofollow = False
|
||||
host_whitelist = ()
|
||||
whitelist_tags = set(['iframe', 'embed'])
|
||||
|
||||
def __init__(self, **kw):
|
||||
for name, value in kw.items():
|
||||
if not hasattr(self, name):
|
||||
raise TypeError(
|
||||
"Unknown parameter: %s=%r" % (name, value))
|
||||
setattr(self, name, value)
|
||||
|
||||
# Used to lookup the primary URL for a given tag that is up for
|
||||
# removal:
|
||||
_tag_link_attrs = dict(
|
||||
script='src',
|
||||
link='href',
|
||||
# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
|
||||
# From what I can tell, both attributes can contain a link:
|
||||
applet=['code', 'object'],
|
||||
iframe='src',
|
||||
embed='src',
|
||||
layer='src',
|
||||
# FIXME: there doesn't really seem like a general way to figure out what
|
||||
# links an <object> tag uses; links often go in <param> tags with values
|
||||
# that we don't really know. You'd have to have knowledge about specific
|
||||
# kinds of plugins (probably keyed off classid), and match against those.
|
||||
##object=?,
|
||||
# FIXME: not looking at the action currently, because it is more complex
|
||||
# than than -- if you keep the form, you should keep the form controls.
|
||||
##form='action',
|
||||
a='href',
|
||||
)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""
|
||||
Cleans the document.
|
||||
"""
|
||||
if hasattr(doc, 'getroot'):
|
||||
# ElementTree instance, instead of an element
|
||||
doc = doc.getroot()
|
||||
# convert XHTML to HTML
|
||||
xhtml_to_html(doc)
|
||||
# Normalize a case that IE treats <image> like <img>, and that
|
||||
# can confuse either this step or later steps.
|
||||
for el in doc.iter('image'):
|
||||
el.tag = 'img'
|
||||
if not self.comments:
|
||||
# Of course, if we were going to kill comments anyway, we don't
|
||||
# need to worry about this
|
||||
self.kill_conditional_comments(doc)
|
||||
|
||||
kill_tags = set(self.kill_tags or ())
|
||||
remove_tags = set(self.remove_tags or ())
|
||||
allow_tags = set(self.allow_tags or ())
|
||||
|
||||
if self.scripts:
|
||||
kill_tags.add('script')
|
||||
if self.safe_attrs_only:
|
||||
safe_attrs = set(self.safe_attrs)
|
||||
for el in doc.iter():
|
||||
attrib = el.attrib
|
||||
for aname in attrib.keys():
|
||||
if aname not in safe_attrs:
|
||||
del attrib[aname]
|
||||
if self.javascript:
|
||||
if not (self.safe_attrs_only and
|
||||
self.safe_attrs == defs.safe_attrs):
|
||||
# safe_attrs handles events attributes itself
|
||||
for el in doc.iter():
|
||||
attrib = el.attrib
|
||||
for aname in attrib.keys():
|
||||
if aname.startswith('on'):
|
||||
del attrib[aname]
|
||||
doc.rewrite_links(self._remove_javascript_link,
|
||||
resolve_base_href=False)
|
||||
if not self.style:
|
||||
# If we're deleting style then we don't have to remove JS links
|
||||
# from styles, otherwise...
|
||||
for el in _find_styled_elements(doc):
|
||||
old = el.get('style')
|
||||
new = _css_javascript_re.sub('', old)
|
||||
new = _css_import_re.sub('', new)
|
||||
if self._has_sneaky_javascript(new):
|
||||
# Something tricky is going on...
|
||||
del el.attrib['style']
|
||||
elif new != old:
|
||||
el.set('style', new)
|
||||
for el in list(doc.iter('style')):
|
||||
if el.get('type', '').lower().strip() == 'text/javascript':
|
||||
el.drop_tree()
|
||||
continue
|
||||
old = el.text or ''
|
||||
new = _css_javascript_re.sub('', old)
|
||||
# The imported CSS can do anything; we just can't allow:
|
||||
new = _css_import_re.sub('', old)
|
||||
if self._has_sneaky_javascript(new):
|
||||
# Something tricky is going on...
|
||||
el.text = '/* deleted */'
|
||||
elif new != old:
|
||||
el.text = new
|
||||
if self.comments or self.processing_instructions:
|
||||
# FIXME: why either? I feel like there's some obscure reason
|
||||
# because you can put PIs in comments...? But I've already
|
||||
# forgotten it
|
||||
kill_tags.add(etree.Comment)
|
||||
if self.processing_instructions:
|
||||
kill_tags.add(etree.ProcessingInstruction)
|
||||
if self.style:
|
||||
kill_tags.add('style')
|
||||
etree.strip_attributes(doc, 'style')
|
||||
if self.links:
|
||||
kill_tags.add('link')
|
||||
elif self.style or self.javascript:
|
||||
# We must get rid of included stylesheets if Javascript is not
|
||||
# allowed, as you can put Javascript in them
|
||||
for el in list(doc.iter('link')):
|
||||
if 'stylesheet' in el.get('rel', '').lower():
|
||||
# Note this kills alternate stylesheets as well
|
||||
if not self.allow_element(el):
|
||||
el.drop_tree()
|
||||
if self.meta:
|
||||
kill_tags.add('meta')
|
||||
if self.page_structure:
|
||||
remove_tags.update(('head', 'html', 'title'))
|
||||
if self.embedded:
|
||||
# FIXME: is <layer> really embedded?
|
||||
# We should get rid of any <param> tags not inside <applet>;
|
||||
# These are not really valid anyway.
|
||||
for el in list(doc.iter('param')):
|
||||
found_parent = False
|
||||
parent = el.getparent()
|
||||
while parent is not None and parent.tag not in ('applet', 'object'):
|
||||
parent = parent.getparent()
|
||||
if parent is None:
|
||||
el.drop_tree()
|
||||
kill_tags.update(('applet',))
|
||||
# The alternate contents that are in an iframe are a good fallback:
|
||||
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
|
||||
if self.frames:
|
||||
# FIXME: ideally we should look at the frame links, but
|
||||
# generally frames don't mix properly with an HTML
|
||||
# fragment anyway.
|
||||
kill_tags.update(defs.frame_tags)
|
||||
if self.forms:
|
||||
remove_tags.add('form')
|
||||
kill_tags.update(('button', 'input', 'select', 'textarea'))
|
||||
if self.annoying_tags:
|
||||
remove_tags.update(('blink', 'marquee'))
|
||||
|
||||
_remove = []
|
||||
_kill = []
|
||||
for el in doc.iter():
|
||||
if el.tag in kill_tags:
|
||||
if self.allow_element(el):
|
||||
continue
|
||||
_kill.append(el)
|
||||
elif el.tag in remove_tags:
|
||||
if self.allow_element(el):
|
||||
continue
|
||||
_remove.append(el)
|
||||
|
||||
if _remove and _remove[0] == doc:
|
||||
# We have to drop the parent-most tag, which we can't
|
||||
# do. Instead we'll rewrite it:
|
||||
el = _remove.pop(0)
|
||||
el.tag = 'div'
|
||||
el.attrib.clear()
|
||||
elif _kill and _kill[0] == doc:
|
||||
# We have to drop the parent-most element, which we can't
|
||||
# do. Instead we'll clear it:
|
||||
el = _kill.pop(0)
|
||||
if el.tag != 'html':
|
||||
el.tag = 'div'
|
||||
el.clear()
|
||||
|
||||
_kill.reverse() # start with innermost tags
|
||||
for el in _kill:
|
||||
el.drop_tree()
|
||||
for el in _remove:
|
||||
el.drop_tag()
|
||||
|
||||
if self.remove_unknown_tags:
|
||||
if allow_tags:
|
||||
raise ValueError(
|
||||
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
|
||||
allow_tags = set(defs.tags)
|
||||
if allow_tags:
|
||||
bad = []
|
||||
for el in doc.iter():
|
||||
if el.tag not in allow_tags:
|
||||
bad.append(el)
|
||||
if bad:
|
||||
if bad[0] is doc:
|
||||
el = bad.pop(0)
|
||||
el.tag = 'div'
|
||||
el.attrib.clear()
|
||||
for el in bad:
|
||||
el.drop_tag()
|
||||
if self.add_nofollow:
|
||||
for el in _find_external_links(doc):
|
||||
if not self.allow_follow(el):
|
||||
rel = el.get('rel')
|
||||
if rel:
|
||||
if ('nofollow' in rel
|
||||
and ' nofollow ' in (' %s ' % rel)):
|
||||
continue
|
||||
rel = '%s nofollow' % rel
|
||||
else:
|
||||
rel = 'nofollow'
|
||||
el.set('rel', rel)
|
||||
|
||||
def allow_follow(self, anchor):
|
||||
"""
|
||||
Override to suppress rel="nofollow" on some anchors.
|
||||
"""
|
||||
return False
|
||||
|
||||
def allow_element(self, el):
|
||||
if el.tag not in self._tag_link_attrs:
|
||||
return False
|
||||
attr = self._tag_link_attrs[el.tag]
|
||||
if isinstance(attr, (list, tuple)):
|
||||
for one_attr in attr:
|
||||
url = el.get(one_attr)
|
||||
if not url:
|
||||
return False
|
||||
if not self.allow_embedded_url(el, url):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
url = el.get(attr)
|
||||
if not url:
|
||||
return False
|
||||
return self.allow_embedded_url(el, url)
|
||||
|
||||
def allow_embedded_url(self, el, url):
|
||||
if (self.whitelist_tags is not None
|
||||
and el.tag not in self.whitelist_tags):
|
||||
return False
|
||||
scheme, netloc, path, query, fragment = urlsplit(url)
|
||||
netloc = netloc.lower().split(':', 1)[0]
|
||||
if scheme not in ('http', 'https'):
|
||||
return False
|
||||
if netloc in self.host_whitelist:
|
||||
return True
|
||||
return False
|
||||
|
||||
def kill_conditional_comments(self, doc):
|
||||
"""
|
||||
IE conditional comments basically embed HTML that the parser
|
||||
doesn't normally see. We can't allow anything like that, so
|
||||
we'll kill any comments that could be conditional.
|
||||
"""
|
||||
bad = []
|
||||
self._kill_elements(
|
||||
doc, lambda el: _conditional_comment_re.search(el.text),
|
||||
etree.Comment)
|
||||
|
||||
def _kill_elements(self, doc, condition, iterate=None):
|
||||
bad = []
|
||||
for el in doc.iter(iterate):
|
||||
if condition(el):
|
||||
bad.append(el)
|
||||
for el in bad:
|
||||
el.drop_tree()
|
||||
|
||||
def _remove_javascript_link(self, link):
|
||||
# links like "j a v a s c r i p t:" might be interpreted in IE
|
||||
new = _substitute_whitespace('', link)
|
||||
if _javascript_scheme_re.search(new):
|
||||
# FIXME: should this be None to delete?
|
||||
return ''
|
||||
return link
|
||||
|
||||
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
|
||||
|
||||
def _has_sneaky_javascript(self, style):
|
||||
"""
|
||||
Depending on the browser, stuff like ``e x p r e s s i o n(...)``
|
||||
can get interpreted, or ``expre/* stuff */ssion(...)``. This
|
||||
checks for attempt to do stuff like this.
|
||||
|
||||
Typically the response will be to kill the entire style; if you
|
||||
have just a bit of Javascript in the style another rule will catch
|
||||
that and remove only the Javascript from the style; this catches
|
||||
more sneaky attempts.
|
||||
"""
|
||||
style = self._substitute_comments('', style)
|
||||
style = style.replace('\\', '')
|
||||
style = _substitute_whitespace('', style)
|
||||
style = style.lower()
|
||||
if 'javascript:' in style:
|
||||
return True
|
||||
if 'expression(' in style:
|
||||
return True
|
||||
return False
|
||||
|
||||
def clean_html(self, html):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
self(doc)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
clean = Cleaner()
|
||||
clean_html = clean.clean_html
|
||||
|
||||
############################################################
|
||||
## Autolinking
|
||||
############################################################
|
||||
|
||||
_link_regexes = [
|
||||
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
|
||||
# This is conservative, but autolinking can be a bit conservative:
|
||||
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
|
||||
]
|
||||
|
||||
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
|
||||
|
||||
_avoid_hosts = [
|
||||
re.compile(r'^localhost', re.I),
|
||||
re.compile(r'\bexample\.(?:com|org|net)$', re.I),
|
||||
re.compile(r'^127\.0\.0\.1$'),
|
||||
]
|
||||
|
||||
_avoid_classes = ['nolink']
|
||||
|
||||
def autolink(el, link_regexes=_link_regexes,
|
||||
avoid_elements=_avoid_elements,
|
||||
avoid_hosts=_avoid_hosts,
|
||||
avoid_classes=_avoid_classes):
|
||||
"""
|
||||
Turn any URLs into links.
|
||||
|
||||
It will search for links identified by the given regular
|
||||
expressions (by default mailto and http(s) links).
|
||||
|
||||
It won't link text in an element in avoid_elements, or an element
|
||||
with a class in avoid_classes. It won't link to anything with a
|
||||
host that matches one of the regular expressions in avoid_hosts
|
||||
(default localhost and 127.0.0.1).
|
||||
|
||||
If you pass in an element, the element's tail will not be
|
||||
substituted, only the contents of the element.
|
||||
"""
|
||||
if el.tag in avoid_elements:
|
||||
return
|
||||
class_name = el.get('class')
|
||||
if class_name:
|
||||
class_name = class_name.split()
|
||||
for match_class in avoid_classes:
|
||||
if match_class in class_name:
|
||||
return
|
||||
for child in list(el):
|
||||
autolink(child, link_regexes=link_regexes,
|
||||
avoid_elements=avoid_elements,
|
||||
avoid_hosts=avoid_hosts,
|
||||
avoid_classes=avoid_classes)
|
||||
if child.tail:
|
||||
text, tail_children = _link_text(
|
||||
child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
|
||||
if tail_children:
|
||||
child.tail = text
|
||||
index = el.index(child)
|
||||
el[index+1:index+1] = tail_children
|
||||
if el.text:
|
||||
text, pre_children = _link_text(
|
||||
el.text, link_regexes, avoid_hosts, factory=el.makeelement)
|
||||
if pre_children:
|
||||
el.text = text
|
||||
el[:0] = pre_children
|
||||
|
||||
def _link_text(text, link_regexes, avoid_hosts, factory):
|
||||
leading_text = ''
|
||||
links = []
|
||||
last_pos = 0
|
||||
while 1:
|
||||
best_match, best_pos = None, None
|
||||
for regex in link_regexes:
|
||||
regex_pos = last_pos
|
||||
while 1:
|
||||
match = regex.search(text, pos=regex_pos)
|
||||
if match is None:
|
||||
break
|
||||
host = match.group('host')
|
||||
for host_regex in avoid_hosts:
|
||||
if host_regex.search(host):
|
||||
regex_pos = match.end()
|
||||
break
|
||||
else:
|
||||
break
|
||||
if match is None:
|
||||
continue
|
||||
if best_pos is None or match.start() < best_pos:
|
||||
best_match = match
|
||||
best_pos = match.start()
|
||||
if best_match is None:
|
||||
# No more matches
|
||||
if links:
|
||||
assert not links[-1].tail
|
||||
links[-1].tail = text
|
||||
else:
|
||||
assert not leading_text
|
||||
leading_text = text
|
||||
break
|
||||
link = best_match.group(0)
|
||||
end = best_match.end()
|
||||
if link.endswith('.') or link.endswith(','):
|
||||
# These punctuation marks shouldn't end a link
|
||||
end -= 1
|
||||
link = link[:-1]
|
||||
prev_text = text[:best_match.start()]
|
||||
if links:
|
||||
assert not links[-1].tail
|
||||
links[-1].tail = prev_text
|
||||
else:
|
||||
assert not leading_text
|
||||
leading_text = prev_text
|
||||
anchor = factory('a')
|
||||
anchor.set('href', link)
|
||||
body = best_match.group('body')
|
||||
if not body:
|
||||
body = link
|
||||
if body.endswith('.') or body.endswith(','):
|
||||
body = body[:-1]
|
||||
anchor.text = body
|
||||
links.append(anchor)
|
||||
text = text[end:]
|
||||
return leading_text, links
|
||||
|
||||
def autolink_html(html, *args, **kw):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
autolink(doc, *args, **kw)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
autolink_html.__doc__ = autolink.__doc__
|
||||
|
||||
############################################################
|
||||
## Word wrapping
|
||||
############################################################
|
||||
|
||||
_avoid_word_break_elements = ['pre', 'textarea', 'code']
|
||||
_avoid_word_break_classes = ['nobreak']
|
||||
|
||||
def word_break(el, max_width=40,
|
||||
avoid_elements=_avoid_word_break_elements,
|
||||
avoid_classes=_avoid_word_break_classes,
|
||||
break_character=unichr(0x200b)):
|
||||
"""
|
||||
Breaks any long words found in the body of the text (not attributes).
|
||||
|
||||
Doesn't effect any of the tags in avoid_elements, by default
|
||||
``<textarea>`` and ``<pre>``
|
||||
|
||||
Breaks words by inserting ​, which is a unicode character
|
||||
for Zero Width Space character. This generally takes up no space
|
||||
in rendering, but does copy as a space, and in monospace contexts
|
||||
usually takes up space.
|
||||
|
||||
See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
|
||||
"""
|
||||
# Character suggestion of ​ comes from:
|
||||
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
|
||||
if el.tag in _avoid_word_break_elements:
|
||||
return
|
||||
class_name = el.get('class')
|
||||
if class_name:
|
||||
dont_break = False
|
||||
class_name = class_name.split()
|
||||
for avoid in avoid_classes:
|
||||
if avoid in class_name:
|
||||
dont_break = True
|
||||
break
|
||||
if dont_break:
|
||||
return
|
||||
if el.text:
|
||||
el.text = _break_text(el.text, max_width, break_character)
|
||||
for child in el:
|
||||
word_break(child, max_width=max_width,
|
||||
avoid_elements=avoid_elements,
|
||||
avoid_classes=avoid_classes,
|
||||
break_character=break_character)
|
||||
if child.tail:
|
||||
child.tail = _break_text(child.tail, max_width, break_character)
|
||||
|
||||
def word_break_html(html, *args, **kw):
|
||||
result_type = type(html)
|
||||
doc = fromstring(html)
|
||||
word_break(doc, *args, **kw)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
def _break_text(text, max_width, break_character):
|
||||
words = text.split()
|
||||
for word in words:
|
||||
if len(word) > max_width:
|
||||
replacement = _insert_break(word, max_width, break_character)
|
||||
text = text.replace(word, replacement)
|
||||
return text
|
||||
|
||||
_break_prefer_re = re.compile(r'[^a-z]', re.I)
|
||||
|
||||
def _insert_break(word, width, break_character):
|
||||
orig_word = word
|
||||
result = ''
|
||||
while len(word) > width:
|
||||
start = word[:width]
|
||||
breaks = list(_break_prefer_re.finditer(start))
|
||||
if breaks:
|
||||
last_break = breaks[-1]
|
||||
# Only walk back up to 10 characters to find a nice break:
|
||||
if last_break.end() > width-10:
|
||||
# FIXME: should the break character be at the end of the
|
||||
# chunk, or the beginning of the next chunk?
|
||||
start = word[:last_break.end()]
|
||||
result += start + break_character
|
||||
word = word[len(start):]
|
||||
result += word
|
||||
return result
|
||||
|
137
lib/lxml/html/defs.py
Normal file
137
lib/lxml/html/defs.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
# FIXME: this should all be confirmed against what a DTD says
|
||||
# (probably in a test; this may not match the DTD exactly, but we
|
||||
# should document just how it differs).
|
||||
|
||||
# Data taken from http://www.w3.org/TR/html401/index/elements.html
|
||||
# and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
|
||||
# for html5_tags.
|
||||
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
from sets import Set as frozenset
|
||||
|
||||
|
||||
empty_tags = frozenset([
|
||||
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
||||
'img', 'input', 'isindex', 'link', 'meta', 'param'])
|
||||
|
||||
deprecated_tags = frozenset([
|
||||
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
|
||||
'menu', 's', 'strike', 'u'])
|
||||
|
||||
# archive actually takes a space-separated list of URIs
|
||||
link_attrs = frozenset([
|
||||
'action', 'archive', 'background', 'cite', 'classid',
|
||||
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
|
||||
'usemap',
|
||||
# Not standard:
|
||||
'dynsrc', 'lowsrc',
|
||||
])
|
||||
|
||||
# Not in the HTML 4 spec:
|
||||
# onerror, onresize
|
||||
event_attrs = frozenset([
|
||||
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
|
||||
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
|
||||
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
|
||||
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
|
||||
'onunload',
|
||||
])
|
||||
|
||||
safe_attrs = frozenset([
|
||||
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
|
||||
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
|
||||
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
|
||||
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
|
||||
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
|
||||
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
|
||||
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
|
||||
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
|
||||
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
||||
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
|
||||
|
||||
# From http://htmlhelp.com/reference/html40/olist.html
|
||||
top_level_tags = frozenset([
|
||||
'html', 'head', 'body', 'frameset',
|
||||
])
|
||||
|
||||
head_tags = frozenset([
|
||||
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
|
||||
])
|
||||
|
||||
general_block_tags = frozenset([
|
||||
'address',
|
||||
'blockquote',
|
||||
'center',
|
||||
'del',
|
||||
'div',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'ins',
|
||||
'isindex',
|
||||
'noscript',
|
||||
'p',
|
||||
'pre',
|
||||
])
|
||||
|
||||
list_tags = frozenset([
|
||||
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
|
||||
])
|
||||
|
||||
table_tags = frozenset([
|
||||
'table', 'caption', 'colgroup', 'col',
|
||||
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
|
||||
])
|
||||
|
||||
# just this one from
|
||||
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
|
||||
block_tags = general_block_tags | list_tags | table_tags | frozenset([
|
||||
# Partial form tags
|
||||
'fieldset', 'form', 'legend', 'optgroup', 'option',
|
||||
])
|
||||
|
||||
form_tags = frozenset([
|
||||
'form', 'button', 'fieldset', 'legend', 'input', 'label',
|
||||
'select', 'optgroup', 'option', 'textarea',
|
||||
])
|
||||
|
||||
special_inline_tags = frozenset([
|
||||
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
|
||||
'img', 'map', 'area', 'object', 'param', 'q', 'script',
|
||||
'span', 'sub', 'sup',
|
||||
])
|
||||
|
||||
phrase_tags = frozenset([
|
||||
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
|
||||
'ins', 'kbd', 'samp', 'strong', 'var',
|
||||
])
|
||||
|
||||
font_style_tags = frozenset([
|
||||
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
|
||||
])
|
||||
|
||||
frame_tags = frozenset([
|
||||
'frameset', 'frame', 'noframes',
|
||||
])
|
||||
|
||||
html5_tags = frozenset([
|
||||
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
|
||||
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
|
||||
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
|
||||
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
|
||||
'svg', 'time', 'track', 'video', 'wbr'
|
||||
])
|
||||
|
||||
# These tags aren't standard
|
||||
nonstandard_tags = frozenset(['blink', 'marquee'])
|
||||
|
||||
|
||||
tags = (top_level_tags | head_tags | general_block_tags | list_tags
|
||||
| table_tags | form_tags | special_inline_tags | phrase_tags
|
||||
| font_style_tags | nonstandard_tags | html5_tags)
|
881
lib/lxml/html/diff.py
Normal file
881
lib/lxml/html/diff.py
Normal file
|
@ -0,0 +1,881 @@
|
|||
import difflib
|
||||
from lxml import etree
|
||||
from lxml.html import fragment_fromstring
|
||||
import re
|
||||
|
||||
__all__ = ['html_annotate', 'htmldiff']
|
||||
|
||||
try:
|
||||
from html import escape as html_escape
|
||||
except ImportError:
|
||||
from cgi import escape as html_escape
|
||||
try:
|
||||
_unicode = unicode
|
||||
except NameError:
|
||||
# Python 3
|
||||
_unicode = str
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3
|
||||
basestring = str
|
||||
|
||||
############################################################
|
||||
## Annotation
|
||||
############################################################
|
||||
|
||||
def default_markup(text, version):
|
||||
return '<span title="%s">%s</span>' % (
|
||||
html_escape(_unicode(version), 1), text)
|
||||
|
||||
def html_annotate(doclist, markup=default_markup):
|
||||
"""
|
||||
doclist should be ordered from oldest to newest, like::
|
||||
|
||||
>>> version1 = 'Hello World'
|
||||
>>> version2 = 'Goodbye World'
|
||||
>>> print(html_annotate([(version1, 'version 1'),
|
||||
... (version2, 'version 2')]))
|
||||
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
|
||||
|
||||
The documents must be *fragments* (str/UTF8 or unicode), not
|
||||
complete documents
|
||||
|
||||
The markup argument is a function to markup the spans of words.
|
||||
This function is called like markup('Hello', 'version 2'), and
|
||||
returns HTML. The first argument is text and never includes any
|
||||
markup. The default uses a span with a title:
|
||||
|
||||
>>> print(default_markup('Some Text', 'by Joe'))
|
||||
<span title="by Joe">Some Text</span>
|
||||
"""
|
||||
# The basic strategy we have is to split the documents up into
|
||||
# logical tokens (which are words with attached markup). We then
|
||||
# do diffs of each of the versions to track when a token first
|
||||
# appeared in the document; the annotation attached to the token
|
||||
# is the version where it first appeared.
|
||||
tokenlist = [tokenize_annotated(doc, version)
|
||||
for doc, version in doclist]
|
||||
cur_tokens = tokenlist[0]
|
||||
for tokens in tokenlist[1:]:
|
||||
html_annotate_merge_annotations(cur_tokens, tokens)
|
||||
cur_tokens = tokens
|
||||
|
||||
# After we've tracked all the tokens, we can combine spans of text
|
||||
# that are adjacent and have the same annotation
|
||||
cur_tokens = compress_tokens(cur_tokens)
|
||||
# And finally add markup
|
||||
result = markup_serialize_tokens(cur_tokens, markup)
|
||||
return ''.join(result).strip()
|
||||
|
||||
def tokenize_annotated(doc, annotation):
|
||||
"""Tokenize a document and add an annotation attribute to each token
|
||||
"""
|
||||
tokens = tokenize(doc, include_hrefs=False)
|
||||
for tok in tokens:
|
||||
tok.annotation = annotation
|
||||
return tokens
|
||||
|
||||
def html_annotate_merge_annotations(tokens_old, tokens_new):
|
||||
"""Merge the annotations from tokens_old into tokens_new, when the
|
||||
tokens in the new document already existed in the old document.
|
||||
"""
|
||||
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
|
||||
commands = s.get_opcodes()
|
||||
|
||||
for command, i1, i2, j1, j2 in commands:
|
||||
if command == 'equal':
|
||||
eq_old = tokens_old[i1:i2]
|
||||
eq_new = tokens_new[j1:j2]
|
||||
copy_annotations(eq_old, eq_new)
|
||||
|
||||
def copy_annotations(src, dest):
|
||||
"""
|
||||
Copy annotations from the tokens listed in src to the tokens in dest
|
||||
"""
|
||||
assert len(src) == len(dest)
|
||||
for src_tok, dest_tok in zip(src, dest):
|
||||
dest_tok.annotation = src_tok.annotation
|
||||
|
||||
def compress_tokens(tokens):
|
||||
"""
|
||||
Combine adjacent tokens when there is no HTML between the tokens,
|
||||
and they share an annotation
|
||||
"""
|
||||
result = [tokens[0]]
|
||||
for tok in tokens[1:]:
|
||||
if (not result[-1].post_tags and
|
||||
not tok.pre_tags and
|
||||
result[-1].annotation == tok.annotation):
|
||||
compress_merge_back(result, tok)
|
||||
else:
|
||||
result.append(tok)
|
||||
return result
|
||||
|
||||
def compress_merge_back(tokens, tok):
|
||||
""" Merge tok into the last element of tokens (modifying the list of
|
||||
tokens in-place). """
|
||||
last = tokens[-1]
|
||||
if type(last) is not token or type(tok) is not token:
|
||||
tokens.append(tok)
|
||||
else:
|
||||
text = _unicode(last)
|
||||
if last.trailing_whitespace:
|
||||
text += last.trailing_whitespace
|
||||
text += tok
|
||||
merged = token(text,
|
||||
pre_tags=last.pre_tags,
|
||||
post_tags=tok.post_tags,
|
||||
trailing_whitespace=tok.trailing_whitespace)
|
||||
merged.annotation = last.annotation
|
||||
tokens[-1] = merged
|
||||
|
||||
def markup_serialize_tokens(tokens, markup_func):
|
||||
"""
|
||||
Serialize the list of tokens into a list of text chunks, calling
|
||||
markup_func around text to add annotations.
|
||||
"""
|
||||
for token in tokens:
|
||||
for pre in token.pre_tags:
|
||||
yield pre
|
||||
html = token.html()
|
||||
html = markup_func(html, token.annotation)
|
||||
if token.trailing_whitespace:
|
||||
html += token.trailing_whitespace
|
||||
yield html
|
||||
for post in token.post_tags:
|
||||
yield post
|
||||
|
||||
|
||||
############################################################
|
||||
## HTML Diffs
|
||||
############################################################
|
||||
|
||||
def htmldiff(old_html, new_html):
|
||||
## FIXME: this should take parsed documents too, and use their body
|
||||
## or other content.
|
||||
""" Do a diff of the old and new document. The documents are HTML
|
||||
*fragments* (str/UTF8 or unicode), they are not complete documents
|
||||
(i.e., no <html> tag).
|
||||
|
||||
Returns HTML with <ins> and <del> tags added around the
|
||||
appropriate text.
|
||||
|
||||
Markup is generally ignored, with the markup from new_html
|
||||
preserved, and possibly some markup from old_html (though it is
|
||||
considered acceptable to lose some of the old markup). Only the
|
||||
words in the HTML are diffed. The exception is <img> tags, which
|
||||
are treated like words, and the href attribute of <a> tags, which
|
||||
are noted inside the tag itself when there are changes.
|
||||
"""
|
||||
old_html_tokens = tokenize(old_html)
|
||||
new_html_tokens = tokenize(new_html)
|
||||
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
|
||||
result = ''.join(result).strip()
|
||||
return fixup_ins_del_tags(result)
|
||||
|
||||
def htmldiff_tokens(html1_tokens, html2_tokens):
|
||||
""" Does a diff on the tokens themselves, returning a list of text
|
||||
chunks (not tokens).
|
||||
"""
|
||||
# There are several passes as we do the differences. The tokens
|
||||
# isolate the portion of the content we care to diff; difflib does
|
||||
# all the actual hard work at that point.
|
||||
#
|
||||
# Then we must create a valid document from pieces of both the old
|
||||
# document and the new document. We generally prefer to take
|
||||
# markup from the new document, and only do a best effort attempt
|
||||
# to keep markup from the old document; anything that we can't
|
||||
# resolve we throw away. Also we try to put the deletes as close
|
||||
# to the location where we think they would have been -- because
|
||||
# we are only keeping the markup from the new document, it can be
|
||||
# fuzzy where in the new document the old text would have gone.
|
||||
# Again we just do a best effort attempt.
|
||||
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
|
||||
commands = s.get_opcodes()
|
||||
result = []
|
||||
for command, i1, i2, j1, j2 in commands:
|
||||
if command == 'equal':
|
||||
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
|
||||
continue
|
||||
if command == 'insert' or command == 'replace':
|
||||
ins_tokens = expand_tokens(html2_tokens[j1:j2])
|
||||
merge_insert(ins_tokens, result)
|
||||
if command == 'delete' or command == 'replace':
|
||||
del_tokens = expand_tokens(html1_tokens[i1:i2])
|
||||
merge_delete(del_tokens, result)
|
||||
# If deletes were inserted directly as <del> then we'd have an
|
||||
# invalid document at this point. Instead we put in special
|
||||
# markers, and when the complete diffed document has been created
|
||||
# we try to move the deletes around and resolve any problems.
|
||||
result = cleanup_delete(result)
|
||||
|
||||
return result
|
||||
|
||||
def expand_tokens(tokens, equal=False):
|
||||
"""Given a list of tokens, return a generator of the chunks of
|
||||
text for the data in the tokens.
|
||||
"""
|
||||
for token in tokens:
|
||||
for pre in token.pre_tags:
|
||||
yield pre
|
||||
if not equal or not token.hide_when_equal:
|
||||
if token.trailing_whitespace:
|
||||
yield token.html() + token.trailing_whitespace
|
||||
else:
|
||||
yield token.html()
|
||||
for post in token.post_tags:
|
||||
yield post
|
||||
|
||||
def merge_insert(ins_chunks, doc):
|
||||
""" doc is the already-handled document (as a list of text chunks);
|
||||
here we add <ins>ins_chunks</ins> to the end of that. """
|
||||
# Though we don't throw away unbalanced_start or unbalanced_end
|
||||
# (we assume there is accompanying markup later or earlier in the
|
||||
# document), we only put <ins> around the balanced portion.
|
||||
unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
|
||||
doc.extend(unbalanced_start)
|
||||
if doc and not doc[-1].endswith(' '):
|
||||
# Fix up the case where the word before the insert didn't end with
|
||||
# a space
|
||||
doc[-1] += ' '
|
||||
doc.append('<ins>')
|
||||
if balanced and balanced[-1].endswith(' '):
|
||||
# We move space outside of </ins>
|
||||
balanced[-1] = balanced[-1][:-1]
|
||||
doc.extend(balanced)
|
||||
doc.append('</ins> ')
|
||||
doc.extend(unbalanced_end)
|
||||
|
||||
# These are sentinals to represent the start and end of a <del>
|
||||
# segment, until we do the cleanup phase to turn them into proper
|
||||
# markup:
|
||||
class DEL_START:
|
||||
pass
|
||||
class DEL_END:
|
||||
pass
|
||||
|
||||
class NoDeletes(Exception):
|
||||
""" Raised when the document no longer contains any pending deletes
|
||||
(DEL_START/DEL_END) """
|
||||
|
||||
def merge_delete(del_chunks, doc):
|
||||
""" Adds the text chunks in del_chunks to the document doc (another
|
||||
list of text chunks) with marker to show it is a delete.
|
||||
cleanup_delete later resolves these markers into <del> tags."""
|
||||
doc.append(DEL_START)
|
||||
doc.extend(del_chunks)
|
||||
doc.append(DEL_END)
|
||||
|
||||
def cleanup_delete(chunks):
|
||||
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
|
||||
them with <del></del>. To do this while keeping the document
|
||||
valid, it may need to drop some tags (either start or end tags).
|
||||
|
||||
It may also move the del into adjacent tags to try to move it to a
|
||||
similar location where it was originally located (e.g., moving a
|
||||
delete into preceding <div> tag, if the del looks like (DEL_START,
|
||||
'Text</div>', DEL_END)"""
|
||||
while 1:
|
||||
# Find a pending DEL_START/DEL_END, splitting the document
|
||||
# into stuff-preceding-DEL_START, stuff-inside, and
|
||||
# stuff-following-DEL_END
|
||||
try:
|
||||
pre_delete, delete, post_delete = split_delete(chunks)
|
||||
except NoDeletes:
|
||||
# Nothing found, we've cleaned up the entire doc
|
||||
break
|
||||
# The stuff-inside-DEL_START/END may not be well balanced
|
||||
# markup. First we figure out what unbalanced portions there are:
|
||||
unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
|
||||
# Then we move the span forward and/or backward based on these
|
||||
# unbalanced portions:
|
||||
locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
|
||||
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
|
||||
doc = pre_delete
|
||||
if doc and not doc[-1].endswith(' '):
|
||||
# Fix up case where the word before us didn't have a trailing space
|
||||
doc[-1] += ' '
|
||||
doc.append('<del>')
|
||||
if balanced and balanced[-1].endswith(' '):
|
||||
# We move space outside of </del>
|
||||
balanced[-1] = balanced[-1][:-1]
|
||||
doc.extend(balanced)
|
||||
doc.append('</del> ')
|
||||
doc.extend(post_delete)
|
||||
chunks = doc
|
||||
return chunks
|
||||
|
||||
def split_unbalanced(chunks):
|
||||
"""Return (unbalanced_start, balanced, unbalanced_end), where each is
|
||||
a list of text and tag chunks.
|
||||
|
||||
unbalanced_start is a list of all the tags that are opened, but
|
||||
not closed in this span. Similarly, unbalanced_end is a list of
|
||||
tags that are closed but were not opened. Extracting these might
|
||||
mean some reordering of the chunks."""
|
||||
start = []
|
||||
end = []
|
||||
tag_stack = []
|
||||
balanced = []
|
||||
for chunk in chunks:
|
||||
if not chunk.startswith('<'):
|
||||
balanced.append(chunk)
|
||||
continue
|
||||
endtag = chunk[1] == '/'
|
||||
name = chunk.split()[0].strip('<>/')
|
||||
if name in empty_tags:
|
||||
balanced.append(chunk)
|
||||
continue
|
||||
if endtag:
|
||||
if tag_stack and tag_stack[-1][0] == name:
|
||||
balanced.append(chunk)
|
||||
name, pos, tag = tag_stack.pop()
|
||||
balanced[pos] = tag
|
||||
elif tag_stack:
|
||||
start.extend([tag for name, pos, tag in tag_stack])
|
||||
tag_stack = []
|
||||
end.append(chunk)
|
||||
else:
|
||||
end.append(chunk)
|
||||
else:
|
||||
tag_stack.append((name, len(balanced), chunk))
|
||||
balanced.append(None)
|
||||
start.extend(
|
||||
[chunk for name, pos, chunk in tag_stack])
|
||||
balanced = [chunk for chunk in balanced if chunk is not None]
|
||||
return start, balanced, end
|
||||
|
||||
def split_delete(chunks):
|
||||
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
|
||||
stuff_after_DEL_END). Returns the first case found (there may be
|
||||
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
|
||||
there's no DEL_START found. """
|
||||
try:
|
||||
pos = chunks.index(DEL_START)
|
||||
except ValueError:
|
||||
raise NoDeletes
|
||||
pos2 = chunks.index(DEL_END)
|
||||
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
|
||||
|
||||
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
|
||||
""" pre_delete and post_delete implicitly point to a place in the
|
||||
document (where the two were split). This moves that point (by
|
||||
popping items from one and pushing them onto the other). It moves
|
||||
the point to try to find a place where unbalanced_start applies.
|
||||
|
||||
As an example::
|
||||
|
||||
>>> unbalanced_start = ['<div>']
|
||||
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
|
||||
>>> pre, post = doc[:3], doc[3:]
|
||||
>>> pre, post
|
||||
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
|
||||
>>> locate_unbalanced_start(unbalanced_start, pre, post)
|
||||
>>> pre, post
|
||||
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
|
||||
|
||||
As you can see, we moved the point so that the dangling <div> that
|
||||
we found will be effectively replaced by the div in the original
|
||||
document. If this doesn't work out, we just throw away
|
||||
unbalanced_start without doing anything.
|
||||
"""
|
||||
while 1:
|
||||
if not unbalanced_start:
|
||||
# We have totally succeded in finding the position
|
||||
break
|
||||
finding = unbalanced_start[0]
|
||||
finding_name = finding.split()[0].strip('<>')
|
||||
if not post_delete:
|
||||
break
|
||||
next = post_delete[0]
|
||||
if next is DEL_START or not next.startswith('<'):
|
||||
# Reached a word, we can't move the delete text forward
|
||||
break
|
||||
if next[1] == '/':
|
||||
# Reached a closing tag, can we go further? Maybe not...
|
||||
break
|
||||
name = next.split()[0].strip('<>')
|
||||
if name == 'ins':
|
||||
# Can't move into an insert
|
||||
break
|
||||
assert name != 'del', (
|
||||
"Unexpected delete tag: %r" % next)
|
||||
if name == finding_name:
|
||||
unbalanced_start.pop(0)
|
||||
pre_delete.append(post_delete.pop(0))
|
||||
else:
|
||||
# Found a tag that doesn't match
|
||||
break
|
||||
|
||||
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
|
||||
""" like locate_unbalanced_start, except handling end tags and
|
||||
possibly moving the point earlier in the document. """
|
||||
while 1:
|
||||
if not unbalanced_end:
|
||||
# Success
|
||||
break
|
||||
finding = unbalanced_end[-1]
|
||||
finding_name = finding.split()[0].strip('<>/')
|
||||
if not pre_delete:
|
||||
break
|
||||
next = pre_delete[-1]
|
||||
if next is DEL_END or not next.startswith('</'):
|
||||
# A word or a start tag
|
||||
break
|
||||
name = next.split()[0].strip('<>/')
|
||||
if name == 'ins' or name == 'del':
|
||||
# Can't move into an insert or delete
|
||||
break
|
||||
if name == finding_name:
|
||||
unbalanced_end.pop()
|
||||
post_delete.insert(0, pre_delete.pop())
|
||||
else:
|
||||
# Found a tag that doesn't match
|
||||
break
|
||||
|
||||
class token(_unicode):
|
||||
""" Represents a diffable token, generally a word that is displayed to
|
||||
the user. Opening tags are attached to this token when they are
|
||||
adjacent (pre_tags) and closing tags that follow the word
|
||||
(post_tags). Some exceptions occur when there are empty tags
|
||||
adjacent to a word, so there may be close tags in pre_tags, or
|
||||
open tags in post_tags.
|
||||
|
||||
We also keep track of whether the word was originally followed by
|
||||
whitespace, even though we do not want to treat the word as
|
||||
equivalent to a similar word that does not have a trailing
|
||||
space."""
|
||||
|
||||
# When this is true, the token will be eliminated from the
|
||||
# displayed diff if no change has occurred:
|
||||
hide_when_equal = False
|
||||
|
||||
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
|
||||
obj = _unicode.__new__(cls, text)
|
||||
|
||||
if pre_tags is not None:
|
||||
obj.pre_tags = pre_tags
|
||||
else:
|
||||
obj.pre_tags = []
|
||||
|
||||
if post_tags is not None:
|
||||
obj.post_tags = post_tags
|
||||
else:
|
||||
obj.post_tags = []
|
||||
|
||||
obj.trailing_whitespace = trailing_whitespace
|
||||
|
||||
return obj
|
||||
|
||||
def __repr__(self):
|
||||
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
|
||||
self.post_tags, self.trailing_whitespace)
|
||||
|
||||
def html(self):
|
||||
return _unicode(self)
|
||||
|
||||
class tag_token(token):
|
||||
|
||||
""" Represents a token that is actually a tag. Currently this is just
|
||||
the <img> tag, which takes up visible space just like a word but
|
||||
is only represented in a document by a tag. """
|
||||
|
||||
def __new__(cls, tag, data, html_repr, pre_tags=None,
|
||||
post_tags=None, trailing_whitespace=""):
|
||||
obj = token.__new__(cls, "%s: %s" % (type, data),
|
||||
pre_tags=pre_tags,
|
||||
post_tags=post_tags,
|
||||
trailing_whitespace=trailing_whitespace)
|
||||
obj.tag = tag
|
||||
obj.data = data
|
||||
obj.html_repr = html_repr
|
||||
return obj
|
||||
|
||||
def __repr__(self):
|
||||
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
|
||||
self.tag,
|
||||
self.data,
|
||||
self.html_repr,
|
||||
self.pre_tags,
|
||||
self.post_tags,
|
||||
self.trailing_whitespace)
|
||||
def html(self):
|
||||
return self.html_repr
|
||||
|
||||
class href_token(token):
|
||||
|
||||
""" Represents the href in an anchor tag. Unlike other words, we only
|
||||
show the href when it changes. """
|
||||
|
||||
hide_when_equal = True
|
||||
|
||||
def html(self):
|
||||
return ' Link: %s' % self
|
||||
|
||||
def tokenize(html, include_hrefs=True):
|
||||
"""
|
||||
Parse the given HTML and returns token objects (words with attached tags).
|
||||
|
||||
This parses only the content of a page; anything in the head is
|
||||
ignored, and the <head> and <body> elements are themselves
|
||||
optional. The content is then parsed by lxml, which ensures the
|
||||
validity of the resulting parsed document (though lxml may make
|
||||
incorrect guesses when the markup is particular bad).
|
||||
|
||||
<ins> and <del> tags are also eliminated from the document, as
|
||||
that gets confusing.
|
||||
|
||||
If include_hrefs is true, then the href attribute of <a> tags is
|
||||
included as a special kind of diffable token."""
|
||||
if etree.iselement(html):
|
||||
body_el = html
|
||||
else:
|
||||
body_el = parse_html(html, cleanup=True)
|
||||
# Then we split the document into text chunks for each tag, word, and end tag:
|
||||
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
|
||||
# Finally re-joining them into token objects:
|
||||
return fixup_chunks(chunks)
|
||||
|
||||
def parse_html(html, cleanup=True):
|
||||
"""
|
||||
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
|
||||
wrapped in a <div> tag that was not in the original document.
|
||||
|
||||
If cleanup is true, make sure there's no <head> or <body>, and get
|
||||
rid of any <ins> and <del> tags.
|
||||
"""
|
||||
if cleanup:
|
||||
# This removes any extra markup or structure like <head>:
|
||||
html = cleanup_html(html)
|
||||
return fragment_fromstring(html, create_parent=True)
|
||||
|
||||
_body_re = re.compile(r'<body.*?>', re.I|re.S)
|
||||
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
|
||||
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
|
||||
|
||||
def cleanup_html(html):
|
||||
""" This 'cleans' the HTML, meaning that any page structure is removed
|
||||
(only the contents of <body> are used, if there is any <body).
|
||||
Also <ins> and <del> tags are removed. """
|
||||
match = _body_re.search(html)
|
||||
if match:
|
||||
html = html[match.end():]
|
||||
match = _end_body_re.search(html)
|
||||
if match:
|
||||
html = html[:match.start()]
|
||||
html = _ins_del_re.sub('', html)
|
||||
return html
|
||||
|
||||
|
||||
end_whitespace_re = re.compile(r'[ \t\n\r]$')
|
||||
|
||||
def split_trailing_whitespace(word):
|
||||
"""
|
||||
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
|
||||
"""
|
||||
stripped_length = len(word.rstrip())
|
||||
return word[0:stripped_length], word[stripped_length:]
|
||||
|
||||
|
||||
def fixup_chunks(chunks):
|
||||
"""
|
||||
This function takes a list of chunks and produces a list of tokens.
|
||||
"""
|
||||
tag_accum = []
|
||||
cur_word = None
|
||||
result = []
|
||||
for chunk in chunks:
|
||||
if isinstance(chunk, tuple):
|
||||
if chunk[0] == 'img':
|
||||
src = chunk[1]
|
||||
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
|
||||
cur_word = tag_token('img', src, html_repr=tag,
|
||||
pre_tags=tag_accum,
|
||||
trailing_whitespace=trailing_whitespace)
|
||||
tag_accum = []
|
||||
result.append(cur_word)
|
||||
|
||||
elif chunk[0] == 'href':
|
||||
href = chunk[1]
|
||||
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
|
||||
tag_accum = []
|
||||
result.append(cur_word)
|
||||
continue
|
||||
|
||||
if is_word(chunk):
|
||||
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
|
||||
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
|
||||
tag_accum = []
|
||||
result.append(cur_word)
|
||||
|
||||
elif is_start_tag(chunk):
|
||||
tag_accum.append(chunk)
|
||||
|
||||
elif is_end_tag(chunk):
|
||||
if tag_accum:
|
||||
tag_accum.append(chunk)
|
||||
else:
|
||||
assert cur_word, (
|
||||
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
|
||||
% (cur_word, result, chunk, chunks))
|
||||
cur_word.post_tags.append(chunk)
|
||||
else:
|
||||
assert(0)
|
||||
|
||||
if not result:
|
||||
return [token('', pre_tags=tag_accum)]
|
||||
else:
|
||||
result[-1].post_tags.extend(tag_accum)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# All the tags in HTML that don't require end tags:
|
||||
empty_tags = (
|
||||
'param', 'img', 'area', 'br', 'basefont', 'input',
|
||||
'base', 'meta', 'link', 'col')
|
||||
|
||||
block_level_tags = (
|
||||
'address',
|
||||
'blockquote',
|
||||
'center',
|
||||
'dir',
|
||||
'div',
|
||||
'dl',
|
||||
'fieldset',
|
||||
'form',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'isindex',
|
||||
'menu',
|
||||
'noframes',
|
||||
'noscript',
|
||||
'ol',
|
||||
'p',
|
||||
'pre',
|
||||
'table',
|
||||
'ul',
|
||||
)
|
||||
|
||||
block_level_container_tags = (
|
||||
'dd',
|
||||
'dt',
|
||||
'frameset',
|
||||
'li',
|
||||
'tbody',
|
||||
'td',
|
||||
'tfoot',
|
||||
'th',
|
||||
'thead',
|
||||
'tr',
|
||||
)
|
||||
|
||||
|
||||
def flatten_el(el, include_hrefs, skip_tag=False):
|
||||
""" Takes an lxml element el, and generates all the text chunks for
|
||||
that tag. Each start tag is a chunk, each word is a chunk, and each
|
||||
end tag is a chunk.
|
||||
|
||||
If skip_tag is true, then the outermost container tag is
|
||||
not returned (just its contents)."""
|
||||
if not skip_tag:
|
||||
if el.tag == 'img':
|
||||
yield ('img', el.get('src'), start_tag(el))
|
||||
else:
|
||||
yield start_tag(el)
|
||||
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
|
||||
return
|
||||
start_words = split_words(el.text)
|
||||
for word in start_words:
|
||||
yield html_escape(word)
|
||||
for child in el:
|
||||
for item in flatten_el(child, include_hrefs=include_hrefs):
|
||||
yield item
|
||||
if el.tag == 'a' and el.get('href') and include_hrefs:
|
||||
yield ('href', el.get('href'))
|
||||
if not skip_tag:
|
||||
yield end_tag(el)
|
||||
end_words = split_words(el.tail)
|
||||
for word in end_words:
|
||||
yield html_escape(word)
|
||||
|
||||
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
|
||||
|
||||
def split_words(text):
|
||||
""" Splits some text into words. Includes trailing whitespace
|
||||
on each word when appropriate. """
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
words = split_words_re.findall(text)
|
||||
return words
|
||||
|
||||
start_whitespace_re = re.compile(r'^[ \t\n\r]')
|
||||
|
||||
def start_tag(el):
|
||||
"""
|
||||
The text representation of the start tag for a tag.
|
||||
"""
|
||||
return '<%s%s>' % (
|
||||
el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
|
||||
for name, value in el.attrib.items()]))
|
||||
|
||||
def end_tag(el):
|
||||
""" The text representation of an end tag for a tag. Includes
|
||||
trailing whitespace when appropriate. """
|
||||
if el.tail and start_whitespace_re.search(el.tail):
|
||||
extra = ' '
|
||||
else:
|
||||
extra = ''
|
||||
return '</%s>%s' % (el.tag, extra)
|
||||
|
||||
def is_word(tok):
|
||||
return not tok.startswith('<')
|
||||
|
||||
def is_end_tag(tok):
|
||||
return tok.startswith('</')
|
||||
|
||||
def is_start_tag(tok):
|
||||
return tok.startswith('<') and not tok.startswith('</')
|
||||
|
||||
def fixup_ins_del_tags(html):
|
||||
""" Given an html string, move any <ins> or <del> tags inside of any
|
||||
block-level elements, e.g. transform <ins><p>word</p></ins> to
|
||||
<p><ins>word</ins></p> """
|
||||
doc = parse_html(html, cleanup=False)
|
||||
_fixup_ins_del_tags(doc)
|
||||
html = serialize_html_fragment(doc, skip_outer=True)
|
||||
return html
|
||||
|
||||
def serialize_html_fragment(el, skip_outer=False):
|
||||
""" Serialize a single lxml element as HTML. The serialized form
|
||||
includes the elements tail.
|
||||
|
||||
If skip_outer is true, then don't serialize the outermost tag
|
||||
"""
|
||||
assert not isinstance(el, basestring), (
|
||||
"You should pass in an element, not a string like %r" % el)
|
||||
html = etree.tostring(el, method="html", encoding=_unicode)
|
||||
if skip_outer:
|
||||
# Get rid of the extra starting tag:
|
||||
html = html[html.find('>')+1:]
|
||||
# Get rid of the extra end tag:
|
||||
html = html[:html.rfind('<')]
|
||||
return html.strip()
|
||||
else:
|
||||
return html
|
||||
|
||||
def _fixup_ins_del_tags(doc):
|
||||
"""fixup_ins_del_tags that works on an lxml document in-place
|
||||
"""
|
||||
for tag in ['ins', 'del']:
|
||||
for el in doc.xpath('descendant-or-self::%s' % tag):
|
||||
if not _contains_block_level_tag(el):
|
||||
continue
|
||||
_move_el_inside_block(el, tag=tag)
|
||||
el.drop_tag()
|
||||
#_merge_element_contents(el)
|
||||
|
||||
def _contains_block_level_tag(el):
|
||||
"""True if the element contains any block-level elements, like <p>, <td>, etc.
|
||||
"""
|
||||
if el.tag in block_level_tags or el.tag in block_level_container_tags:
|
||||
return True
|
||||
for child in el:
|
||||
if _contains_block_level_tag(child):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _move_el_inside_block(el, tag):
|
||||
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
|
||||
and moves them inside any block-level tags. """
|
||||
for child in el:
|
||||
if _contains_block_level_tag(child):
|
||||
break
|
||||
else:
|
||||
import sys
|
||||
# No block-level tags in any child
|
||||
children_tag = etree.Element(tag)
|
||||
children_tag.text = el.text
|
||||
el.text = None
|
||||
children_tag.extend(list(el))
|
||||
el[:] = [children_tag]
|
||||
return
|
||||
for child in list(el):
|
||||
if _contains_block_level_tag(child):
|
||||
_move_el_inside_block(child, tag)
|
||||
if child.tail:
|
||||
tail_tag = etree.Element(tag)
|
||||
tail_tag.text = child.tail
|
||||
child.tail = None
|
||||
el.insert(el.index(child)+1, tail_tag)
|
||||
else:
|
||||
child_tag = etree.Element(tag)
|
||||
el.replace(child, child_tag)
|
||||
child_tag.append(child)
|
||||
if el.text:
|
||||
text_tag = etree.Element(tag)
|
||||
text_tag.text = el.text
|
||||
el.text = None
|
||||
el.insert(0, text_tag)
|
||||
|
||||
def _merge_element_contents(el):
|
||||
"""
|
||||
Removes an element, but merges its contents into its place, e.g.,
|
||||
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
|
||||
<p>Hi there!</p>
|
||||
"""
|
||||
parent = el.getparent()
|
||||
text = el.text or ''
|
||||
if el.tail:
|
||||
if not len(el):
|
||||
text += el.tail
|
||||
else:
|
||||
if el[-1].tail:
|
||||
el[-1].tail += el.tail
|
||||
else:
|
||||
el[-1].tail = el.tail
|
||||
index = parent.index(el)
|
||||
if text:
|
||||
if index == 0:
|
||||
previous = None
|
||||
else:
|
||||
previous = parent[index-1]
|
||||
if previous is None:
|
||||
if parent.text:
|
||||
parent.text += text
|
||||
else:
|
||||
parent.text = text
|
||||
else:
|
||||
if previous.tail:
|
||||
previous.tail += text
|
||||
else:
|
||||
previous.tail = text
|
||||
parent[index:index+1] = el.getchildren()
|
||||
|
||||
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
|
||||
"""
|
||||
Acts like SequenceMatcher, but tries not to find very small equal
|
||||
blocks amidst large spans of changes
|
||||
"""
|
||||
|
||||
threshold = 2
|
||||
|
||||
def get_matching_blocks(self):
|
||||
size = min(len(self.b), len(self.b))
|
||||
threshold = min(self.threshold, size / 4)
|
||||
actual = difflib.SequenceMatcher.get_matching_blocks(self)
|
||||
return [item for item in actual
|
||||
if item[2] > threshold
|
||||
or not item[2]]
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml.html import _diffcommand
|
||||
_diffcommand.main()
|
||||
|
299
lib/lxml/html/formfill.py
Normal file
299
lib/lxml/html/formfill.py
Normal file
|
@ -0,0 +1,299 @@
|
|||
from lxml.etree import XPath, ElementBase
|
||||
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
|
||||
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
|
||||
from lxml.html import defs
|
||||
import copy
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3
|
||||
basestring = str
|
||||
|
||||
__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
|
||||
'insert_errors', 'insert_errors_html',
|
||||
'DefaultErrorCreator']
|
||||
|
||||
class FormNotFound(LookupError):
|
||||
"""
|
||||
Raised when no form can be found
|
||||
"""
|
||||
|
||||
_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
|
||||
_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
_name_xpath = XPath('descendant-or-self::*[@name=$name]')
|
||||
|
||||
def fill_form(
|
||||
el,
|
||||
values,
|
||||
form_id=None,
|
||||
form_index=None,
|
||||
):
|
||||
el = _find_form(el, form_id=form_id, form_index=form_index)
|
||||
_fill_form(el, values)
|
||||
|
||||
def fill_form_html(html, values, form_id=None, form_index=None):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
fill_form(doc, values, form_id=form_id, form_index=form_index)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
def _fill_form(el, values):
|
||||
counts = {}
|
||||
if hasattr(values, 'mixed'):
|
||||
# For Paste request parameters
|
||||
values = values.mixed()
|
||||
inputs = _input_xpath(el)
|
||||
for input in inputs:
|
||||
name = input.get('name')
|
||||
if not name:
|
||||
continue
|
||||
if _takes_multiple(input):
|
||||
value = values.get(name, [])
|
||||
if not isinstance(value, (list, tuple)):
|
||||
value = [value]
|
||||
_fill_multiple(input, value)
|
||||
elif name not in values:
|
||||
continue
|
||||
else:
|
||||
index = counts.get(name, 0)
|
||||
counts[name] = index + 1
|
||||
value = values[name]
|
||||
if isinstance(value, (list, tuple)):
|
||||
try:
|
||||
value = value[index]
|
||||
except IndexError:
|
||||
continue
|
||||
elif index > 0:
|
||||
continue
|
||||
_fill_single(input, value)
|
||||
|
||||
def _takes_multiple(input):
|
||||
if _nons(input.tag) == 'select' and input.get('multiple'):
|
||||
# FIXME: multiple="0"?
|
||||
return True
|
||||
type = input.get('type', '').lower()
|
||||
if type in ('radio', 'checkbox'):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _fill_multiple(input, value):
|
||||
type = input.get('type', '').lower()
|
||||
if type == 'checkbox':
|
||||
v = input.get('value')
|
||||
if v is None:
|
||||
if not value:
|
||||
result = False
|
||||
else:
|
||||
result = value[0]
|
||||
if isinstance(value, basestring):
|
||||
# The only valid "on" value for an unnamed checkbox is 'on'
|
||||
result = result == 'on'
|
||||
_check(input, result)
|
||||
else:
|
||||
_check(input, v in value)
|
||||
elif type == 'radio':
|
||||
v = input.get('value')
|
||||
_check(input, v in value)
|
||||
else:
|
||||
assert _nons(input.tag) == 'select'
|
||||
for option in _options_xpath(input):
|
||||
v = option.get('value')
|
||||
if v is None:
|
||||
# This seems to be the default, at least on IE
|
||||
# FIXME: but I'm not sure
|
||||
v = option.text_content()
|
||||
_select(option, v in value)
|
||||
|
||||
def _check(el, check):
|
||||
if check:
|
||||
el.set('checked', '')
|
||||
else:
|
||||
if 'checked' in el.attrib:
|
||||
del el.attrib['checked']
|
||||
|
||||
def _select(el, select):
|
||||
if select:
|
||||
el.set('selected', '')
|
||||
else:
|
||||
if 'selected' in el.attrib:
|
||||
del el.attrib['selected']
|
||||
|
||||
def _fill_single(input, value):
|
||||
if _nons(input.tag) == 'textarea':
|
||||
input.text = value
|
||||
else:
|
||||
input.set('value', value)
|
||||
|
||||
def _find_form(el, form_id=None, form_index=None):
|
||||
if form_id is None and form_index is None:
|
||||
forms = _forms_xpath(el)
|
||||
for form in forms:
|
||||
return form
|
||||
raise FormNotFound(
|
||||
"No forms in page")
|
||||
if form_id is not None:
|
||||
form = el.get_element_by_id(form_id)
|
||||
if form is not None:
|
||||
return form
|
||||
forms = _form_name_xpath(el, name=form_id)
|
||||
if forms:
|
||||
return forms[0]
|
||||
else:
|
||||
raise FormNotFound(
|
||||
"No form with the name or id of %r (forms: %s)"
|
||||
% (id, ', '.join(_find_form_ids(el))))
|
||||
if form_index is not None:
|
||||
forms = _forms_xpath(el)
|
||||
try:
|
||||
return forms[form_index]
|
||||
except IndexError:
|
||||
raise FormNotFound(
|
||||
"There is no form with the index %r (%i forms found)"
|
||||
% (form_index, len(forms)))
|
||||
|
||||
def _find_form_ids(el):
|
||||
forms = _forms_xpath(el)
|
||||
if not forms:
|
||||
yield '(no forms)'
|
||||
return
|
||||
for index, form in enumerate(forms):
|
||||
if form.get('id'):
|
||||
if form.get('name'):
|
||||
yield '%s or %s' % (form.get('id'),
|
||||
form.get('name'))
|
||||
else:
|
||||
yield form.get('id')
|
||||
elif form.get('name'):
|
||||
yield form.get('name')
|
||||
else:
|
||||
yield '(unnamed form %s)' % index
|
||||
|
||||
############################################################
|
||||
## Error filling
|
||||
############################################################
|
||||
|
||||
class DefaultErrorCreator(object):
|
||||
insert_before = True
|
||||
block_inside = True
|
||||
error_container_tag = 'div'
|
||||
error_message_class = 'error-message'
|
||||
error_block_class = 'error-block'
|
||||
default_message = "Invalid"
|
||||
|
||||
def __init__(self, **kw):
|
||||
for name, value in kw.items():
|
||||
if not hasattr(self, name):
|
||||
raise TypeError(
|
||||
"Unexpected keyword argument: %s" % name)
|
||||
setattr(self, name, value)
|
||||
|
||||
def __call__(self, el, is_block, message):
|
||||
error_el = el.makeelement(self.error_container_tag)
|
||||
if self.error_message_class:
|
||||
error_el.set('class', self.error_message_class)
|
||||
if is_block and self.error_block_class:
|
||||
error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
|
||||
if message is None or message == '':
|
||||
message = self.default_message
|
||||
if isinstance(message, ElementBase):
|
||||
error_el.append(message)
|
||||
else:
|
||||
assert isinstance(message, basestring), (
|
||||
"Bad message; should be a string or element: %r" % message)
|
||||
error_el.text = message or self.default_message
|
||||
if is_block and self.block_inside:
|
||||
if self.insert_before:
|
||||
error_el.tail = el.text
|
||||
el.text = None
|
||||
el.insert(0, error_el)
|
||||
else:
|
||||
el.append(error_el)
|
||||
else:
|
||||
parent = el.getparent()
|
||||
pos = parent.index(el)
|
||||
if self.insert_before:
|
||||
parent.insert(pos, error_el)
|
||||
else:
|
||||
error_el.tail = el.tail
|
||||
el.tail = None
|
||||
parent.insert(pos+1, error_el)
|
||||
|
||||
default_error_creator = DefaultErrorCreator()
|
||||
|
||||
|
||||
def insert_errors(
|
||||
el,
|
||||
errors,
|
||||
form_id=None,
|
||||
form_index=None,
|
||||
error_class="error",
|
||||
error_creator=default_error_creator,
|
||||
):
|
||||
el = _find_form(el, form_id=form_id, form_index=form_index)
|
||||
for name, error in errors.items():
|
||||
if error is None:
|
||||
continue
|
||||
for error_el, message in _find_elements_for_name(el, name, error):
|
||||
assert isinstance(message, (basestring, type(None), ElementBase)), (
|
||||
"Bad message: %r" % message)
|
||||
_insert_error(error_el, message, error_class, error_creator)
|
||||
|
||||
def insert_errors_html(html, values, **kw):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
insert_errors(doc, values, **kw)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
def _insert_error(el, error, error_class, error_creator):
|
||||
if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
|
||||
is_block = False
|
||||
else:
|
||||
is_block = True
|
||||
if _nons(el.tag) != 'form' and error_class:
|
||||
_add_class(el, error_class)
|
||||
if el.get('id'):
|
||||
labels = _label_for_xpath(el, for_id=el.get('id'))
|
||||
if labels:
|
||||
for label in labels:
|
||||
_add_class(label, error_class)
|
||||
error_creator(el, is_block, error)
|
||||
|
||||
def _add_class(el, class_name):
|
||||
if el.get('class'):
|
||||
el.set('class', el.get('class')+' '+class_name)
|
||||
else:
|
||||
el.set('class', class_name)
|
||||
|
||||
def _find_elements_for_name(form, name, error):
|
||||
if name is None:
|
||||
# An error for the entire form
|
||||
yield form, error
|
||||
return
|
||||
if name.startswith('#'):
|
||||
# By id
|
||||
el = form.get_element_by_id(name[1:])
|
||||
if el is not None:
|
||||
yield el, error
|
||||
return
|
||||
els = _name_xpath(form, name=name)
|
||||
if not els:
|
||||
# FIXME: should this raise an exception?
|
||||
return
|
||||
if not isinstance(error, (list, tuple)):
|
||||
yield els[0], error
|
||||
return
|
||||
# FIXME: if error is longer than els, should it raise an error?
|
||||
for el, err in zip(els, error):
|
||||
if err is None:
|
||||
continue
|
||||
yield el, err
|
207
lib/lxml/html/html5parser.py
Normal file
207
lib/lxml/html/html5parser.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
"""
|
||||
An interface to html5lib that mimics the lxml.html interface.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import string
|
||||
|
||||
from html5lib import HTMLParser as _HTMLParser
|
||||
from html5lib.treebuilders.etree_lxml import TreeBuilder
|
||||
|
||||
from lxml import etree
|
||||
from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
|
||||
|
||||
# python3 compatibility
|
||||
try:
|
||||
_strings = basestring
|
||||
except NameError:
|
||||
_strings = (bytes, str)
|
||||
try:
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
from urllib.request import urlopen
|
||||
try:
|
||||
from urlparse import urlparse
|
||||
except ImportError:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class HTMLParser(_HTMLParser):
|
||||
"""An html5lib HTML parser with lxml as tree."""
|
||||
|
||||
def __init__(self, strict=False, **kwargs):
|
||||
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
|
||||
|
||||
|
||||
try:
|
||||
from html5lib import XHTMLParser as _XHTMLParser
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
class XHTMLParser(_XHTMLParser):
|
||||
"""An html5lib XHTML Parser with lxml as tree."""
|
||||
|
||||
def __init__(self, strict=False, **kwargs):
|
||||
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
|
||||
|
||||
xhtml_parser = XHTMLParser()
|
||||
|
||||
|
||||
def _find_tag(tree, tag):
|
||||
elem = tree.find(tag)
|
||||
if elem is not None:
|
||||
return elem
|
||||
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
|
||||
|
||||
|
||||
def document_fromstring(html, guess_charset=True, parser=None):
|
||||
"""Parse a whole document into a string."""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
return parser.parse(html, useChardet=guess_charset).getroot()
|
||||
|
||||
|
||||
def fragments_fromstring(html, no_leading_text=False,
|
||||
guess_charset=False, parser=None):
|
||||
"""Parses several HTML elements, returning a list of elements.
|
||||
|
||||
The first item in the list may be a string. If no_leading_text is true,
|
||||
then it will be an error if there is leading text, and it will always be
|
||||
a list of only elements.
|
||||
|
||||
If `guess_charset` is `True` and the text was not unicode but a
|
||||
bytestring, the `chardet` library will perform charset guessing on the
|
||||
string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
children = parser.parseFragment(html, 'div', useChardet=guess_charset)
|
||||
if children and isinstance(children[0], _strings):
|
||||
if no_leading_text:
|
||||
if children[0].strip():
|
||||
raise etree.ParserError('There is leading text: %r' %
|
||||
children[0])
|
||||
del children[0]
|
||||
return children
|
||||
|
||||
|
||||
def fragment_fromstring(html, create_parent=False,
|
||||
guess_charset=False, parser=None):
|
||||
"""Parses a single HTML element; it is an error if there is more than
|
||||
one element, or if anything but whitespace precedes or follows the
|
||||
element.
|
||||
|
||||
If create_parent is true (or is a tag name) then a parent node
|
||||
will be created to encapsulate the HTML in a single element. In
|
||||
this case, leading or trailing text is allowed.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
accept_leading_text = bool(create_parent)
|
||||
|
||||
elements = fragments_fromstring(
|
||||
html, guess_charset=guess_charset, parser=parser,
|
||||
no_leading_text=not accept_leading_text)
|
||||
|
||||
if create_parent:
|
||||
if not isinstance(create_parent, _strings):
|
||||
create_parent = 'div'
|
||||
new_root = Element(create_parent)
|
||||
if elements:
|
||||
if isinstance(elements[0], _strings):
|
||||
new_root.text = elements[0]
|
||||
del elements[0]
|
||||
new_root.extend(elements)
|
||||
return new_root
|
||||
|
||||
if not elements:
|
||||
raise etree.ParserError('No elements found')
|
||||
if len(elements) > 1:
|
||||
raise etree.ParserError('Multiple elements found')
|
||||
result = elements[0]
|
||||
if result.tail and result.tail.strip():
|
||||
raise etree.ParserError('Element followed by text: %r' % result.tail)
|
||||
result.tail = None
|
||||
return result
|
||||
|
||||
|
||||
def fromstring(html, guess_charset=True, parser=None):
|
||||
"""Parse the html, returning a single element/document.
|
||||
|
||||
This tries to minimally parse the chunk of text, without knowing if it
|
||||
is a fragment or a document.
|
||||
|
||||
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
doc = document_fromstring(html, parser=parser,
|
||||
guess_charset=guess_charset)
|
||||
|
||||
# document starts with doctype or <html>, full document!
|
||||
start = html[:50].lstrip().lower()
|
||||
if start.startswith('<html') or start.startswith('<!doctype'):
|
||||
return doc
|
||||
|
||||
head = _find_tag(doc, 'head')
|
||||
|
||||
# if the head is not empty we have a full document
|
||||
if len(head):
|
||||
return doc
|
||||
|
||||
body = _find_tag(doc, 'body')
|
||||
|
||||
# The body has just one element, so it was probably a single
|
||||
# element passed in
|
||||
if (len(body) == 1 and (not body.text or not body.text.strip())
|
||||
and (not body[-1].tail or not body[-1].tail.strip())):
|
||||
return body[0]
|
||||
|
||||
# Now we have a body which represents a bunch of tags which have the
|
||||
# content that was passed in. We will create a fake container, which
|
||||
# is the body tag, except <body> implies too much structure.
|
||||
if _contains_block_level_tag(body):
|
||||
body.tag = 'div'
|
||||
else:
|
||||
body.tag = 'span'
|
||||
return body
|
||||
|
||||
|
||||
def parse(filename_url_or_file, guess_charset=True, parser=None):
|
||||
"""Parse a filename, URL, or file-like object into an HTML document
|
||||
tree. Note: this returns a tree, not an element. Use
|
||||
``parse(...).getroot()`` to get the document root.
|
||||
"""
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
if not isinstance(filename_url_or_file, _strings):
|
||||
fp = filename_url_or_file
|
||||
elif _looks_like_url(filename_url_or_file):
|
||||
fp = urlopen(filename_url_or_file)
|
||||
else:
|
||||
fp = open(filename_url_or_file, 'rb')
|
||||
return parser.parse(fp, useChardet=guess_charset)
|
||||
|
||||
|
||||
def _looks_like_url(str):
|
||||
scheme = urlparse(str)[0]
|
||||
if not scheme:
|
||||
return False
|
||||
elif (sys.platform == 'win32' and
|
||||
scheme in string.ascii_letters
|
||||
and len(scheme) == 1):
|
||||
# looks like a 'normal' absolute path
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
html_parser = HTMLParser()
|
125
lib/lxml/html/soupparser.py
Normal file
125
lib/lxml/html/soupparser.py
Normal file
|
@ -0,0 +1,125 @@
|
|||
__doc__ = """External interface to the BeautifulSoup HTML parser.
|
||||
"""
|
||||
|
||||
__all__ = ["fromstring", "parse", "convert_tree"]
|
||||
|
||||
from lxml import etree, html
|
||||
from BeautifulSoup import \
|
||||
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
|
||||
|
||||
|
||||
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
|
||||
"""Parse a string of HTML data into an Element tree using the
|
||||
BeautifulSoup parser.
|
||||
|
||||
Returns the root ``<html>`` Element of the tree.
|
||||
|
||||
You can pass a different BeautifulSoup parser through the
|
||||
`beautifulsoup` keyword, and a diffent Element factory function
|
||||
through the `makeelement` keyword. By default, the standard
|
||||
``BeautifulSoup`` class and the default factory of `lxml.html` are
|
||||
used.
|
||||
"""
|
||||
return _parse(data, beautifulsoup, makeelement, **bsargs)
|
||||
|
||||
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
|
||||
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
|
||||
|
||||
You can pass a different BeautifulSoup parser through the
|
||||
`beautifulsoup` keyword, and a diffent Element factory function
|
||||
through the `makeelement` keyword. By default, the standard
|
||||
``BeautifulSoup`` class and the default factory of `lxml.html` are
|
||||
used.
|
||||
"""
|
||||
if not hasattr(file, 'read'):
|
||||
file = open(file)
|
||||
root = _parse(file, beautifulsoup, makeelement, **bsargs)
|
||||
return etree.ElementTree(root)
|
||||
|
||||
def convert_tree(beautiful_soup_tree, makeelement=None):
|
||||
"""Convert a BeautifulSoup tree to a list of Element trees.
|
||||
|
||||
Returns a list instead of a single root Element to support
|
||||
HTML-like soup with more than one root element.
|
||||
|
||||
You can pass a different Element factory through the `makeelement`
|
||||
keyword.
|
||||
"""
|
||||
if makeelement is None:
|
||||
makeelement = html.html_parser.makeelement
|
||||
root = _convert_tree(beautiful_soup_tree, makeelement)
|
||||
children = root.getchildren()
|
||||
for child in children:
|
||||
root.remove(child)
|
||||
return children
|
||||
|
||||
|
||||
# helpers
|
||||
|
||||
def _parse(source, beautifulsoup, makeelement, **bsargs):
|
||||
if beautifulsoup is None:
|
||||
beautifulsoup = BeautifulSoup
|
||||
if makeelement is None:
|
||||
makeelement = html.html_parser.makeelement
|
||||
if 'convertEntities' not in bsargs:
|
||||
bsargs['convertEntities'] = 'html'
|
||||
tree = beautifulsoup(source, **bsargs)
|
||||
root = _convert_tree(tree, makeelement)
|
||||
# from ET: wrap the document in a html root element, if necessary
|
||||
if len(root) == 1 and root[0].tag == "html":
|
||||
return root[0]
|
||||
root.tag = "html"
|
||||
return root
|
||||
|
||||
def _convert_tree(beautiful_soup_tree, makeelement):
|
||||
root = makeelement(beautiful_soup_tree.name,
|
||||
attrib=dict(beautiful_soup_tree.attrs))
|
||||
_convert_children(root, beautiful_soup_tree, makeelement)
|
||||
return root
|
||||
|
||||
def _convert_children(parent, beautiful_soup_tree, makeelement):
|
||||
SubElement = etree.SubElement
|
||||
et_child = None
|
||||
for child in beautiful_soup_tree:
|
||||
if isinstance(child, Tag):
|
||||
et_child = SubElement(parent, child.name, attrib=dict(
|
||||
[(k, unescape(v)) for (k,v) in child.attrs]))
|
||||
_convert_children(et_child, child, makeelement)
|
||||
elif type(child) is NavigableString:
|
||||
_append_text(parent, et_child, unescape(child))
|
||||
else:
|
||||
if isinstance(child, Comment):
|
||||
parent.append(etree.Comment(child))
|
||||
elif isinstance(child, ProcessingInstruction):
|
||||
parent.append(etree.ProcessingInstruction(
|
||||
*child.split(' ', 1)))
|
||||
else: # CData
|
||||
_append_text(parent, et_child, unescape(child))
|
||||
|
||||
def _append_text(parent, element, text):
|
||||
if element is None:
|
||||
parent.text = (parent.text or '') + text
|
||||
else:
|
||||
element.tail = (element.tail or '') + text
|
||||
|
||||
|
||||
# copied from ET's ElementSoup
|
||||
|
||||
try:
|
||||
from html.entities import name2codepoint # Python 3
|
||||
except ImportError:
|
||||
from htmlentitydefs import name2codepoint
|
||||
import re
|
||||
|
||||
handle_entities = re.compile("&(\w+);").sub
|
||||
|
||||
def unescape(string):
|
||||
if not string:
|
||||
return ''
|
||||
# work around oddities in BeautifulSoup's entity handling
|
||||
def unescape_entity(m):
|
||||
try:
|
||||
return unichr(name2codepoint[m.group(1)])
|
||||
except KeyError:
|
||||
return m.group(0) # use as is
|
||||
return handle_entities(unescape_entity, string)
|
1
lib/lxml/html/tests/__init__.py
Normal file
1
lib/lxml/html/tests/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
#
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains applet
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
|
||||
<div>safe<applet code="foo.class" codebase="http://example.com/"></applet> <b>description</b></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,8 @@
|
|||
Description: entry content contains embed
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
|
||||
Options:
|
||||
Notes: <div> wrapper
|
||||
|
||||
<div><blink>safe</blink> description</div>
|
||||
----------
|
||||
<div>safe description</div>
|
84
lib/lxml/html/tests/feedparser-data/entry_content_crazy.data
Normal file
84
lib/lxml/html/tests/feedparser-data/entry_content_crazy.data
Normal file
|
@ -0,0 +1,84 @@
|
|||
Description: entry content is crazy
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'Crazy HTML -' + u'- Can Your Regex Parse This?\n\n\n\n<!-' + u'- <script> -' + u'->\n\n<!-' + u'- \n\t<script> \n-' + u'->\n\n\n\nfunction executeMe()\n{\n\n\n\n\n/* \n<h1>Did The Javascript Execute?</h1>\n<div>\nI will execute here, too, if you mouse over me\n</div>'
|
||||
Options: -page_structure
|
||||
Notes: for some reason the comments in the expected field are acting weird
|
||||
|
||||
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Crazy HTML -- Can Your Regex Parse This?</title>
|
||||
|
||||
</head>
|
||||
<body notRealAttribute="value"onload="executeMe();"foo="bar"
|
||||
|
||||
>
|
||||
<!-- <script> -->
|
||||
|
||||
<!--
|
||||
<script>
|
||||
-->
|
||||
|
||||
</script>
|
||||
|
||||
|
||||
<script
|
||||
|
||||
|
||||
>
|
||||
|
||||
function executeMe()
|
||||
{
|
||||
|
||||
|
||||
|
||||
|
||||
/* <script>
|
||||
function am_i_javascript()
|
||||
{
|
||||
var str = "Some innocuously commented out stuff";
|
||||
}
|
||||
< /script>
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
alert("Executed");
|
||||
}
|
||||
|
||||
</script
|
||||
|
||||
|
||||
|
||||
>
|
||||
<h1>Did The Javascript Execute?</h1>
|
||||
<div notRealAttribute="value
|
||||
"onmouseover="
|
||||
executeMe();
|
||||
"foo="bar">
|
||||
I will execute here, too, if you mouse over me
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
----------
|
||||
<html>
|
||||
<head>
|
||||
<title>Crazy HTML -- Can Your Regex Parse This?</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Did The Javascript Execute?</h1>
|
||||
<div>
|
||||
I will execute here, too, if you mouse over me
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,8 @@
|
|||
Description: entry content contains embed
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
Notes: <div> wrapper, close <embed> tag (not closing it lost the <b> tag)
|
||||
|
||||
<div>safe<embed src="http://example.com/"></embed> <b>description</b></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains frameset
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
|
||||
<div>safe<frameset rows="*"><frame src="http://example.com/"></frameset> <b>description</b></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,8 @@
|
|||
Description: entry content contains iframe
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
Notes: div wrapper, close <iframe>
|
||||
|
||||
<div>safe<iframe src="http://example.com/"></iframe> <b>description</b></iframe></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains link
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
|
||||
<div>safe<link rel="stylesheet" type="text/css" href="http://example.com/evil.css"> <b>description</b></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains meta
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
|
||||
<div>safe<meta http-equiv="Refresh" content="0; URL=http://example.com/"> <b>description</b></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,8 @@
|
|||
Description: entry content contains object
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
|
||||
Options:
|
||||
Notes: div wrapper, close <object>
|
||||
|
||||
<div>safe<object classid="clsid:C932BA85-4374-101B-A56C-00AA003668DC"></object> <b>description</b></div>
|
||||
----------
|
||||
<div>safe <b>description</b></div>
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onabort
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onabort="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onblur
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onblur="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onchange
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onchange="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onclick
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onclick="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains ondblclick
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options: javascript
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" ondblclick="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onerror
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onerror="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onfocus
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onfocus="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onkeydown
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeydown="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onkeypress
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeypress="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onkeyup
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeyup="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onload
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onload="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onmousedown
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmousedown="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onmouseout
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseout="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onmouseover
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseover="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onmouseup
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseup="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onreset
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onreset="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onresize
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onresize="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onsubmit
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onsubmit="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains onunload
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
|
||||
Options:
|
||||
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onunload="location.href='http://www.ragingplatypus.com/';" />
|
||||
----------
|
||||
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains script
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
|
||||
Options:
|
||||
|
||||
<div>safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>
|
||||
----------
|
||||
<div>safe description</div>
|
|
@ -0,0 +1,13 @@
|
|||
Description: entry content contains script (cdata)
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
|
||||
Options:
|
||||
Notes: div wrapper. Currently not working because of how HTML() is parsing the CDATA (not in a useful way)
|
||||
The resulting code is safe, it just includes crap from the <script> tag (but not the script tag
|
||||
itself).
|
||||
Ignore: true
|
||||
|
||||
<div>
|
||||
<![CDATA[safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description]]>
|
||||
</div>
|
||||
----------
|
||||
<div>safe description</div>
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains script (inline)
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<div>safe description</div>'
|
||||
Options:
|
||||
|
||||
<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>
|
||||
----------
|
||||
<div>safe description</div>
|
|
@ -0,0 +1,7 @@
|
|||
Description: entry content contains style
|
||||
Expect: not bozo and entries[0]['content'][0]['value'] == u'<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>'
|
||||
Options: style
|
||||
|
||||
<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>
|
||||
----------
|
||||
<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>
|
|
@ -0,0 +1,8 @@
|
|||
Description: I built a quick XSS fuzzer to detect any erroneous characters that are allowed after the open parenthesis but before the JavaScript directive in IE and Netscape 8.1 in secure site mode. These are in decimal but you can include hex and add padding of course. (Any of the following chars can be used: 1-32, 34, 39, 160, 8192-8.13, 12288, 65279)
|
||||
http://ha.ckers.org/xss.html#XSS_DIV_background-image_plus
|
||||
Options: -safe_attrs_only
|
||||
Notes: As you see, the CSS gets corrupted, but I don't really care that much.
|
||||
|
||||
<DIV STYLE="background-image: url(javascript:alert('XSS'))">text</div>
|
||||
----------
|
||||
<div style="background-image: url(">text</div>
|
|
@ -0,0 +1,10 @@
|
|||
Description: exploit (this has been modified slightly to obfuscate the url parameter). The original vulnerability was found by Renaud Lifchitz as a vulnerability in Hotmail.
|
||||
http://ha.ckers.org/xss.html#XSS_DIV_background_image_unicode
|
||||
Options: -safe_attrs_only
|
||||
Ignore: true
|
||||
Notes: I don't understand how this exploit works. It seems like the description actually refers to
|
||||
the unicode you'd import, but why that matters I don't know.
|
||||
|
||||
<DIV STYLE="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">text</div>
|
||||
----------
|
||||
<div style="background-image: ">text</div>
|
|
@ -0,0 +1,9 @@
|
|||
Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job
|
||||
http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden
|
||||
Options: -comments, -processing_instructions
|
||||
|
||||
<div><!--[if gte IE 4]>
|
||||
<SCRIPT>alert('XSS');</SCRIPT>
|
||||
<![endif]--></div>
|
||||
----------
|
||||
<div></div>
|
12
lib/lxml/html/tests/hackers-org-data/html-plus-time.data
Normal file
12
lib/lxml/html/tests/hackers-org-data/html-plus-time.data
Normal file
|
@ -0,0 +1,12 @@
|
|||
Description: HTML+TIME in XML. This is how Grey Magic hacked Hotmail and Yahoo!. This only works in Internet Explorer and Netscape 8.1 in IE rendering engine mode and remember that you need to be between HTML and BODY tags for this to work
|
||||
http://ha.ckers.org/xss.html#XSS_HTML_plus_time
|
||||
Ignore: true
|
||||
Notes: I don't understand the vector here, or how this is supposed to work.
|
||||
|
||||
<div>
|
||||
<t:set attributeName="innerHTML" to="XSS<SCRIPT DEFER>alert("XSS")</SCRIPT>">
|
||||
</BODY></HTML></div>
|
||||
----------
|
||||
<div>
|
||||
<t:set attributeName="innerHTML" to="XSS<SCRIPT DEFER>alert("XSS")</SCRIPT>">
|
||||
</BODY></HTML>x</div>
|
15
lib/lxml/html/tests/hackers-org-data/javascript-link.data
Normal file
15
lib/lxml/html/tests/hackers-org-data/javascript-link.data
Normal file
|
@ -0,0 +1,15 @@
|
|||
Description: javascript: in many forms
|
||||
|
||||
<div>
|
||||
<a href="java
|
||||
script:alert()">x</a>
|
||||
<a href="j a v a s c r i p t:alert()">x</a>
|
||||
<a href="jscript
|
||||
:alert()">x</a>
|
||||
</div>
|
||||
----------
|
||||
<div>
|
||||
<a href="">x</a>
|
||||
<a href="">x</a>
|
||||
<a href="">x</a>
|
||||
</div>
|
8
lib/lxml/html/tests/hackers-org-data/style-comment.data
Normal file
8
lib/lxml/html/tests/hackers-org-data/style-comment.data
Normal file
|
@ -0,0 +1,8 @@
|
|||
Description: to break up expression (Thanks to Roman Ivanov for this one)
|
||||
http://ha.ckers.org/xss.html#XSS_STYLE_comment
|
||||
Options: -safe_attrs_only
|
||||
Notes: Because of the suspicious stuff in there, the style is removed entirely
|
||||
|
||||
<IMG STYLE="xss:expr/*XSS*/ession(alert('XSS'))">
|
||||
----------
|
||||
<img>
|
10
lib/lxml/html/tests/hackers-org-data/style-expression.data
Normal file
10
lib/lxml/html/tests/hackers-org-data/style-expression.data
Normal file
|
@ -0,0 +1,10 @@
|
|||
Description: (this is really a hybrid of the above XSS vectors, but it really does show how hard STYLE tags can be to parse apart, like above this can send IE into a loop)
|
||||
http://ha.ckers.org/xss.html#XSS_IMG_STYLE_expression
|
||||
Options: -safe_attrs_only
|
||||
Notes: Modified to avoid a parsing in libxml2 that ruins the XSS (the " marks).
|
||||
Also there seemed to be an extra "p" in exppression
|
||||
|
||||
<div><img style="xss: ex/*<A STYLE='no\xss:noxss(*//*);
|
||||
xss:ex/*XSS*//*/*/pression(alert('XSS'))"></div>
|
||||
----------
|
||||
<div><img></div>
|
8
lib/lxml/html/tests/hackers-org-data/style-import.data
Normal file
8
lib/lxml/html/tests/hackers-org-data/style-import.data
Normal file
|
@ -0,0 +1,8 @@
|
|||
Description: tags with broken up JavaScript for XSS (this XSS at times sends IE into an infinite loop of alerts)
|
||||
http://ha.ckers.org/xss.html#XSS_STYLE
|
||||
Options: -safe_attrs_only
|
||||
|
||||
<div><STYLE>@im\port'\ja\vasc\ript:alert("XSS")';</STYLE></div>
|
||||
----------
|
||||
<div><style>/* deleted */</style></div>
|
||||
|
7
lib/lxml/html/tests/hackers-org-data/style-js-tag.data
Normal file
7
lib/lxml/html/tests/hackers-org-data/style-js-tag.data
Normal file
|
@ -0,0 +1,7 @@
|
|||
Description: (Older versions of Netscape only)
|
||||
http://ha.ckers.org/xss.html#XSS_STYLE_tag
|
||||
Options: -safe_attrs_only
|
||||
|
||||
<div><STYLE TYPE="text/javascript">alert('XSS');</STYLE></div>
|
||||
----------
|
||||
<div></div>
|
8
lib/lxml/html/tests/hackers-org-data/style-url-js.data
Normal file
8
lib/lxml/html/tests/hackers-org-data/style-url-js.data
Normal file
|
@ -0,0 +1,8 @@
|
|||
Description: http://ha.ckers.org/xss.html#XSS_STYLE_background-image
|
||||
Options: -style, -safe_attrs_only
|
||||
Notes: The CSS is messed up here, but so it goes
|
||||
|
||||
<div><STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A></div>
|
||||
----------
|
||||
<div><style>.XSS{background-image:url("");}</style><a class="XSS"></a></div>
|
||||
|
10
lib/lxml/html/tests/hackers-org-data/xml-data-island.data
Normal file
10
lib/lxml/html/tests/hackers-org-data/xml-data-island.data
Normal file
|
@ -0,0 +1,10 @@
|
|||
Description: XML data island with comment obfuscation (this is another take on the same exploit that doesn't use CDATA fields, but rather uses comments to break up the javascript directive)
|
||||
http://ha.ckers.org/xss.html#XSS_XML_data_island_comment
|
||||
Ignore: true
|
||||
Notes: I don't understand the vector here. Maybe datasrc should be filtered?
|
||||
|
||||
<div><XML ID="xss"><I><B><IMG SRC="javas<!-- -->cript:alert('XSS')"></B></I></XML>
|
||||
<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN></div>
|
||||
----------
|
||||
<div><XML ID="xss"><I><B><IMG SRC="javas<!-- -->cript:alert('XSS')"></B></I></XML>
|
||||
<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN>x</div>
|
|
@ -0,0 +1,9 @@
|
|||
Description: Locally hosted XML with embedded JavaScript#XSS_Local_XML that is generated using an XML data island. This is the same as above but instead referrs to a locally hosted (must be on the same server) XML file that contains your cross site scripting vector. You can see the result here <http://ha.ckers.org/xssxmltest.html>
|
||||
http://ha.ckers.org/xss.html#XSS_Local_XML
|
||||
|
||||
<div><XML SRC="xsstest.xml" ID=I></XML>
|
||||
<SPAN DATASRC=#I DATAFLD=C DATAFORMATAS=HTML></SPAN></div>
|
||||
----------
|
||||
<div>
|
||||
<span></span>
|
||||
</div>
|
|
@ -0,0 +1,16 @@
|
|||
Description: XML namespace. The htc file must be located on the same server as your XSS vector
|
||||
http://ha.ckers.org/xss.html#XSS_XML_namespace
|
||||
Note: I don't completely understand the vector here. page_structure is what does this.
|
||||
|
||||
<HTML xmlns:xss>
|
||||
<body>
|
||||
<?import namespace="xss" implementation="http://ha.ckers.org/xss.htc">
|
||||
<xss:xss>XSS</xss:xss>
|
||||
</body>
|
||||
</HTML>
|
||||
----------
|
||||
<HTML>
|
||||
<body>
|
||||
<div>XSS</div>
|
||||
</body>
|
||||
</HTML>
|
11
lib/lxml/html/tests/test_autolink.py
Normal file
11
lib/lxml/html/tests/test_autolink.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_autolink.txt')])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
79
lib/lxml/html/tests/test_autolink.txt
Normal file
79
lib/lxml/html/tests/test_autolink.txt
Normal file
|
@ -0,0 +1,79 @@
|
|||
This tests autolink::
|
||||
|
||||
>>> from lxml.html import usedoctest
|
||||
>>> from lxml.html.clean import autolink_html
|
||||
>>> print(autolink_html('''
|
||||
... <div>Link here: http://test.com/foo.html.</div>
|
||||
... '''))
|
||||
<div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>Mail me at mailto:ianb@test.com or http://myhome.com</div>
|
||||
... '''))
|
||||
<div>Mail me at <a href="mailto:ianb@test.com">ianb@test.com</a>
|
||||
or <a href="http://myhome.com">http://myhome.com</a></div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>The <b>great</b> thing is the http://link.com links <i>and</i>
|
||||
... the http://foobar.com links.</div>'''))
|
||||
<div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i>
|
||||
the <a href="http://foobar.com">http://foobar.com</a> links.</div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>Link: <http://foobar.com></div>'''))
|
||||
<div>Link: <<a href="http://foobar.com">http://foobar.com</a>></div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>Link: (http://foobar.com)</div>'''))
|
||||
<div>Link: (<a href="http://foobar.com">http://foobar.com</a>)</div>
|
||||
|
||||
Parenthesis are tricky, we'll do our best::
|
||||
|
||||
>>> print(autolink_html('''
|
||||
... <div>(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))</div>
|
||||
... '''))
|
||||
<div>(Link: <a href="http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)">http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)</a>)</div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>... a link: http://foo.com)</div>
|
||||
... '''))
|
||||
<div>... a link: <a href="http://foo.com">http://foo.com</a>)</div>
|
||||
|
||||
Some cases that won't be caught (on purpose)::
|
||||
|
||||
>>> print(autolink_html('''
|
||||
... <div>A link to http://localhost/foo/bar won't, but a link to
|
||||
... http://test.com will</div>'''))
|
||||
<div>A link to http://localhost/foo/bar won't, but a link to
|
||||
<a href="http://test.com">http://test.com</a> will</div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>A link in <textarea>http://test.com</textarea></div>'''))
|
||||
<div>A link in <textarea>http://test.com</textarea></div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>A link in <a href="http://foo.com">http://bar.com</a></div>'''))
|
||||
<div>A link in <a href="http://foo.com">http://bar.com</a></div>
|
||||
>>> print(autolink_html('''
|
||||
... <div>A link in <code>http://foo.com</code> or
|
||||
... <span class="nolink">http://bar.com</span></div>'''))
|
||||
<div>A link in <code>http://foo.com</code> or
|
||||
<span class="nolink">http://bar.com</span></div>
|
||||
|
||||
There's also a word wrapping function, that should probably be run
|
||||
after autolink::
|
||||
|
||||
>>> from lxml.html.clean import word_break_html
|
||||
>>> def pascii(s):
|
||||
... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii'))
|
||||
>>> pascii(word_break_html( u'''
|
||||
... <div>Hey you
|
||||
... 12345678901234567890123456789012345678901234567890</div>'''))
|
||||
<div>Hey you
|
||||
1234567890123456789012345678901234567890​1234567890</div>
|
||||
|
||||
Not everything is broken:
|
||||
|
||||
>>> pascii(word_break_html('''
|
||||
... <div>Hey you
|
||||
... <code>12345678901234567890123456789012345678901234567890</code></div>'''))
|
||||
<div>Hey you
|
||||
<code>12345678901234567890123456789012345678901234567890</code></div>
|
||||
>>> pascii(word_break_html('''
|
||||
... <a href="12345678901234567890123456789012345678901234567890">text</a>'''))
|
||||
<a href="12345678901234567890123456789012345678901234567890">text</a>
|
||||
|
||||
|
13
lib/lxml/html/tests/test_basic.py
Normal file
13
lib/lxml/html/tests/test_basic.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest, doctest
|
||||
import lxml.html
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_basic.txt')])
|
||||
suite.addTests([doctest.DocTestSuite(lxml.html)])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
162
lib/lxml/html/tests/test_basic.txt
Normal file
162
lib/lxml/html/tests/test_basic.txt
Normal file
|
@ -0,0 +1,162 @@
|
|||
lxml.html adds a find_class method to elements::
|
||||
|
||||
>>> from lxml.etree import Comment
|
||||
>>> from lxml.html import document_fromstring, fragment_fromstring, tostring
|
||||
>>> from lxml.html import fragments_fromstring, fromstring
|
||||
>>> from lxml.html.clean import clean, clean_html
|
||||
>>> from lxml.html import usedoctest
|
||||
>>> try: unicode = unicode
|
||||
... except NameError: unicode = str
|
||||
|
||||
>>> h = document_fromstring('''
|
||||
... <html><head></head>
|
||||
... <body>
|
||||
... <a class="vcard
|
||||
... fn url" href="foobar">P1</a>
|
||||
... <a class="not-fn vcard" href="baz">P2</a>
|
||||
... </body></html>''')
|
||||
>>> print(tostring(h, encoding=unicode))
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
<a class="vcard
|
||||
fn url" href="foobar">P1</a>
|
||||
<a class="not-fn vcard" href="baz">P2</a>
|
||||
</body>
|
||||
</html>
|
||||
>>> print([e.text for e in h.find_class('fn')])
|
||||
['P1']
|
||||
>>> print([e.text for e in h.find_class('vcard')])
|
||||
['P1', 'P2']
|
||||
|
||||
Also added is a get_rel_links, which you can use to search for links
|
||||
like ``<a rel="$something">``::
|
||||
|
||||
>>> h = document_fromstring('''
|
||||
... <a href="1">test 1</a>
|
||||
... <a href="2" rel="tag">item 2</a>
|
||||
... <a href="3" rel="tagging">item 3</a>
|
||||
... <a href="4" rel="TAG">item 4</a>''')
|
||||
>>> print([e.attrib['href'] for e in h.find_rel_links('tag')])
|
||||
['2', '4']
|
||||
>>> print([e.attrib['href'] for e in h.find_rel_links('nofollow')])
|
||||
[]
|
||||
|
||||
Another method is ``get_element_by_id`` that does what it says::
|
||||
|
||||
>>> print(tostring(fragment_fromstring('''
|
||||
... <div>
|
||||
... <span id="test">stuff</span>
|
||||
... </div>''').get_element_by_id('test'), encoding=unicode))
|
||||
<span id="test">stuff</span>
|
||||
|
||||
Or to get the content of an element without the tags, use text_content()::
|
||||
|
||||
>>> el = fragment_fromstring('''
|
||||
... <div>This is <a href="foo">a <b>bold</b> link</a></div>''')
|
||||
>>> el.text_content()
|
||||
'This is a bold link'
|
||||
|
||||
Or drop an element (leaving its content) or the entire tree, like::
|
||||
|
||||
>>> doc = document_fromstring('''
|
||||
... <html>
|
||||
... <body>
|
||||
... <div id="body">
|
||||
... This is a <a href="foo" id="link">test</a> of stuff.
|
||||
... </div>
|
||||
... <!-- a comment -->
|
||||
... <div>footer</div>
|
||||
... </body>
|
||||
... </html>''')
|
||||
>>> doc.get_element_by_id('link').drop_tag()
|
||||
>>> print(tostring(doc, encoding=unicode))
|
||||
<html>
|
||||
<body>
|
||||
<div id="body">
|
||||
This is a test of stuff.
|
||||
</div>
|
||||
<!-- a comment -->
|
||||
<div>footer</div>
|
||||
</body>
|
||||
</html>
|
||||
>>> doc.get_element_by_id('body').drop_tree()
|
||||
>>> print(tostring(doc, encoding=unicode))
|
||||
<html>
|
||||
<body>
|
||||
<!-- a comment -->
|
||||
<div>footer</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
Note, however, that comment text will not be merged into the tree when you
|
||||
drop the comment. Here, ``drop_tag()`` behaves exactly like ``drop_tree()``:
|
||||
|
||||
>>> for comment in doc.getiterator(Comment):
|
||||
... comment.drop_tag()
|
||||
>>> print(tostring(doc, encoding=unicode))
|
||||
<html>
|
||||
<body>
|
||||
<div>footer</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
In Python3 it should be possible to parse strings given as bytes objects, at
|
||||
least if an encoding is given.
|
||||
|
||||
>>> from lxml.html import HTMLParser
|
||||
>>> enc = 'utf-8'
|
||||
>>> html_parser = HTMLParser(encoding=enc)
|
||||
>>> src = '<html><body>Test</body></html>'.encode(enc)
|
||||
|
||||
>>> doc = fromstring(src, parser=html_parser)
|
||||
>>> print(tostring(doc, encoding=unicode))
|
||||
<html><body>Test</body></html>
|
||||
|
||||
>>> docs = fragments_fromstring(src, parser=html_parser)
|
||||
>>> len(docs)
|
||||
1
|
||||
>>> print(docs[0])
|
||||
Test
|
||||
|
||||
Bug 599318: Call fromstring with a frameset fragment should not raise an error,
|
||||
the whole document is returned.
|
||||
|
||||
>>> import lxml.html
|
||||
>>> content='''
|
||||
... <frameset>
|
||||
... <frame src="main.php" name="srcpg">
|
||||
... </frameset>'''
|
||||
>>> etree_document = lxml.html.fromstring(content)
|
||||
>>> print(tostring(etree_document, encoding=unicode))
|
||||
<html><frameset><frame src="main.php" name="srcpg"></frameset></html>
|
||||
|
||||
Bug 599318: Call fromstring with a div fragment should not raise an error,
|
||||
only the element is returned
|
||||
|
||||
>>> import lxml.html
|
||||
>>> content='<div></div>'
|
||||
>>> etree_document = lxml.html.fromstring(content)
|
||||
>>> print(tostring(etree_document, encoding=unicode))
|
||||
<div></div>
|
||||
|
||||
Bug 599318: Call fromstring with a head fragment should not raise an error,
|
||||
the whole document is returned.
|
||||
|
||||
>>> import lxml.html
|
||||
>>> content='<head></head>'
|
||||
>>> etree_document = lxml.html.fromstring(content)
|
||||
>>> print(tostring(etree_document, encoding=unicode))
|
||||
<html><head></head></html>
|
||||
|
||||
Bug 690319: Leading whitespace before doctype declaration should not raise an error.
|
||||
|
||||
>>> import lxml.html
|
||||
>>> content='''
|
||||
... <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
... <html>
|
||||
... </html>'''
|
||||
>>> etree_document = lxml.html.fromstring(content)
|
||||
>>> print(tostring(etree_document, encoding=unicode))
|
||||
<html></html>
|
||||
|
80
lib/lxml/html/tests/test_clean.py
Normal file
80
lib/lxml/html/tests/test_clean.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest
|
||||
from lxml.etree import LIBXML_VERSION
|
||||
|
||||
import lxml.html
|
||||
from lxml.html.clean import Cleaner, clean_html
|
||||
|
||||
|
||||
class CleanerTest(unittest.TestCase):
|
||||
def test_allow_tags(self):
|
||||
html = """
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<p>some text</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>hello</td><td>world</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>hello</td><td>world</td>
|
||||
</tr>
|
||||
</table>
|
||||
<img>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
html_root = lxml.html.document_fromstring(html)
|
||||
cleaner = Cleaner(
|
||||
remove_unknown_tags = False,
|
||||
allow_tags = ['table', 'tr', 'td'])
|
||||
result = cleaner.clean_html(html_root)
|
||||
|
||||
self.assertEqual(12-5+1, len(list(result.iter())))
|
||||
|
||||
def test_safe_attrs_included(self):
|
||||
html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
|
||||
|
||||
safe_attrs=set(lxml.html.defs.safe_attrs)
|
||||
safe_attrs.add('style')
|
||||
|
||||
cleaner = Cleaner(
|
||||
safe_attrs_only=True,
|
||||
safe_attrs=safe_attrs)
|
||||
result = cleaner.clean_html(html)
|
||||
|
||||
self.assertEqual(html, result)
|
||||
|
||||
def test_safe_attrs_excluded(self):
|
||||
html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
|
||||
expected = """<p><span>Cyan</span></p>"""
|
||||
|
||||
safe_attrs=set()
|
||||
|
||||
cleaner = Cleaner(
|
||||
safe_attrs_only=True,
|
||||
safe_attrs=safe_attrs)
|
||||
result = cleaner.clean_html(html)
|
||||
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_clean_invalid_root_tag(self):
|
||||
# only testing that cleaning with invalid root tags works at all
|
||||
s = lxml.html.fromstring('parent <invalid tag>child</another>')
|
||||
self.assertEqual('parent child', clean_html(s).text_content())
|
||||
|
||||
s = lxml.html.fromstring('<invalid tag>child</another>')
|
||||
self.assertEqual('child', clean_html(s).text_content())
|
||||
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_clean.txt')])
|
||||
if LIBXML_VERSION >= (2,6,31):
|
||||
suite.addTests([make_doctest('test_clean_embed.txt')])
|
||||
suite.addTests(unittest.makeSuite(CleanerTest))
|
||||
return suite
|
161
lib/lxml/html/tests/test_clean.txt
Normal file
161
lib/lxml/html/tests/test_clean.txt
Normal file
|
@ -0,0 +1,161 @@
|
|||
>>> from lxml.html import fromstring, tostring
|
||||
>>> from lxml.html.clean import clean, clean_html, Cleaner
|
||||
>>> from lxml.html import usedoctest
|
||||
|
||||
>>> doc = '''<html>
|
||||
... <head>
|
||||
... <script type="text/javascript" src="evil-site"></script>
|
||||
... <link rel="alternate" type="text/rss" src="evil-rss">
|
||||
... <link rel="alternate" type="text/rss" href="http://example.com">
|
||||
... <link rel="stylesheet" type="text/rss" href="http://example.com">
|
||||
... <style>
|
||||
... body {background-image: url(javascript:do_evil)};
|
||||
... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
|
||||
... div {color: expression(evil)};
|
||||
... </style>
|
||||
... </head>
|
||||
... <body onload="evil_function()">
|
||||
... <!-- I am interpreted for EVIL! -->
|
||||
... <a href="javascript:evil_function()">a link</a>
|
||||
... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
|
||||
... <a href="#" onclick="evil_function()">another link</a>
|
||||
... <p onclick="evil_function()">a paragraph</p>
|
||||
... <div style="display: none">secret EVIL!</div>
|
||||
... <object> of EVIL! </object>
|
||||
... <iframe src="evil-site"></iframe>
|
||||
... <form action="evil-site">
|
||||
... Password: <input type="password" name="password">
|
||||
... </form>
|
||||
... <a href="evil-site">spam spam SPAM!</a>
|
||||
... <a href="http://example.com" rel="author">Author</a>
|
||||
... <a href="http://example.com" rel="nofollow">Text</a>
|
||||
... <img src="evil!">
|
||||
... </body>
|
||||
... </html>'''
|
||||
|
||||
>>> print(doc)
|
||||
<html>
|
||||
<head>
|
||||
<script type="text/javascript" src="evil-site"></script>
|
||||
<link rel="alternate" type="text/rss" src="evil-rss">
|
||||
<link rel="alternate" type="text/rss" href="http://example.com">
|
||||
<link rel="stylesheet" type="text/rss" href="http://example.com">
|
||||
<style>
|
||||
body {background-image: url(javascript:do_evil)};
|
||||
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
|
||||
div {color: expression(evil)};
|
||||
</style>
|
||||
</head>
|
||||
<body onload="evil_function()">
|
||||
<!-- I am interpreted for EVIL! -->
|
||||
<a href="javascript:evil_function()">a link</a>
|
||||
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
|
||||
<a href="#" onclick="evil_function()">another link</a>
|
||||
<p onclick="evil_function()">a paragraph</p>
|
||||
<div style="display: none">secret EVIL!</div>
|
||||
<object> of EVIL! </object>
|
||||
<iframe src="evil-site"></iframe>
|
||||
<form action="evil-site">
|
||||
Password: <input type="password" name="password">
|
||||
</form>
|
||||
<a href="evil-site">spam spam SPAM!</a>
|
||||
<a href="http://example.com" rel="author">Author</a>
|
||||
<a href="http://example.com" rel="nofollow">Text</a>
|
||||
<img src="evil!">
|
||||
</body>
|
||||
</html>
|
||||
|
||||
>>> print(tostring(fromstring(doc)).decode("utf-8"))
|
||||
<html>
|
||||
<head>
|
||||
<script type="text/javascript" src="evil-site"></script>
|
||||
<link rel="alternate" type="text/rss" src="evil-rss">
|
||||
<link rel="alternate" type="text/rss" href="http://example.com">
|
||||
<link rel="stylesheet" type="text/rss" href="http://example.com">
|
||||
<style>
|
||||
body {background-image: url(javascript:do_evil)};
|
||||
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
|
||||
div {color: expression(evil)};
|
||||
</style>
|
||||
</head>
|
||||
<body onload="evil_function()">
|
||||
<!-- I am interpreted for EVIL! -->
|
||||
<a href="javascript:evil_function()">a link</a>
|
||||
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
|
||||
<a href="#" onclick="evil_function()">another link</a>
|
||||
<p onclick="evil_function()">a paragraph</p>
|
||||
<div style="display: none">secret EVIL!</div>
|
||||
<object> of EVIL! </object>
|
||||
<iframe src="evil-site"></iframe>
|
||||
<form action="evil-site">
|
||||
Password: <input type="password" name="password">
|
||||
</form>
|
||||
<a href="evil-site">spam spam SPAM!</a>
|
||||
<a href="http://example.com" rel="author">Author</a>
|
||||
<a href="http://example.com" rel="nofollow">Text</a>
|
||||
<img src="evil!">
|
||||
</body>
|
||||
</html>
|
||||
|
||||
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
|
||||
<html>
|
||||
<head>
|
||||
<style>/* deleted */</style>
|
||||
</head>
|
||||
<body>
|
||||
<a href="">a link</a>
|
||||
<a href="">data</a>
|
||||
<a href="#">another link</a>
|
||||
<p>a paragraph</p>
|
||||
<div style="display: none">secret EVIL!</div>
|
||||
of EVIL!
|
||||
Password:
|
||||
<a href="evil-site">spam spam SPAM!</a>
|
||||
<a href="http://example.com" rel="author">Author</a>
|
||||
<a href="http://example.com" rel="nofollow">Text</a>
|
||||
<img src="evil!">
|
||||
</body>
|
||||
</html>
|
||||
|
||||
>>> print(Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<a href="">a link</a>
|
||||
<a href="">data</a>
|
||||
<a href="#">another link</a>
|
||||
<p>a paragraph</p>
|
||||
<div>secret EVIL!</div>
|
||||
of EVIL!
|
||||
Password:
|
||||
<a href="evil-site" rel="nofollow">spam spam SPAM!</a>
|
||||
<a href="http://example.com" rel="author nofollow">Author</a>
|
||||
<a href="http://example.com" rel="nofollow">Text</a>
|
||||
<img src="evil!">
|
||||
</body>
|
||||
</html>
|
||||
|
||||
>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
|
||||
<html>
|
||||
<head>
|
||||
<link rel="alternate" type="text/rss" src="evil-rss">
|
||||
<link rel="alternate" type="text/rss" href="http://example.com">
|
||||
<link rel="stylesheet" type="text/rss" href="http://example.com">
|
||||
<style>/* deleted */</style>
|
||||
</head>
|
||||
<body>
|
||||
<a href="">a link</a>
|
||||
<a href="">data</a>
|
||||
<a href="#">another link</a>
|
||||
<p>a paragraph</p>
|
||||
<div>secret EVIL!</div>
|
||||
of EVIL!
|
||||
Password:
|
||||
<a href="evil-site">spam spam SPAM!</a>
|
||||
<a href="http://example.com" rel="author">Author</a>
|
||||
<a href="http://example.com" rel="nofollow">Text</a>
|
||||
<img src="evil!">
|
||||
</body>
|
||||
</html>
|
||||
|
39
lib/lxml/html/tests/test_clean_embed.txt
Normal file
39
lib/lxml/html/tests/test_clean_embed.txt
Normal file
|
@ -0,0 +1,39 @@
|
|||
THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !!
|
||||
|
||||
|
||||
>>> from lxml.html import fromstring, tostring
|
||||
>>> from lxml.html.clean import clean, clean_html, Cleaner
|
||||
>>> from lxml.html import usedoctest
|
||||
|
||||
>>> def tostring(el): # work-around for Py3 'bytes' type
|
||||
... from lxml.html import tostring
|
||||
... s = tostring(el)
|
||||
... if not isinstance(s, str):
|
||||
... s = s.decode('UTF-8')
|
||||
... return s
|
||||
|
||||
>>> doc_embed = '''<div>
|
||||
... <embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
|
||||
... <embed src="http://anothersite.com/v/another"></embed>
|
||||
... <script src="http://www.youtube.com/example.js"></script>
|
||||
... <script src="/something-else.js"></script>
|
||||
... </div>'''
|
||||
>>> print(tostring(fromstring(doc_embed)))
|
||||
<div>
|
||||
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
|
||||
<embed src="http://anothersite.com/v/another"></embed>
|
||||
<script src="http://www.youtube.com/example.js"></script>
|
||||
<script src="/something-else.js"></script>
|
||||
</div>
|
||||
>>> print(Cleaner().clean_html(doc_embed))
|
||||
<div>
|
||||
</div>
|
||||
>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
|
||||
<div>
|
||||
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
|
||||
</div>
|
||||
>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
|
||||
<div>
|
||||
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
|
||||
<script src="http://www.youtube.com/example.js"></script>
|
||||
</div>
|
14
lib/lxml/html/tests/test_diff.py
Normal file
14
lib/lxml/html/tests/test_diff.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest, doctest
|
||||
|
||||
from lxml.html import diff
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_diff.txt'),
|
||||
doctest.DocTestSuite(diff)])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
252
lib/lxml/html/tests/test_diff.txt
Normal file
252
lib/lxml/html/tests/test_diff.txt
Normal file
|
@ -0,0 +1,252 @@
|
|||
lxml.html.diff does HTML comparisons. These are word-based comparisons.
|
||||
|
||||
First, a handy function for normalizing whitespace and doing word wrapping::
|
||||
|
||||
>>> import re, textwrap
|
||||
>>> def pwrapped(text):
|
||||
... text = re.sub(r'[ \n\t\r]+', ' ', text)
|
||||
... text = textwrap.fill(text)
|
||||
... print(text)
|
||||
>>> def pdiff(text1, text2):
|
||||
... pwrapped(htmldiff(text1, text2))
|
||||
|
||||
Example::
|
||||
|
||||
>>> from lxml.html.diff import htmldiff, html_annotate
|
||||
>>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
|
||||
>>> html2 = '''<p>This is some test textual writing with some changed stuff
|
||||
... and some same stuff</p>'''
|
||||
>>> pdiff(html1, html2)
|
||||
<p>This is some test <ins>textual writing with some changed stuff
|
||||
</ins> <del>text with some changes</del> and some same stuff</p>
|
||||
|
||||
Style tags are largely ignored in terms of differences, though markup is not eliminated::
|
||||
|
||||
>>> html1 = '<p>Hi <i>you guys</i></p>'
|
||||
>>> html2 = '<p>Hi <i>you</i> guys</p>'
|
||||
>>> pdiff(html1, html2)
|
||||
<p>Hi <i>you</i> guys</p>
|
||||
>>> pdiff('text', '<p>text</p>')
|
||||
<p>text</p>
|
||||
>>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!')
|
||||
<i>Hi <ins>guy</ins> <del>guys</del> </i> !!
|
||||
>>> pdiff('H<i>i</i>', 'Hi')
|
||||
<ins>Hi</ins> <del>H<i>i</i></del>
|
||||
>>> pdiff('<i>A B</i> C', '<i>A</i> C')
|
||||
<i>A <del>B</del> </i> C
|
||||
>>> pdiff('<i>A B</i> C', '<i>B</i> C')
|
||||
<i> <del>A</del> B</i> C
|
||||
>>> pdiff('<p></p>', '<p></p>')
|
||||
<p></p>
|
||||
>>> pdiff('<p>Hi</p>', '<p>Bye</p>')
|
||||
<p><ins>Bye</ins></p> <p><del>Hi</del></p>
|
||||
>>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>')
|
||||
<p> <ins>Bye</ins> <del>Hi</del> Guy</p>
|
||||
>>> pdiff('<p>Hey there</p>', '')
|
||||
<ins></ins> <p><del>Hey there</del></p>
|
||||
|
||||
Movement between paragraphs is ignored, as tag-based changes are generally ignored::
|
||||
>>>
|
||||
>>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>')
|
||||
<p>Hello World</p>
|
||||
|
||||
As a special case, changing the href of a link is displayed, and
|
||||
images are treated like words:
|
||||
|
||||
>>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>')
|
||||
<a href="http://google.com">search <ins> Link: http://google.com</ins>
|
||||
<del> Link: http://yahoo.com</del> </a>
|
||||
>>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>')
|
||||
<p>Print this <del><img src="print.gif"></del> </p>
|
||||
>>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
|
||||
<a href="http://yahoo.com">search</a>
|
||||
|
||||
Images may sometimes not have 'src' attributes:
|
||||
|
||||
>>> pdiff('<img src="tease"> <img> test <img src="test">', '<img> test <img src="toast">')
|
||||
<del><img src="tease"></del> <img> test <ins><img src="toast"></ins>
|
||||
<del><img src="test"></del>
|
||||
|
||||
A test of empty elements:
|
||||
|
||||
>>> pdiff('some <br> text', 'some <br> test')
|
||||
some <ins><br> test</ins> <del><br> text</del>
|
||||
|
||||
Whitespace is generally ignored for the diff but preserved during the diff:
|
||||
|
||||
>>> print(htmldiff('<p> first\nsecond\nthird</p>', '<p>   first\n second\nthird </p>'))
|
||||
<p>first
|
||||
second
|
||||
third </p>
|
||||
>>> print(htmldiff('<pre>first\nsecond\nthird</pre>', '<pre>first\nsecond\nthird</pre>'))
|
||||
<pre>first
|
||||
second
|
||||
third</pre>
|
||||
>>> print(htmldiff('<pre>first\nsecond</pre>', '<pre>first\nsecond\n third</pre>'))
|
||||
<pre>first
|
||||
second
|
||||
<ins>third</ins> </pre>
|
||||
|
||||
The sixteen combinations::
|
||||
|
||||
First "insert start" (del start/middle/end/none):
|
||||
|
||||
>>> pdiff('<b>A B C</b>', '<b>D B C</b')
|
||||
<b> <ins>D</ins> <del>A</del> B C</b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>D A C</b>')
|
||||
<b> <ins>D</ins> A <del>B</del> C</b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>D A B</b>')
|
||||
<b> <ins>D</ins> A B <del>C</del> </b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>D A B C</b>')
|
||||
<b> <ins>D</ins> A B C</b>
|
||||
|
||||
Next, "insert middle" (del start/middle/end/none):
|
||||
|
||||
>>> pdiff('<b>A B C</b>', '<b>D B C</b>')
|
||||
<b> <ins>D</ins> <del>A</del> B C</b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A D C</b>')
|
||||
<b>A <ins>D</ins> <del>B</del> C</b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A D B</b>')
|
||||
<b>A <ins>D</ins> B <del>C</del> </b>
|
||||
|
||||
This one case hits the threshold of our insensitive matching:
|
||||
|
||||
>>> pdiff('<b>A B C</b>', '<b>A D B C</b>')
|
||||
<b> <ins>A D</ins> <del>A</del> B C</b>
|
||||
|
||||
|
||||
Then "insert end" (del start/middle/end/none):
|
||||
|
||||
>>> pdiff('<b>A B C</b>', '<b>B C D</b>')
|
||||
<b> <del>A</del> B C <ins>D</ins> </b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A C D</b>')
|
||||
<b>A <del>B</del> C <ins>D</ins> </b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A B D</b>')
|
||||
<b>A B <ins>D</ins> <del>C</del> </b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A B C D</b>')
|
||||
<b>A B C <ins>D</ins> </b>
|
||||
|
||||
Then no insert (del start/middle/end):
|
||||
|
||||
>>> pdiff('<b>A B C</b>', '<b>B C</b>')
|
||||
<b> <del>A</del> B C</b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A C</b>')
|
||||
<b>A <del>B</del> C</b>
|
||||
>>> pdiff('<b>A B C</b>', '<b>A B</b>')
|
||||
<b>A B <del>C</del> </b>
|
||||
|
||||
>>> pdiff('<b>A B</b> C', '<b>A B</b>')
|
||||
<b>A B</b> <del>C</del>
|
||||
>>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>')
|
||||
<b>A B</b> <del><b>C</b></del>
|
||||
>>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A')
|
||||
A <p><del><b>hey there</b> <i>how are you?</i></del></p>
|
||||
|
||||
Testing a larger document, to make sure there are not weird
|
||||
unnecessary parallels found:
|
||||
|
||||
>>> pdiff('''
|
||||
... <p>This is a test document with many words in it that goes on
|
||||
... for a while and doesn't have anything do to with the next
|
||||
... document that we match this against</p>''', '''
|
||||
... <p>This is another document with few similarities to the preceding
|
||||
... one, but enough that it may have overlap that could turn into
|
||||
... a confusing series of deletes and inserts.
|
||||
... </p>''')
|
||||
<p><ins>This is another document with few similarities to the
|
||||
preceding one, but enough that it may have overlap that could turn
|
||||
into a confusing series of deletes and inserts. </ins></p>
|
||||
<p><del>This is a test document with many words in it that goes on for
|
||||
a while and doesn't have anything do to with the next document that we
|
||||
match this against</del></p>
|
||||
|
||||
|
||||
|
||||
Annotation of content can also be done, where every bit of content is
|
||||
marked up with information about where it came from.
|
||||
|
||||
First, some setup; note that html_annotate is called with a sequence
|
||||
of documents and the annotation associated with that document. We'll
|
||||
just use indexes, but you could use author or timestamp information.
|
||||
|
||||
>>> def markup(text, annotation):
|
||||
... return '<span version="%s">%s</span>' % (annotation, text)
|
||||
>>> def panno(*docs):
|
||||
... pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)],
|
||||
... markup=markup))
|
||||
|
||||
Now, a sequence of documents:
|
||||
|
||||
>>> panno('Hello cruel world', 'Hi cruel world', 'Hi world')
|
||||
<span version="1">Hi</span> <span version="0">world</span>
|
||||
>>> panno('A similar document', 'A similar document',
|
||||
... 'A similar document here')
|
||||
<span version="0">A similar document</span> <span
|
||||
version="2">here</span>
|
||||
>>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>')
|
||||
<p><span version="0">P1 para</span></p><p><span version="1">P3
|
||||
foo</span></p>
|
||||
>>> panno('Hello<p>There World</p>','Hello<p>There Town</p>')
|
||||
<span version="0">Hello</span><p><span version="0">There</span> <span
|
||||
version="1">Town</span></p>
|
||||
>>> panno('<p>Hello</p>There World','<p>Hello</p>There Town')
|
||||
<p><span version="0">Hello</span></p><span version="0">There</span>
|
||||
<span version="1">Town</span>
|
||||
>>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>')
|
||||
<p><span version="0">Hello</span></p><p><span version="0">There</span>
|
||||
<span version="1">Town</span></p>
|
||||
>>> panno('<p>Hi <img src="/foo"> You</p>',
|
||||
... '<p>Hi You</p>',
|
||||
... '<p>Hi You <img src="/bar"></p>')
|
||||
<p><span version="0">Hi You</span> <span version="2"><img
|
||||
src="/bar"></span></p>
|
||||
>>> panno('<p><a href="/foo">Hey</a></p>',
|
||||
... '<p><a href="/bar">Hey</a></p>')
|
||||
<p><a href="/bar"><span version="0">Hey</span></a></p>
|
||||
>>> panno('<p><a href="/foo">Hey You</a></p>',
|
||||
... '<p><a href="/foo">Hey Guy</a></p>')
|
||||
<p><a href="/foo"><span version="0">Hey</span> <span
|
||||
version="1">Guy</span></a></p>
|
||||
|
||||
Internals
|
||||
---------
|
||||
|
||||
|
||||
Some utility functions::
|
||||
|
||||
>>> from lxml.html.diff import fixup_ins_del_tags, split_unbalanced, split_trailing_whitespace
|
||||
>>> def pfixup(text):
|
||||
... print(fixup_ins_del_tags(text).strip())
|
||||
>>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
|
||||
<p><ins>some text <b>and more text</b> and more</ins></p>
|
||||
>>> pfixup('<p><ins>Hi!</ins> you</p>')
|
||||
<p><ins>Hi!</ins> you</p>
|
||||
>>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>')
|
||||
<div>Some text <ins>and </ins><p><ins>more text</ins></p> </div>
|
||||
>>> pfixup('''
|
||||
... <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''')
|
||||
<table><tr><td><ins>One table</ins></td><td><ins>More stuff</ins></td></tr></table>
|
||||
|
||||
|
||||
Testing split_unbalanced::
|
||||
|
||||
>>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
|
||||
([], ['<a href="blah">', 'hey', '</a>'], [])
|
||||
>>> split_unbalanced(['<a href="blah">', 'hey'])
|
||||
(['<a href="blah">'], ['hey'], [])
|
||||
>>> split_unbalanced(['Hey', '</i>', 'You', '</b>'])
|
||||
([], ['Hey', 'You'], ['</i>', '</b>'])
|
||||
>>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>'])
|
||||
([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>'])
|
||||
>>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There'])
|
||||
(['<b>'], ['So', 'Hi', 'There'], ['</i>'])
|
||||
|
||||
|
||||
Testing split_trailing_whitespace::
|
||||
|
||||
>>> split_trailing_whitespace('test\n\n')
|
||||
('test', '\n\n')
|
||||
>>> split_trailing_whitespace(' test\n ')
|
||||
(' test', '\n ')
|
||||
>>> split_trailing_whitespace('test')
|
||||
('test', '')
|
33
lib/lxml/html/tests/test_elementsoup.py
Normal file
33
lib/lxml/html/tests/test_elementsoup.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest, HelperTestCase
|
||||
|
||||
try:
|
||||
import BeautifulSoup
|
||||
BS_INSTALLED = True
|
||||
except ImportError:
|
||||
BS_INSTALLED = False
|
||||
|
||||
if BS_INSTALLED:
|
||||
class SoupParserTestCase(HelperTestCase):
|
||||
from lxml.html import soupparser
|
||||
|
||||
def test_broken_attribute(self):
|
||||
html = """\
|
||||
<html><head></head><body>
|
||||
<form><input type='text' disabled size='10'></form>
|
||||
</body></html>
|
||||
"""
|
||||
root = self.soupparser.fromstring(html)
|
||||
self.assertTrue(root.find('.//input').get('disabled') is not None)
|
||||
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if BS_INSTALLED:
|
||||
suite.addTests([unittest.makeSuite(SoupParserTestCase)])
|
||||
if sys.version_info[0] < 3:
|
||||
suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
98
lib/lxml/html/tests/test_feedparser_data.py
Normal file
98
lib/lxml/html/tests/test_feedparser_data.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import sys
|
||||
import os
|
||||
import re
|
||||
try:
|
||||
from rfc822 import Message
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from email import message_from_file as Message
|
||||
import unittest
|
||||
from lxml.tests.common_imports import doctest
|
||||
if sys.version_info >= (2,4):
|
||||
from lxml.doctestcompare import LHTMLOutputChecker
|
||||
|
||||
from lxml.html.clean import clean, Cleaner
|
||||
|
||||
feed_dirs = [
|
||||
os.path.join(os.path.dirname(__file__), 'feedparser-data'),
|
||||
os.path.join(os.path.dirname(__file__), 'hackers-org-data'),
|
||||
]
|
||||
bar_re = re.compile(r"-----+")
|
||||
|
||||
class DummyInput:
|
||||
def __init__(self, **kw):
|
||||
for name, value in kw.items():
|
||||
setattr(self, name, value)
|
||||
|
||||
class FeedTestCase(unittest.TestCase):
|
||||
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
unittest.TestCase.__init__(self)
|
||||
|
||||
def parse(self):
|
||||
f = open(self.filename, 'r')
|
||||
headers = Message(f)
|
||||
c = f.read()
|
||||
f.close()
|
||||
if not c.strip():
|
||||
c = headers.get_payload()
|
||||
if not headers.keys():
|
||||
raise Exception(
|
||||
"File %s has no headers" % self.filename)
|
||||
self.description = headers['Description']
|
||||
self.expect = headers.get('Expect', '')
|
||||
self.ignore = headers.get('Ignore')
|
||||
self.options = [
|
||||
o.strip() for o in headers.get('Options', '').split(',')
|
||||
if o.strip()]
|
||||
parts = bar_re.split(c)
|
||||
self.input = parts[0].rstrip() + '\n'
|
||||
if parts[1:]:
|
||||
self.expect = parts[1].rstrip() + '\n'
|
||||
else:
|
||||
self.expect = None
|
||||
|
||||
def runTest(self):
|
||||
self.parse()
|
||||
if self.ignore:
|
||||
# We've marked this test to be ignored.
|
||||
return
|
||||
kw = {}
|
||||
for name in self.options:
|
||||
if name.startswith('-'):
|
||||
kw[name[1:]] = False
|
||||
else:
|
||||
kw[name] = True
|
||||
if kw.get('clean', True):
|
||||
transformed = Cleaner(**kw).clean_html(self.input)
|
||||
else:
|
||||
transformed = self.input
|
||||
assert self.expect is not None, (
|
||||
"No expected output in %s" % self.filename)
|
||||
checker = LHTMLOutputChecker()
|
||||
if not checker.check_output(self.expect, transformed, 0):
|
||||
result = checker.output_difference(
|
||||
DummyInput(want=self.expect), transformed, 0)
|
||||
#result += '\noptions: %s %r' % (', '.join(self.options), kw)
|
||||
#result += repr(transformed)
|
||||
raise Exception("\n"+result)
|
||||
|
||||
def shortDescription(self):
|
||||
return self.filename
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
for dir in feed_dirs:
|
||||
for fn in os.listdir(dir):
|
||||
fn = os.path.join(dir, fn)
|
||||
if fn.endswith('.data'):
|
||||
case = FeedTestCase(fn)
|
||||
suite.addTests([case])
|
||||
# This is my lazy way of stopping on first error:
|
||||
try:
|
||||
case.runTest()
|
||||
except:
|
||||
break
|
||||
return suite
|
8
lib/lxml/html/tests/test_formfill.py
Normal file
8
lib/lxml/html/tests/test_formfill.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_formfill.txt')])
|
||||
return suite
|
112
lib/lxml/html/tests/test_formfill.txt
Normal file
112
lib/lxml/html/tests/test_formfill.txt
Normal file
|
@ -0,0 +1,112 @@
|
|||
Some basic imports:
|
||||
|
||||
>>> from lxml.html import usedoctest
|
||||
>>> from lxml.html.formfill import fill_form_html
|
||||
|
||||
The simplest kind of filling is just filling an input with a value:
|
||||
|
||||
>>> print(fill_form_html('''
|
||||
... <form><input type="text" name="foo"></form>''', dict(foo='bar')))
|
||||
<form><input type="text" name="foo" value="bar"></form>
|
||||
|
||||
You can also fill multiple inputs, like:
|
||||
|
||||
>>> print(fill_form_html('''
|
||||
... <form>
|
||||
... <input type="text" name="foo">
|
||||
... <input type="text" name="foo">
|
||||
... </form>''', dict(foo=['bar1', 'bar2'])))
|
||||
<form>
|
||||
<input type="text" name="foo" value="bar1">
|
||||
<input type="text" name="foo" value="bar2">
|
||||
</form>
|
||||
|
||||
Checkboxes can work either as boolean true/false, or be selected based
|
||||
on their inclusion in a set of values::
|
||||
|
||||
>>> print(fill_form_html('''
|
||||
... <form>
|
||||
... Would you like to be spammed?
|
||||
... <input type="checkbox" name="spam_me"> <br>
|
||||
... Spam you'd like to receive:<br>
|
||||
... Viagra spam:
|
||||
... <input type="checkbox" name="type" value="viagra"><br>
|
||||
... Stock spam:
|
||||
... <input type="checkbox" name="type" value="stock"><br>
|
||||
... Other spam:
|
||||
... <input type="checkbox" name="type" value="other"><br>
|
||||
... <input type="submit" value="Spam!">
|
||||
... </form>''', dict(spam_me=True, type=['viagra', 'other'])))
|
||||
<form>
|
||||
Would you like to be spammed?
|
||||
<input type="checkbox" name="spam_me" checked> <br>
|
||||
Spam you'd like to receive:<br>
|
||||
Viagra spam:
|
||||
<input type="checkbox" name="type" value="viagra" checked><br>
|
||||
Stock spam:
|
||||
<input type="checkbox" name="type" value="stock"><br>
|
||||
Other spam:
|
||||
<input type="checkbox" name="type" value="other" checked><br>
|
||||
<input type="submit" value="Spam!">
|
||||
</form>
|
||||
|
||||
FIXME: I need to test more of this. But I'm lazy and want to use the
|
||||
coverage report for some of this.
|
||||
|
||||
|
||||
This module also allows you to add error messages to the form. The errors
|
||||
add an "error" class to the input fields, and any labels if the field
|
||||
has a label. It also inserts an error message into the form, using a
|
||||
function you can provide (or the default function).
|
||||
|
||||
Example::
|
||||
|
||||
>>> from lxml.html.formfill import insert_errors_html
|
||||
>>> print(insert_errors_html('''
|
||||
... <form>
|
||||
... <fieldset id="fieldset">
|
||||
... <input name="v1"><br>
|
||||
... <label for="v2">label</label>
|
||||
... <input name="v2" id="v2"><br>
|
||||
... </fieldset>
|
||||
... <input name="v3" class="foo">
|
||||
... <input name="v3" class="foo">
|
||||
... <input name="v4">
|
||||
... <input name="v4">
|
||||
... </form>''', {
|
||||
... 'v1': "err1",
|
||||
... 'v2': "err2",
|
||||
... 'v3': [None, "err3-2"],
|
||||
... 'v4': "err4",
|
||||
... None: 'general error',
|
||||
... '#fieldset': 'area error',
|
||||
... }))
|
||||
<form>
|
||||
<div class="error-message error-block">general error</div>
|
||||
<fieldset id="fieldset" class="error">
|
||||
<div class="error-message error-block">area error</div>
|
||||
<div class="error-message">err1</div>
|
||||
<input name="v1" class="error"><br>
|
||||
<label for="v2" class="error">label</label>
|
||||
<div class="error-message">err2</div>
|
||||
<input name="v2" id="v2" class="error"><br>
|
||||
</fieldset>
|
||||
<input name="v3" class="foo">
|
||||
<div class="error-message">err3-2</div>
|
||||
<input name="v3" class="foo error">
|
||||
<div class="error-message">err4</div>
|
||||
<input name="v4" class="error">
|
||||
<input name="v4">
|
||||
</form>
|
||||
|
||||
|
||||
REGRESSION: When filling textareas, the "name" attribute used to
|
||||
be removed. The "name" attribute should be kept::
|
||||
|
||||
>>> print(fill_form_html('''
|
||||
... <form>
|
||||
... <textarea name="foo">Initial value</textarea>
|
||||
... </form>''', dict(foo="Bar")))
|
||||
<form>
|
||||
<textarea name="foo">Bar</textarea>
|
||||
</form>
|
11
lib/lxml/html/tests/test_forms.py
Normal file
11
lib/lxml/html/tests/test_forms.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_forms.txt')])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
195
lib/lxml/html/tests/test_forms.txt
Normal file
195
lib/lxml/html/tests/test_forms.txt
Normal file
|
@ -0,0 +1,195 @@
|
|||
>>> from lxml.html import usedoctest
|
||||
>>> from lxml.html import fromstring, tostring
|
||||
>>> h = fromstring('''<html><body>
|
||||
... <form action="test">
|
||||
... <input type="hidden" name="hidden_field" value="hidden_value">
|
||||
... <input type="text" name="text_field" value="text_value">
|
||||
... <input type="checkbox" name="single_checkbox">
|
||||
... <input type="checkbox" name="single_checkbox2" value="good">
|
||||
... <input type="checkbox" name="check_group" value="1">
|
||||
... <input type="checkbox" name="check_group" value="2" checked>
|
||||
... <input type="checkbox" name="check_group" value="3" checked>
|
||||
... <input type="checkbox" name="check_group" value="4">
|
||||
... <textarea name="textarea_field">some text</textarea>
|
||||
... <label for="value1">value 1</label>
|
||||
... <input type="radio" name="radios" value="value1" id="value1">
|
||||
... <label for="value2">value 2</label>
|
||||
... <input type="radio" name="radios" value="value2" id="value2">
|
||||
... <label for="value3">value 3</label>
|
||||
... <input type="radio" name="radios" value="value3" id="value3" checked>
|
||||
... <select name="select1">
|
||||
... <option> No value </option>
|
||||
... <option value="">Empty</option>
|
||||
... <option value="1">number 1</option>
|
||||
... </select>
|
||||
... <select name="select2" multiple>
|
||||
... <option value="1">number 1</option>
|
||||
... <option value="2">number 2</option>
|
||||
... <option value="3">number 3</option>
|
||||
... <option>number 4</option>
|
||||
... </select>
|
||||
... <input type="submit" name="submit1" value="submit">
|
||||
... <input type="submit" name="submit2" value="submit">
|
||||
... <input type="reset" name="reset1">linksys
|
||||
... </form>
|
||||
... </body></html>''', base_url='http://example.org/form.html')
|
||||
>>> h.base_url
|
||||
u'http://example.org/form.html'
|
||||
>>> f = h.forms[0]
|
||||
>>> f.action
|
||||
u'http://example.org/test'
|
||||
>>> f.method
|
||||
'GET'
|
||||
>>> f.inputs # doctest:+NOPARSE_MARKUP
|
||||
<InputGetter for form 0>
|
||||
>>> hidden = f.inputs['hidden_field']
|
||||
>>> hidden.checkable
|
||||
False
|
||||
>>> hidden.value
|
||||
'hidden_value'
|
||||
>>> hidden.value = 'new value'
|
||||
>>> tostring(hidden, with_tail=False)
|
||||
b'<input type="hidden" name="hidden_field" value="new value">'
|
||||
>>> checkbox = f.inputs['single_checkbox']
|
||||
>>> checkbox.checkable
|
||||
True
|
||||
>>> checkbox.type
|
||||
'checkbox'
|
||||
>>> checkbox.checked
|
||||
False
|
||||
>>> print(checkbox.value)
|
||||
None
|
||||
>>> checkbox.checked = True
|
||||
>>> checkbox.value
|
||||
'on'
|
||||
>>> tostring(checkbox, with_tail=False)
|
||||
b'<input type="checkbox" name="single_checkbox" checked>'
|
||||
>>> checkbox2 = f.inputs['single_checkbox2']
|
||||
>>> checkbox2.checked = True
|
||||
>>> checkbox2.value
|
||||
'good'
|
||||
>>> group = f.inputs['check_group']
|
||||
>>> group.value # doctest:+NOPARSE_MARKUP
|
||||
<CheckboxValues {'2', '3'} for checkboxes name='check_group'>
|
||||
>>> group.value.add('1')
|
||||
>>> group.value # doctest:+NOPARSE_MARKUP
|
||||
<CheckboxValues {'1', '2', '3'} for checkboxes name='check_group'>
|
||||
>>> tostring(group[0], with_tail=False)
|
||||
b'<input type="checkbox" name="check_group" value="1" checked>'
|
||||
>>> group.value_options
|
||||
['1', '2', '3', '4']
|
||||
>>> group.value.add('doesnotexist')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: "No checkbox with value 'doesnotexist'"
|
||||
>>> textarea = f.inputs['textarea_field']
|
||||
>>> textarea.value
|
||||
'some text'
|
||||
>>> radios = f.inputs['radios']
|
||||
>>> radios[0].label.text
|
||||
'value 1'
|
||||
>>> radios.value
|
||||
'value3'
|
||||
>>> radios.value = 'value1'
|
||||
>>> radios.value
|
||||
'value1'
|
||||
>>> tostring(radios[0], with_tail=False)
|
||||
b'<input type="radio" name="radios" value="value1" id="value1" checked>'
|
||||
>>> radios.value = None
|
||||
>>> tostring(radios[0], with_tail=False)
|
||||
b'<input type="radio" name="radios" value="value1" id="value1">'
|
||||
>>> radios.value_options
|
||||
['value1', 'value2', 'value3']
|
||||
>>> select = f.inputs['select1']
|
||||
>>> print(select.value)
|
||||
None
|
||||
>>> select.value = ""
|
||||
>>> select.value
|
||||
''
|
||||
>>> select.value = 'asdf'
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: There is no option with the value of 'asdf'
|
||||
>>> select.value_options
|
||||
['No value', '', '1']
|
||||
>>> select.value = 'No value'
|
||||
>>> select.value
|
||||
'No value'
|
||||
>>> select = f.inputs['select2']
|
||||
>>> select.value # doctest:+NOPARSE_MARKUP
|
||||
<MultipleSelectOptions {} for select name='select2'>
|
||||
>>> select.value.update(['2', '3'])
|
||||
>>> select.value # doctest:+NOPARSE_MARKUP
|
||||
<MultipleSelectOptions {'2', '3'} for select name='select2'>
|
||||
>>> select.value.remove('3')
|
||||
>>> select.value.add('asdf')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: There is no option with the value 'asdf'
|
||||
>>> select.value.add('number 4')
|
||||
>>> select.value # doctest:+NOPARSE_MARKUP
|
||||
<MultipleSelectOptions {'2', 'number 4'} for select name='select2'>
|
||||
>>> select.value.remove('number 4')
|
||||
>>> select.value_options
|
||||
['1', '2', '3', 'number 4']
|
||||
>>> try: from urllib import urlencode
|
||||
... except ImportError: from urllib.parse import urlencode
|
||||
>>> print(urlencode(f.form_values()))
|
||||
hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2
|
||||
>>> fields = f.fields
|
||||
>>> fields # doctest:+NOPARSE_MARKUP
|
||||
<FieldsDict for form 0>
|
||||
>>> for name, value in sorted(fields.items()):
|
||||
... print('%s: %r' % (name, value))
|
||||
check_group: <CheckboxValues {'1', '2', '3'} for checkboxes name='check_group'>
|
||||
hidden_field: 'new value'
|
||||
radios: None
|
||||
reset1: None
|
||||
select1: 'No value'
|
||||
select2: <MultipleSelectOptions {'2'} for select name='select2'>
|
||||
single_checkbox: 'on'
|
||||
single_checkbox2: 'good'
|
||||
submit1: 'submit'
|
||||
submit2: 'submit'
|
||||
text_field: 'text_value'
|
||||
textarea_field: 'some text'
|
||||
|
||||
>>> import lxml.html
|
||||
>>> tree = lxml.html.fromstring('''
|
||||
... <html><body>
|
||||
... <form>
|
||||
... <input name="foo" value="bar"/>
|
||||
... <input type="submit" />
|
||||
... </form>
|
||||
... </body></html>
|
||||
... ''')
|
||||
>>> tree # doctest: +ELLIPSIS
|
||||
<Element html at ...>
|
||||
>>> tree.forms[0] # doctest: +ELLIPSIS
|
||||
<Element form at ...>
|
||||
>>> tree.forms[0].fields # doctest: +NOPARSE_MARKUP
|
||||
<FieldsDict for form 0>
|
||||
>>> list(tree.forms[0].fields.keys())
|
||||
['foo']
|
||||
>>> list(tree.forms[0].fields.items())
|
||||
[('foo', 'bar')]
|
||||
>>> list(tree.forms[0].fields.values())
|
||||
['bar']
|
||||
|
||||
>>> tree = lxml.html.fromstring('''
|
||||
... <html><body>
|
||||
... <form>
|
||||
... <textarea name="foo">some <b>text<br>content</b> with tags</textarea>
|
||||
... </form>
|
||||
... </body></html>
|
||||
... ''')
|
||||
>>> list(tree.forms[0].fields.keys())
|
||||
['foo']
|
||||
>>> ta = tree.forms[0].inputs['foo']
|
||||
>>> print(ta.value)
|
||||
some <b>text<br>content</b> with tags
|
||||
>>> ta.value = 'abc<br>def'
|
||||
>>> print(ta.value)
|
||||
abc<br>def
|
||||
>>> len(ta)
|
||||
0
|
36
lib/lxml/html/tests/test_frames.py
Normal file
36
lib/lxml/html/tests/test_frames.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest, doctest
|
||||
import lxml.html
|
||||
from lxml.html import html_parser, XHTML_NAMESPACE
|
||||
|
||||
class FrameTest(unittest.TestCase):
|
||||
|
||||
def test_parse_fragments_fromstring(self):
|
||||
parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
|
||||
html = """<frameset>
|
||||
<frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
|
||||
</frameset>"""
|
||||
etree_document = lxml.html.fragments_fromstring(html, parser=parser)
|
||||
self.assertEqual(len(etree_document), 1)
|
||||
root = etree_document[0]
|
||||
self.assertEqual(root.tag, "frameset")
|
||||
frame_element = root[0]
|
||||
self.assertEqual(frame_element.tag, 'frame')
|
||||
|
||||
def test_parse_fromstring(self):
|
||||
parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
|
||||
html = """<html><frameset>
|
||||
<frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
|
||||
</frameset></html>"""
|
||||
etree_document = lxml.html.fromstring(html, parser=parser)
|
||||
self.assertEqual(etree_document.tag, 'html')
|
||||
self.assertEqual(len(etree_document), 1)
|
||||
frameset_element = etree_document[0]
|
||||
self.assertEqual(len(frameset_element), 1)
|
||||
frame_element = frameset_element[0]
|
||||
self.assertEqual(frame_element.tag, 'frame')
|
||||
|
||||
|
||||
def test_suite():
|
||||
loader = unittest.TestLoader()
|
||||
return loader.loadTestsFromModule(sys.modules[__name__])
|
429
lib/lxml/html/tests/test_html5parser.py
Normal file
429
lib/lxml/html/tests/test_html5parser.py
Normal file
|
@ -0,0 +1,429 @@
|
|||
import os
|
||||
import imp
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError: # python 3
|
||||
from io import StringIO
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
try:
|
||||
from unittest import skipUnless
|
||||
except ImportError:
|
||||
# sys.version < (2, 7)
|
||||
def skipUnless(condition, reason):
|
||||
return lambda f: condition and f or None
|
||||
|
||||
if sys.version_info < (2,6):
|
||||
class NamedTemporaryFile(object):
|
||||
def __init__(self, delete=True, **kwargs):
|
||||
self._tmpfile = tempfile.NamedTemporaryFile(**kwargs)
|
||||
def close(self):
|
||||
self._tmpfile.flush()
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._tmpfile, name)
|
||||
else:
|
||||
NamedTemporaryFile = tempfile.NamedTemporaryFile
|
||||
|
||||
from lxml.builder import ElementMaker
|
||||
from lxml.etree import Element, ElementTree, ParserError
|
||||
from lxml.html import html_parser, XHTML_NAMESPACE
|
||||
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
import urllib.parse as urlparse
|
||||
|
||||
try:
|
||||
from urllib import pathname2url
|
||||
except ImportError:
|
||||
from urllib.request import pathname2url
|
||||
|
||||
|
||||
def path2url(path):
|
||||
return urlparse.urljoin(
|
||||
'file:', pathname2url(path))
|
||||
|
||||
|
||||
try:
|
||||
import html5lib
|
||||
except ImportError:
|
||||
html5lib = None
|
||||
|
||||
class BogusModules(object):
|
||||
# See PEP 302 for details on how this works
|
||||
def __init__(self, mocks):
|
||||
self.mocks = mocks
|
||||
|
||||
def find_module(self, fullname, path=None):
|
||||
if fullname in self.mocks:
|
||||
return self
|
||||
return None
|
||||
|
||||
def load_module(self, fullname):
|
||||
mod = sys.modules.setdefault(fullname, imp.new_module(fullname))
|
||||
mod.__file__, mod.__loader__, mod.__path__ = "<dummy>", self, []
|
||||
mod.__dict__.update(self.mocks[fullname])
|
||||
return mod
|
||||
|
||||
# Fake just enough of html5lib so that html5parser.py is importable
|
||||
# without errors.
|
||||
sys.meta_path.append(BogusModules({
|
||||
'html5lib': {
|
||||
# A do-nothing HTMLParser class
|
||||
'HTMLParser': type('HTMLParser', (object,), {
|
||||
'__init__': lambda self, **kw: None,
|
||||
}),
|
||||
},
|
||||
'html5lib.treebuilders': {
|
||||
},
|
||||
'html5lib.treebuilders.etree_lxml': {
|
||||
'TreeBuilder': 'dummy treebuilder',
|
||||
},
|
||||
}))
|
||||
|
||||
|
||||
class Test_HTMLParser(unittest.TestCase):
|
||||
def make_one(self, **kwargs):
|
||||
from lxml.html.html5parser import HTMLParser
|
||||
return HTMLParser(**kwargs)
|
||||
|
||||
@skipUnless(html5lib, 'html5lib is not installed')
|
||||
def test_integration(self):
|
||||
parser = self.make_one(strict=True)
|
||||
tree = parser.parse(XHTML_TEST_DOCUMENT)
|
||||
root = tree.getroot()
|
||||
self.assertEqual(root.tag, xhtml_tag('html'))
|
||||
|
||||
|
||||
class Test_XHTMLParser(unittest.TestCase):
|
||||
def make_one(self, **kwargs):
|
||||
from lxml.html.html5parser import XHTMLParser
|
||||
return XHTMLParser(**kwargs)
|
||||
|
||||
@skipUnless(hasattr(html5lib, 'XHTMLParser'),
|
||||
'xhtml5lib does not have XHTMLParser')
|
||||
def test_integration(self):
|
||||
# XXX: This test are untested. (html5lib no longer has an XHTMLParser)
|
||||
parser = self.make_one(strict=True)
|
||||
tree = parser.parse(XHTML_TEST_DOCUMENT)
|
||||
root = tree.getroot()
|
||||
self.assertEqual(root.tag, xhtml_tag('html'))
|
||||
|
||||
|
||||
class Test_document_fromstring(unittest.TestCase):
|
||||
def call_it(self, *args, **kwargs):
|
||||
from lxml.html.html5parser import document_fromstring
|
||||
return document_fromstring(*args, **kwargs)
|
||||
|
||||
def test_basic(self):
|
||||
parser = DummyParser(doc=DummyElementTree(root='dummy root'))
|
||||
elem = self.call_it('dummy input', parser=parser)
|
||||
self.assertEqual(elem, 'dummy root')
|
||||
self.assertEqual(parser.parse_args, ('dummy input',))
|
||||
self.assertEqual(parser.parse_kwargs, {'useChardet': True})
|
||||
|
||||
def test_guess_charset_arg_gets_passed_to_parser(self):
|
||||
parser = DummyParser()
|
||||
elem = self.call_it('', guess_charset='gc_arg', parser=parser)
|
||||
self.assertEqual(parser.parse_kwargs, {'useChardet': 'gc_arg'})
|
||||
|
||||
def test_raises_type_error_on_nonstring_input(self):
|
||||
not_a_string = None
|
||||
self.assertRaises(TypeError, self.call_it, not_a_string)
|
||||
|
||||
@skipUnless(html5lib, 'html5lib is not installed')
|
||||
def test_integration(self):
|
||||
elem = self.call_it(XHTML_TEST_DOCUMENT)
|
||||
self.assertEqual(elem.tag, xhtml_tag('html'))
|
||||
|
||||
|
||||
class Test_fragments_fromstring(unittest.TestCase):
|
||||
def call_it(self, *args, **kwargs):
|
||||
from lxml.html.html5parser import fragments_fromstring
|
||||
return fragments_fromstring(*args, **kwargs)
|
||||
|
||||
def test_basic(self):
|
||||
parser = DummyParser(fragments='fragments')
|
||||
fragments = self.call_it('dummy input', parser=parser)
|
||||
self.assertEqual(fragments, 'fragments')
|
||||
|
||||
def test_guess_charset_arg_gets_passed_to_parser(self):
|
||||
parser = DummyParser()
|
||||
elem = self.call_it('', guess_charset='gc_arg', parser=parser)
|
||||
self.assertEqual(parser.parseFragment_kwargs, {'useChardet': 'gc_arg'})
|
||||
|
||||
def test_raises_type_error_on_nonstring_input(self):
|
||||
not_a_string = None
|
||||
self.assertRaises(TypeError, self.call_it, not_a_string)
|
||||
|
||||
def test_no_leading_text_strips_empty_leading_text(self):
|
||||
parser = DummyParser(fragments=['', 'tail'])
|
||||
fragments = self.call_it('', parser=parser, no_leading_text=True)
|
||||
self.assertEqual(fragments, ['tail'])
|
||||
|
||||
def test_no_leading_text_raises_error_if_leading_text(self):
|
||||
parser = DummyParser(fragments=['leading text', 'tail'])
|
||||
self.assertRaises(ParserError, self.call_it,
|
||||
'', parser=parser, no_leading_text=True)
|
||||
|
||||
@skipUnless(html5lib, 'html5lib is not installed')
|
||||
def test_integration(self):
|
||||
fragments = self.call_it('a<b>c</b>')
|
||||
self.assertEqual(len(fragments), 2)
|
||||
self.assertEqual(fragments[0], 'a')
|
||||
self.assertEqual(fragments[1].tag, xhtml_tag('b'))
|
||||
|
||||
|
||||
class Test_fragment_fromstring(unittest.TestCase):
|
||||
def call_it(self, *args, **kwargs):
|
||||
from lxml.html.html5parser import fragment_fromstring
|
||||
return fragment_fromstring(*args, **kwargs)
|
||||
|
||||
def test_basic(self):
|
||||
element = DummyElement()
|
||||
parser = DummyParser(fragments=[element])
|
||||
self.assertEqual(self.call_it('html', parser=parser), element)
|
||||
|
||||
def test_raises_type_error_on_nonstring_input(self):
|
||||
not_a_string = None
|
||||
self.assertRaises(TypeError, self.call_it, not_a_string)
|
||||
|
||||
def test_create_parent(self):
|
||||
parser = DummyParser(fragments=['head', Element('child')])
|
||||
elem = self.call_it('html', parser=parser, create_parent='parent')
|
||||
self.assertEqual(elem.tag, 'parent')
|
||||
self.assertEqual(elem.text, 'head')
|
||||
self.assertEqual(elem[0].tag, 'child')
|
||||
|
||||
def test_create_parent_default_type_no_ns(self):
|
||||
parser = DummyParser(fragments=[], namespaceHTMLElements=False)
|
||||
elem = self.call_it('html', parser=parser, create_parent=True)
|
||||
self.assertEqual(elem.tag, 'div')
|
||||
|
||||
def test_raises_error_on_leading_text(self):
|
||||
parser = DummyParser(fragments=['leading text'])
|
||||
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
|
||||
|
||||
def test_raises_error_if_no_elements_found(self):
|
||||
parser = DummyParser(fragments=[])
|
||||
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
|
||||
|
||||
def test_raises_error_if_multiple_elements_found(self):
|
||||
parser = DummyParser(fragments=[DummyElement(), DummyElement()])
|
||||
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
|
||||
|
||||
def test_raises_error_if_tail(self):
|
||||
parser = DummyParser(fragments=[DummyElement(tail='tail')])
|
||||
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
|
||||
|
||||
|
||||
class Test_fromstring(unittest.TestCase):
|
||||
def call_it(self, *args, **kwargs):
|
||||
from lxml.html.html5parser import fromstring
|
||||
return fromstring(*args, **kwargs)
|
||||
|
||||
def test_returns_whole_doc_if_input_contains_html_tag(self):
|
||||
parser = DummyParser(root='the doc')
|
||||
self.assertEqual(self.call_it('<html></html>', parser=parser),
|
||||
'the doc')
|
||||
|
||||
def test_returns_whole_doc_if_input_contains_doctype(self):
|
||||
parser = DummyParser(root='the doc')
|
||||
self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser),
|
||||
'the doc')
|
||||
|
||||
def test_returns_whole_doc_if_head_not_empty(self, use_ns=True):
|
||||
E = HTMLElementMaker(namespaceHTMLElements=use_ns)
|
||||
root = E.html(E.head(E.title()))
|
||||
parser = DummyParser(root=root)
|
||||
self.assertEqual(self.call_it('', parser=parser), root)
|
||||
|
||||
def test_returns_whole_doc_if_head_not_empty_no_ns(self):
|
||||
self.test_returns_whole_doc_if_head_not_empty(use_ns=False)
|
||||
|
||||
def test_returns_unwraps_body_if_single_element(self):
|
||||
E = HTMLElementMaker()
|
||||
elem = E.p('test')
|
||||
root = E.html(E.head(), E.body(elem))
|
||||
parser = DummyParser(root=root)
|
||||
self.assertEqual(self.call_it('', parser=parser), elem)
|
||||
|
||||
def test_returns_body_if_has_text(self):
|
||||
E = HTMLElementMaker()
|
||||
elem = E.p('test')
|
||||
body = E.body('text', elem)
|
||||
root = E.html(E.head(), body)
|
||||
parser = DummyParser(root=root)
|
||||
self.assertEqual(self.call_it('', parser=parser), body)
|
||||
|
||||
def test_returns_body_if_single_element_has_tail(self):
|
||||
E = HTMLElementMaker()
|
||||
elem = E.p('test')
|
||||
elem.tail = 'tail'
|
||||
body = E.body(elem)
|
||||
root = E.html(E.head(), body)
|
||||
parser = DummyParser(root=root)
|
||||
self.assertEqual(self.call_it('', parser=parser), body)
|
||||
|
||||
def test_wraps_multiple_fragments_in_div_no_ns(self):
|
||||
E = HTMLElementMaker(namespaceHTMLElements=False)
|
||||
parser = DummyParser(root=E.html(E.head(), E.body(E.h1(), E.p())),
|
||||
namespaceHTMLElements=False)
|
||||
elem = self.call_it('', parser=parser)
|
||||
self.assertEqual(elem.tag, 'div')
|
||||
|
||||
def test_wraps_multiple_fragments_in_span_no_ns(self):
|
||||
E = HTMLElementMaker(namespaceHTMLElements=False)
|
||||
parser = DummyParser(root=E.html(E.head(), E.body('foo', E.a('link'))),
|
||||
namespaceHTMLElements=False)
|
||||
elem = self.call_it('', parser=parser)
|
||||
self.assertEqual(elem.tag, 'span')
|
||||
|
||||
def test_raises_type_error_on_nonstring_input(self):
|
||||
not_a_string = None
|
||||
self.assertRaises(TypeError, self.call_it, not_a_string)
|
||||
|
||||
@skipUnless(html5lib, 'html5lib is not installed')
|
||||
def test_integration_whole_doc(self):
|
||||
elem = self.call_it(XHTML_TEST_DOCUMENT)
|
||||
self.assertEqual(elem.tag, xhtml_tag('html'))
|
||||
|
||||
@skipUnless(html5lib, 'html5lib is not installed')
|
||||
def test_integration_single_fragment(self):
|
||||
elem = self.call_it('<p></p>')
|
||||
self.assertEqual(elem.tag, xhtml_tag('p'))
|
||||
|
||||
|
||||
class Test_parse(unittest.TestCase):
|
||||
def call_it(self, *args, **kwargs):
|
||||
from lxml.html.html5parser import parse
|
||||
return parse(*args, **kwargs)
|
||||
|
||||
def make_temp_file(self, contents=''):
|
||||
tmpfile = NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
tmpfile.write(contents.encode('utf8'))
|
||||
tmpfile.flush()
|
||||
tmpfile.seek(0)
|
||||
return tmpfile
|
||||
except Exception:
|
||||
try:
|
||||
tmpfile.close()
|
||||
finally:
|
||||
os.unlink(tempfile.name)
|
||||
raise
|
||||
|
||||
def test_with_file_object(self):
|
||||
parser = DummyParser(doc='the doc')
|
||||
fp = open(__file__)
|
||||
try:
|
||||
self.assertEqual(self.call_it(fp, parser=parser), 'the doc')
|
||||
self.assertEqual(parser.parse_args, (fp,))
|
||||
finally:
|
||||
fp.close()
|
||||
|
||||
def test_with_file_name(self):
|
||||
parser = DummyParser(doc='the doc')
|
||||
tmpfile = self.make_temp_file('data')
|
||||
try:
|
||||
data = tmpfile.read()
|
||||
finally:
|
||||
tmpfile.close()
|
||||
try:
|
||||
self.assertEqual(self.call_it(tmpfile.name, parser=parser), 'the doc')
|
||||
fp, = parser.parse_args
|
||||
try:
|
||||
self.assertEqual(fp.read(), data)
|
||||
finally:
|
||||
fp.close()
|
||||
finally:
|
||||
os.unlink(tmpfile.name)
|
||||
|
||||
def test_with_url(self):
|
||||
parser = DummyParser(doc='the doc')
|
||||
tmpfile = self.make_temp_file('content')
|
||||
try:
|
||||
data = tmpfile.read()
|
||||
finally:
|
||||
tmpfile.close()
|
||||
try:
|
||||
url = path2url(tmpfile.name)
|
||||
self.assertEqual(self.call_it(url, parser=parser), 'the doc')
|
||||
fp, = parser.parse_args
|
||||
try:
|
||||
self.assertEqual(fp.read(), data)
|
||||
finally:
|
||||
fp.close()
|
||||
finally:
|
||||
os.unlink(tmpfile.name)
|
||||
|
||||
@skipUnless(html5lib, 'html5lib is not installed')
|
||||
def test_integration(self):
|
||||
doc = self.call_it(StringIO(XHTML_TEST_DOCUMENT))
|
||||
root = doc.getroot()
|
||||
self.assertEqual(root.tag, xhtml_tag('html'))
|
||||
|
||||
|
||||
def test_suite():
|
||||
loader = unittest.TestLoader()
|
||||
return loader.loadTestsFromModule(sys.modules[__name__])
|
||||
|
||||
|
||||
class HTMLElementMaker(ElementMaker):
|
||||
def __init__(self, namespaceHTMLElements=True):
|
||||
initargs = dict(makeelement=html_parser.makeelement)
|
||||
if namespaceHTMLElements:
|
||||
initargs.update(namespace=XHTML_NAMESPACE,
|
||||
nsmap={None: XHTML_NAMESPACE})
|
||||
ElementMaker.__init__(self, **initargs)
|
||||
|
||||
|
||||
class DummyParser(object):
|
||||
def __init__(self, doc=None, root=None,
|
||||
fragments=None, namespaceHTMLElements=True):
|
||||
self.doc = doc or DummyElementTree(root=root)
|
||||
self.fragments = fragments
|
||||
self.tree = DummyTreeBuilder(namespaceHTMLElements)
|
||||
|
||||
def parse(self, *args, **kwargs):
|
||||
self.parse_args = args
|
||||
self.parse_kwargs = kwargs
|
||||
return self.doc
|
||||
|
||||
def parseFragment(self, *args, **kwargs):
|
||||
self.parseFragment_args = args
|
||||
self.parseFragment_kwargs = kwargs
|
||||
return self.fragments
|
||||
|
||||
|
||||
class DummyTreeBuilder(object):
|
||||
def __init__(self, namespaceHTMLElements=True):
|
||||
self.namespaceHTMLElements = namespaceHTMLElements
|
||||
|
||||
|
||||
class DummyElementTree(object):
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
|
||||
def getroot(self):
|
||||
return self.root
|
||||
|
||||
|
||||
class DummyElement(object):
|
||||
def __init__(self, tag='tag', tail=None):
|
||||
self.tag = tag
|
||||
self.tail = tail
|
||||
|
||||
|
||||
def xhtml_tag(tag):
|
||||
return '{%s}%s' % (XHTML_NAMESPACE, tag)
|
||||
|
||||
|
||||
XHTML_TEST_DOCUMENT = '''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>TITLE</title></head>
|
||||
<body></body>
|
||||
</html>
|
||||
'''
|
11
lib/lxml/html/tests/test_rewritelinks.py
Normal file
11
lib/lxml/html/tests/test_rewritelinks.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
if sys.version_info >= (2,4):
|
||||
suite.addTests([make_doctest('test_rewritelinks.txt')])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
245
lib/lxml/html/tests/test_rewritelinks.txt
Normal file
245
lib/lxml/html/tests/test_rewritelinks.txt
Normal file
|
@ -0,0 +1,245 @@
|
|||
|
||||
Setup::
|
||||
|
||||
>>> import lxml.html
|
||||
|
||||
We'll define a link translation function:
|
||||
|
||||
>>> base_href = 'http://old/base/path.html'
|
||||
>>> try: import urlparse
|
||||
... except ImportError: import urllib.parse as urlparse
|
||||
>>> def relocate_href(link):
|
||||
... link = urlparse.urljoin(base_href, link)
|
||||
... if link.startswith('http://old'):
|
||||
... return 'https://new' + link[len('http://old'):]
|
||||
... else:
|
||||
... return link
|
||||
|
||||
Now for content. First, to make it easier on us, we need to trim the
|
||||
normalized HTML we get from these functions::
|
||||
|
||||
Some basics::
|
||||
|
||||
>>> from lxml.html import usedoctest, tostring
|
||||
>>> from lxml.html import rewrite_links
|
||||
>>> print(rewrite_links(
|
||||
... '<a href="http://old/blah/blah.html">link</a>', relocate_href))
|
||||
<a href="https://new/blah/blah.html">link</a>
|
||||
>>> print(rewrite_links(
|
||||
... '<script src="http://old/foo.js"></script>', relocate_href))
|
||||
<script src="https://new/foo.js"></script>
|
||||
>>> print(rewrite_links(
|
||||
... '<link href="foo.css">', relocate_href))
|
||||
<link href="https://new/base/foo.css">
|
||||
>>> print(rewrite_links('''\
|
||||
... <base href="http://blah/stuff/index.html">
|
||||
... <link href="foo.css">
|
||||
... <a href="http://old/bar.html">x</a>\
|
||||
... ''', relocate_href))
|
||||
<link href="http://blah/stuff/foo.css">
|
||||
<a href="https://new/bar.html">x</a>
|
||||
|
||||
Links in CSS are also handled::
|
||||
|
||||
>>> print(rewrite_links('''
|
||||
... <style>
|
||||
... body {background-image: url(http://old/image.gif)};
|
||||
... @import "http://old/other-style.css";
|
||||
... </style>''', relocate_href))
|
||||
<html><head><style>
|
||||
body {background-image: url(https://new/image.gif)};
|
||||
@import "https://new/other-style.css";
|
||||
</style></head></html>
|
||||
>>> print(rewrite_links('''
|
||||
... <style>
|
||||
... body {background-image: url("http://old/image.gif")};
|
||||
... @import "http://old/other-style.css";
|
||||
... </style>''', relocate_href))
|
||||
<html><head><style>
|
||||
body {background-image: url("https://new/image.gif")};
|
||||
@import "https://new/other-style.css";
|
||||
</style></head></html>
|
||||
|
||||
Those links in style attributes are also rewritten::
|
||||
|
||||
>>> print(rewrite_links('''
|
||||
... <div style="background-image: url(http://old/image.gif)">text</div>
|
||||
... ''', relocate_href))
|
||||
<div style="background-image: url(https://new/image.gif)">text</div>
|
||||
|
||||
The ``<base href>`` tag is also respected (but also removed)::
|
||||
|
||||
>>> print(rewrite_links('''
|
||||
... <html><head>
|
||||
... <base href="http://old/">
|
||||
... </head>
|
||||
... <body>
|
||||
... <a href="foo.html">link</a>
|
||||
... </body></html>''', relocate_href))
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
<a href="https://new/foo.html">link</a>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
The ``iterlinks`` method (and function) gives you all the links in
|
||||
the document, along with the element and attribute the link comes
|
||||
from. This makes it fairly easy to see what resources the document
|
||||
references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
|
||||
is something embedded). It returns a generator of ``(element, attrib,
|
||||
link)``, which is awkward to test here, so we'll make a printer::
|
||||
|
||||
>>> from lxml.html import iterlinks, document_fromstring, tostring
|
||||
>>> def print_iter(seq):
|
||||
... for element, attrib, link, pos in seq:
|
||||
... if pos:
|
||||
... extra = '@%s' % pos
|
||||
... else:
|
||||
... extra = ''
|
||||
... print('%s %s="%s"%s' % (element.tag, attrib, link, extra))
|
||||
>>> print_iter(iterlinks('''
|
||||
... <html>
|
||||
... <head>
|
||||
... <link rel="stylesheet" href="style.css">
|
||||
... <style type="text/css">
|
||||
... body {
|
||||
... background-image: url(/bg.gif);
|
||||
... }
|
||||
... @import "/other-styles.css";
|
||||
... </style>
|
||||
... <script src="/js-funcs.js"></script>
|
||||
... </head>
|
||||
... <body>
|
||||
... <table>
|
||||
... <tr><td><ul>
|
||||
... <li><a href="/test.html">Test stuff</a></li>
|
||||
... <li><a href="/other.html">Other stuff</a></li>
|
||||
... </td></tr>
|
||||
... <td style="background-image: url(/td-bg.png)">
|
||||
... <img src="/logo.gif">
|
||||
... Hi world!
|
||||
... </td></tr>
|
||||
... </table>
|
||||
... </body></html>'''))
|
||||
link href="style.css"
|
||||
style None="/other-styles.css"@69
|
||||
style None="/bg.gif"@40
|
||||
script src="/js-funcs.js"
|
||||
a href="/test.html"
|
||||
a href="/other.html"
|
||||
td style="/td-bg.png"@22
|
||||
img src="/logo.gif"
|
||||
|
||||
An application of ``iterlinks()`` is ``make_links_absolute()``::
|
||||
|
||||
>>> from lxml.html import make_links_absolute
|
||||
>>> print(make_links_absolute('''
|
||||
... <html>
|
||||
... <head>
|
||||
... <link rel="stylesheet" href="style.css">
|
||||
... <style type="text/css">
|
||||
... body {
|
||||
... background-image: url(/bg.gif);
|
||||
... }
|
||||
... @import "/other-styles.css";
|
||||
... </style>
|
||||
... <script src="/js-funcs.js"></script>
|
||||
... </head>
|
||||
... <body>
|
||||
... <table>
|
||||
... <tr><td><ul>
|
||||
... <li><a href=" /test.html">Test stuff</a></li>
|
||||
... <li><a href="/other.html ">Other stuff</a></li>
|
||||
... </td></tr>
|
||||
... <tr><td style="background-image: url( /td-bg.png )">
|
||||
... <img src="logo.gif">
|
||||
... Hi world!
|
||||
... </td></tr>
|
||||
... </table>
|
||||
... </body></html>''',
|
||||
... base_url="http://my.little.server/url/"))
|
||||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" href="http://my.little.server/url/style.css">
|
||||
<style type="text/css">
|
||||
body {
|
||||
background-image: url(http://my.little.server/bg.gif);
|
||||
}
|
||||
@import "http://my.little.server/other-styles.css";
|
||||
</style>
|
||||
<script src="http://my.little.server/js-funcs.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<table>
|
||||
<tr><td><ul>
|
||||
<li><a href="http://my.little.server/test.html">Test stuff</a></li>
|
||||
<li><a href="http://my.little.server/other.html">Other stuff</a></li>
|
||||
</ul></td></tr>
|
||||
<tr>
|
||||
<td style="background-image: url(http://my.little.server/td-bg.png)">
|
||||
<img src="http://my.little.server/url/logo.gif">
|
||||
Hi world!
|
||||
</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
### Test disabled to support Py2.6 and earlier
|
||||
#If the document contains invalid links, you may choose to "discard" or "ignore"
|
||||
#them by passing the respective option into the ``handle_failures`` argument::
|
||||
#
|
||||
# >>> html = lxml.html.fromstring ('''\
|
||||
# ... <html><body><div>
|
||||
# ... <a href="http://fancybase.com]Buy">test2</a>
|
||||
# ... </div></body></html>''')
|
||||
#
|
||||
# >>> html.make_links_absolute(base_url="http://my.little.server/url/",
|
||||
# ... handle_failures="discard")
|
||||
#
|
||||
# >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
|
||||
# <html><body><div>
|
||||
# <a>test2</a>
|
||||
# </div></body></html>
|
||||
|
||||
Check if we can replace multiple links inside of the same text string::
|
||||
|
||||
>>> html = lxml.html.fromstring ("""\
|
||||
... <html>
|
||||
... <head>
|
||||
... <title>Test</title>
|
||||
... <style type='text/css'>
|
||||
... .bg1 {
|
||||
... background: url(images/bg1.png);
|
||||
... }
|
||||
... .bg2 {
|
||||
... background: url(images/bg2.png);
|
||||
... }
|
||||
... </style>
|
||||
... </head>
|
||||
... <body>
|
||||
... <p>Hi</p>
|
||||
... </body>
|
||||
... </html>
|
||||
... """,
|
||||
... base_url = 'http://www.example.com/')
|
||||
|
||||
>>> html.make_links_absolute ()
|
||||
|
||||
>>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
|
||||
<html>
|
||||
<head>
|
||||
<title>Test</title>
|
||||
<style type="text/css">
|
||||
.bg1 {
|
||||
background: url(http://www.example.com/images/bg1.png);
|
||||
}
|
||||
.bg2 {
|
||||
background: url(http://www.example.com/images/bg2.png);
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<p>Hi</p>
|
||||
</body>
|
||||
</html>
|
11
lib/lxml/html/tests/test_xhtml.py
Normal file
11
lib/lxml/html/tests/test_xhtml.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import unittest, sys
|
||||
from lxml.tests.common_imports import make_doctest
|
||||
import lxml.html
|
||||
|
||||
def test_suite():
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTests([make_doctest('test_xhtml.txt')])
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
30
lib/lxml/html/tests/test_xhtml.txt
Normal file
30
lib/lxml/html/tests/test_xhtml.txt
Normal file
|
@ -0,0 +1,30 @@
|
|||
>>> from lxml.html import document_fromstring, fragment_fromstring, tostring
|
||||
|
||||
lxml.html has two parsers, one for HTML, one for XHTML:
|
||||
|
||||
>>> from lxml.html import HTMLParser, XHTMLParser
|
||||
>>> html = "<html><body><p>Hi!</p></body></html>"
|
||||
|
||||
>>> root = document_fromstring(html, parser=HTMLParser())
|
||||
>>> print(root.tag)
|
||||
html
|
||||
|
||||
>>> root = document_fromstring(html, parser=XHTMLParser())
|
||||
>>> print(root.tag)
|
||||
html
|
||||
|
||||
There are two functions for converting between HTML and XHTML:
|
||||
|
||||
>>> from lxml.html import xhtml_to_html, html_to_xhtml
|
||||
|
||||
>>> doc = document_fromstring(html, parser=HTMLParser())
|
||||
>>> tostring(doc)
|
||||
b'<html><body><p>Hi!</p></body></html>'
|
||||
|
||||
>>> html_to_xhtml(doc)
|
||||
>>> tostring(doc)
|
||||
b'<html:html xmlns:html="http://www.w3.org/1999/xhtml"><html:body><html:p>Hi!</html:p></html:body></html:html>'
|
||||
|
||||
>>> xhtml_to_html(doc)
|
||||
>>> tostring(doc)
|
||||
b'<html xmlns:html="http://www.w3.org/1999/xhtml"><body><p>Hi!</p></body></html>'
|
110
lib/lxml/html/tests/transform_feedparser_data.py
Normal file
110
lib/lxml/html/tests/transform_feedparser_data.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
"""
|
||||
This takes the feedparser tests from here:
|
||||
|
||||
http://feedparser.org/tests/wellformed/sanitize/
|
||||
|
||||
and rewrites them to be easier to handle (not using the internal model
|
||||
of feedparser). The input format is::
|
||||
|
||||
<!--
|
||||
Description: {description}
|
||||
Expect: {expression}
|
||||
-->
|
||||
...
|
||||
<content ...>{content}</content>
|
||||
...
|
||||
|
||||
The Expect expression is checked for
|
||||
``entries[0]['content'][0]['value'] == {data}``.
|
||||
|
||||
The output format is::
|
||||
|
||||
Description: {description}
|
||||
Expect: {expression} (if data couldn't be parsed)
|
||||
Options:
|
||||
|
||||
{content, unescaped}
|
||||
----------
|
||||
{data, unescaped, if found}
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import traceback
|
||||
|
||||
_desc_re = re.compile(r'\s*Description:\s*(.*)')
|
||||
_expect_re = re.compile(r'\s*Expect:\s*(.*)')
|
||||
_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)")
|
||||
_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)")
|
||||
|
||||
def parse_content(content):
|
||||
match = _desc_re.search(content)
|
||||
desc = match.group(1)
|
||||
match = _expect_re.search(content)
|
||||
expect = match.group(1)
|
||||
data = None
|
||||
for regex in [_data_expect_re, _feed_data_expect_re]:
|
||||
match = regex.search(expect)
|
||||
if match:
|
||||
# Icky, but I'll trust it
|
||||
data = eval(match.group(1).strip())
|
||||
break
|
||||
c = None
|
||||
for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']:
|
||||
regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S)
|
||||
match = regex.search(content)
|
||||
if match:
|
||||
c = match.group(1)
|
||||
break
|
||||
assert c is not None
|
||||
# Seems like body isn't quoted
|
||||
if tag != 'body':
|
||||
c = c.replace('<', '<')
|
||||
c = c.replace('&', '&')
|
||||
# FIXME: I should really do more unescaping...
|
||||
return {
|
||||
'Description': desc,
|
||||
'Expect': expect,
|
||||
'data': data,
|
||||
'content': c}
|
||||
|
||||
def serialize_content(d):
|
||||
s = '''\
|
||||
Description: %(Description)s
|
||||
Expect: %(Expect)s
|
||||
Options:
|
||||
|
||||
%(content)s
|
||||
''' % d
|
||||
if d.get('data') is not None:
|
||||
s += '----------\n%s' % d['data']
|
||||
return s
|
||||
|
||||
def translate_file(filename):
|
||||
f = open(filename, 'rb')
|
||||
c = f.read()
|
||||
f.close()
|
||||
try:
|
||||
output = serialize_content(parse_content(c))
|
||||
except:
|
||||
print('Bad data in %s:' % filename)
|
||||
print(c)
|
||||
traceback.print_exc()
|
||||
print('-'*60)
|
||||
return
|
||||
new = os.path.splitext(filename)[0] + '.data'
|
||||
f = open(new, 'wb')
|
||||
f.write(output)
|
||||
f.close()
|
||||
|
||||
def translate_all(dir):
|
||||
for fn in os.listdir(dir):
|
||||
fn = os.path.join(dir, fn)
|
||||
if fn.endswith('.xml'):
|
||||
translate_file(fn)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data'))
|
||||
|
13
lib/lxml/html/usedoctest.py
Normal file
13
lib/lxml/html/usedoctest.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
"""Doctest module for HTML comparison.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> import lxml.html.usedoctest
|
||||
>>> # now do your HTML doctests ...
|
||||
|
||||
See `lxml.doctestcompare`.
|
||||
"""
|
||||
|
||||
from lxml import doctestcompare
|
||||
|
||||
doctestcompare.temp_install(html=True, del_module=__name__)
|
0
lib/lxml/includes/__init__.py
Normal file
0
lib/lxml/includes/__init__.py
Normal file
26
lib/lxml/includes/c14n.pxd
Normal file
26
lib/lxml/includes/c14n.pxd
Normal file
|
@ -0,0 +1,26 @@
|
|||
from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar
|
||||
from lxml.includes.xpath cimport xmlNodeSet
|
||||
|
||||
cdef extern from "libxml/c14n.h":
|
||||
cdef int xmlC14NDocDumpMemory(xmlDoc* doc,
|
||||
xmlNodeSet* nodes,
|
||||
int exclusive,
|
||||
xmlChar** inclusive_ns_prefixes,
|
||||
int with_comments,
|
||||
xmlChar** doc_txt_ptr) nogil
|
||||
|
||||
cdef int xmlC14NDocSave(xmlDoc* doc,
|
||||
xmlNodeSet* nodes,
|
||||
int exclusive,
|
||||
xmlChar** inclusive_ns_prefixes,
|
||||
int with_comments,
|
||||
char* filename,
|
||||
int compression) nogil
|
||||
|
||||
cdef int xmlC14NDocSaveTo(xmlDoc* doc,
|
||||
xmlNodeSet* nodes,
|
||||
int exclusive,
|
||||
xmlChar** inclusive_ns_prefixes,
|
||||
int with_comments,
|
||||
xmlOutputBuffer* buffer) nogil
|
||||
|
3
lib/lxml/includes/config.pxd
Normal file
3
lib/lxml/includes/config.pxd
Normal file
|
@ -0,0 +1,3 @@
|
|||
cdef extern from "etree_defs.h":
|
||||
cdef bint ENABLE_THREADING
|
||||
cdef bint ENABLE_SCHEMATRON
|
18
lib/lxml/includes/dtdvalid.pxd
Normal file
18
lib/lxml/includes/dtdvalid.pxd
Normal file
|
@ -0,0 +1,18 @@
|
|||
from lxml.includes cimport tree
|
||||
from lxml.includes.tree cimport xmlDoc, xmlDtd
|
||||
|
||||
cdef extern from "libxml/valid.h" nogil:
|
||||
ctypedef void (*xmlValidityErrorFunc)(void * ctx, const char * msg, ...)
|
||||
ctypedef void (*xmlValidityWarningFunc)(void * ctx, const char * msg, ...)
|
||||
|
||||
ctypedef struct xmlValidCtxt:
|
||||
void *userData
|
||||
xmlValidityErrorFunc error
|
||||
xmlValidityWarningFunc warning
|
||||
|
||||
cdef xmlValidCtxt* xmlNewValidCtxt()
|
||||
cdef void xmlFreeValidCtxt(xmlValidCtxt* cur)
|
||||
|
||||
cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd)
|
||||
cdef tree.xmlElement* xmlGetDtdElementDesc(
|
||||
xmlDtd* dtd, tree.const_xmlChar* name)
|
328
lib/lxml/includes/etree_defs.h
Normal file
328
lib/lxml/includes/etree_defs.h
Normal file
|
@ -0,0 +1,328 @@
|
|||
#ifndef HAS_ETREE_DEFS_H
|
||||
#define HAS_ETREE_DEFS_H
|
||||
|
||||
/* quick check for Python/libxml2/libxslt devel setup */
|
||||
#include "Python.h"
|
||||
#ifndef PY_VERSION_HEX
|
||||
# error the development package of Python (header files etc.) is not installed correctly
|
||||
#else
|
||||
# if PY_VERSION_HEX < 0x02060000 || PY_MAJOR_VERSION >= 3 && PY_VERSION_HEX < 0x03020000
|
||||
# error this version of lxml requires Python 2.6, 2.7, 3.2 or later
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "libxml/xmlversion.h"
|
||||
#ifndef LIBXML_VERSION
|
||||
# error the development package of libxml2 (header files etc.) is not installed correctly
|
||||
#else
|
||||
#if LIBXML_VERSION < 20700
|
||||
# error minimum required version of libxml2 is 2.7.0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "libxslt/xsltconfig.h"
|
||||
#ifndef LIBXSLT_VERSION
|
||||
# error the development package of libxslt (header files etc.) is not installed correctly
|
||||
#else
|
||||
#if LIBXSLT_VERSION < 10123
|
||||
# error minimum required version of libxslt is 1.1.23
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* v_arg functions */
|
||||
#define va_int(ap) va_arg(ap, int)
|
||||
#define va_charptr(ap) va_arg(ap, char *)
|
||||
|
||||
#ifdef PYPY_VERSION
|
||||
# define IS_PYPY 1
|
||||
#else
|
||||
# define IS_PYPY 0
|
||||
#endif
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
# define IS_PYTHON3 1
|
||||
#else
|
||||
# define IS_PYTHON3 0
|
||||
#endif
|
||||
|
||||
#if IS_PYTHON3
|
||||
#undef LXML_UNICODE_STRINGS
|
||||
#define LXML_UNICODE_STRINGS 1
|
||||
#else
|
||||
#ifndef LXML_UNICODE_STRINGS
|
||||
#define LXML_UNICODE_STRINGS 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !IS_PYPY
|
||||
# define PyWeakref_LockObject(obj) (NULL)
|
||||
#endif
|
||||
|
||||
/* Threading is not currently supported by PyPy */
|
||||
#if IS_PYPY
|
||||
# ifndef WITHOUT_THREADING
|
||||
# define WITHOUT_THREADING
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Python 3 doesn't have PyFile_*() anymore */
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
# define PyFile_AsFile(o) (NULL)
|
||||
#else
|
||||
#if IS_PYPY
|
||||
# undef PyFile_AsFile
|
||||
# define PyFile_AsFile(o) (NULL)
|
||||
# undef PyUnicode_FromFormat
|
||||
# define PyUnicode_FromFormat(s, a, b) (NULL)
|
||||
# undef PyByteArray_Check
|
||||
# define PyByteArray_Check(o) (0)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if PY_VERSION_HEX <= 0x03030000 && !(defined(CYTHON_PEP393_ENABLED) && CYTHON_PEP393_ENABLED)
|
||||
#define PyUnicode_IS_READY(op) (0)
|
||||
#define PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
|
||||
#define PyUnicode_KIND(u) (sizeof(Py_UNICODE))
|
||||
#define PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u))
|
||||
#endif
|
||||
|
||||
/* PySlice_GetIndicesEx() has wrong signature in Py<=3.1 */
|
||||
#if PY_VERSION_HEX >= 0x03020000
|
||||
# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(o, l, b, e, s, sl)
|
||||
#else
|
||||
# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(((PySliceObject*)o), l, b, e, s, sl)
|
||||
#endif
|
||||
|
||||
#ifdef WITHOUT_THREADING
|
||||
# define PyEval_SaveThread() (NULL)
|
||||
# define PyEval_RestoreThread(state)
|
||||
# define PyGILState_Ensure() (PyGILState_UNLOCKED)
|
||||
# define PyGILState_Release(state)
|
||||
# undef Py_UNBLOCK_THREADS
|
||||
# define Py_UNBLOCK_THREADS
|
||||
# undef Py_BLOCK_THREADS
|
||||
# define Py_BLOCK_THREADS
|
||||
#endif
|
||||
|
||||
#ifdef WITHOUT_THREADING
|
||||
# define ENABLE_THREADING 0
|
||||
#else
|
||||
# define ENABLE_THREADING 1
|
||||
#endif
|
||||
|
||||
#if LIBXML_VERSION < 20704
|
||||
/* FIXME: hack to make new error reporting compile in old libxml2 versions */
|
||||
# define xmlStructuredErrorContext NULL
|
||||
# define xmlXIncludeProcessTreeFlagsData(n,o,d) xmlXIncludeProcessTreeFlags(n,o)
|
||||
#endif
|
||||
|
||||
/* schematron was added in libxml2 2.6.21 */
|
||||
#ifdef LIBXML_SCHEMATRON_ENABLED
|
||||
# define ENABLE_SCHEMATRON 1
|
||||
#else
|
||||
# define ENABLE_SCHEMATRON 0
|
||||
# define XML_SCHEMATRON_OUT_QUIET 0
|
||||
# define XML_SCHEMATRON_OUT_XML 0
|
||||
# define XML_SCHEMATRON_OUT_ERROR 0
|
||||
typedef void xmlSchematron;
|
||||
typedef void xmlSchematronParserCtxt;
|
||||
typedef void xmlSchematronValidCtxt;
|
||||
# define xmlSchematronNewDocParserCtxt(doc) NULL
|
||||
# define xmlSchematronNewParserCtxt(file) NULL
|
||||
# define xmlSchematronParse(ctxt) NULL
|
||||
# define xmlSchematronFreeParserCtxt(ctxt)
|
||||
# define xmlSchematronFree(schema)
|
||||
# define xmlSchematronNewValidCtxt(schema, options) NULL
|
||||
# define xmlSchematronValidateDoc(ctxt, doc) 0
|
||||
# define xmlSchematronFreeValidCtxt(ctxt)
|
||||
# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data)
|
||||
#endif
|
||||
|
||||
#if LIBXML_VERSION < 20900
|
||||
# define XML_PARSE_BIG_LINES 4194304
|
||||
#endif
|
||||
|
||||
#include "libxml/tree.h"
|
||||
#ifndef LIBXML2_NEW_BUFFER
|
||||
typedef xmlBuffer xmlBuf;
|
||||
# define xmlBufContent(buf) xmlBufferContent(buf)
|
||||
# define xmlBufUse(buf) xmlBufferLength(buf)
|
||||
#endif
|
||||
|
||||
/* libexslt 1.1.25+ support EXSLT functions in XPath */
|
||||
#if LIBXSLT_VERSION < 10125
|
||||
#define exsltDateXpathCtxtRegister(ctxt, prefix)
|
||||
#define exsltSetsXpathCtxtRegister(ctxt, prefix)
|
||||
#define exsltMathXpathCtxtRegister(ctxt, prefix)
|
||||
#define exsltStrXpathCtxtRegister(ctxt, prefix)
|
||||
#endif
|
||||
|
||||
/* work around MSDEV 6.0 */
|
||||
#if (_MSC_VER == 1200) && (WINVER < 0x0500)
|
||||
long _ftol( double ); //defined by VC6 C libs
|
||||
long _ftol2( double dblSource ) { return _ftol( dblSource ); }
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
/* Test for GCC > 2.95 */
|
||||
#if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
|
||||
#define unlikely_condition(x) __builtin_expect((x), 0)
|
||||
#else /* __GNUC__ > 2 ... */
|
||||
#define unlikely_condition(x) (x)
|
||||
#endif /* __GNUC__ > 2 ... */
|
||||
#else /* __GNUC__ */
|
||||
#define unlikely_condition(x) (x)
|
||||
#endif /* __GNUC__ */
|
||||
|
||||
#ifndef Py_TYPE
|
||||
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
|
||||
#endif
|
||||
|
||||
#define PY_NEW(T) \
|
||||
(((PyTypeObject*)(T))->tp_new( \
|
||||
(PyTypeObject*)(T), __pyx_empty_tuple, NULL))
|
||||
|
||||
#define _fqtypename(o) ((Py_TYPE(o))->tp_name)
|
||||
|
||||
#if PY_MAJOR_VERSION < 3
|
||||
#define _isString(obj) (PyString_CheckExact(obj) || \
|
||||
PyUnicode_CheckExact(obj) || \
|
||||
PyType_IsSubtype(Py_TYPE(obj), &PyBaseString_Type))
|
||||
#else
|
||||
/* builtin subtype type checks are almost as fast as exact checks in Py2.7+
|
||||
* and Unicode is more common in Py3 */
|
||||
#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj))
|
||||
#endif
|
||||
|
||||
#define _isElement(c_node) \
|
||||
(((c_node)->type == XML_ELEMENT_NODE) || \
|
||||
((c_node)->type == XML_COMMENT_NODE) || \
|
||||
((c_node)->type == XML_ENTITY_REF_NODE) || \
|
||||
((c_node)->type == XML_PI_NODE))
|
||||
|
||||
#define _isElementOrXInclude(c_node) \
|
||||
(_isElement(c_node) || \
|
||||
((c_node)->type == XML_XINCLUDE_START) || \
|
||||
((c_node)->type == XML_XINCLUDE_END))
|
||||
|
||||
#define _getNs(c_node) \
|
||||
(((c_node)->ns == 0) ? 0 : ((c_node)->ns->href))
|
||||
|
||||
|
||||
/* Macro pair implementation of a depth first tree walker
|
||||
*
|
||||
* Calls the code block between the BEGIN and END macros for all elements
|
||||
* below c_tree_top (exclusively), starting at c_node (inclusively iff
|
||||
* 'inclusive' is 1). The _ELEMENT_ variants will only stop on nodes
|
||||
* that match _isElement(), the normal variant will stop on every node
|
||||
* except text nodes.
|
||||
*
|
||||
* To traverse the node and all of its children and siblings in Pyrex, call
|
||||
* cdef xmlNode* some_node
|
||||
* BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1)
|
||||
* # do something with some_node
|
||||
* END_FOR_EACH_ELEMENT_FROM(some_node)
|
||||
*
|
||||
* To traverse only the children and siblings of a node, call
|
||||
* cdef xmlNode* some_node
|
||||
* BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0)
|
||||
* # do something with some_node
|
||||
* END_FOR_EACH_ELEMENT_FROM(some_node)
|
||||
*
|
||||
* To traverse only the children, do:
|
||||
* cdef xmlNode* some_node
|
||||
* some_node = parent_node.children
|
||||
* BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1)
|
||||
* # do something with some_node
|
||||
* END_FOR_EACH_ELEMENT_FROM(some_node)
|
||||
*
|
||||
* NOTE: 'some_node' MUST be a plain 'xmlNode*' !
|
||||
*
|
||||
* NOTE: parent modification during the walk can divert the iterator, but
|
||||
* should not segfault !
|
||||
*/
|
||||
|
||||
#define _LX__ELEMENT_MATCH(c_node, only_elements) \
|
||||
((only_elements) ? (_isElement(c_node)) : 1)
|
||||
|
||||
#define _LX__ADVANCE_TO_NEXT(c_node, only_elements) \
|
||||
while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements))) \
|
||||
c_node = c_node->next;
|
||||
|
||||
#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \
|
||||
{ \
|
||||
/* walk through children first */ \
|
||||
xmlNode* _lx__next = c_node->children; \
|
||||
if (_lx__next != 0) { \
|
||||
if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
|
||||
_lx__next = 0; \
|
||||
} else { \
|
||||
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
|
||||
} \
|
||||
} \
|
||||
if ((_lx__next == 0) && (c_node != c_stop_node)) { \
|
||||
/* try siblings */ \
|
||||
_lx__next = c_node->next; \
|
||||
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
|
||||
/* back off through parents */ \
|
||||
while (_lx__next == 0) { \
|
||||
c_node = c_node->parent; \
|
||||
if (c_node == 0) \
|
||||
break; \
|
||||
if (c_node == c_stop_node) \
|
||||
break; \
|
||||
if ((only_elements) && !_isElement(c_node)) \
|
||||
break; \
|
||||
/* we already traversed the parents -> siblings */ \
|
||||
_lx__next = c_node->next; \
|
||||
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
|
||||
} \
|
||||
} \
|
||||
c_node = _lx__next; \
|
||||
}
|
||||
|
||||
#define _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, only_elements) \
|
||||
{ \
|
||||
if (c_node != 0) { \
|
||||
const xmlNode* _lx__tree_top = (c_tree_top); \
|
||||
const int _lx__only_elements = (only_elements); \
|
||||
/* make sure we start at an element */ \
|
||||
if (!_LX__ELEMENT_MATCH(c_node, _lx__only_elements)) { \
|
||||
/* we skip the node, so 'inclusive' is irrelevant */ \
|
||||
if (c_node == _lx__tree_top) \
|
||||
c_node = 0; /* nothing to traverse */ \
|
||||
else { \
|
||||
c_node = c_node->next; \
|
||||
_LX__ADVANCE_TO_NEXT(c_node, _lx__only_elements) \
|
||||
} \
|
||||
} else if (! (inclusive)) { \
|
||||
/* skip the first node */ \
|
||||
_LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
|
||||
} \
|
||||
\
|
||||
/* now run the user code on the elements we find */ \
|
||||
while (c_node != 0) { \
|
||||
/* here goes the code to be run for each element */
|
||||
|
||||
#define _LX__END_FOR_EACH_FROM(c_node) \
|
||||
_LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive) \
|
||||
_LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 1)
|
||||
|
||||
#define END_FOR_EACH_ELEMENT_FROM(c_node) \
|
||||
_LX__END_FOR_EACH_FROM(c_node)
|
||||
|
||||
#define BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive) \
|
||||
_LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 0)
|
||||
|
||||
#define END_FOR_EACH_FROM(c_node) \
|
||||
_LX__END_FOR_EACH_FROM(c_node)
|
||||
|
||||
|
||||
#endif /* HAS_ETREE_DEFS_H */
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue