Added lxml to our libs

This commit is contained in:
echel0n 2014-03-28 21:32:46 -07:00
parent 9ac649444d
commit 32c029c3cf
190 changed files with 55421 additions and 0 deletions

223
lib/lxml/ElementInclude.py Normal file
View file

@ -0,0 +1,223 @@
#
# ElementTree
# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $
#
# limited xinclude support for element trees
#
# history:
# 2003-08-15 fl created
# 2003-11-14 fl fixed default loader
#
# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2004 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
"""
Limited XInclude support for the ElementTree package.
While lxml.etree has full support for XInclude (see
`etree.ElementTree.xinclude()`), this module provides a simpler, pure
Python, ElementTree compatible implementation that supports a simple
form of custom URL resolvers.
"""
from lxml import etree
import copy
try:
from urlparse import urljoin
from urllib2 import urlopen
except ImportError:
# Python 3
from urllib.parse import urljoin
from urllib.request import urlopen
try:
set
except NameError:
# Python 2.3
from sets import Set as set
XINCLUDE = "{http://www.w3.org/2001/XInclude}"
XINCLUDE_INCLUDE = XINCLUDE + "include"
XINCLUDE_FALLBACK = XINCLUDE + "fallback"
##
# Fatal include error.
class FatalIncludeError(etree.LxmlSyntaxError):
pass
##
# ET compatible default loader.
# This loader reads an included resource from disk.
#
# @param href Resource reference.
# @param parse Parse mode. Either "xml" or "text".
# @param encoding Optional text encoding.
# @return The expanded resource. If the parse mode is "xml", this
# is an ElementTree instance. If the parse mode is "text", this
# is a Unicode string. If the loader fails, it can return None
# or raise an IOError exception.
# @throws IOError If the loader fails to load the resource.
def default_loader(href, parse, encoding=None):
file = open(href, 'rb')
if parse == "xml":
data = etree.parse(file).getroot()
else:
data = file.read()
if not encoding:
encoding = 'utf-8'
data = data.decode(encoding)
file.close()
return data
##
# Default loader used by lxml.etree - handles custom resolvers properly
#
def _lxml_default_loader(href, parse, encoding=None, parser=None):
if parse == "xml":
data = etree.parse(href, parser).getroot()
else:
if "://" in href:
f = urlopen(href)
else:
f = open(href, 'rb')
data = f.read()
f.close()
if not encoding:
encoding = 'utf-8'
data = data.decode(encoding)
return data
##
# Wrapper for ET compatibility - drops the parser
def _wrap_et_loader(loader):
def load(href, parse, encoding=None, parser=None):
return loader(href, parse, encoding)
return load
##
# Expand XInclude directives.
#
# @param elem Root element.
# @param loader Optional resource loader. If omitted, it defaults
# to {@link default_loader}. If given, it should be a callable
# that implements the same interface as <b>default_loader</b>.
# @throws FatalIncludeError If the function fails to include a given
# resource, or if the tree contains malformed XInclude elements.
# @throws IOError If the function fails to load a given resource.
# @returns the node or its replacement if it was an XInclude node
def include(elem, loader=None, base_url=None):
if base_url is None:
if hasattr(elem, 'getroot'):
tree = elem
elem = elem.getroot()
else:
tree = elem.getroottree()
if hasattr(tree, 'docinfo'):
base_url = tree.docinfo.URL
elif hasattr(elem, 'getroot'):
elem = elem.getroot()
_include(elem, loader, base_url=base_url)
def _include(elem, loader=None, _parent_hrefs=None, base_url=None):
if loader is not None:
load_include = _wrap_et_loader(loader)
else:
load_include = _lxml_default_loader
if _parent_hrefs is None:
_parent_hrefs = set()
parser = elem.getroottree().parser
include_elements = list(
elem.iter('{http://www.w3.org/2001/XInclude}*'))
for e in include_elements:
if e.tag == XINCLUDE_INCLUDE:
# process xinclude directive
href = urljoin(base_url, e.get("href"))
parse = e.get("parse", "xml")
parent = e.getparent()
if parse == "xml":
if href in _parent_hrefs:
raise FatalIncludeError(
"recursive include of %r detected" % href
)
_parent_hrefs.add(href)
node = load_include(href, parse, parser=parser)
if node is None:
raise FatalIncludeError(
"cannot load %r as %r" % (href, parse)
)
node = _include(node, loader, _parent_hrefs)
if e.tail:
node.tail = (node.tail or "") + e.tail
if parent is None:
return node # replaced the root node!
parent.replace(e, node)
elif parse == "text":
text = load_include(href, parse, encoding=e.get("encoding"))
if text is None:
raise FatalIncludeError(
"cannot load %r as %r" % (href, parse)
)
predecessor = e.getprevious()
if predecessor is not None:
predecessor.tail = (predecessor.tail or "") + text
elif parent is None:
return text # replaced the root node!
else:
parent.text = (parent.text or "") + text + (e.tail or "")
parent.remove(e)
else:
raise FatalIncludeError(
"unknown parse type in xi:include tag (%r)" % parse
)
elif e.tag == XINCLUDE_FALLBACK:
parent = e.getparent()
if parent is not None and parent.tag != XINCLUDE_INCLUDE:
raise FatalIncludeError(
"xi:fallback tag must be child of xi:include (%r)" % e.tag
)
else:
raise FatalIncludeError(
"Invalid element found in XInclude namespace (%r)" % e.tag
)
return elem

20
lib/lxml/__init__.py Normal file
View file

@ -0,0 +1,20 @@
# this is a package
def get_include():
"""
Returns a list of header include paths (for lxml itself, libxml2
and libxslt) needed to compile C code against lxml if it was built
with statically linked libraries.
"""
import os
lxml_path = __path__[0]
include_path = os.path.join(lxml_path, 'includes')
includes = [include_path, lxml_path]
for name in os.listdir(include_path):
path = os.path.join(include_path, name)
if os.path.isdir(path):
includes.append(path)
return includes

306
lib/lxml/_elementpath.py Normal file
View file

@ -0,0 +1,306 @@
#
# ElementTree
# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
#
# limited xpath support for element trees
#
# history:
# 2003-05-23 fl created
# 2003-05-28 fl added support for // etc
# 2003-08-27 fl fixed parsing of periods in element names
# 2007-09-10 fl new selection engine
# 2007-09-12 fl fixed parent selector
# 2007-09-13 fl added iterfind; changed findall to return a list
# 2007-11-30 fl added namespaces support
# 2009-10-30 fl added child element value filter
#
# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2009 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
##
# Implementation module for XPath support. There's usually no reason
# to import this module directly; the <b>ElementTree</b> does this for
# you, if needed.
##
import re
xpath_tokenizer_re = re.compile(
"("
"'[^']*'|\"[^\"]*\"|"
"::|"
"//?|"
"\.\.|"
"\(\)|"
"[/.*:\[\]\(\)@=])|"
"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
"\s+"
)
def xpath_tokenizer(pattern, namespaces=None):
for token in xpath_tokenizer_re.findall(pattern):
tag = token[1]
if tag and tag[0] != "{" and ":" in tag:
try:
prefix, uri = tag.split(":", 1)
if not namespaces:
raise KeyError
yield token[0], "{%s}%s" % (namespaces[prefix], uri)
except KeyError:
raise SyntaxError("prefix %r not found in prefix map" % prefix)
else:
yield token
def prepare_child(next, token):
tag = token[1]
def select(result):
for elem in result:
for e in elem.iterchildren(tag):
yield e
return select
def prepare_star(next, token):
def select(result):
for elem in result:
for e in elem.iterchildren('*'):
yield e
return select
def prepare_self(next, token):
def select(result):
return result
return select
def prepare_descendant(next, token):
token = next()
if token[0] == "*":
tag = "*"
elif not token[0]:
tag = token[1]
else:
raise SyntaxError("invalid descendant")
def select(result):
for elem in result:
for e in elem.iterdescendants(tag):
yield e
return select
def prepare_parent(next, token):
def select(result):
for elem in result:
parent = elem.getparent()
if parent is not None:
yield parent
return select
def prepare_predicate(next, token):
# FIXME: replace with real parser!!! refs:
# http://effbot.org/zone/simple-iterator-parser.htm
# http://javascript.crockford.com/tdop/tdop.html
signature = []
predicate = []
while 1:
token = next()
if token[0] == "]":
break
if token[0] and token[0][:1] in "'\"":
token = "'", token[0][1:-1]
signature.append(token[0] or "-")
predicate.append(token[1])
signature = "".join(signature)
# use signature to determine predicate type
if signature == "@-":
# [@attribute] predicate
key = predicate[1]
def select(result):
for elem in result:
if elem.get(key) is not None:
yield elem
return select
if signature == "@-='":
# [@attribute='value']
key = predicate[1]
value = predicate[-1]
def select(result):
for elem in result:
if elem.get(key) == value:
yield elem
return select
if signature == "-" and not re.match("-?\d+$", predicate[0]):
# [tag]
tag = predicate[0]
def select(result):
for elem in result:
for _ in elem.iterchildren(tag):
yield elem
break
return select
if signature == "-='" and not re.match("-?\d+$", predicate[0]):
# [tag='value']
tag = predicate[0]
value = predicate[-1]
def select(result):
for elem in result:
for e in elem.iterchildren(tag):
if "".join(e.itertext()) == value:
yield elem
break
return select
if signature == "-" or signature == "-()" or signature == "-()-":
# [index] or [last()] or [last()-index]
if signature == "-":
# [index]
index = int(predicate[0]) - 1
if index < 0:
if index == -1:
raise SyntaxError(
"indices in path predicates are 1-based, not 0-based")
else:
raise SyntaxError("path index >= 1 expected")
else:
if predicate[0] != "last":
raise SyntaxError("unsupported function")
if signature == "-()-":
try:
index = int(predicate[2]) - 1
except ValueError:
raise SyntaxError("unsupported expression")
else:
index = -1
def select(result):
for elem in result:
parent = elem.getparent()
if parent is None:
continue
try:
# FIXME: what if the selector is "*" ?
elems = list(parent.iterchildren(elem.tag))
if elems[index] is elem:
yield elem
except IndexError:
pass
return select
raise SyntaxError("invalid predicate")
ops = {
"": prepare_child,
"*": prepare_star,
".": prepare_self,
"..": prepare_parent,
"//": prepare_descendant,
"[": prepare_predicate,
}
_cache = {}
# --------------------------------------------------------------------
def _build_path_iterator(path, namespaces):
# compile selector pattern
if path[-1:] == "/":
path = path + "*" # implicit all (FIXME: keep this?)
try:
return _cache[(path, namespaces and tuple(sorted(namespaces.items())) or None)]
except KeyError:
pass
if len(_cache) > 100:
_cache.clear()
if path[:1] == "/":
raise SyntaxError("cannot use absolute path on element")
stream = iter(xpath_tokenizer(path, namespaces))
try:
_next = stream.next
except AttributeError:
# Python 3
_next = stream.__next__
try:
token = _next()
except StopIteration:
raise SyntaxError("empty path expression")
selector = []
while 1:
try:
selector.append(ops[token[0]](_next, token))
except StopIteration:
raise SyntaxError("invalid path")
try:
token = _next()
if token[0] == "/":
token = _next()
except StopIteration:
break
_cache[path] = selector
return selector
##
# Iterate over the matching nodes
def iterfind(elem, path, namespaces=None):
selector = _build_path_iterator(path, namespaces)
result = iter((elem,))
for select in selector:
result = select(result)
return result
##
# Find first matching object.
def find(elem, path, namespaces=None):
it = iterfind(elem, path, namespaces)
try:
try:
_next = it.next
except AttributeError:
return next(it)
else:
return _next()
except StopIteration:
return None
##
# Find all matching objects.
def findall(elem, path, namespaces=None):
return list(iterfind(elem, path, namespaces))
##
# Find text for first matching object.
def findtext(elem, path, default=None, namespaces=None):
el = find(elem, path, namespaces)
if el is None:
return default
else:
return el.text or ''

1645
lib/lxml/apihelpers.pxi Normal file

File diff suppressed because it is too large Load diff

238
lib/lxml/builder.py Normal file
View file

@ -0,0 +1,238 @@
#
# Element generator factory by Fredrik Lundh.
#
# Source:
# http://online.effbot.org/2006_11_01_archive.htm#et-builder
# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2004 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
"""
The ``E`` Element factory for generating XML documents.
"""
import lxml.etree as ET
try:
from functools import partial
except ImportError:
# fake it for pre-2.5 releases
def partial(func, tag):
return lambda *args, **kwargs: func(tag, *args, **kwargs)
try:
callable
except NameError:
# Python 3
def callable(f):
return hasattr(f, '__call__')
try:
basestring
except NameError:
basestring = str
try:
unicode
except NameError:
unicode = str
class ElementMaker(object):
"""Element generator factory.
Unlike the ordinary Element factory, the E factory allows you to pass in
more than just a tag and some optional attributes; you can also pass in
text and other elements. The text is added as either text or tail
attributes, and elements are inserted at the right spot. Some small
examples::
>>> from lxml import etree as ET
>>> from lxml.builder import E
>>> ET.tostring(E("tag"))
'<tag/>'
>>> ET.tostring(E("tag", "text"))
'<tag>text</tag>'
>>> ET.tostring(E("tag", "text", key="value"))
'<tag key="value">text</tag>'
>>> ET.tostring(E("tag", E("subtag", "text"), "tail"))
'<tag><subtag>text</subtag>tail</tag>'
For simple tags, the factory also allows you to write ``E.tag(...)`` instead
of ``E('tag', ...)``::
>>> ET.tostring(E.tag())
'<tag/>'
>>> ET.tostring(E.tag("text"))
'<tag>text</tag>'
>>> ET.tostring(E.tag(E.subtag("text"), "tail"))
'<tag><subtag>text</subtag>tail</tag>'
Here's a somewhat larger example; this shows how to generate HTML
documents, using a mix of prepared factory functions for inline elements,
nested ``E.tag`` calls, and embedded XHTML fragments::
# some common inline elements
A = E.a
I = E.i
B = E.b
def CLASS(v):
# helper function, 'class' is a reserved word
return {'class': v}
page = (
E.html(
E.head(
E.title("This is a sample document")
),
E.body(
E.h1("Hello!", CLASS("title")),
E.p("This is a paragraph with ", B("bold"), " text in it!"),
E.p("This is another paragraph, with a ",
A("link", href="http://www.python.org"), "."),
E.p("Here are some reservered characters: <spam&egg>."),
ET.XML("<p>And finally, here is an embedded XHTML fragment.</p>"),
)
)
)
print ET.tostring(page)
Here's a prettyprinted version of the output from the above script::
<html>
<head>
<title>This is a sample document</title>
</head>
<body>
<h1 class="title">Hello!</h1>
<p>This is a paragraph with <b>bold</b> text in it!</p>
<p>This is another paragraph, with <a href="http://www.python.org">link</a>.</p>
<p>Here are some reservered characters: &lt;spam&amp;egg&gt;.</p>
<p>And finally, here is an embedded XHTML fragment.</p>
</body>
</html>
For namespace support, you can pass a namespace map (``nsmap``)
and/or a specific target ``namespace`` to the ElementMaker class::
>>> E = ElementMaker(namespace="http://my.ns/")
>>> print(ET.tostring( E.test ))
<test xmlns="http://my.ns/"/>
>>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'})
>>> print(ET.tostring( E.test ))
<p:test xmlns:p="http://my.ns/"/>
"""
def __init__(self, typemap=None,
namespace=None, nsmap=None, makeelement=None):
if namespace is not None:
self._namespace = '{' + namespace + '}'
else:
self._namespace = None
if nsmap:
self._nsmap = dict(nsmap)
else:
self._nsmap = None
if makeelement is not None:
assert callable(makeelement)
self._makeelement = makeelement
else:
self._makeelement = ET.Element
# initialize type map for this element factory
if typemap:
typemap = typemap.copy()
else:
typemap = {}
def add_text(elem, item):
try:
elem[-1].tail = (elem[-1].tail or "") + item
except IndexError:
elem.text = (elem.text or "") + item
if str not in typemap:
typemap[str] = add_text
if unicode not in typemap:
typemap[unicode] = add_text
def add_dict(elem, item):
attrib = elem.attrib
for k, v in item.items():
if isinstance(v, basestring):
attrib[k] = v
else:
attrib[k] = typemap[type(v)](None, v)
if dict not in typemap:
typemap[dict] = add_dict
self._typemap = typemap
def __call__(self, tag, *children, **attrib):
get = self._typemap.get
if self._namespace is not None and tag[0] != '{':
tag = self._namespace + tag
elem = self._makeelement(tag, nsmap=self._nsmap)
if attrib:
get(dict)(elem, attrib)
for item in children:
if callable(item):
item = item()
t = get(type(item))
if t is None:
if ET.iselement(item):
elem.append(item)
continue
for basetype in type(item).__mro__:
# See if the typemap knows of any of this type's bases.
t = get(basetype)
if t is not None:
break
else:
raise TypeError("bad argument type: %s(%r)" %
(type(item).__name__, item))
v = t(elem, item)
if v:
get(type(v))(elem, v)
return elem
def __getattr__(self, tag):
return partial(self, tag)
# create factory object
E = ElementMaker()

565
lib/lxml/classlookup.pxi Normal file
View file

@ -0,0 +1,565 @@
# Configurable Element class lookup
################################################################################
# Custom Element classes
cdef public class ElementBase(_Element) [ type LxmlElementBaseType,
object LxmlElementBase ]:
u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
The public Element class. All custom Element classes must inherit
from this one. To create an Element, use the `Element()` factory.
BIG FAT WARNING: Subclasses *must not* override __init__ or
__new__ as it is absolutely undefined when these objects will be
created or destroyed. All persistent state of Elements must be
stored in the underlying XML. If you really need to initialize
the object after creation, you can implement an ``_init(self)``
method that will be called directly after object creation.
Subclasses of this class can be instantiated to create a new
Element. By default, the tag name will be the class name and the
namespace will be empty. You can modify this with the following
class attributes:
* TAG - the tag name, possibly containing a namespace in Clark
notation
* NAMESPACE - the default namespace URI, unless provided as part
of the TAG attribute.
* HTML - flag if the class is an HTML tag, as opposed to an XML
tag. This only applies to un-namespaced tags and defaults to
false (i.e. XML).
* PARSER - the parser that provides the configuration for the
newly created document. Providing an HTML parser here will
default to creating an HTML element.
In user code, the latter three are commonly inherited in class
hierarchies that implement a common namespace.
"""
def __init__(self, *children, attrib=None, nsmap=None, **_extra):
u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
"""
cdef bint is_html = 0
cdef _BaseParser parser
cdef _Element last_child
# don't use normal attribute access as it might be overridden
_getattr = object.__getattribute__
try:
namespace = _utf8(_getattr(self, 'NAMESPACE'))
except AttributeError:
namespace = None
try:
ns, tag = _getNsTag(_getattr(self, 'TAG'))
if ns is not None:
namespace = ns
except AttributeError:
tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__'))
if b'.' in tag:
tag = tag.split(b'.')[-1]
try:
parser = _getattr(self, 'PARSER')
except AttributeError:
parser = None
for child in children:
if isinstance(child, _Element):
parser = (<_Element>child)._doc._parser
break
if isinstance(parser, HTMLParser):
is_html = 1
if namespace is None:
try:
is_html = _getattr(self, 'HTML')
except AttributeError:
pass
_initNewElement(self, is_html, tag, namespace, parser,
attrib, nsmap, _extra)
last_child = None
for child in children:
if _isString(child):
if last_child is None:
_setNodeText(self._c_node,
(_collectText(self._c_node.children) or '') + child)
else:
_setTailText(last_child._c_node,
(_collectText(last_child._c_node.next) or '') + child)
elif isinstance(child, _Element):
last_child = child
_appendChild(self, last_child)
elif isinstance(child, type) and issubclass(child, ElementBase):
last_child = child()
_appendChild(self, last_child)
else:
raise TypeError, "Invalid child type: %r" % type(child)
cdef class CommentBase(_Comment):
u"""All custom Comment classes must inherit from this one.
To create an XML Comment instance, use the ``Comment()`` factory.
Subclasses *must not* override __init__ or __new__ as it is
absolutely undefined when these objects will be created or
destroyed. All persistent state of Comments must be stored in the
underlying XML. If you really need to initialize the object after
creation, you can implement an ``_init(self)`` method that will be
called after object creation.
"""
def __init__(self, text):
# copied from Comment() factory
cdef _Document doc
cdef xmlDoc* c_doc
if text is None:
text = b''
else:
text = _utf8(text)
c_doc = _newXMLDoc()
doc = _documentFactory(c_doc, None)
self._c_node = _createComment(c_doc, _xcstr(text))
if self._c_node is NULL:
raise MemoryError()
tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
_registerProxy(self, doc, self._c_node)
self._init()
cdef class PIBase(_ProcessingInstruction):
u"""All custom Processing Instruction classes must inherit from this one.
To create an XML ProcessingInstruction instance, use the ``PI()``
factory.
Subclasses *must not* override __init__ or __new__ as it is
absolutely undefined when these objects will be created or
destroyed. All persistent state of PIs must be stored in the
underlying XML. If you really need to initialize the object after
creation, you can implement an ``_init(self)`` method that will be
called after object creation.
"""
def __init__(self, target, text=None):
# copied from PI() factory
cdef _Document doc
cdef xmlDoc* c_doc
target = _utf8(target)
if text is None:
text = b''
else:
text = _utf8(text)
c_doc = _newXMLDoc()
doc = _documentFactory(c_doc, None)
self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
if self._c_node is NULL:
raise MemoryError()
tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
_registerProxy(self, doc, self._c_node)
self._init()
cdef class EntityBase(_Entity):
u"""All custom Entity classes must inherit from this one.
To create an XML Entity instance, use the ``Entity()`` factory.
Subclasses *must not* override __init__ or __new__ as it is
absolutely undefined when these objects will be created or
destroyed. All persistent state of Entities must be stored in the
underlying XML. If you really need to initialize the object after
creation, you can implement an ``_init(self)`` method that will be
called after object creation.
"""
def __init__(self, name):
cdef _Document doc
cdef xmlDoc* c_doc
name_utf = _utf8(name)
c_name = _xcstr(name_utf)
if c_name[0] == c'#':
if not _characterReferenceIsValid(c_name + 1):
raise ValueError, u"Invalid character reference: '%s'" % name
elif not _xmlNameIsValid(c_name):
raise ValueError, u"Invalid entity reference: '%s'" % name
c_doc = _newXMLDoc()
doc = _documentFactory(c_doc, None)
self._c_node = _createEntity(c_doc, c_name)
if self._c_node is NULL:
raise MemoryError()
tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
_registerProxy(self, doc, self._c_node)
self._init()
cdef int _validateNodeClass(xmlNode* c_node, cls) except -1:
if c_node.type == tree.XML_ELEMENT_NODE:
expected = ElementBase
elif c_node.type == tree.XML_COMMENT_NODE:
expected = CommentBase
elif c_node.type == tree.XML_ENTITY_REF_NODE:
expected = EntityBase
elif c_node.type == tree.XML_PI_NODE:
expected = PIBase
else:
assert 0, u"Unknown node type: %s" % c_node.type
if not (isinstance(cls, type) and issubclass(cls, expected)):
raise TypeError(
"result of class lookup must be subclass of %s, got %s"
% (type(expected), type(cls)))
return 0
################################################################################
# Element class lookup
ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*)
# class to store element class lookup functions
cdef public class ElementClassLookup [ type LxmlElementClassLookupType,
object LxmlElementClassLookup ]:
u"""ElementClassLookup(self)
Superclass of Element class lookups.
"""
cdef _element_class_lookup_function _lookup_function
def __cinit__(self):
self._lookup_function = NULL # use default lookup
cdef public class FallbackElementClassLookup(ElementClassLookup) \
[ type LxmlFallbackElementClassLookupType,
object LxmlFallbackElementClassLookup ]:
u"""FallbackElementClassLookup(self, fallback=None)
Superclass of Element class lookups with additional fallback.
"""
cdef readonly ElementClassLookup fallback
cdef _element_class_lookup_function _fallback_function
def __cinit__(self):
# fall back to default lookup
self._fallback_function = _lookupDefaultElementClass
def __init__(self, ElementClassLookup fallback=None):
if fallback is not None:
self._setFallback(fallback)
else:
self._fallback_function = _lookupDefaultElementClass
cdef void _setFallback(self, ElementClassLookup lookup):
u"""Sets the fallback scheme for this lookup method.
"""
self.fallback = lookup
self._fallback_function = lookup._lookup_function
if self._fallback_function is NULL:
self._fallback_function = _lookupDefaultElementClass
def set_fallback(self, ElementClassLookup lookup not None):
u"""set_fallback(self, lookup)
Sets the fallback scheme for this lookup method.
"""
self._setFallback(lookup)
cdef inline object _callLookupFallback(FallbackElementClassLookup lookup,
_Document doc, xmlNode* c_node):
return lookup._fallback_function(lookup.fallback, doc, c_node)
################################################################################
# default lookup scheme
cdef class ElementDefaultClassLookup(ElementClassLookup):
u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None)
Element class lookup scheme that always returns the default Element
class.
The keyword arguments ``element``, ``comment``, ``pi`` and ``entity``
accept the respective Element classes.
"""
cdef readonly object element_class
cdef readonly object comment_class
cdef readonly object pi_class
cdef readonly object entity_class
def __cinit__(self):
self._lookup_function = _lookupDefaultElementClass
def __init__(self, element=None, comment=None, pi=None, entity=None):
if element is None:
self.element_class = _Element
elif issubclass(element, ElementBase):
self.element_class = element
else:
raise TypeError, u"element class must be subclass of ElementBase"
if comment is None:
self.comment_class = _Comment
elif issubclass(comment, CommentBase):
self.comment_class = comment
else:
raise TypeError, u"comment class must be subclass of CommentBase"
if entity is None:
self.entity_class = _Entity
elif issubclass(entity, EntityBase):
self.entity_class = entity
else:
raise TypeError, u"Entity class must be subclass of EntityBase"
if pi is None:
self.pi_class = None # special case, see below
elif issubclass(pi, PIBase):
self.pi_class = pi
else:
raise TypeError, u"PI class must be subclass of PIBase"
cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node):
u"Trivial class lookup function that always returns the default class."
if c_node.type == tree.XML_ELEMENT_NODE:
if state is not None:
return (<ElementDefaultClassLookup>state).element_class
else:
return _Element
elif c_node.type == tree.XML_COMMENT_NODE:
if state is not None:
return (<ElementDefaultClassLookup>state).comment_class
else:
return _Comment
elif c_node.type == tree.XML_ENTITY_REF_NODE:
if state is not None:
return (<ElementDefaultClassLookup>state).entity_class
else:
return _Entity
elif c_node.type == tree.XML_PI_NODE:
if state is None or (<ElementDefaultClassLookup>state).pi_class is None:
# special case XSLT-PI
if c_node.name is not NULL and c_node.content is not NULL:
if tree.xmlStrcmp(c_node.name, <unsigned char*>"xml-stylesheet") == 0:
if tree.xmlStrstr(c_node.content, <unsigned char*>"text/xsl") is not NULL or \
tree.xmlStrstr(c_node.content, <unsigned char*>"text/xml") is not NULL:
return _XSLTProcessingInstruction
return _ProcessingInstruction
else:
return (<ElementDefaultClassLookup>state).pi_class
else:
assert 0, u"Unknown node type: %s" % c_node.type
################################################################################
# attribute based lookup scheme
cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup):
u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None)
Checks an attribute of an Element and looks up the value in a
class dictionary.
Arguments:
- attribute name - '{ns}name' style string
- class mapping - Python dict mapping attribute values to Element classes
- fallback - optional fallback lookup mechanism
A None key in the class mapping will be checked if the attribute is
missing.
"""
cdef object _class_mapping
cdef tuple _pytag
cdef const_xmlChar* _c_ns
cdef const_xmlChar* _c_name
def __cinit__(self):
self._lookup_function = _attribute_class_lookup
def __init__(self, attribute_name, class_mapping,
ElementClassLookup fallback=None):
self._pytag = _getNsTag(attribute_name)
ns, name = self._pytag
if ns is None:
self._c_ns = NULL
else:
self._c_ns = _xcstr(ns)
self._c_name = _xcstr(name)
self._class_mapping = dict(class_mapping)
FallbackElementClassLookup.__init__(self, fallback)
cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node):
cdef AttributeBasedElementClassLookup lookup
cdef python.PyObject* dict_result
lookup = <AttributeBasedElementClassLookup>state
if c_node.type == tree.XML_ELEMENT_NODE:
value = _attributeValueFromNsName(
c_node, lookup._c_ns, lookup._c_name)
dict_result = python.PyDict_GetItem(lookup._class_mapping, value)
if dict_result is not NULL:
cls = <object>dict_result
_validateNodeClass(c_node, cls)
return cls
return _callLookupFallback(lookup, doc, c_node)
################################################################################
# per-parser lookup scheme
cdef class ParserBasedElementClassLookup(FallbackElementClassLookup):
u"""ParserBasedElementClassLookup(self, fallback=None)
Element class lookup based on the XML parser.
"""
def __cinit__(self):
self._lookup_function = _parser_class_lookup
cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node):
if doc._parser._class_lookup is not None:
return doc._parser._class_lookup._lookup_function(
doc._parser._class_lookup, doc, c_node)
return _callLookupFallback(<FallbackElementClassLookup>state, doc, c_node)
################################################################################
# custom class lookup based on node type, namespace, name
cdef class CustomElementClassLookup(FallbackElementClassLookup):
u"""CustomElementClassLookup(self, fallback=None)
Element class lookup based on a subclass method.
You can inherit from this class and override the method::
lookup(self, type, doc, namespace, name)
to lookup the element class for a node. Arguments of the method:
* type: one of 'element', 'comment', 'PI', 'entity'
* doc: document that the node is in
* namespace: namespace URI of the node (or None for comments/PIs/entities)
* name: name of the element/entity, None for comments, target for PIs
If you return None from this method, the fallback will be called.
"""
def __cinit__(self):
self._lookup_function = _custom_class_lookup
def lookup(self, type, doc, namespace, name):
u"lookup(self, type, doc, namespace, name)"
return None
cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node):
cdef CustomElementClassLookup lookup
lookup = <CustomElementClassLookup>state
if c_node.type == tree.XML_ELEMENT_NODE:
element_type = u"element"
elif c_node.type == tree.XML_COMMENT_NODE:
element_type = u"comment"
elif c_node.type == tree.XML_PI_NODE:
element_type = u"PI"
elif c_node.type == tree.XML_ENTITY_REF_NODE:
element_type = u"entity"
else:
element_type = u"element"
if c_node.name is NULL:
name = None
else:
name = funicode(c_node.name)
c_str = tree._getNs(c_node)
ns = funicode(c_str) if c_str is not NULL else None
cls = lookup.lookup(element_type, doc, ns, name)
if cls is not None:
_validateNodeClass(c_node, cls)
return cls
return _callLookupFallback(lookup, doc, c_node)
################################################################################
# read-only tree based class lookup
cdef class PythonElementClassLookup(FallbackElementClassLookup):
u"""PythonElementClassLookup(self, fallback=None)
Element class lookup based on a subclass method.
This class lookup scheme allows access to the entire XML tree in
read-only mode. To use it, re-implement the ``lookup(self, doc,
root)`` method in a subclass::
from lxml import etree, pyclasslookup
class MyElementClass(etree.ElementBase):
honkey = True
class MyLookup(pyclasslookup.PythonElementClassLookup):
def lookup(self, doc, root):
if root.tag == "sometag":
return MyElementClass
else:
for child in root:
if child.tag == "someothertag":
return MyElementClass
# delegate to default
return None
If you return None from this method, the fallback will be called.
The first argument is the opaque document instance that contains
the Element. The second argument is a lightweight Element proxy
implementation that is only valid during the lookup. Do not try
to keep a reference to it. Once the lookup is done, the proxy
will be invalid.
Also, you cannot wrap such a read-only Element in an ElementTree,
and you must take care not to keep a reference to them outside of
the `lookup()` method.
Note that the API of the Element objects is not complete. It is
purely read-only and does not support all features of the normal
`lxml.etree` API (such as XPath, extended slicing or some
iteration methods).
See http://codespeak.net/lxml/element_classes.html
"""
def __cinit__(self):
self._lookup_function = _python_class_lookup
def lookup(self, doc, element):
u"""lookup(self, doc, element)
Override this method to implement your own lookup scheme.
"""
return None
cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node):
cdef PythonElementClassLookup lookup
cdef _ReadOnlyElementProxy proxy
lookup = <PythonElementClassLookup>state
proxy = _newReadOnlyProxy(None, c_node)
cls = lookup.lookup(doc, proxy)
_freeReadOnlyProxies(proxy)
if cls is not None:
_validateNodeClass(c_node, cls)
return cls
return _callLookupFallback(lookup, doc, c_node)
################################################################################
# Global setup
cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS
cdef object ELEMENT_CLASS_LOOKUP_STATE
cdef void _setElementClassLookupFunction(
_element_class_lookup_function function, object state):
global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE
if function is NULL:
state = DEFAULT_ELEMENT_CLASS_LOOKUP
function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function
ELEMENT_CLASS_LOOKUP_STATE = state
LOOKUP_ELEMENT_CLASS = function
def set_element_class_lookup(ElementClassLookup lookup = None):
u"""set_element_class_lookup(lookup = None)
Set the global default element class lookup method.
"""
if lookup is None or lookup._lookup_function is NULL:
_setElementClassLookupFunction(NULL, None)
else:
_setElementClassLookupFunction(lookup._lookup_function, lookup)
# default setup: parser delegation
cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP
DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup()
set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP)

210
lib/lxml/cleanup.pxi Normal file
View file

@ -0,0 +1,210 @@
# functions for tree cleanup and removing elements from subtrees
def cleanup_namespaces(tree_or_element):
u"""cleanup_namespaces(tree_or_element)
Remove all namespace declarations from a subtree that are not used
by any of the elements or attributes in that tree.
"""
cdef _Element element
element = _rootNodeOrRaise(tree_or_element)
_removeUnusedNamespaceDeclarations(element._c_node)
def strip_attributes(tree_or_element, *attribute_names):
u"""strip_attributes(tree_or_element, *attribute_names)
Delete all attributes with the provided attribute names from an
Element (or ElementTree) and its descendants.
Attribute names can contain wildcards as in `_Element.iter`.
Example usage::
strip_attributes(root_element,
'simpleattr',
'{http://some/ns}attrname',
'{http://other/ns}*')
"""
cdef _MultiTagMatcher matcher
cdef _Element element
element = _rootNodeOrRaise(tree_or_element)
if not attribute_names:
return
matcher = _MultiTagMatcher(attribute_names)
matcher.cacheTags(element._doc)
if matcher.rejectsAllAttributes():
return
_strip_attributes(element._c_node, matcher)
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
cdef xmlAttr* c_attr
cdef xmlAttr* c_next_attr
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
c_attr = c_node.properties
while c_attr is not NULL:
c_next_attr = c_attr.next
if matcher.matchesAttribute(c_attr):
tree.xmlRemoveProp(c_attr)
c_attr = c_next_attr
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
u"""strip_elements(tree_or_element, *tag_names, with_tail=True)
Delete all elements with the provided tag names from a tree or
subtree. This will remove the elements and their entire subtree,
including all their attributes, text content and descendants. It
will also remove the tail text of the element unless you
explicitly set the ``with_tail`` keyword argument option to False.
Tag names can contain wildcards as in `_Element.iter`.
Note that this will not delete the element (or ElementTree root
element) that you passed even if it matches. It will only treat
its descendants. If you want to include the root element, check
its tag name directly before even calling this function.
Example usage::
strip_elements(some_element,
'simpletagname', # non-namespaced tag
'{http://some/ns}tagname', # namespaced tag
'{http://some/other/ns}*' # any tag from a namespace
lxml.etree.Comment # comments
)
"""
cdef _MultiTagMatcher matcher
cdef _Element element
cdef _Document doc
cdef list ns_tags
cdef qname* c_ns_tags
cdef Py_ssize_t c_tag_count
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
if not tag_names:
return
matcher = _MultiTagMatcher(tag_names)
matcher.cacheTags(doc)
if matcher.rejectsAll():
return
if isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
if matcher.matchesType(tree.XML_COMMENT_NODE):
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
if matcher.matchesType(tree.XML_PI_NODE):
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
_strip_elements(doc, element._c_node, matcher, with_tail)
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
bint with_tail):
cdef xmlNode* c_child
cdef xmlNode* c_next
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
# we run through the children here to prevent any problems
# with the tree iteration which would occur if we unlinked the
# c_node itself
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
c_next = _nextElement(c_child)
if matcher.matches(c_child):
if c_child.type == tree.XML_ELEMENT_NODE:
if not with_tail:
tree.xmlUnlinkNode(c_child)
_removeNode(doc, c_child)
else:
if with_tail:
_removeText(c_child.next)
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
def strip_tags(tree_or_element, *tag_names):
u"""strip_tags(tree_or_element, *tag_names)
Delete all elements with the provided tag names from a tree or
subtree. This will remove the elements and their attributes, but
*not* their text/tail content or descendants. Instead, it will
merge the text content and children of the element into its
parent.
Tag names can contain wildcards as in `_Element.iter`.
Note that this will not delete the element (or ElementTree root
element) that you passed even if it matches. It will only treat
its descendants.
Example usage::
strip_tags(some_element,
'simpletagname', # non-namespaced tag
'{http://some/ns}tagname', # namespaced tag
'{http://some/other/ns}*' # any tag from a namespace
Comment # comments (including their text!)
)
"""
cdef _MultiTagMatcher matcher
cdef _Element element
cdef _Document doc
cdef list ns_tags
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
cdef char** c_ns_tags
cdef Py_ssize_t c_tag_count
doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
if not tag_names:
return
matcher = _MultiTagMatcher(tag_names)
matcher.cacheTags(doc)
if matcher.rejectsAll():
return
if isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
if matcher.matchesType(tree.XML_COMMENT_NODE):
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
if matcher.matchesType(tree.XML_PI_NODE):
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
_strip_tags(doc, element._c_node, matcher)
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
cdef xmlNode* c_child
cdef xmlNode* c_next
cdef Py_ssize_t i
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
# we run through the children here to prevent any problems
# with the tree iteration which would occur if we unlinked the
# c_node itself
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
if not matcher.matches(c_child):
c_child = _nextElement(c_child)
continue
if c_child.type == tree.XML_ELEMENT_NODE:
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
_replaceNodeByChildren(doc, c_child)
if not attemptDeallocation(c_child):
if c_child.nsDef is not NULL:
# make namespaces absolute
moveNodeToDocument(doc, doc._c_doc, c_child)
c_child = c_next
else:
c_next = _nextElement(c_child)
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)

103
lib/lxml/cssselect.py Normal file
View file

@ -0,0 +1,103 @@
"""CSS Selectors based on XPath.
This module supports selecting XML/HTML tags based on CSS selectors.
See the `CSSSelector` class for details.
This is a thin wrapper around cssselect 0.7 or later.
"""
import sys
from lxml import etree
## Work-around the lack of absolute import in Python 2.4
#from __future__ import absolute_import
#from cssselect import ...
try:
external_cssselect = __import__('cssselect')
except ImportError:
raise ImportError('cssselect seems not to be installed. '
'See http://packages.python.org/cssselect/')
SelectorSyntaxError = external_cssselect.SelectorSyntaxError
ExpressionError = external_cssselect.ExpressionError
SelectorError = external_cssselect.SelectorError
__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
'CSSSelector']
class LxmlTranslator(external_cssselect.GenericTranslator):
"""
A custom CSS selector to XPath translator with lxml-specific extensions.
"""
def xpath_contains_function(self, xpath, function):
# Defined there, removed in later drafts:
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
if function.argument_types() not in (['STRING'], ['IDENT']):
raise ExpressionError(
"Expected a single string or ident for :contains(), got %r"
% function.arguments)
value = function.arguments[0].value
return xpath.add_condition(
'contains(__lxml_internal_css:lower-case(string(.)), %s)'
% self.xpath_literal(value.lower()))
class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
"""
lxml extensions + HTML support.
"""
def _make_lower_case(context, s):
return s.lower()
ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
ns.prefix = '__lxml_internal_css'
ns['lower-case'] = _make_lower_case
class CSSSelector(etree.XPath):
"""A CSS selector.
Usage::
>>> from lxml import etree, cssselect
>>> select = cssselect.CSSSelector("a tag > child")
>>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
>>> [ el.tag for el in select(root) ]
['child']
To use CSS namespaces, you need to pass a prefix-to-namespace
mapping as ``namespaces`` keyword argument::
>>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
>>> select_ns = cssselect.CSSSelector('root > rdf|Description',
... namespaces={'rdf': rdfns})
>>> rdf = etree.XML((
... '<root xmlns:rdf="%s">'
... '<rdf:Description>blah</rdf:Description>'
... '</root>') % rdfns)
>>> [(el.tag, el.text) for el in select_ns(rdf)]
[('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
"""
def __init__(self, css, namespaces=None, translator='xml'):
if translator == 'xml':
translator = LxmlTranslator()
elif translator == 'html':
translator = LxmlHTMLTranslator()
elif translator == 'xhtml':
translator = LxmlHTMLTranslator(xhtml=True)
path = translator.css_to_xpath(css)
etree.XPath.__init__(self, path, namespaces=namespaces)
self.css = css
def __repr__(self):
return '<%s %s for %r>' % (
self.__class__.__name__,
hex(abs(id(self)))[2:],
self.css)

8
lib/lxml/cvarargs.pxd Normal file
View file

@ -0,0 +1,8 @@
cdef extern from "stdarg.h":
ctypedef void *va_list
void va_start(va_list ap, void *last) nogil
void va_end(va_list ap) nogil
cdef extern from "etree_defs.h":
cdef int va_int(va_list ap) nogil
cdef char *va_charptr(va_list ap) nogil

91
lib/lxml/debug.pxi Normal file
View file

@ -0,0 +1,91 @@
@cython.final
@cython.internal
cdef class _MemDebug:
"""Debugging support for the memory allocation in libxml2.
"""
def bytes_used(self):
"""bytes_used(self)
Returns the total amount of memory (in bytes) currently used by libxml2.
Note that libxml2 constrains this value to a C int, which limits
the accuracy on 64 bit systems.
"""
return tree.xmlMemUsed()
def blocks_used(self):
"""blocks_used(self)
Returns the total number of memory blocks currently allocated by libxml2.
Note that libxml2 constrains this value to a C int, which limits
the accuracy on 64 bit systems.
"""
return tree.xmlMemBlocks()
def dict_size(self):
"""dict_size(self)
Returns the current size of the global name dictionary used by libxml2
for the current thread. Each thread has its own dictionary.
"""
c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL)
if c_dict is NULL:
raise MemoryError()
return tree.xmlDictSize(c_dict)
def dump(self, output_file=None, byte_count=None):
"""dump(self, output_file=None, byte_count=None)
Dumps the current memory blocks allocated by libxml2 to a file.
The optional parameter 'output_file' specifies the file path. It defaults
to the file ".memorylist" in the current directory.
The optional parameter 'byte_count' limits the number of bytes in the dump.
Note that this parameter is ignored when lxml is compiled against a libxml2
version before 2.7.0.
"""
cdef Py_ssize_t c_count
if output_file is None:
output_file = b'.memorylist'
elif isinstance(output_file, unicode):
output_file.encode(sys.getfilesystemencoding())
f = stdio.fopen(output_file, "w")
if f is NULL:
raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding()))
try:
if byte_count is None:
tree.xmlMemDisplay(f)
else:
c_count = byte_count
tree.xmlMemDisplayLast(f, c_count)
finally:
stdio.fclose(f)
def show(self, output_file=None, block_count=None):
"""show(self, output_file=None, block_count=None)
Dumps the current memory blocks allocated by libxml2 to a file.
The output file format is suitable for line diffing.
The optional parameter 'output_file' specifies the file path. It defaults
to the file ".memorydump" in the current directory.
The optional parameter 'block_count' limits the number of blocks
in the dump.
"""
if output_file is None:
output_file = b'.memorydump'
elif isinstance(output_file, unicode):
output_file.encode(sys.getfilesystemencoding())
f = stdio.fopen(output_file, "w")
if f is NULL:
raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding()))
try:
tree.xmlMemShow(f, block_count if block_count is not None else tree.xmlMemBlocks())
finally:
stdio.fclose(f)
memory_debugger = _MemDebug()

175
lib/lxml/docloader.pxi Normal file
View file

@ -0,0 +1,175 @@
# Custom resolver API
ctypedef enum _InputDocumentDataType:
PARSER_DATA_INVALID
PARSER_DATA_EMPTY
PARSER_DATA_STRING
PARSER_DATA_FILENAME
PARSER_DATA_FILE
@cython.final
@cython.internal
cdef class _InputDocument:
cdef _InputDocumentDataType _type
cdef bytes _data_bytes
cdef object _filename
cdef object _file
cdef bint _close_file
def __cinit__(self):
self._type = PARSER_DATA_INVALID
cdef class Resolver:
u"This is the base class of all resolvers."
def resolve(self, system_url, public_id, context):
u"""resolve(self, system_url, public_id, context)
Override this method to resolve an external source by
``system_url`` and ``public_id``. The third argument is an
opaque context object.
Return the result of one of the ``resolve_*()`` methods.
"""
return None
def resolve_empty(self, context):
u"""resolve_empty(self, context)
Return an empty input document.
Pass context as parameter.
"""
cdef _InputDocument doc_ref
doc_ref = _InputDocument()
doc_ref._type = PARSER_DATA_EMPTY
return doc_ref
def resolve_string(self, string, context, *, base_url=None):
u"""resolve_string(self, string, context, base_url=None)
Return a parsable string as input document.
Pass data string and context as parameters. You can pass the
source URL or filename through the ``base_url`` keyword
argument.
"""
cdef _InputDocument doc_ref
if isinstance(string, unicode):
string = (<unicode>string).encode('utf8')
elif not isinstance(string, bytes):
raise TypeError, "argument must be a byte string or unicode string"
doc_ref = _InputDocument()
doc_ref._type = PARSER_DATA_STRING
doc_ref._data_bytes = string
if base_url is not None:
doc_ref._filename = _encodeFilename(base_url)
return doc_ref
def resolve_filename(self, filename, context):
u"""resolve_filename(self, filename, context)
Return the name of a parsable file as input document.
Pass filename and context as parameters. You can also pass a
URL with an HTTP, FTP or file target.
"""
cdef _InputDocument doc_ref
doc_ref = _InputDocument()
doc_ref._type = PARSER_DATA_FILENAME
doc_ref._filename = _encodeFilename(filename)
return doc_ref
def resolve_file(self, f, context, *, base_url=None, bint close=True):
u"""resolve_file(self, f, context, base_url=None, close=True)
Return an open file-like object as input document.
Pass open file and context as parameters. You can pass the
base URL or filename of the file through the ``base_url``
keyword argument. If the ``close`` flag is True (the
default), the file will be closed after reading.
Note that using ``.resolve_filename()`` is more efficient,
especially in threaded environments.
"""
cdef _InputDocument doc_ref
try:
f.read
except AttributeError:
raise TypeError, u"Argument is not a file-like object"
doc_ref = _InputDocument()
doc_ref._type = PARSER_DATA_FILE
if base_url is not None:
doc_ref._filename = _encodeFilename(base_url)
else:
doc_ref._filename = _getFilenameForFile(f)
doc_ref._close_file = close
doc_ref._file = f
return doc_ref
@cython.final
@cython.internal
cdef class _ResolverRegistry:
cdef object _resolvers
cdef Resolver _default_resolver
def __cinit__(self, Resolver default_resolver=None):
self._resolvers = set()
self._default_resolver = default_resolver
def add(self, Resolver resolver not None):
u"""add(self, resolver)
Register a resolver.
For each requested entity, the 'resolve' method of the resolver will
be called and the result will be passed to the parser. If this method
returns None, the request will be delegated to other resolvers or the
default resolver. The resolvers will be tested in an arbitrary order
until the first match is found.
"""
self._resolvers.add(resolver)
def remove(self, resolver):
u"remove(self, resolver)"
self._resolvers.discard(resolver)
cdef _ResolverRegistry _copy(self):
cdef _ResolverRegistry registry
registry = _ResolverRegistry(self._default_resolver)
registry._resolvers = self._resolvers.copy()
return registry
def copy(self):
u"copy(self)"
return self._copy()
def resolve(self, system_url, public_id, context):
u"resolve(self, system_url, public_id, context)"
for resolver in self._resolvers:
result = resolver.resolve(system_url, public_id, context)
if result is not None:
return result
if self._default_resolver is None:
return None
return self._default_resolver.resolve(system_url, public_id, context)
def __repr__(self):
return repr(self._resolvers)
@cython.internal
cdef class _ResolverContext(_ExceptionContext):
cdef _ResolverRegistry _resolvers
cdef _TempStore _storage
cdef void clear(self):
_ExceptionContext.clear(self)
self._storage.clear()
cdef _initResolverContext(_ResolverContext context,
_ResolverRegistry resolvers):
if resolvers is None:
context._resolvers = _ResolverRegistry()
else:
context._resolvers = resolvers
context._storage = _TempStore()

505
lib/lxml/doctestcompare.py Normal file
View file

@ -0,0 +1,505 @@
"""
lxml-based doctest output comparison.
Note: normally, you should just import the `lxml.usedoctest` and
`lxml.html.usedoctest` modules from within a doctest, instead of this
one::
>>> import lxml.usedoctest # for XML output
>>> import lxml.html.usedoctest # for HTML output
To use this module directly, you must call ``lxmldoctest.install()``,
which will cause doctest to use this in all subsequent calls.
This changes the way output is checked and comparisons are made for
XML or HTML-like content.
XML or HTML content is noticed because the example starts with ``<``
(it's HTML if it starts with ``<html``). You can also use the
``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
Some rough wildcard-like things are allowed. Whitespace is generally
ignored (except in attributes). In text (attributes and text in the
body) you can use ``...`` as a wildcard. In an example it also
matches any trailing tags in the element, though it does not match
leading tags. You may create a tag ``<any>`` or include an ``any``
attribute in the tag. An ``any`` tag matches any tag, while the
attribute matches any and all attributes.
When a match fails, the reformatted example and gotten text is
displayed (indented), and a rough diff-like output is given. Anything
marked with ``-`` is in the output but wasn't supposed to be, and
similarly ``+`` means its in the example but wasn't in the output.
You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
"""
from lxml import etree
import sys
import re
import doctest
import cgi
__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
'LHTMLOutputChecker', 'install', 'temp_install']
try:
_basestring = basestring
except NameError:
_basestring = (str, bytes)
_IS_PYTHON_3 = sys.version_info[0] >= 3
PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
PARSE_XML = doctest.register_optionflag('PARSE_XML')
NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')
OutputChecker = doctest.OutputChecker
def strip(v):
if v is None:
return None
else:
return v.strip()
def norm_whitespace(v):
return _norm_whitespace_re.sub(' ', v)
_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)
def html_fromstring(html):
return etree.fromstring(html, _html_parser)
# We use this to distinguish repr()s from elements:
_repr_re = re.compile(r'^<[^>]+ (at|object) ')
_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')
class LXMLOutputChecker(OutputChecker):
empty_tags = (
'param', 'img', 'area', 'br', 'basefont', 'input',
'base', 'meta', 'link', 'col')
def get_default_parser(self):
return etree.XML
def check_output(self, want, got, optionflags):
alt_self = getattr(self, '_temp_override_self', None)
if alt_self is not None:
super_method = self._temp_call_super_check_output
self = alt_self
else:
super_method = OutputChecker.check_output
parser = self.get_parser(want, got, optionflags)
if not parser:
return super_method(
self, want, got, optionflags)
try:
want_doc = parser(want)
except etree.XMLSyntaxError:
return False
try:
got_doc = parser(got)
except etree.XMLSyntaxError:
return False
return self.compare_docs(want_doc, got_doc)
def get_parser(self, want, got, optionflags):
parser = None
if NOPARSE_MARKUP & optionflags:
return None
if PARSE_HTML & optionflags:
parser = html_fromstring
elif PARSE_XML & optionflags:
parser = etree.XML
elif (want.strip().lower().startswith('<html')
and got.strip().startswith('<html')):
parser = html_fromstring
elif (self._looks_like_markup(want)
and self._looks_like_markup(got)):
parser = self.get_default_parser()
return parser
def _looks_like_markup(self, s):
s = s.strip()
return (s.startswith('<')
and not _repr_re.search(s))
def compare_docs(self, want, got):
if not self.tag_compare(want.tag, got.tag):
return False
if not self.text_compare(want.text, got.text, True):
return False
if not self.text_compare(want.tail, got.tail, True):
return False
if 'any' not in want.attrib:
want_keys = sorted(want.attrib.keys())
got_keys = sorted(got.attrib.keys())
if want_keys != got_keys:
return False
for key in want_keys:
if not self.text_compare(want.attrib[key], got.attrib[key], False):
return False
if want.text != '...' or len(want):
want_children = list(want)
got_children = list(got)
while want_children or got_children:
if not want_children or not got_children:
return False
want_first = want_children.pop(0)
got_first = got_children.pop(0)
if not self.compare_docs(want_first, got_first):
return False
if not got_children and want_first.tail == '...':
break
return True
def text_compare(self, want, got, strip):
want = want or ''
got = got or ''
if strip:
want = norm_whitespace(want).strip()
got = norm_whitespace(got).strip()
want = '^%s$' % re.escape(want)
want = want.replace(r'\.\.\.', '.*')
if re.search(want, got):
return True
else:
return False
def tag_compare(self, want, got):
if want == 'any':
return True
if (not isinstance(want, _basestring)
or not isinstance(got, _basestring)):
return want == got
want = want or ''
got = got or ''
if want.startswith('{...}'):
# Ellipsis on the namespace
return want.split('}')[-1] == got.split('}')[-1]
else:
return want == got
def output_difference(self, example, got, optionflags):
want = example.want
parser = self.get_parser(want, got, optionflags)
errors = []
if parser is not None:
try:
want_doc = parser(want)
except etree.XMLSyntaxError:
e = sys.exc_info()[1]
errors.append('In example: %s' % e)
try:
got_doc = parser(got)
except etree.XMLSyntaxError:
e = sys.exc_info()[1]
errors.append('In actual output: %s' % e)
if parser is None or errors:
value = OutputChecker.output_difference(
self, example, got, optionflags)
if errors:
errors.append(value)
return '\n'.join(errors)
else:
return value
html = parser is html_fromstring
diff_parts = []
diff_parts.append('Expected:')
diff_parts.append(self.format_doc(want_doc, html, 2))
diff_parts.append('Got:')
diff_parts.append(self.format_doc(got_doc, html, 2))
diff_parts.append('Diff:')
diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
return '\n'.join(diff_parts)
def html_empty_tag(self, el, html=True):
if not html:
return False
if el.tag not in self.empty_tags:
return False
if el.text or len(el):
# This shouldn't happen (contents in an empty tag)
return False
return True
def format_doc(self, doc, html, indent, prefix=''):
parts = []
if not len(doc):
# No children...
parts.append(' '*indent)
parts.append(prefix)
parts.append(self.format_tag(doc))
if not self.html_empty_tag(doc, html):
if strip(doc.text):
parts.append(self.format_text(doc.text))
parts.append(self.format_end_tag(doc))
if strip(doc.tail):
parts.append(self.format_text(doc.tail))
parts.append('\n')
return ''.join(parts)
parts.append(' '*indent)
parts.append(prefix)
parts.append(self.format_tag(doc))
if not self.html_empty_tag(doc, html):
parts.append('\n')
if strip(doc.text):
parts.append(' '*indent)
parts.append(self.format_text(doc.text))
parts.append('\n')
for el in doc:
parts.append(self.format_doc(el, html, indent+2))
parts.append(' '*indent)
parts.append(self.format_end_tag(doc))
parts.append('\n')
if strip(doc.tail):
parts.append(' '*indent)
parts.append(self.format_text(doc.tail))
parts.append('\n')
return ''.join(parts)
def format_text(self, text, strip=True):
if text is None:
return ''
if strip:
text = text.strip()
return cgi.escape(text, 1)
def format_tag(self, el):
attrs = []
if isinstance(el, etree.CommentBase):
# FIXME: probably PIs should be handled specially too?
return '<!--'
for name, value in sorted(el.attrib.items()):
attrs.append('%s="%s"' % (name, self.format_text(value, False)))
if not attrs:
return '<%s>' % el.tag
return '<%s %s>' % (el.tag, ' '.join(attrs))
def format_end_tag(self, el):
if isinstance(el, etree.CommentBase):
# FIXME: probably PIs should be handled specially too?
return '-->'
return '</%s>' % el.tag
def collect_diff(self, want, got, html, indent):
parts = []
if not len(want) and not len(got):
parts.append(' '*indent)
parts.append(self.collect_diff_tag(want, got))
if not self.html_empty_tag(got, html):
parts.append(self.collect_diff_text(want.text, got.text))
parts.append(self.collect_diff_end_tag(want, got))
parts.append(self.collect_diff_text(want.tail, got.tail))
parts.append('\n')
return ''.join(parts)
parts.append(' '*indent)
parts.append(self.collect_diff_tag(want, got))
parts.append('\n')
if strip(want.text) or strip(got.text):
parts.append(' '*indent)
parts.append(self.collect_diff_text(want.text, got.text))
parts.append('\n')
want_children = list(want)
got_children = list(got)
while want_children or got_children:
if not want_children:
parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-'))
continue
if not got_children:
parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+'))
continue
parts.append(self.collect_diff(
want_children.pop(0), got_children.pop(0), html, indent+2))
parts.append(' '*indent)
parts.append(self.collect_diff_end_tag(want, got))
parts.append('\n')
if strip(want.tail) or strip(got.tail):
parts.append(' '*indent)
parts.append(self.collect_diff_text(want.tail, got.tail))
parts.append('\n')
return ''.join(parts)
def collect_diff_tag(self, want, got):
if not self.tag_compare(want.tag, got.tag):
tag = '%s (got: %s)' % (want.tag, got.tag)
else:
tag = got.tag
attrs = []
any = want.tag == 'any' or 'any' in want.attrib
for name, value in sorted(got.attrib.items()):
if name not in want.attrib and not any:
attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
else:
if name in want.attrib:
text = self.collect_diff_text(want.attrib[name], value, False)
else:
text = self.format_text(value, False)
attrs.append('%s="%s"' % (name, text))
if not any:
for name, value in sorted(want.attrib.items()):
if name in got.attrib:
continue
attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
if attrs:
tag = '<%s %s>' % (tag, ' '.join(attrs))
else:
tag = '<%s>' % tag
return tag
def collect_diff_end_tag(self, want, got):
if want.tag != got.tag:
tag = '%s (got: %s)' % (want.tag, got.tag)
else:
tag = got.tag
return '</%s>' % tag
def collect_diff_text(self, want, got, strip=True):
if self.text_compare(want, got, strip):
if not got:
return ''
return self.format_text(got, strip)
text = '%s (got: %s)' % (want, got)
return self.format_text(text, strip)
class LHTMLOutputChecker(LXMLOutputChecker):
def get_default_parser(self):
return html_fromstring
def install(html=False):
"""
Install doctestcompare for all future doctests.
If html is true, then by default the HTML parser will be used;
otherwise the XML parser is used.
"""
if html:
doctest.OutputChecker = LHTMLOutputChecker
else:
doctest.OutputChecker = LXMLOutputChecker
def temp_install(html=False, del_module=None):
"""
Use this *inside* a doctest to enable this checker for this
doctest only.
If html is true, then by default the HTML parser will be used;
otherwise the XML parser is used.
"""
if html:
Checker = LHTMLOutputChecker
else:
Checker = LXMLOutputChecker
frame = _find_doctest_frame()
dt_self = frame.f_locals['self']
checker = Checker()
old_checker = dt_self._checker
dt_self._checker = checker
# The unfortunate thing is that there is a local variable 'check'
# in the function that runs the doctests, that is a bound method
# into the output checker. We have to update that. We can't
# modify the frame, so we have to modify the object in place. The
# only way to do this is to actually change the func_code
# attribute of the method. We change it, and then wait for
# __record_outcome to be run, which signals the end of the __run
# method, at which point we restore the previous check_output
# implementation.
if _IS_PYTHON_3:
check_func = frame.f_locals['check'].__func__
checker_check_func = checker.check_output.__func__
else:
check_func = frame.f_locals['check'].im_func
checker_check_func = checker.check_output.im_func
# Because we can't patch up func_globals, this is the only global
# in check_output that we care about:
doctest.etree = etree
_RestoreChecker(dt_self, old_checker, checker,
check_func, checker_check_func,
del_module)
class _RestoreChecker(object):
def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
del_module):
self.dt_self = dt_self
self.checker = old_checker
self.checker._temp_call_super_check_output = self.call_super
self.checker._temp_override_self = new_checker
self.check_func = check_func
self.clone_func = clone_func
self.del_module = del_module
self.install_clone()
self.install_dt_self()
def install_clone(self):
if _IS_PYTHON_3:
self.func_code = self.check_func.__code__
self.func_globals = self.check_func.__globals__
self.check_func.__code__ = self.clone_func.__code__
else:
self.func_code = self.check_func.func_code
self.func_globals = self.check_func.func_globals
self.check_func.func_code = self.clone_func.func_code
def uninstall_clone(self):
if _IS_PYTHON_3:
self.check_func.__code__ = self.func_code
else:
self.check_func.func_code = self.func_code
def install_dt_self(self):
self.prev_func = self.dt_self._DocTestRunner__record_outcome
self.dt_self._DocTestRunner__record_outcome = self
def uninstall_dt_self(self):
self.dt_self._DocTestRunner__record_outcome = self.prev_func
def uninstall_module(self):
if self.del_module:
import sys
del sys.modules[self.del_module]
if '.' in self.del_module:
package, module = self.del_module.rsplit('.', 1)
package_mod = sys.modules[package]
delattr(package_mod, module)
def __call__(self, *args, **kw):
self.uninstall_clone()
self.uninstall_dt_self()
del self.checker._temp_override_self
del self.checker._temp_call_super_check_output
result = self.prev_func(*args, **kw)
self.uninstall_module()
return result
def call_super(self, *args, **kw):
self.uninstall_clone()
try:
return self.check_func(*args, **kw)
finally:
self.install_clone()
def _find_doctest_frame():
import sys
frame = sys._getframe(1)
while frame:
l = frame.f_locals
if 'BOOM' in l:
# Sign of doctest
return frame
frame = frame.f_back
raise LookupError(
"Could not find doctest (only use this function *inside* a doctest)")
__test__ = {
'basic': '''
>>> temp_install()
>>> print """<xml a="1" b="2">stuff</xml>"""
<xml b="2" a="1">...</xml>
>>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>"""
<xml xmlns="...">
<tag attr="..." />
</xml>
>>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
<xml>...foo /></xml>
'''}
if __name__ == '__main__':
import doctest
doctest.testmod()

468
lib/lxml/dtd.pxi Normal file
View file

@ -0,0 +1,468 @@
# support for DTD validation
from lxml.includes cimport dtdvalid
class DTDError(LxmlError):
u"""Base class for DTD errors.
"""
pass
class DTDParseError(DTDError):
u"""Error while parsing a DTD.
"""
pass
class DTDValidateError(DTDError):
u"""Error while validating an XML document with a DTD.
"""
pass
cdef inline int _assertValidDTDNode(node, void *c_node) except -1:
assert c_node is not NULL, u"invalid DTD proxy at %s" % id(node)
@cython.final
@cython.internal
@cython.freelist(8)
cdef class _DTDElementContentDecl:
cdef DTD _dtd
cdef tree.xmlElementContent* _c_node
def __repr__(self):
return "<%s.%s object name=%r type=%r occur=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.type, self.occur, id(self))
property name:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
property type:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
cdef int type = self._c_node.type
if type == tree.XML_ELEMENT_CONTENT_PCDATA:
return "pcdata"
elif type == tree.XML_ELEMENT_CONTENT_ELEMENT:
return "element"
elif type == tree.XML_ELEMENT_CONTENT_SEQ:
return "seq"
elif type == tree.XML_ELEMENT_CONTENT_OR:
return "or"
else:
return None
property occur:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
cdef int occur = self._c_node.ocur
if occur == tree.XML_ELEMENT_CONTENT_ONCE:
return "once"
elif occur == tree.XML_ELEMENT_CONTENT_OPT:
return "opt"
elif occur == tree.XML_ELEMENT_CONTENT_MULT:
return "mult"
elif occur == tree.XML_ELEMENT_CONTENT_PLUS:
return "plus"
else:
return None
property left:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
c1 = self._c_node.c1
if c1:
node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
node._dtd = self._dtd
node._c_node = <tree.xmlElementContent*>c1
return node
else:
return None
property right:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
c2 = self._c_node.c2
if c2:
node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
node._dtd = self._dtd
node._c_node = <tree.xmlElementContent*>c2
return node
else:
return None
@cython.final
@cython.internal
@cython.freelist(8)
cdef class _DTDAttributeDecl:
cdef DTD _dtd
cdef tree.xmlAttribute* _c_node
def __repr__(self):
return "<%s.%s object name=%r elemname=%r prefix=%r type=%r default=%r default_value=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.elemname, self.prefix, self.type, self.default, self.default_value, id(self))
property name:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
property elemname:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.elem) if self._c_node.elem is not NULL else None
property prefix:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None
property type:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
cdef int type = self._c_node.atype
if type == tree.XML_ATTRIBUTE_CDATA:
return "cdata"
elif type == tree.XML_ATTRIBUTE_ID:
return "id"
elif type == tree.XML_ATTRIBUTE_IDREF:
return "idref"
elif type == tree.XML_ATTRIBUTE_IDREFS:
return "idrefs"
elif type == tree.XML_ATTRIBUTE_ENTITY:
return "entity"
elif type == tree.XML_ATTRIBUTE_ENTITIES:
return "entities"
elif type == tree.XML_ATTRIBUTE_NMTOKEN:
return "nmtoken"
elif type == tree.XML_ATTRIBUTE_NMTOKENS:
return "nmtokens"
elif type == tree.XML_ATTRIBUTE_ENUMERATION:
return "enumeration"
elif type == tree.XML_ATTRIBUTE_NOTATION:
return "notation"
else:
return None
property default:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
cdef int default = self._c_node.def_
if default == tree.XML_ATTRIBUTE_NONE:
return "none"
elif default == tree.XML_ATTRIBUTE_REQUIRED:
return "required"
elif default == tree.XML_ATTRIBUTE_IMPLIED:
return "implied"
elif default == tree.XML_ATTRIBUTE_FIXED:
return "fixed"
else:
return None
property default_value:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.defaultValue) if self._c_node.defaultValue is not NULL else None
def itervalues(self):
_assertValidDTDNode(self, self._c_node)
cdef tree.xmlEnumeration *c_node = self._c_node.tree
while c_node is not NULL:
yield funicode(c_node.name)
c_node = c_node.next
def values(self):
return list(self.itervalues())
@cython.final
@cython.internal
@cython.freelist(8)
cdef class _DTDElementDecl:
cdef DTD _dtd
cdef tree.xmlElement* _c_node
def __repr__(self):
return "<%s.%s object name=%r prefix=%r type=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.prefix, self.type, id(self))
property name:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
property prefix:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None
property type:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
cdef int type = self._c_node.etype
if type == tree.XML_ELEMENT_TYPE_UNDEFINED:
return "undefined"
elif type == tree.XML_ELEMENT_TYPE_EMPTY:
return "empty"
elif type == tree.XML_ELEMENT_TYPE_ANY:
return "any"
elif type == tree.XML_ELEMENT_TYPE_MIXED:
return "mixed"
elif type == tree.XML_ELEMENT_TYPE_ELEMENT:
return "element"
else:
return None
property content:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
cdef tree.xmlElementContent *content = self._c_node.content
if content:
node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
node._dtd = self._dtd
node._c_node = content
return node
else:
return None
def iterattributes(self):
_assertValidDTDNode(self, self._c_node)
cdef tree.xmlAttribute *c_node = self._c_node.attributes
while c_node:
node = <_DTDAttributeDecl>_DTDAttributeDecl.__new__(_DTDAttributeDecl)
node._dtd = self._dtd
node._c_node = c_node
yield node
c_node = c_node.nexth
def attributes(self):
return list(self.iterattributes())
@cython.final
@cython.internal
@cython.freelist(8)
cdef class _DTDEntityDecl:
cdef DTD _dtd
cdef tree.xmlEntity* _c_node
def __repr__(self):
return "<%s.%s object name=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
property name:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.name) if self._c_node.name is not NULL else None
property orig:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.orig) if self._c_node.orig is not NULL else None
property content:
def __get__(self):
_assertValidDTDNode(self, self._c_node)
return funicode(self._c_node.content) if self._c_node.content is not NULL else None
################################################################################
# DTD
cdef class DTD(_Validator):
u"""DTD(self, file=None, external_id=None)
A DTD validator.
Can load from filesystem directly given a filename or file-like object.
Alternatively, pass the keyword parameter ``external_id`` to load from a
catalog.
"""
cdef tree.xmlDtd* _c_dtd
def __init__(self, file=None, *, external_id=None):
_Validator.__init__(self)
if file is not None:
if _isString(file):
file = _encodeFilename(file)
with self._error_log:
self._c_dtd = xmlparser.xmlParseDTD(NULL, _xcstr(file))
elif hasattr(file, 'read'):
self._c_dtd = _parseDtdFromFilelike(file)
else:
raise DTDParseError, u"file must be a filename or file-like object"
elif external_id is not None:
with self._error_log:
self._c_dtd = xmlparser.xmlParseDTD(<const_xmlChar*>external_id, NULL)
else:
raise DTDParseError, u"either filename or external ID required"
if self._c_dtd is NULL:
raise DTDParseError(
self._error_log._buildExceptionMessage(u"error parsing DTD"),
self._error_log)
property name:
def __get__(self):
if self._c_dtd is NULL:
return None
return funicodeOrNone(self._c_dtd.name)
property external_id:
def __get__(self):
if self._c_dtd is NULL:
return None
return funicodeOrNone(self._c_dtd.ExternalID)
property system_url:
def __get__(self):
if self._c_dtd is NULL:
return None
return funicodeOrNone(self._c_dtd.SystemID)
def iterelements(self):
cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
while c_node is not NULL:
if c_node.type == tree.XML_ELEMENT_DECL:
node = _DTDElementDecl()
node._dtd = self
node._c_node = <tree.xmlElement*>c_node
yield node
c_node = c_node.next
def elements(self):
return list(self.iterelements())
def iterentities(self):
cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
while c_node is not NULL:
if c_node.type == tree.XML_ENTITY_DECL:
node = _DTDEntityDecl()
node._dtd = self
node._c_node = <tree.xmlEntity*>c_node
yield node
c_node = c_node.next
def entities(self):
return list(self.iterentities())
def __dealloc__(self):
tree.xmlFreeDtd(self._c_dtd)
def __call__(self, etree):
u"""__call__(self, etree)
Validate doc using the DTD.
Returns true if the document is valid, false if not.
"""
cdef _Document doc
cdef _Element root_node
cdef xmlDoc* c_doc
cdef dtdvalid.xmlValidCtxt* valid_ctxt
cdef int ret = -1
assert self._c_dtd is not NULL, "DTD not initialised"
doc = _documentOrRaise(etree)
root_node = _rootNodeOrRaise(etree)
valid_ctxt = dtdvalid.xmlNewValidCtxt()
if valid_ctxt is NULL:
raise DTDError(u"Failed to create validation context")
# work around error reporting bug in libxml2 <= 2.9.1 (and later?)
# https://bugzilla.gnome.org/show_bug.cgi?id=724903
valid_ctxt.error = <dtdvalid.xmlValidityErrorFunc>_nullGenericErrorFunc
valid_ctxt.userData = NULL
try:
with self._error_log:
c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
ret = dtdvalid.xmlValidateDtd(valid_ctxt, c_doc, self._c_dtd)
_destroyFakeDoc(doc._c_doc, c_doc)
finally:
dtdvalid.xmlFreeValidCtxt(valid_ctxt)
if ret == -1:
raise DTDValidateError(u"Internal error in DTD validation",
self._error_log)
return ret == 1
cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL:
cdef _ExceptionContext exc_context
cdef _FileReaderContext dtd_parser
cdef _ErrorLog error_log
cdef tree.xmlDtd* c_dtd
exc_context = _ExceptionContext()
dtd_parser = _FileReaderContext(file, exc_context, None)
error_log = _ErrorLog()
with error_log:
c_dtd = dtd_parser._readDtd()
exc_context._raise_if_stored()
if c_dtd is NULL:
raise DTDParseError(u"error parsing DTD", error_log)
return c_dtd
cdef DTD _dtdFactory(tree.xmlDtd* c_dtd):
# do not run through DTD.__init__()!
cdef DTD dtd
if c_dtd is NULL:
return None
dtd = DTD.__new__(DTD)
dtd._c_dtd = _copyDtd(c_dtd)
_Validator.__init__(dtd)
return dtd
cdef tree.xmlDtd* _copyDtd(tree.xmlDtd* c_orig_dtd) except NULL:
"""
Copy a DTD. libxml2 (currently) fails to set up the element->attributes
links when copying DTDs, so we have to rebuild them here.
"""
c_dtd = tree.xmlCopyDtd(c_orig_dtd)
if not c_dtd:
raise MemoryError
cdef tree.xmlNode* c_node = c_dtd.children
while c_node:
if c_node.type == tree.XML_ATTRIBUTE_DECL:
_linkDtdAttribute(c_dtd, <tree.xmlAttribute*>c_node)
c_node = c_node.next
return c_dtd
cdef void _linkDtdAttribute(tree.xmlDtd* c_dtd, tree.xmlAttribute* c_attr):
"""
Create the link to the DTD attribute declaration from the corresponding
element declaration.
"""
c_elem = dtdvalid.xmlGetDtdElementDesc(c_dtd, c_attr.elem)
if not c_elem:
# no such element? something is wrong with the DTD ...
return
c_pos = c_elem.attributes
if not c_pos:
c_elem.attributes = c_attr
c_attr.nexth = NULL
return
# libxml2 keeps namespace declarations first, and we need to make
# sure we don't re-insert attributes that are already there
if _isDtdNsDecl(c_attr):
if not _isDtdNsDecl(c_pos):
c_elem.attributes = c_attr
c_attr.nexth = c_pos
return
while c_pos != c_attr and c_pos.nexth and _isDtdNsDecl(c_pos.nexth):
c_pos = c_pos.nexth
else:
# append at end
while c_pos != c_attr and c_pos.nexth:
c_pos = c_pos.nexth
if c_pos == c_attr:
return
c_attr.nexth = c_pos.nexth
c_pos.nexth = c_attr
cdef bint _isDtdNsDecl(tree.xmlAttribute* c_attr):
if cstring_h.strcmp(<const_char*>c_attr.name, "xmlns") == 0:
return True
if (c_attr.prefix is not NULL and
cstring_h.strcmp(<const_char*>c_attr.prefix, "xmlns") == 0):
return True
return False

855
lib/lxml/extensions.pxi Normal file
View file

@ -0,0 +1,855 @@
# support for extension functions in XPath and XSLT
class XPathError(LxmlError):
u"""Base class of all XPath errors.
"""
pass
class XPathEvalError(XPathError):
u"""Error during XPath evaluation.
"""
pass
class XPathFunctionError(XPathEvalError):
u"""Internal error looking up an XPath extension function.
"""
pass
class XPathResultError(XPathEvalError):
u"""Error handling an XPath result.
"""
pass
# forward declarations
ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf)
cdef class _ExsltRegExp
################################################################################
# Base class for XSLT and XPath evaluation contexts: functions, namespaces, ...
@cython.internal
cdef class _BaseContext:
cdef xpath.xmlXPathContext* _xpathCtxt
cdef _Document _doc
cdef dict _extensions
cdef list _namespaces
cdef list _global_namespaces
cdef dict _utf_refs
cdef dict _function_cache
cdef dict _eval_context_dict
cdef bint _build_smart_strings
# for exception handling and temporary reference keeping:
cdef _TempStore _temp_refs
cdef set _temp_documents
cdef _ExceptionContext _exc
cdef _ErrorLog _error_log
def __cinit__(self):
self._xpathCtxt = NULL
def __init__(self, namespaces, extensions, error_log, enable_regexp,
build_smart_strings):
cdef _ExsltRegExp _regexp
cdef dict new_extensions
cdef list ns
self._utf_refs = {}
self._global_namespaces = []
self._function_cache = {}
self._eval_context_dict = None
self._error_log = error_log
if extensions is not None:
# convert extensions to UTF-8
if isinstance(extensions, dict):
extensions = (extensions,)
# format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function}
new_extensions = {}
for extension in extensions:
for (ns_uri, name), function in extension.items():
if name is None:
raise ValueError, u"extensions must have non empty names"
ns_utf = self._to_utf(ns_uri)
name_utf = self._to_utf(name)
new_extensions[(ns_utf, name_utf)] = function
extensions = new_extensions or None
if namespaces is not None:
if isinstance(namespaces, dict):
namespaces = namespaces.items()
if namespaces:
ns = []
for prefix, ns_uri in namespaces:
if prefix is None or not prefix:
raise TypeError, \
u"empty namespace prefix is not supported in XPath"
if ns_uri is None or not ns_uri:
raise TypeError, \
u"setting default namespace is not supported in XPath"
prefix_utf = self._to_utf(prefix)
ns_uri_utf = self._to_utf(ns_uri)
ns.append( (prefix_utf, ns_uri_utf) )
namespaces = ns
else:
namespaces = None
self._doc = None
self._exc = _ExceptionContext()
self._extensions = extensions
self._namespaces = namespaces
self._temp_refs = _TempStore()
self._temp_documents = set()
self._build_smart_strings = build_smart_strings
if enable_regexp:
_regexp = _ExsltRegExp()
_regexp._register_in_context(self)
cdef _BaseContext _copy(self):
cdef _BaseContext context
if self._namespaces is not None:
namespaces = self._namespaces[:]
else:
namespaces = None
context = self.__class__(namespaces, None, self._error_log, False,
self._build_smart_strings)
if self._extensions is not None:
context._extensions = self._extensions.copy()
return context
cdef bytes _to_utf(self, s):
u"Convert to UTF-8 and keep a reference to the encoded string"
cdef python.PyObject* dict_result
if s is None:
return None
dict_result = python.PyDict_GetItem(self._utf_refs, s)
if dict_result is not NULL:
return <bytes>dict_result
utf = _utf8(s)
self._utf_refs[s] = utf
if python.IS_PYPY:
# use C level refs, PyPy refs are not enough!
python.Py_INCREF(utf)
return utf
cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt):
self._xpathCtxt = xpathCtxt
xpathCtxt.userData = <void*>self
xpathCtxt.error = _receiveXPathError
@cython.final
cdef _register_context(self, _Document doc):
self._doc = doc
self._exc.clear()
@cython.final
cdef _cleanup_context(self):
#xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt)
#self.unregisterGlobalNamespaces()
if python.IS_PYPY:
# clean up double refs in PyPy (see "_to_utf()" method)
for ref in self._utf_refs.itervalues():
python.Py_DECREF(ref)
self._utf_refs.clear()
self._eval_context_dict = None
self._doc = None
@cython.final
cdef _release_context(self):
if self._xpathCtxt is not NULL:
self._xpathCtxt.userData = NULL
self._xpathCtxt = NULL
# namespaces (internal UTF-8 methods with leading '_')
cdef addNamespace(self, prefix, ns_uri):
cdef list namespaces
if prefix is None:
raise TypeError, u"empty prefix is not supported in XPath"
prefix_utf = self._to_utf(prefix)
ns_uri_utf = self._to_utf(ns_uri)
new_item = (prefix_utf, ns_uri_utf)
if self._namespaces is None:
self._namespaces = [new_item]
else:
namespaces = []
for item in self._namespaces:
if item[0] == prefix_utf:
item = new_item
new_item = None
namespaces.append(item)
if new_item is not None:
namespaces.append(new_item)
self._namespaces = namespaces
if self._xpathCtxt is not NULL:
xpath.xmlXPathRegisterNs(
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
cdef registerNamespace(self, prefix, ns_uri):
if prefix is None:
raise TypeError, u"empty prefix is not supported in XPath"
prefix_utf = self._to_utf(prefix)
ns_uri_utf = self._to_utf(ns_uri)
self._global_namespaces.append(prefix_utf)
xpath.xmlXPathRegisterNs(self._xpathCtxt,
_xcstr(prefix_utf), _xcstr(ns_uri_utf))
cdef registerLocalNamespaces(self):
if self._namespaces is None:
return
for prefix_utf, ns_uri_utf in self._namespaces:
xpath.xmlXPathRegisterNs(
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
cdef registerGlobalNamespaces(self):
cdef list ns_prefixes = _find_all_extension_prefixes()
if python.PyList_GET_SIZE(ns_prefixes) > 0:
for prefix_utf, ns_uri_utf in ns_prefixes:
self._global_namespaces.append(prefix_utf)
xpath.xmlXPathRegisterNs(
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
cdef unregisterGlobalNamespaces(self):
if python.PyList_GET_SIZE(self._global_namespaces) > 0:
for prefix_utf in self._global_namespaces:
xpath.xmlXPathRegisterNs(self._xpathCtxt,
_xcstr(prefix_utf), NULL)
del self._global_namespaces[:]
cdef void _unregisterNamespace(self, prefix_utf):
xpath.xmlXPathRegisterNs(self._xpathCtxt,
_xcstr(prefix_utf), NULL)
# extension functions
cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1:
if self._extensions is None:
self._extensions = {}
self._extensions[(ns_utf, name_utf)] = function
return 0
cdef registerGlobalFunctions(self, void* ctxt,
_register_function reg_func):
cdef python.PyObject* dict_result
cdef dict d
for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems():
dict_result = python.PyDict_GetItem(
self._function_cache, ns_utf)
if dict_result is not NULL:
d = <dict>dict_result
else:
d = {}
self._function_cache[ns_utf] = d
for name_utf, function in ns_functions.iteritems():
d[name_utf] = function
reg_func(ctxt, name_utf, ns_utf)
cdef registerLocalFunctions(self, void* ctxt,
_register_function reg_func):
cdef python.PyObject* dict_result
cdef dict d
if self._extensions is None:
return # done
last_ns = None
d = None
for (ns_utf, name_utf), function in self._extensions.iteritems():
if ns_utf is not last_ns or d is None:
last_ns = ns_utf
dict_result = python.PyDict_GetItem(
self._function_cache, ns_utf)
if dict_result is not NULL:
d = <dict>dict_result
else:
d = {}
self._function_cache[ns_utf] = d
d[name_utf] = function
reg_func(ctxt, name_utf, ns_utf)
cdef unregisterAllFunctions(self, void* ctxt,
_register_function unreg_func):
for ns_utf, functions in self._function_cache.iteritems():
for name_utf in functions:
unreg_func(ctxt, name_utf, ns_utf)
cdef unregisterGlobalFunctions(self, void* ctxt,
_register_function unreg_func):
for ns_utf, functions in self._function_cache.items():
for name_utf in functions:
if self._extensions is None or \
(ns_utf, name_utf) not in self._extensions:
unreg_func(ctxt, name_utf, ns_utf)
@cython.final
cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name):
u"""Lookup an extension function in the cache and return it.
Parameters: c_ns_uri may be NULL, c_name must not be NULL
"""
cdef python.PyObject* c_dict
cdef python.PyObject* dict_result
c_dict = python.PyDict_GetItem(
self._function_cache, None if c_ns_uri is NULL else c_ns_uri)
if c_dict is not NULL:
dict_result = python.PyDict_GetItem(
<object>c_dict, <unsigned char*>c_name)
if dict_result is not NULL:
return <object>dict_result
return None
# Python access to the XPath context for extension functions
property context_node:
def __get__(self):
cdef xmlNode* c_node
if self._xpathCtxt is NULL:
raise XPathError, \
u"XPath context is only usable during the evaluation"
c_node = self._xpathCtxt.node
if c_node is NULL:
raise XPathError, u"no context node"
if c_node.doc != self._xpathCtxt.doc:
raise XPathError, \
u"document-external context nodes are not supported"
if self._doc is None:
raise XPathError, u"document context is missing"
return _elementFactory(self._doc, c_node)
property eval_context:
def __get__(self):
if self._eval_context_dict is None:
self._eval_context_dict = {}
return self._eval_context_dict
# Python reference keeping during XPath function evaluation
@cython.final
cdef _release_temp_refs(self):
u"Free temporarily referenced objects from this context."
self._temp_refs.clear()
self._temp_documents.clear()
@cython.final
cdef _hold(self, obj):
u"""A way to temporarily hold references to nodes in the evaluator.
This is needed because otherwise nodes created in XPath extension
functions would be reference counted too soon, during the XPath
evaluation. This is most important in the case of exceptions.
"""
cdef _Element element
if isinstance(obj, _Element):
self._temp_refs.add(obj)
self._temp_documents.add((<_Element>obj)._doc)
return
elif _isString(obj) or not python.PySequence_Check(obj):
return
for o in obj:
if isinstance(o, _Element):
#print "Holding element:", <int>element._c_node
self._temp_refs.add(o)
#print "Holding document:", <int>element._doc._c_doc
self._temp_documents.add((<_Element>o)._doc)
@cython.final
cdef _Document _findDocumentForNode(self, xmlNode* c_node):
u"""If an XPath expression returns an element from a different
document than the current context document, we call this to
see if it was possibly created by an extension and is a known
document instance.
"""
cdef _Document doc
for doc in self._temp_documents:
if doc is not None and doc._c_doc is c_node.doc:
return doc
return None
# libxml2 keeps these error messages in a static array in its code
# and doesn't give us access to them ...
cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = (
b"Ok",
b"Number encoding",
b"Unfinished literal",
b"Start of literal",
b"Expected $ for variable reference",
b"Undefined variable",
b"Invalid predicate",
b"Invalid expression",
b"Missing closing curly brace",
b"Unregistered function",
b"Invalid operand",
b"Invalid type",
b"Invalid number of arguments",
b"Invalid context size",
b"Invalid context position",
b"Memory allocation error",
b"Syntax error",
b"Resource error",
b"Sub resource error",
b"Undefined namespace prefix",
b"Encoding error",
b"Char out of XML range",
b"Invalid or incomplete context",
b"Stack usage error",
)
cdef void _forwardXPathError(void* c_ctxt, xmlerror.xmlError* c_error) with gil:
cdef xmlerror.xmlError error
cdef int xpath_code
if c_error.message is not NULL:
error.message = c_error.message
else:
xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK
if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES):
error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code])
else:
error.message = b"unknown error"
error.domain = c_error.domain
error.code = c_error.code
error.level = c_error.level
error.line = c_error.line
error.int2 = c_error.int1 # column
error.file = c_error.file
(<_BaseContext>c_ctxt)._error_log._receive(&error)
cdef void _receiveXPathError(void* c_context, xmlerror.xmlError* error) nogil:
if not __DEBUG:
return
if c_context is NULL:
_forwardError(NULL, error)
else:
_forwardXPathError(c_context, error)
def Extension(module, function_mapping=None, *, ns=None):
u"""Extension(module, function_mapping=None, ns=None)
Build a dictionary of extension functions from the functions
defined in a module or the methods of an object.
As second argument, you can pass an additional mapping of
attribute names to XPath function names, or a list of function
names that should be taken.
The ``ns`` keyword argument accepts a namespace URI for the XPath
functions.
"""
cdef dict functions = {}
if isinstance(function_mapping, dict):
for function_name, xpath_name in function_mapping.items():
functions[(ns, xpath_name)] = getattr(module, function_name)
else:
if function_mapping is None:
function_mapping = [ name for name in dir(module)
if not name.startswith(u'_') ]
for function_name in function_mapping:
functions[(ns, function_name)] = getattr(module, function_name)
return functions
################################################################################
# EXSLT regexp implementation
@cython.final
@cython.internal
cdef class _ExsltRegExp:
cdef dict _compile_map
def __cinit__(self):
self._compile_map = {}
cdef _make_string(self, value):
if _isString(value):
return value
elif isinstance(value, list):
# node set: take recursive text concatenation of first element
if python.PyList_GET_SIZE(value) == 0:
return u''
firstnode = value[0]
if _isString(firstnode):
return firstnode
elif isinstance(firstnode, _Element):
c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
if c_text is NULL:
raise MemoryError()
try:
return funicode(c_text)
finally:
tree.xmlFree(c_text)
else:
return unicode(firstnode)
else:
return unicode(value)
cdef _compile(self, rexp, ignore_case):
cdef python.PyObject* c_result
rexp = self._make_string(rexp)
key = (rexp, ignore_case)
c_result = python.PyDict_GetItem(self._compile_map, key)
if c_result is not NULL:
return <object>c_result
py_flags = re.UNICODE
if ignore_case:
py_flags = py_flags | re.IGNORECASE
rexp_compiled = re.compile(rexp, py_flags)
self._compile_map[key] = rexp_compiled
return rexp_compiled
def test(self, ctxt, s, rexp, flags=u''):
flags = self._make_string(flags)
s = self._make_string(s)
rexpc = self._compile(rexp, u'i' in flags)
if rexpc.search(s) is None:
return False
else:
return True
def match(self, ctxt, s, rexp, flags=u''):
cdef list result_list
flags = self._make_string(flags)
s = self._make_string(s)
rexpc = self._compile(rexp, u'i' in flags)
if u'g' in flags:
results = rexpc.findall(s)
if not results:
return ()
else:
result = rexpc.search(s)
if not result:
return ()
results = [ result.group() ]
results.extend( result.groups(u'') )
result_list = []
root = Element(u'matches')
join_groups = u''.join
for s_match in results:
if python.PyTuple_CheckExact(s_match):
s_match = join_groups(s_match)
elem = SubElement(root, u'match')
elem.text = s_match
result_list.append(elem)
return result_list
def replace(self, ctxt, s, rexp, flags, replacement):
replacement = self._make_string(replacement)
flags = self._make_string(flags)
s = self._make_string(s)
rexpc = self._compile(rexp, u'i' in flags)
if u'g' in flags:
count = 0
else:
count = 1
return rexpc.sub(replacement, s, count)
cdef _register_in_context(self, _BaseContext context):
ns = b"http://exslt.org/regular-expressions"
context._addLocalExtensionFunction(ns, b"test", self.test)
context._addLocalExtensionFunction(ns, b"match", self.match)
context._addLocalExtensionFunction(ns, b"replace", self.replace)
################################################################################
# helper functions
cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc,
_BaseContext context) except NULL:
cdef xpath.xmlNodeSet* resultSet
cdef _Element fake_node = None
cdef xmlNode* c_node
if isinstance(obj, unicode):
obj = _utf8(obj)
if isinstance(obj, bytes):
# libxml2 copies the string value
return xpath.xmlXPathNewCString(_cstr(obj))
if isinstance(obj, bool):
return xpath.xmlXPathNewBoolean(obj)
if python.PyNumber_Check(obj):
return xpath.xmlXPathNewFloat(obj)
if obj is None:
resultSet = xpath.xmlXPathNodeSetCreate(NULL)
elif isinstance(obj, _Element):
resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node)
elif python.PySequence_Check(obj):
resultSet = xpath.xmlXPathNodeSetCreate(NULL)
try:
for value in obj:
if isinstance(value, _Element):
if context is not None:
context._hold(value)
xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node)
else:
if context is None or doc is None:
raise XPathResultError, \
u"Non-Element values not supported at this point - got %r" % value
# support strings by appending text nodes to an Element
if isinstance(value, unicode):
value = _utf8(value)
if isinstance(value, bytes):
if fake_node is None:
fake_node = _makeElement("text-root", NULL, doc, None,
None, None, None, None, None)
context._hold(fake_node)
else:
# append a comment node to keep the text nodes separate
c_node = tree.xmlNewDocComment(doc._c_doc, <unsigned char*>"")
if c_node is NULL:
raise MemoryError()
tree.xmlAddChild(fake_node._c_node, c_node)
context._hold(value)
c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value))
if c_node is NULL:
raise MemoryError()
tree.xmlAddChild(fake_node._c_node, c_node)
xpath.xmlXPathNodeSetAdd(resultSet, c_node)
else:
raise XPathResultError, \
u"This is not a supported node-set result: %r" % value
except:
xpath.xmlXPathFreeNodeSet(resultSet)
raise
else:
raise XPathResultError, u"Unknown return type: %s" % \
python._fqtypename(obj).decode('utf8')
return xpath.xmlXPathWrapNodeSet(resultSet)
cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj,
_Document doc, _BaseContext context):
if xpathObj.type == xpath.XPATH_UNDEFINED:
raise XPathResultError, u"Undefined xpath result"
elif xpathObj.type == xpath.XPATH_NODESET:
return _createNodeSetResult(xpathObj, doc, context)
elif xpathObj.type == xpath.XPATH_BOOLEAN:
return xpathObj.boolval
elif xpathObj.type == xpath.XPATH_NUMBER:
return xpathObj.floatval
elif xpathObj.type == xpath.XPATH_STRING:
stringval = funicode(xpathObj.stringval)
if context._build_smart_strings:
stringval = _elementStringResultFactory(
stringval, None, None, 0)
return stringval
elif xpathObj.type == xpath.XPATH_POINT:
raise NotImplementedError, u"XPATH_POINT"
elif xpathObj.type == xpath.XPATH_RANGE:
raise NotImplementedError, u"XPATH_RANGE"
elif xpathObj.type == xpath.XPATH_LOCATIONSET:
raise NotImplementedError, u"XPATH_LOCATIONSET"
elif xpathObj.type == xpath.XPATH_USERS:
raise NotImplementedError, u"XPATH_USERS"
elif xpathObj.type == xpath.XPATH_XSLT_TREE:
return _createNodeSetResult(xpathObj, doc, context)
else:
raise XPathResultError, u"Unknown xpath result %s" % unicode(xpathObj.type)
cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc,
_BaseContext context):
cdef xmlNode* c_node
cdef int i
cdef list result
result = []
if xpathObj.nodesetval is NULL:
return result
for i in range(xpathObj.nodesetval.nodeNr):
c_node = xpathObj.nodesetval.nodeTab[i]
_unpackNodeSetEntry(result, c_node, doc, context,
xpathObj.type == xpath.XPATH_XSLT_TREE)
return result
cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc,
_BaseContext context, bint is_fragment):
cdef xmlNode* c_child
if _isElement(c_node):
if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
# XXX: works, but maybe not always the right thing to do?
# XPath: only runs when extensions create or copy trees
# -> we store Python refs to these, so that is OK
# XSLT: can it leak when merging trees from multiple sources?
c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
# FIXME: call _instantiateElementFromXPath() instead?
results.append(
_fakeDocElementFactory(doc, c_node))
elif c_node.type == tree.XML_TEXT_NODE or \
c_node.type == tree.XML_CDATA_SECTION_NODE or \
c_node.type == tree.XML_ATTRIBUTE_NODE:
results.append(
_buildElementStringResult(doc, c_node, context))
elif c_node.type == tree.XML_NAMESPACE_DECL:
results.append( (funicodeOrNone((<xmlNs*>c_node).prefix),
funicodeOrNone((<xmlNs*>c_node).href)) )
elif c_node.type == tree.XML_DOCUMENT_NODE or \
c_node.type == tree.XML_HTML_DOCUMENT_NODE:
# ignored for everything but result tree fragments
if is_fragment:
c_child = c_node.children
while c_child is not NULL:
_unpackNodeSetEntry(results, c_child, doc, context, 0)
c_child = c_child.next
elif c_node.type == tree.XML_XINCLUDE_START or \
c_node.type == tree.XML_XINCLUDE_END:
pass
else:
raise NotImplementedError, \
u"Not yet implemented result node type: %d" % c_node.type
cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj):
u"""Free the XPath object, but *never* free the *content* of node sets.
Python dealloc will do that for us.
"""
if xpathObj.nodesetval is not NULL:
xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval)
xpathObj.nodesetval = NULL
xpath.xmlXPathFreeObject(xpathObj)
cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc,
_BaseContext context):
# NOTE: this may copy the element - only call this when it can't leak
if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
# not from the context document and not from a fake document
# either => may still be from a known document, e.g. one
# created by an extension function
doc = context._findDocumentForNode(c_node)
if doc is None:
# not from a known document at all! => can only make a
# safety copy here
c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
return _fakeDocElementFactory(doc, c_node)
################################################################################
# special str/unicode subclasses
@cython.final
cdef class _ElementUnicodeResult(unicode):
cdef _Element _parent
cdef readonly object attrname
cdef readonly bint is_tail
cdef readonly bint is_text
cdef readonly bint is_attribute
def getparent(self):
return self._parent
class _ElementStringResult(bytes):
# we need to use a Python class here, bytes cannot be C-subclassed
# in Pyrex/Cython
def getparent(self):
return self._parent
cdef object _elementStringResultFactory(string_value, _Element parent,
attrname, bint is_tail):
cdef _ElementUnicodeResult uresult
cdef bint is_text
cdef bint is_attribute = attrname is not None
if parent is None:
is_text = 0
else:
is_text = not (is_tail or is_attribute)
if type(string_value) is bytes:
result = _ElementStringResult(string_value)
result._parent = parent
result.is_attribute = is_attribute
result.is_tail = is_tail
result.is_text = is_text
result.attrname = attrname
return result
else:
uresult = _ElementUnicodeResult(string_value)
uresult._parent = parent
uresult.is_attribute = is_attribute
uresult.is_tail = is_tail
uresult.is_text = is_text
uresult.attrname = attrname
return uresult
cdef object _buildElementStringResult(_Document doc, xmlNode* c_node,
_BaseContext context):
cdef _Element parent = None
cdef object attrname = None
cdef xmlNode* c_element
cdef bint is_tail
if c_node.type == tree.XML_ATTRIBUTE_NODE:
attrname = _namespacedName(c_node)
is_tail = 0
s = tree.xmlNodeGetContent(c_node)
try:
value = funicode(s)
finally:
tree.xmlFree(s)
c_element = NULL
else:
#assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
# may be tail text or normal text
value = funicode(c_node.content)
c_element = _previousElement(c_node)
is_tail = c_element is not NULL
if not context._build_smart_strings:
return value
if c_element is NULL:
# non-tail text or attribute text
c_element = c_node.parent
while c_element is not NULL and not _isElement(c_element):
c_element = c_element.parent
if c_element is not NULL:
parent = _instantiateElementFromXPath(c_element, doc, context)
return _elementStringResultFactory(
value, parent, attrname, is_tail)
################################################################################
# callbacks for XPath/XSLT extension functions
cdef void _extension_function_call(_BaseContext context, function,
xpath.xmlXPathParserContext* ctxt, int nargs):
cdef _Document doc
cdef xpath.xmlXPathObject* obj
cdef list args
cdef int i
doc = context._doc
try:
args = []
for i in range(nargs):
obj = xpath.valuePop(ctxt)
o = _unwrapXPathObject(obj, doc, context)
_freeXPathObject(obj)
args.append(o)
args.reverse()
res = function(context, *args)
# wrap result for XPath consumption
obj = _wrapXPathObject(res, doc, context)
# prevent Python from deallocating elements handed to libxml2
context._hold(res)
xpath.valuePush(ctxt, obj)
except:
xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR)
context._exc._store_raised()
finally:
return # swallow any further exceptions
# lookup the function by name and call it
cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt,
int nargs) with gil:
cdef _BaseContext context
cdef xpath.xmlXPathContext* rctxt = ctxt.context
context = <_BaseContext> rctxt.userData
try:
function = context._find_cached_function(rctxt.functionURI, rctxt.function)
if function is not None:
_extension_function_call(context, function, ctxt, nargs)
else:
xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
context._exc._store_exception(
XPathFunctionError(u"XPath function '%s' not found" %
_namespacedNameFromNsName(rctxt.functionURI, rctxt.function)))
except:
# may not be the right error, but we need to tell libxml2 *something*
xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
context._exc._store_raised()
finally:
return # swallow any further exceptions

View file

@ -0,0 +1,10 @@
__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
"""
__all__ = ["parse", "convert_tree"]
from soupparser import convert_tree, parse as _parse
def parse(file, beautifulsoup=None, makeelement=None):
root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
return root.getroot()

1697
lib/lxml/html/__init__.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,87 @@
import optparse
import sys
import re
import os
from lxml.html.diff import htmldiff
description = """\
"""
parser = optparse.OptionParser(
usage="%prog [OPTIONS] FILE1 FILE2\n"
"%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
description=description,
)
parser.add_option(
'-o', '--output',
metavar="FILE",
dest="output",
default="-",
help="File to write the difference to",
)
parser.add_option(
'-a', '--annotation',
action="store_true",
dest="annotation",
help="Do an annotation")
def main(args=None):
if args is None:
args = sys.argv[1:]
options, args = parser.parse_args(args)
if options.annotation:
return annotate(options, args)
if len(args) != 2:
print('Error: you must give two files')
parser.print_help()
sys.exit(1)
file1, file2 = args
input1 = read_file(file1)
input2 = read_file(file2)
body1 = split_body(input1)[1]
pre, body2, post = split_body(input2)
result = htmldiff(body1, body2)
result = pre + result + post
if options.output == '-':
if not result.endswith('\n'):
result += '\n'
sys.stdout.write(result)
else:
f = open(options.output, 'wb')
f.write(result)
f.close()
def read_file(filename):
if filename == '-':
c = sys.stdin.read()
elif not os.path.exists(filename):
raise OSError(
"Input file %s does not exist" % filename)
else:
f = open(filename, 'rb')
c = f.read()
f.close()
return c
body_start_re = re.compile(
r"<body.*?>", re.I|re.S)
body_end_re = re.compile(
r"</body.*?>", re.I|re.S)
def split_body(html):
match = body_start_re.search(html)
if match:
pre = html[:match.end()]
html = html[match.end():]
match = body_end_re.search(html)
if match:
post = html[match.start():]
html = html[:match.start()]
return pre, html, post
def annotate(options, args):
print("Not yet implemented")
sys.exit(1)

View file

@ -0,0 +1,100 @@
"""
Legacy module - don't use in new code!
html5lib now has its own proper implementation.
This module implements a tree builder for html5lib that generates lxml
html element trees. This module uses camelCase as it follows the
html5lib style guide.
"""
from html5lib.treebuilders import _base, etree as etree_builders
from lxml import html, etree
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self.childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
def __init__(self, *args, **kwargs):
html_builder = etree_builders.getETreeModule(html, fullTree=False)
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
self.elementClass = html_builder.Element
self.commentClass = etree_builder.Comment
_base.TreeBuilder.__init__(self, *args, **kwargs)
def reset(self):
_base.TreeBuilder.reset(self)
self.rootInserted = False
self.initialComments = []
self.doctype = None
def getDocument(self):
return self.document._elementTree
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, name, publicId, systemId):
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertComment(self, data, parent=None):
if not self.rootInserted:
self.initialComments.append(data)
else:
_base.TreeBuilder.insertComment(self, data, parent)
def insertRoot(self, name):
buf = []
if self.doctype and self.doctype.name:
buf.append('<!DOCTYPE %s' % self.doctype.name)
if self.doctype.publicId is not None or self.doctype.systemId is not None:
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
self.doctype.systemId))
buf.append('>')
buf.append('<html></html>')
root = html.fromstring(''.join(buf))
# Append the initial comments:
for comment in self.initialComments:
root.addprevious(etree.Comment(comment))
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name)
root_element._element = root
self.document.childNodes.append(root_element)
self.openElements.append(root_element)
self.rootInserted = True

115
lib/lxml/html/_setmixin.py Normal file
View file

@ -0,0 +1,115 @@
class SetMixin(object):
"""
Mix-in for sets. You must define __iter__, add, remove
"""
def __len__(self):
length = 0
for item in self:
length += 1
return length
def __contains__(self, item):
for has_item in self:
if item == has_item:
return True
return False
def issubset(self, other):
for item in other:
if item not in self:
return False
return True
__le__ = issubset
def issuperset(self, other):
for item in self:
if item not in other:
return False
return True
__ge__ = issuperset
def union(self, other):
return self | other
def __or__(self, other):
new = self.copy()
new |= other
return new
def intersection(self, other):
return self & other
def __and__(self, other):
new = self.copy()
new &= other
return new
def difference(self, other):
return self - other
def __sub__(self, other):
new = self.copy()
new -= other
return new
def symmetric_difference(self, other):
return self ^ other
def __xor__(self, other):
new = self.copy()
new ^= other
return new
def copy(self):
return set(self)
def update(self, other):
for item in other:
self.add(item)
def __ior__(self, other):
self.update(other)
return self
def intersection_update(self, other):
for item in self:
if item not in other:
self.remove(item)
def __iand__(self, other):
self.intersection_update(other)
return self
def difference_update(self, other):
for item in other:
if item in self:
self.remove(item)
def __isub__(self, other):
self.difference_update(other)
return self
def symmetric_difference_update(self, other):
for item in other:
if item in self:
self.remove(item)
else:
self.add(item)
def __ixor__(self, other):
self.symmetric_difference_update(other)
return self
def discard(self, item):
try:
self.remove(item)
except KeyError:
pass
def clear(self):
for item in list(self):
self.remove(item)

133
lib/lxml/html/builder.py Normal file
View file

@ -0,0 +1,133 @@
# --------------------------------------------------------------------
# The ElementTree toolkit is
# Copyright (c) 1999-2004 by Fredrik Lundh
# --------------------------------------------------------------------
"""
A set of HTML generator tags for building HTML documents.
Usage::
>>> from lxml.html.builder import *
>>> html = HTML(
... HEAD( TITLE("Hello World") ),
... BODY( CLASS("main"),
... H1("Hello World !")
... )
... )
>>> import lxml.etree
>>> print lxml.etree.tostring(html, pretty_print=True)
<html>
<head>
<title>Hello World</title>
</head>
<body class="main">
<h1>Hello World !</h1>
</body>
</html>
"""
from lxml.builder import ElementMaker
from lxml.html import html_parser
E = ElementMaker(makeelement=html_parser.makeelement)
# elements
A = E.a # anchor
ABBR = E.abbr # abbreviated form (e.g., WWW, HTTP, etc.)
ACRONYM = E.acronym #
ADDRESS = E.address # information on author
APPLET = E.applet # Java applet (DEPRECATED)
AREA = E.area # client-side image map area
B = E.b # bold text style
BASE = E.base # document base URI
BASEFONT = E.basefont # base font size (DEPRECATED)
BDO = E.bdo # I18N BiDi over-ride
BIG = E.big # large text style
BLOCKQUOTE = E.blockquote # long quotation
BODY = E.body # document body
BR = E.br # forced line break
BUTTON = E.button # push button
CAPTION = E.caption # table caption
CENTER = E.center # shorthand for DIV align=center (DEPRECATED)
CITE = E.cite # citation
CODE = E.code # computer code fragment
COL = E.col # table column
COLGROUP = E.colgroup # table column group
DD = E.dd # definition description
DEL = getattr(E, 'del') # deleted text
DFN = E.dfn # instance definition
DIR = E.dir # directory list (DEPRECATED)
DIV = E.div # generic language/style container
DL = E.dl # definition list
DT = E.dt # definition term
EM = E.em # emphasis
FIELDSET = E.fieldset # form control group
FONT = E.font # local change to font (DEPRECATED)
FORM = E.form # interactive form
FRAME = E.frame # subwindow
FRAMESET = E.frameset # window subdivision
H1 = E.h1 # heading
H2 = E.h2 # heading
H3 = E.h3 # heading
H4 = E.h4 # heading
H5 = E.h5 # heading
H6 = E.h6 # heading
HEAD = E.head # document head
HR = E.hr # horizontal rule
HTML = E.html # document root element
I = E.i # italic text style
IFRAME = E.iframe # inline subwindow
IMG = E.img # Embedded image
INPUT = E.input # form control
INS = E.ins # inserted text
ISINDEX = E.isindex # single line prompt (DEPRECATED)
KBD = E.kbd # text to be entered by the user
LABEL = E.label # form field label text
LEGEND = E.legend # fieldset legend
LI = E.li # list item
LINK = E.link # a media-independent link
MAP = E.map # client-side image map
MENU = E.menu # menu list (DEPRECATED)
META = E.meta # generic metainformation
NOFRAMES = E.noframes # alternate content container for non frame-based rendering
NOSCRIPT = E.noscript # alternate content container for non script-based rendering
OBJECT = E.object # generic embedded object
OL = E.ol # ordered list
OPTGROUP = E.optgroup # option group
OPTION = E.option # selectable choice
P = E.p # paragraph
PARAM = E.param # named property value
PRE = E.pre # preformatted text
Q = E.q # short inline quotation
S = E.s # strike-through text style (DEPRECATED)
SAMP = E.samp # sample program output, scripts, etc.
SCRIPT = E.script # script statements
SELECT = E.select # option selector
SMALL = E.small # small text style
SPAN = E.span # generic language/style container
STRIKE = E.strike # strike-through text (DEPRECATED)
STRONG = E.strong # strong emphasis
STYLE = E.style # style info
SUB = E.sub # subscript
SUP = E.sup # superscript
TABLE = E.table #
TBODY = E.tbody # table body
TD = E.td # table data cell
TEXTAREA = E.textarea # multi-line text field
TFOOT = E.tfoot # table footer
TH = E.th # table header cell
THEAD = E.thead # table header
TITLE = E.title # document title
TR = E.tr # table row
TT = E.tt # teletype or monospaced text style
U = E.u # underlined text style (DEPRECATED)
UL = E.ul # unordered list
VAR = E.var # instance of a variable or program argument
# attributes (only reserved words are included here)
ATTR = dict
def CLASS(v): return {'class': v}
def FOR(v): return {'for': v}

724
lib/lxml/html/clean.py Normal file
View file

@ -0,0 +1,724 @@
"""A cleanup tool for HTML.
Removes unwanted tags and content. See the `Cleaner` class for
details.
"""
import re
import copy
try:
from urlparse import urlsplit
except ImportError:
# Python 3
from urllib.parse import urlsplit
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import xhtml_to_html, _transform_result
try:
unichr
except NameError:
# Python 3
unichr = chr
try:
unicode
except NameError:
# Python 3
unicode = str
try:
bytes
except NameError:
# Python < 2.6
bytes = str
try:
basestring
except NameError:
basestring = (str, bytes)
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
'word_break', 'word_break_html']
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
# I have multiple kinds of schemes searched; but should schemes be
# whitelisted instead?
# max height?
# remove images? Also in CSS? background attribute?
# Some way to whitelist object, iframe, etc (e.g., if you want to
# allow *just* embedded YouTube movies)
# Log what was deleted and why?
# style="behavior: ..." might be bad in IE?
# Should we have something for just <meta http-equiv>? That's the worst of the
# metas.
# UTF-7 detections? Example:
# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
# you don't always have to have the charset set, if the page has no charset
# and there's UTF7-like code in it.
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
# This is an IE-specific construct you can have in a stylesheet to
# run some Javascript:
_css_javascript_re = re.compile(
r'expression\s*\(.*?\)', re.S|re.I)
# Do I have to worry about @\nimport?
_css_import_re = re.compile(
r'@\s*import', re.I)
# All kinds of schemes besides just javascript: that can cause
# execution:
_javascript_scheme_re = re.compile(
r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
_substitute_whitespace = re.compile(r'\s+').sub
# FIXME: should data: be blocked?
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
_conditional_comment_re = re.compile(
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
_find_styled_elements = etree.XPath(
"descendant-or-self::*[@style]")
_find_external_links = etree.XPath(
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
namespaces={'x':XHTML_NAMESPACE})
class Cleaner(object):
"""
Instances cleans the document of each of the possible offending
elements. The cleaning is controlled by attributes; you can
override attributes in a subclass, or set them in the constructor.
``scripts``:
Removes any ``<script>`` tags.
``javascript``:
Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
as they could contain Javascript.
``comments``:
Removes any comments.
``style``:
Removes any style tags or attributes.
``links``:
Removes any ``<link>`` tags
``meta``:
Removes any ``<meta>`` tags
``page_structure``:
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
``processing_instructions``:
Removes any processing instructions.
``embedded``:
Removes any embedded objects (flash, iframes)
``frames``:
Removes any frame-related tags
``forms``:
Removes any form tags
``annoying_tags``:
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
``remove_tags``:
A list of tags to remove. Only the tags will be removed,
their content will get pulled up into the parent tag.
``kill_tags``:
A list of tags to kill. Killing also removes the tag's content,
i.e. the whole subtree, not just the tag itself.
``allow_tags``:
A list of tags to include (default include all).
``remove_unknown_tags``:
Remove any tags that aren't standard parts of HTML.
``safe_attrs_only``:
If true, only include 'safe' attributes (specifically the list
from the feedparser HTML sanitisation web site).
``safe_attrs``:
A set of attribute names to override the default list of attributes
considered 'safe' (when safe_attrs_only=True).
``add_nofollow``:
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
``host_whitelist``:
A list or set of hosts that you can use for embedded content
(for content like ``<object>``, ``<link rel="stylesheet">``, etc).
You can also implement/override the method
``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
implement more complex rules for what can be embedded.
Anything that passes this test will be shown, regardless of
the value of (for instance) ``embedded``.
Note that this parameter might not work as intended if you do not
make the links absolute before doing the cleaning.
Note that you may also need to set ``whitelist_tags``.
``whitelist_tags``:
A set of tags that can be included with ``host_whitelist``.
The default is ``iframe`` and ``embed``; you may wish to
include other tags like ``script``, or you may want to
implement ``allow_embedded_url`` for more control. Set to None to
include all tags.
This modifies the document *in place*.
"""
scripts = True
javascript = True
comments = True
style = False
links = True
meta = True
page_structure = True
processing_instructions = True
embedded = True
frames = True
forms = True
annoying_tags = True
remove_tags = None
allow_tags = None
kill_tags = None
remove_unknown_tags = True
safe_attrs_only = True
safe_attrs = defs.safe_attrs
add_nofollow = False
host_whitelist = ()
whitelist_tags = set(['iframe', 'embed'])
def __init__(self, **kw):
for name, value in kw.items():
if not hasattr(self, name):
raise TypeError(
"Unknown parameter: %s=%r" % (name, value))
setattr(self, name, value)
# Used to lookup the primary URL for a given tag that is up for
# removal:
_tag_link_attrs = dict(
script='src',
link='href',
# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
# From what I can tell, both attributes can contain a link:
applet=['code', 'object'],
iframe='src',
embed='src',
layer='src',
# FIXME: there doesn't really seem like a general way to figure out what
# links an <object> tag uses; links often go in <param> tags with values
# that we don't really know. You'd have to have knowledge about specific
# kinds of plugins (probably keyed off classid), and match against those.
##object=?,
# FIXME: not looking at the action currently, because it is more complex
# than than -- if you keep the form, you should keep the form controls.
##form='action',
a='href',
)
def __call__(self, doc):
"""
Cleans the document.
"""
if hasattr(doc, 'getroot'):
# ElementTree instance, instead of an element
doc = doc.getroot()
# convert XHTML to HTML
xhtml_to_html(doc)
# Normalize a case that IE treats <image> like <img>, and that
# can confuse either this step or later steps.
for el in doc.iter('image'):
el.tag = 'img'
if not self.comments:
# Of course, if we were going to kill comments anyway, we don't
# need to worry about this
self.kill_conditional_comments(doc)
kill_tags = set(self.kill_tags or ())
remove_tags = set(self.remove_tags or ())
allow_tags = set(self.allow_tags or ())
if self.scripts:
kill_tags.add('script')
if self.safe_attrs_only:
safe_attrs = set(self.safe_attrs)
for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname not in safe_attrs:
del attrib[aname]
if self.javascript:
if not (self.safe_attrs_only and
self.safe_attrs == defs.safe_attrs):
# safe_attrs handles events attributes itself
for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname.startswith('on'):
del attrib[aname]
doc.rewrite_links(self._remove_javascript_link,
resolve_base_href=False)
if not self.style:
# If we're deleting style then we don't have to remove JS links
# from styles, otherwise...
for el in _find_styled_elements(doc):
old = el.get('style')
new = _css_javascript_re.sub('', old)
new = _css_import_re.sub('', new)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
del el.attrib['style']
elif new != old:
el.set('style', new)
for el in list(doc.iter('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
el.drop_tree()
continue
old = el.text or ''
new = _css_javascript_re.sub('', old)
# The imported CSS can do anything; we just can't allow:
new = _css_import_re.sub('', old)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
el.text = '/* deleted */'
elif new != old:
el.text = new
if self.comments or self.processing_instructions:
# FIXME: why either? I feel like there's some obscure reason
# because you can put PIs in comments...? But I've already
# forgotten it
kill_tags.add(etree.Comment)
if self.processing_instructions:
kill_tags.add(etree.ProcessingInstruction)
if self.style:
kill_tags.add('style')
etree.strip_attributes(doc, 'style')
if self.links:
kill_tags.add('link')
elif self.style or self.javascript:
# We must get rid of included stylesheets if Javascript is not
# allowed, as you can put Javascript in them
for el in list(doc.iter('link')):
if 'stylesheet' in el.get('rel', '').lower():
# Note this kills alternate stylesheets as well
if not self.allow_element(el):
el.drop_tree()
if self.meta:
kill_tags.add('meta')
if self.page_structure:
remove_tags.update(('head', 'html', 'title'))
if self.embedded:
# FIXME: is <layer> really embedded?
# We should get rid of any <param> tags not inside <applet>;
# These are not really valid anyway.
for el in list(doc.iter('param')):
found_parent = False
parent = el.getparent()
while parent is not None and parent.tag not in ('applet', 'object'):
parent = parent.getparent()
if parent is None:
el.drop_tree()
kill_tags.update(('applet',))
# The alternate contents that are in an iframe are a good fallback:
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
if self.frames:
# FIXME: ideally we should look at the frame links, but
# generally frames don't mix properly with an HTML
# fragment anyway.
kill_tags.update(defs.frame_tags)
if self.forms:
remove_tags.add('form')
kill_tags.update(('button', 'input', 'select', 'textarea'))
if self.annoying_tags:
remove_tags.update(('blink', 'marquee'))
_remove = []
_kill = []
for el in doc.iter():
if el.tag in kill_tags:
if self.allow_element(el):
continue
_kill.append(el)
elif el.tag in remove_tags:
if self.allow_element(el):
continue
_remove.append(el)
if _remove and _remove[0] == doc:
# We have to drop the parent-most tag, which we can't
# do. Instead we'll rewrite it:
el = _remove.pop(0)
el.tag = 'div'
el.attrib.clear()
elif _kill and _kill[0] == doc:
# We have to drop the parent-most element, which we can't
# do. Instead we'll clear it:
el = _kill.pop(0)
if el.tag != 'html':
el.tag = 'div'
el.clear()
_kill.reverse() # start with innermost tags
for el in _kill:
el.drop_tree()
for el in _remove:
el.drop_tag()
if self.remove_unknown_tags:
if allow_tags:
raise ValueError(
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
allow_tags = set(defs.tags)
if allow_tags:
bad = []
for el in doc.iter():
if el.tag not in allow_tags:
bad.append(el)
if bad:
if bad[0] is doc:
el = bad.pop(0)
el.tag = 'div'
el.attrib.clear()
for el in bad:
el.drop_tag()
if self.add_nofollow:
for el in _find_external_links(doc):
if not self.allow_follow(el):
rel = el.get('rel')
if rel:
if ('nofollow' in rel
and ' nofollow ' in (' %s ' % rel)):
continue
rel = '%s nofollow' % rel
else:
rel = 'nofollow'
el.set('rel', rel)
def allow_follow(self, anchor):
"""
Override to suppress rel="nofollow" on some anchors.
"""
return False
def allow_element(self, el):
if el.tag not in self._tag_link_attrs:
return False
attr = self._tag_link_attrs[el.tag]
if isinstance(attr, (list, tuple)):
for one_attr in attr:
url = el.get(one_attr)
if not url:
return False
if not self.allow_embedded_url(el, url):
return False
return True
else:
url = el.get(attr)
if not url:
return False
return self.allow_embedded_url(el, url)
def allow_embedded_url(self, el, url):
if (self.whitelist_tags is not None
and el.tag not in self.whitelist_tags):
return False
scheme, netloc, path, query, fragment = urlsplit(url)
netloc = netloc.lower().split(':', 1)[0]
if scheme not in ('http', 'https'):
return False
if netloc in self.host_whitelist:
return True
return False
def kill_conditional_comments(self, doc):
"""
IE conditional comments basically embed HTML that the parser
doesn't normally see. We can't allow anything like that, so
we'll kill any comments that could be conditional.
"""
bad = []
self._kill_elements(
doc, lambda el: _conditional_comment_re.search(el.text),
etree.Comment)
def _kill_elements(self, doc, condition, iterate=None):
bad = []
for el in doc.iter(iterate):
if condition(el):
bad.append(el)
for el in bad:
el.drop_tree()
def _remove_javascript_link(self, link):
# links like "j a v a s c r i p t:" might be interpreted in IE
new = _substitute_whitespace('', link)
if _javascript_scheme_re.search(new):
# FIXME: should this be None to delete?
return ''
return link
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
def _has_sneaky_javascript(self, style):
"""
Depending on the browser, stuff like ``e x p r e s s i o n(...)``
can get interpreted, or ``expre/* stuff */ssion(...)``. This
checks for attempt to do stuff like this.
Typically the response will be to kill the entire style; if you
have just a bit of Javascript in the style another rule will catch
that and remove only the Javascript from the style; this catches
more sneaky attempts.
"""
style = self._substitute_comments('', style)
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()
if 'javascript:' in style:
return True
if 'expression(' in style:
return True
return False
def clean_html(self, html):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
return _transform_result(result_type, doc)
clean = Cleaner()
clean_html = clean.clean_html
############################################################
## Autolinking
############################################################
_link_regexes = [
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
# This is conservative, but autolinking can be a bit conservative:
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
]
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
_avoid_hosts = [
re.compile(r'^localhost', re.I),
re.compile(r'\bexample\.(?:com|org|net)$', re.I),
re.compile(r'^127\.0\.0\.1$'),
]
_avoid_classes = ['nolink']
def autolink(el, link_regexes=_link_regexes,
avoid_elements=_avoid_elements,
avoid_hosts=_avoid_hosts,
avoid_classes=_avoid_classes):
"""
Turn any URLs into links.
It will search for links identified by the given regular
expressions (by default mailto and http(s) links).
It won't link text in an element in avoid_elements, or an element
with a class in avoid_classes. It won't link to anything with a
host that matches one of the regular expressions in avoid_hosts
(default localhost and 127.0.0.1).
If you pass in an element, the element's tail will not be
substituted, only the contents of the element.
"""
if el.tag in avoid_elements:
return
class_name = el.get('class')
if class_name:
class_name = class_name.split()
for match_class in avoid_classes:
if match_class in class_name:
return
for child in list(el):
autolink(child, link_regexes=link_regexes,
avoid_elements=avoid_elements,
avoid_hosts=avoid_hosts,
avoid_classes=avoid_classes)
if child.tail:
text, tail_children = _link_text(
child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
if tail_children:
child.tail = text
index = el.index(child)
el[index+1:index+1] = tail_children
if el.text:
text, pre_children = _link_text(
el.text, link_regexes, avoid_hosts, factory=el.makeelement)
if pre_children:
el.text = text
el[:0] = pre_children
def _link_text(text, link_regexes, avoid_hosts, factory):
leading_text = ''
links = []
last_pos = 0
while 1:
best_match, best_pos = None, None
for regex in link_regexes:
regex_pos = last_pos
while 1:
match = regex.search(text, pos=regex_pos)
if match is None:
break
host = match.group('host')
for host_regex in avoid_hosts:
if host_regex.search(host):
regex_pos = match.end()
break
else:
break
if match is None:
continue
if best_pos is None or match.start() < best_pos:
best_match = match
best_pos = match.start()
if best_match is None:
# No more matches
if links:
assert not links[-1].tail
links[-1].tail = text
else:
assert not leading_text
leading_text = text
break
link = best_match.group(0)
end = best_match.end()
if link.endswith('.') or link.endswith(','):
# These punctuation marks shouldn't end a link
end -= 1
link = link[:-1]
prev_text = text[:best_match.start()]
if links:
assert not links[-1].tail
links[-1].tail = prev_text
else:
assert not leading_text
leading_text = prev_text
anchor = factory('a')
anchor.set('href', link)
body = best_match.group('body')
if not body:
body = link
if body.endswith('.') or body.endswith(','):
body = body[:-1]
anchor.text = body
links.append(anchor)
text = text[end:]
return leading_text, links
def autolink_html(html, *args, **kw):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
autolink(doc, *args, **kw)
return _transform_result(result_type, doc)
autolink_html.__doc__ = autolink.__doc__
############################################################
## Word wrapping
############################################################
_avoid_word_break_elements = ['pre', 'textarea', 'code']
_avoid_word_break_classes = ['nobreak']
def word_break(el, max_width=40,
avoid_elements=_avoid_word_break_elements,
avoid_classes=_avoid_word_break_classes,
break_character=unichr(0x200b)):
"""
Breaks any long words found in the body of the text (not attributes).
Doesn't effect any of the tags in avoid_elements, by default
``<textarea>`` and ``<pre>``
Breaks words by inserting &#8203;, which is a unicode character
for Zero Width Space character. This generally takes up no space
in rendering, but does copy as a space, and in monospace contexts
usually takes up space.
See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
"""
# Character suggestion of &#8203 comes from:
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
if el.tag in _avoid_word_break_elements:
return
class_name = el.get('class')
if class_name:
dont_break = False
class_name = class_name.split()
for avoid in avoid_classes:
if avoid in class_name:
dont_break = True
break
if dont_break:
return
if el.text:
el.text = _break_text(el.text, max_width, break_character)
for child in el:
word_break(child, max_width=max_width,
avoid_elements=avoid_elements,
avoid_classes=avoid_classes,
break_character=break_character)
if child.tail:
child.tail = _break_text(child.tail, max_width, break_character)
def word_break_html(html, *args, **kw):
result_type = type(html)
doc = fromstring(html)
word_break(doc, *args, **kw)
return _transform_result(result_type, doc)
def _break_text(text, max_width, break_character):
words = text.split()
for word in words:
if len(word) > max_width:
replacement = _insert_break(word, max_width, break_character)
text = text.replace(word, replacement)
return text
_break_prefer_re = re.compile(r'[^a-z]', re.I)
def _insert_break(word, width, break_character):
orig_word = word
result = ''
while len(word) > width:
start = word[:width]
breaks = list(_break_prefer_re.finditer(start))
if breaks:
last_break = breaks[-1]
# Only walk back up to 10 characters to find a nice break:
if last_break.end() > width-10:
# FIXME: should the break character be at the end of the
# chunk, or the beginning of the next chunk?
start = word[:last_break.end()]
result += start + break_character
word = word[len(start):]
result += word
return result

137
lib/lxml/html/defs.py Normal file
View file

@ -0,0 +1,137 @@
# FIXME: this should all be confirmed against what a DTD says
# (probably in a test; this may not match the DTD exactly, but we
# should document just how it differs).
# Data taken from http://www.w3.org/TR/html401/index/elements.html
# and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
# for html5_tags.
try:
frozenset
except NameError:
from sets import Set as frozenset
empty_tags = frozenset([
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param'])
deprecated_tags = frozenset([
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
'menu', 's', 'strike', 'u'])
# archive actually takes a space-separated list of URIs
link_attrs = frozenset([
'action', 'archive', 'background', 'cite', 'classid',
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
'usemap',
# Not standard:
'dynsrc', 'lowsrc',
])
# Not in the HTML 4 spec:
# onerror, onresize
event_attrs = frozenset([
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
'onunload',
])
safe_attrs = frozenset([
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
# From http://htmlhelp.com/reference/html40/olist.html
top_level_tags = frozenset([
'html', 'head', 'body', 'frameset',
])
head_tags = frozenset([
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
])
general_block_tags = frozenset([
'address',
'blockquote',
'center',
'del',
'div',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'ins',
'isindex',
'noscript',
'p',
'pre',
])
list_tags = frozenset([
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
])
table_tags = frozenset([
'table', 'caption', 'colgroup', 'col',
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
])
# just this one from
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
block_tags = general_block_tags | list_tags | table_tags | frozenset([
# Partial form tags
'fieldset', 'form', 'legend', 'optgroup', 'option',
])
form_tags = frozenset([
'form', 'button', 'fieldset', 'legend', 'input', 'label',
'select', 'optgroup', 'option', 'textarea',
])
special_inline_tags = frozenset([
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
'img', 'map', 'area', 'object', 'param', 'q', 'script',
'span', 'sub', 'sup',
])
phrase_tags = frozenset([
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
'ins', 'kbd', 'samp', 'strong', 'var',
])
font_style_tags = frozenset([
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
])
frame_tags = frozenset([
'frameset', 'frame', 'noframes',
])
html5_tags = frozenset([
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
'svg', 'time', 'track', 'video', 'wbr'
])
# These tags aren't standard
nonstandard_tags = frozenset(['blink', 'marquee'])
tags = (top_level_tags | head_tags | general_block_tags | list_tags
| table_tags | form_tags | special_inline_tags | phrase_tags
| font_style_tags | nonstandard_tags | html5_tags)

881
lib/lxml/html/diff.py Normal file
View file

@ -0,0 +1,881 @@
import difflib
from lxml import etree
from lxml.html import fragment_fromstring
import re
__all__ = ['html_annotate', 'htmldiff']
try:
from html import escape as html_escape
except ImportError:
from cgi import escape as html_escape
try:
_unicode = unicode
except NameError:
# Python 3
_unicode = str
try:
basestring
except NameError:
# Python 3
basestring = str
############################################################
## Annotation
############################################################
def default_markup(text, version):
return '<span title="%s">%s</span>' % (
html_escape(_unicode(version), 1), text)
def html_annotate(doclist, markup=default_markup):
"""
doclist should be ordered from oldest to newest, like::
>>> version1 = 'Hello World'
>>> version2 = 'Goodbye World'
>>> print(html_annotate([(version1, 'version 1'),
... (version2, 'version 2')]))
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
The documents must be *fragments* (str/UTF8 or unicode), not
complete documents
The markup argument is a function to markup the spans of words.
This function is called like markup('Hello', 'version 2'), and
returns HTML. The first argument is text and never includes any
markup. The default uses a span with a title:
>>> print(default_markup('Some Text', 'by Joe'))
<span title="by Joe">Some Text</span>
"""
# The basic strategy we have is to split the documents up into
# logical tokens (which are words with attached markup). We then
# do diffs of each of the versions to track when a token first
# appeared in the document; the annotation attached to the token
# is the version where it first appeared.
tokenlist = [tokenize_annotated(doc, version)
for doc, version in doclist]
cur_tokens = tokenlist[0]
for tokens in tokenlist[1:]:
html_annotate_merge_annotations(cur_tokens, tokens)
cur_tokens = tokens
# After we've tracked all the tokens, we can combine spans of text
# that are adjacent and have the same annotation
cur_tokens = compress_tokens(cur_tokens)
# And finally add markup
result = markup_serialize_tokens(cur_tokens, markup)
return ''.join(result).strip()
def tokenize_annotated(doc, annotation):
"""Tokenize a document and add an annotation attribute to each token
"""
tokens = tokenize(doc, include_hrefs=False)
for tok in tokens:
tok.annotation = annotation
return tokens
def html_annotate_merge_annotations(tokens_old, tokens_new):
"""Merge the annotations from tokens_old into tokens_new, when the
tokens in the new document already existed in the old document.
"""
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
commands = s.get_opcodes()
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
eq_old = tokens_old[i1:i2]
eq_new = tokens_new[j1:j2]
copy_annotations(eq_old, eq_new)
def copy_annotations(src, dest):
"""
Copy annotations from the tokens listed in src to the tokens in dest
"""
assert len(src) == len(dest)
for src_tok, dest_tok in zip(src, dest):
dest_tok.annotation = src_tok.annotation
def compress_tokens(tokens):
"""
Combine adjacent tokens when there is no HTML between the tokens,
and they share an annotation
"""
result = [tokens[0]]
for tok in tokens[1:]:
if (not result[-1].post_tags and
not tok.pre_tags and
result[-1].annotation == tok.annotation):
compress_merge_back(result, tok)
else:
result.append(tok)
return result
def compress_merge_back(tokens, tok):
""" Merge tok into the last element of tokens (modifying the list of
tokens in-place). """
last = tokens[-1]
if type(last) is not token or type(tok) is not token:
tokens.append(tok)
else:
text = _unicode(last)
if last.trailing_whitespace:
text += last.trailing_whitespace
text += tok
merged = token(text,
pre_tags=last.pre_tags,
post_tags=tok.post_tags,
trailing_whitespace=tok.trailing_whitespace)
merged.annotation = last.annotation
tokens[-1] = merged
def markup_serialize_tokens(tokens, markup_func):
"""
Serialize the list of tokens into a list of text chunks, calling
markup_func around text to add annotations.
"""
for token in tokens:
for pre in token.pre_tags:
yield pre
html = token.html()
html = markup_func(html, token.annotation)
if token.trailing_whitespace:
html += token.trailing_whitespace
yield html
for post in token.post_tags:
yield post
############################################################
## HTML Diffs
############################################################
def htmldiff(old_html, new_html):
## FIXME: this should take parsed documents too, and use their body
## or other content.
""" Do a diff of the old and new document. The documents are HTML
*fragments* (str/UTF8 or unicode), they are not complete documents
(i.e., no <html> tag).
Returns HTML with <ins> and <del> tags added around the
appropriate text.
Markup is generally ignored, with the markup from new_html
preserved, and possibly some markup from old_html (though it is
considered acceptable to lose some of the old markup). Only the
words in the HTML are diffed. The exception is <img> tags, which
are treated like words, and the href attribute of <a> tags, which
are noted inside the tag itself when there are changes.
"""
old_html_tokens = tokenize(old_html)
new_html_tokens = tokenize(new_html)
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()
return fixup_ins_del_tags(result)
def htmldiff_tokens(html1_tokens, html2_tokens):
""" Does a diff on the tokens themselves, returning a list of text
chunks (not tokens).
"""
# There are several passes as we do the differences. The tokens
# isolate the portion of the content we care to diff; difflib does
# all the actual hard work at that point.
#
# Then we must create a valid document from pieces of both the old
# document and the new document. We generally prefer to take
# markup from the new document, and only do a best effort attempt
# to keep markup from the old document; anything that we can't
# resolve we throw away. Also we try to put the deletes as close
# to the location where we think they would have been -- because
# we are only keeping the markup from the new document, it can be
# fuzzy where in the new document the old text would have gone.
# Again we just do a best effort attempt.
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
commands = s.get_opcodes()
result = []
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
continue
if command == 'insert' or command == 'replace':
ins_tokens = expand_tokens(html2_tokens[j1:j2])
merge_insert(ins_tokens, result)
if command == 'delete' or command == 'replace':
del_tokens = expand_tokens(html1_tokens[i1:i2])
merge_delete(del_tokens, result)
# If deletes were inserted directly as <del> then we'd have an
# invalid document at this point. Instead we put in special
# markers, and when the complete diffed document has been created
# we try to move the deletes around and resolve any problems.
result = cleanup_delete(result)
return result
def expand_tokens(tokens, equal=False):
"""Given a list of tokens, return a generator of the chunks of
text for the data in the tokens.
"""
for token in tokens:
for pre in token.pre_tags:
yield pre
if not equal or not token.hide_when_equal:
if token.trailing_whitespace:
yield token.html() + token.trailing_whitespace
else:
yield token.html()
for post in token.post_tags:
yield post
def merge_insert(ins_chunks, doc):
""" doc is the already-handled document (as a list of text chunks);
here we add <ins>ins_chunks</ins> to the end of that. """
# Though we don't throw away unbalanced_start or unbalanced_end
# (we assume there is accompanying markup later or earlier in the
# document), we only put <ins> around the balanced portion.
unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
doc.extend(unbalanced_start)
if doc and not doc[-1].endswith(' '):
# Fix up the case where the word before the insert didn't end with
# a space
doc[-1] += ' '
doc.append('<ins>')
if balanced and balanced[-1].endswith(' '):
# We move space outside of </ins>
balanced[-1] = balanced[-1][:-1]
doc.extend(balanced)
doc.append('</ins> ')
doc.extend(unbalanced_end)
# These are sentinals to represent the start and end of a <del>
# segment, until we do the cleanup phase to turn them into proper
# markup:
class DEL_START:
pass
class DEL_END:
pass
class NoDeletes(Exception):
""" Raised when the document no longer contains any pending deletes
(DEL_START/DEL_END) """
def merge_delete(del_chunks, doc):
""" Adds the text chunks in del_chunks to the document doc (another
list of text chunks) with marker to show it is a delete.
cleanup_delete later resolves these markers into <del> tags."""
doc.append(DEL_START)
doc.extend(del_chunks)
doc.append(DEL_END)
def cleanup_delete(chunks):
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
them with <del></del>. To do this while keeping the document
valid, it may need to drop some tags (either start or end tags).
It may also move the del into adjacent tags to try to move it to a
similar location where it was originally located (e.g., moving a
delete into preceding <div> tag, if the del looks like (DEL_START,
'Text</div>', DEL_END)"""
while 1:
# Find a pending DEL_START/DEL_END, splitting the document
# into stuff-preceding-DEL_START, stuff-inside, and
# stuff-following-DEL_END
try:
pre_delete, delete, post_delete = split_delete(chunks)
except NoDeletes:
# Nothing found, we've cleaned up the entire doc
break
# The stuff-inside-DEL_START/END may not be well balanced
# markup. First we figure out what unbalanced portions there are:
unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
# Then we move the span forward and/or backward based on these
# unbalanced portions:
locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
doc = pre_delete
if doc and not doc[-1].endswith(' '):
# Fix up case where the word before us didn't have a trailing space
doc[-1] += ' '
doc.append('<del>')
if balanced and balanced[-1].endswith(' '):
# We move space outside of </del>
balanced[-1] = balanced[-1][:-1]
doc.extend(balanced)
doc.append('</del> ')
doc.extend(post_delete)
chunks = doc
return chunks
def split_unbalanced(chunks):
"""Return (unbalanced_start, balanced, unbalanced_end), where each is
a list of text and tag chunks.
unbalanced_start is a list of all the tags that are opened, but
not closed in this span. Similarly, unbalanced_end is a list of
tags that are closed but were not opened. Extracting these might
mean some reordering of the chunks."""
start = []
end = []
tag_stack = []
balanced = []
for chunk in chunks:
if not chunk.startswith('<'):
balanced.append(chunk)
continue
endtag = chunk[1] == '/'
name = chunk.split()[0].strip('<>/')
if name in empty_tags:
balanced.append(chunk)
continue
if endtag:
if tag_stack and tag_stack[-1][0] == name:
balanced.append(chunk)
name, pos, tag = tag_stack.pop()
balanced[pos] = tag
elif tag_stack:
start.extend([tag for name, pos, tag in tag_stack])
tag_stack = []
end.append(chunk)
else:
end.append(chunk)
else:
tag_stack.append((name, len(balanced), chunk))
balanced.append(None)
start.extend(
[chunk for name, pos, chunk in tag_stack])
balanced = [chunk for chunk in balanced if chunk is not None]
return start, balanced, end
def split_delete(chunks):
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
stuff_after_DEL_END). Returns the first case found (there may be
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
there's no DEL_START found. """
try:
pos = chunks.index(DEL_START)
except ValueError:
raise NoDeletes
pos2 = chunks.index(DEL_END)
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
""" pre_delete and post_delete implicitly point to a place in the
document (where the two were split). This moves that point (by
popping items from one and pushing them onto the other). It moves
the point to try to find a place where unbalanced_start applies.
As an example::
>>> unbalanced_start = ['<div>']
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
>>> pre, post = doc[:3], doc[3:]
>>> pre, post
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
>>> locate_unbalanced_start(unbalanced_start, pre, post)
>>> pre, post
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
As you can see, we moved the point so that the dangling <div> that
we found will be effectively replaced by the div in the original
document. If this doesn't work out, we just throw away
unbalanced_start without doing anything.
"""
while 1:
if not unbalanced_start:
# We have totally succeded in finding the position
break
finding = unbalanced_start[0]
finding_name = finding.split()[0].strip('<>')
if not post_delete:
break
next = post_delete[0]
if next is DEL_START or not next.startswith('<'):
# Reached a word, we can't move the delete text forward
break
if next[1] == '/':
# Reached a closing tag, can we go further? Maybe not...
break
name = next.split()[0].strip('<>')
if name == 'ins':
# Can't move into an insert
break
assert name != 'del', (
"Unexpected delete tag: %r" % next)
if name == finding_name:
unbalanced_start.pop(0)
pre_delete.append(post_delete.pop(0))
else:
# Found a tag that doesn't match
break
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
""" like locate_unbalanced_start, except handling end tags and
possibly moving the point earlier in the document. """
while 1:
if not unbalanced_end:
# Success
break
finding = unbalanced_end[-1]
finding_name = finding.split()[0].strip('<>/')
if not pre_delete:
break
next = pre_delete[-1]
if next is DEL_END or not next.startswith('</'):
# A word or a start tag
break
name = next.split()[0].strip('<>/')
if name == 'ins' or name == 'del':
# Can't move into an insert or delete
break
if name == finding_name:
unbalanced_end.pop()
post_delete.insert(0, pre_delete.pop())
else:
# Found a tag that doesn't match
break
class token(_unicode):
""" Represents a diffable token, generally a word that is displayed to
the user. Opening tags are attached to this token when they are
adjacent (pre_tags) and closing tags that follow the word
(post_tags). Some exceptions occur when there are empty tags
adjacent to a word, so there may be close tags in pre_tags, or
open tags in post_tags.
We also keep track of whether the word was originally followed by
whitespace, even though we do not want to treat the word as
equivalent to a similar word that does not have a trailing
space."""
# When this is true, the token will be eliminated from the
# displayed diff if no change has occurred:
hide_when_equal = False
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
obj = _unicode.__new__(cls, text)
if pre_tags is not None:
obj.pre_tags = pre_tags
else:
obj.pre_tags = []
if post_tags is not None:
obj.post_tags = post_tags
else:
obj.post_tags = []
obj.trailing_whitespace = trailing_whitespace
return obj
def __repr__(self):
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
self.post_tags, self.trailing_whitespace)
def html(self):
return _unicode(self)
class tag_token(token):
""" Represents a token that is actually a tag. Currently this is just
the <img> tag, which takes up visible space just like a word but
is only represented in a document by a tag. """
def __new__(cls, tag, data, html_repr, pre_tags=None,
post_tags=None, trailing_whitespace=""):
obj = token.__new__(cls, "%s: %s" % (type, data),
pre_tags=pre_tags,
post_tags=post_tags,
trailing_whitespace=trailing_whitespace)
obj.tag = tag
obj.data = data
obj.html_repr = html_repr
return obj
def __repr__(self):
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
self.tag,
self.data,
self.html_repr,
self.pre_tags,
self.post_tags,
self.trailing_whitespace)
def html(self):
return self.html_repr
class href_token(token):
""" Represents the href in an anchor tag. Unlike other words, we only
show the href when it changes. """
hide_when_equal = True
def html(self):
return ' Link: %s' % self
def tokenize(html, include_hrefs=True):
"""
Parse the given HTML and returns token objects (words with attached tags).
This parses only the content of a page; anything in the head is
ignored, and the <head> and <body> elements are themselves
optional. The content is then parsed by lxml, which ensures the
validity of the resulting parsed document (though lxml may make
incorrect guesses when the markup is particular bad).
<ins> and <del> tags are also eliminated from the document, as
that gets confusing.
If include_hrefs is true, then the href attribute of <a> tags is
included as a special kind of diffable token."""
if etree.iselement(html):
body_el = html
else:
body_el = parse_html(html, cleanup=True)
# Then we split the document into text chunks for each tag, word, and end tag:
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
# Finally re-joining them into token objects:
return fixup_chunks(chunks)
def parse_html(html, cleanup=True):
"""
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
wrapped in a <div> tag that was not in the original document.
If cleanup is true, make sure there's no <head> or <body>, and get
rid of any <ins> and <del> tags.
"""
if cleanup:
# This removes any extra markup or structure like <head>:
html = cleanup_html(html)
return fragment_fromstring(html, create_parent=True)
_body_re = re.compile(r'<body.*?>', re.I|re.S)
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
def cleanup_html(html):
""" This 'cleans' the HTML, meaning that any page structure is removed
(only the contents of <body> are used, if there is any <body).
Also <ins> and <del> tags are removed. """
match = _body_re.search(html)
if match:
html = html[match.end():]
match = _end_body_re.search(html)
if match:
html = html[:match.start()]
html = _ins_del_re.sub('', html)
return html
end_whitespace_re = re.compile(r'[ \t\n\r]$')
def split_trailing_whitespace(word):
"""
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
"""
stripped_length = len(word.rstrip())
return word[0:stripped_length], word[stripped_length:]
def fixup_chunks(chunks):
"""
This function takes a list of chunks and produces a list of tokens.
"""
tag_accum = []
cur_word = None
result = []
for chunk in chunks:
if isinstance(chunk, tuple):
if chunk[0] == 'img':
src = chunk[1]
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
cur_word = tag_token('img', src, html_repr=tag,
pre_tags=tag_accum,
trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif chunk[0] == 'href':
href = chunk[1]
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
tag_accum = []
result.append(cur_word)
continue
if is_word(chunk):
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif is_start_tag(chunk):
tag_accum.append(chunk)
elif is_end_tag(chunk):
if tag_accum:
tag_accum.append(chunk)
else:
assert cur_word, (
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
% (cur_word, result, chunk, chunks))
cur_word.post_tags.append(chunk)
else:
assert(0)
if not result:
return [token('', pre_tags=tag_accum)]
else:
result[-1].post_tags.extend(tag_accum)
return result
# All the tags in HTML that don't require end tags:
empty_tags = (
'param', 'img', 'area', 'br', 'basefont', 'input',
'base', 'meta', 'link', 'col')
block_level_tags = (
'address',
'blockquote',
'center',
'dir',
'div',
'dl',
'fieldset',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'isindex',
'menu',
'noframes',
'noscript',
'ol',
'p',
'pre',
'table',
'ul',
)
block_level_container_tags = (
'dd',
'dt',
'frameset',
'li',
'tbody',
'td',
'tfoot',
'th',
'thead',
'tr',
)
def flatten_el(el, include_hrefs, skip_tag=False):
""" Takes an lxml element el, and generates all the text chunks for
that tag. Each start tag is a chunk, each word is a chunk, and each
end tag is a chunk.
If skip_tag is true, then the outermost container tag is
not returned (just its contents)."""
if not skip_tag:
if el.tag == 'img':
yield ('img', el.get('src'), start_tag(el))
else:
yield start_tag(el)
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
return
start_words = split_words(el.text)
for word in start_words:
yield html_escape(word)
for child in el:
for item in flatten_el(child, include_hrefs=include_hrefs):
yield item
if el.tag == 'a' and el.get('href') and include_hrefs:
yield ('href', el.get('href'))
if not skip_tag:
yield end_tag(el)
end_words = split_words(el.tail)
for word in end_words:
yield html_escape(word)
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
def split_words(text):
""" Splits some text into words. Includes trailing whitespace
on each word when appropriate. """
if not text or not text.strip():
return []
words = split_words_re.findall(text)
return words
start_whitespace_re = re.compile(r'^[ \t\n\r]')
def start_tag(el):
"""
The text representation of the start tag for a tag.
"""
return '<%s%s>' % (
el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
for name, value in el.attrib.items()]))
def end_tag(el):
""" The text representation of an end tag for a tag. Includes
trailing whitespace when appropriate. """
if el.tail and start_whitespace_re.search(el.tail):
extra = ' '
else:
extra = ''
return '</%s>%s' % (el.tag, extra)
def is_word(tok):
return not tok.startswith('<')
def is_end_tag(tok):
return tok.startswith('</')
def is_start_tag(tok):
return tok.startswith('<') and not tok.startswith('</')
def fixup_ins_del_tags(html):
""" Given an html string, move any <ins> or <del> tags inside of any
block-level elements, e.g. transform <ins><p>word</p></ins> to
<p><ins>word</ins></p> """
doc = parse_html(html, cleanup=False)
_fixup_ins_del_tags(doc)
html = serialize_html_fragment(doc, skip_outer=True)
return html
def serialize_html_fragment(el, skip_outer=False):
""" Serialize a single lxml element as HTML. The serialized form
includes the elements tail.
If skip_outer is true, then don't serialize the outermost tag
"""
assert not isinstance(el, basestring), (
"You should pass in an element, not a string like %r" % el)
html = etree.tostring(el, method="html", encoding=_unicode)
if skip_outer:
# Get rid of the extra starting tag:
html = html[html.find('>')+1:]
# Get rid of the extra end tag:
html = html[:html.rfind('<')]
return html.strip()
else:
return html
def _fixup_ins_del_tags(doc):
"""fixup_ins_del_tags that works on an lxml document in-place
"""
for tag in ['ins', 'del']:
for el in doc.xpath('descendant-or-self::%s' % tag):
if not _contains_block_level_tag(el):
continue
_move_el_inside_block(el, tag=tag)
el.drop_tag()
#_merge_element_contents(el)
def _contains_block_level_tag(el):
"""True if the element contains any block-level elements, like <p>, <td>, etc.
"""
if el.tag in block_level_tags or el.tag in block_level_container_tags:
return True
for child in el:
if _contains_block_level_tag(child):
return True
return False
def _move_el_inside_block(el, tag):
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
and moves them inside any block-level tags. """
for child in el:
if _contains_block_level_tag(child):
break
else:
import sys
# No block-level tags in any child
children_tag = etree.Element(tag)
children_tag.text = el.text
el.text = None
children_tag.extend(list(el))
el[:] = [children_tag]
return
for child in list(el):
if _contains_block_level_tag(child):
_move_el_inside_block(child, tag)
if child.tail:
tail_tag = etree.Element(tag)
tail_tag.text = child.tail
child.tail = None
el.insert(el.index(child)+1, tail_tag)
else:
child_tag = etree.Element(tag)
el.replace(child, child_tag)
child_tag.append(child)
if el.text:
text_tag = etree.Element(tag)
text_tag.text = el.text
el.text = None
el.insert(0, text_tag)
def _merge_element_contents(el):
"""
Removes an element, but merges its contents into its place, e.g.,
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
<p>Hi there!</p>
"""
parent = el.getparent()
text = el.text or ''
if el.tail:
if not len(el):
text += el.tail
else:
if el[-1].tail:
el[-1].tail += el.tail
else:
el[-1].tail = el.tail
index = parent.index(el)
if text:
if index == 0:
previous = None
else:
previous = parent[index-1]
if previous is None:
if parent.text:
parent.text += text
else:
parent.text = text
else:
if previous.tail:
previous.tail += text
else:
previous.tail = text
parent[index:index+1] = el.getchildren()
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
"""
Acts like SequenceMatcher, but tries not to find very small equal
blocks amidst large spans of changes
"""
threshold = 2
def get_matching_blocks(self):
size = min(len(self.b), len(self.b))
threshold = min(self.threshold, size / 4)
actual = difflib.SequenceMatcher.get_matching_blocks(self)
return [item for item in actual
if item[2] > threshold
or not item[2]]
if __name__ == '__main__':
from lxml.html import _diffcommand
_diffcommand.main()

299
lib/lxml/html/formfill.py Normal file
View file

@ -0,0 +1,299 @@
from lxml.etree import XPath, ElementBase
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
from lxml.html import defs
import copy
try:
basestring
except NameError:
# Python 3
basestring = str
__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
'insert_errors', 'insert_errors_html',
'DefaultErrorCreator']
class FormNotFound(LookupError):
"""
Raised when no form can be found
"""
_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
namespaces={'x':XHTML_NAMESPACE})
_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
namespaces={'x':XHTML_NAMESPACE})
_name_xpath = XPath('descendant-or-self::*[@name=$name]')
def fill_form(
el,
values,
form_id=None,
form_index=None,
):
el = _find_form(el, form_id=form_id, form_index=form_index)
_fill_form(el, values)
def fill_form_html(html, values, form_id=None, form_index=None):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
fill_form(doc, values, form_id=form_id, form_index=form_index)
return _transform_result(result_type, doc)
def _fill_form(el, values):
counts = {}
if hasattr(values, 'mixed'):
# For Paste request parameters
values = values.mixed()
inputs = _input_xpath(el)
for input in inputs:
name = input.get('name')
if not name:
continue
if _takes_multiple(input):
value = values.get(name, [])
if not isinstance(value, (list, tuple)):
value = [value]
_fill_multiple(input, value)
elif name not in values:
continue
else:
index = counts.get(name, 0)
counts[name] = index + 1
value = values[name]
if isinstance(value, (list, tuple)):
try:
value = value[index]
except IndexError:
continue
elif index > 0:
continue
_fill_single(input, value)
def _takes_multiple(input):
if _nons(input.tag) == 'select' and input.get('multiple'):
# FIXME: multiple="0"?
return True
type = input.get('type', '').lower()
if type in ('radio', 'checkbox'):
return True
return False
def _fill_multiple(input, value):
type = input.get('type', '').lower()
if type == 'checkbox':
v = input.get('value')
if v is None:
if not value:
result = False
else:
result = value[0]
if isinstance(value, basestring):
# The only valid "on" value for an unnamed checkbox is 'on'
result = result == 'on'
_check(input, result)
else:
_check(input, v in value)
elif type == 'radio':
v = input.get('value')
_check(input, v in value)
else:
assert _nons(input.tag) == 'select'
for option in _options_xpath(input):
v = option.get('value')
if v is None:
# This seems to be the default, at least on IE
# FIXME: but I'm not sure
v = option.text_content()
_select(option, v in value)
def _check(el, check):
if check:
el.set('checked', '')
else:
if 'checked' in el.attrib:
del el.attrib['checked']
def _select(el, select):
if select:
el.set('selected', '')
else:
if 'selected' in el.attrib:
del el.attrib['selected']
def _fill_single(input, value):
if _nons(input.tag) == 'textarea':
input.text = value
else:
input.set('value', value)
def _find_form(el, form_id=None, form_index=None):
if form_id is None and form_index is None:
forms = _forms_xpath(el)
for form in forms:
return form
raise FormNotFound(
"No forms in page")
if form_id is not None:
form = el.get_element_by_id(form_id)
if form is not None:
return form
forms = _form_name_xpath(el, name=form_id)
if forms:
return forms[0]
else:
raise FormNotFound(
"No form with the name or id of %r (forms: %s)"
% (id, ', '.join(_find_form_ids(el))))
if form_index is not None:
forms = _forms_xpath(el)
try:
return forms[form_index]
except IndexError:
raise FormNotFound(
"There is no form with the index %r (%i forms found)"
% (form_index, len(forms)))
def _find_form_ids(el):
forms = _forms_xpath(el)
if not forms:
yield '(no forms)'
return
for index, form in enumerate(forms):
if form.get('id'):
if form.get('name'):
yield '%s or %s' % (form.get('id'),
form.get('name'))
else:
yield form.get('id')
elif form.get('name'):
yield form.get('name')
else:
yield '(unnamed form %s)' % index
############################################################
## Error filling
############################################################
class DefaultErrorCreator(object):
insert_before = True
block_inside = True
error_container_tag = 'div'
error_message_class = 'error-message'
error_block_class = 'error-block'
default_message = "Invalid"
def __init__(self, **kw):
for name, value in kw.items():
if not hasattr(self, name):
raise TypeError(
"Unexpected keyword argument: %s" % name)
setattr(self, name, value)
def __call__(self, el, is_block, message):
error_el = el.makeelement(self.error_container_tag)
if self.error_message_class:
error_el.set('class', self.error_message_class)
if is_block and self.error_block_class:
error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
if message is None or message == '':
message = self.default_message
if isinstance(message, ElementBase):
error_el.append(message)
else:
assert isinstance(message, basestring), (
"Bad message; should be a string or element: %r" % message)
error_el.text = message or self.default_message
if is_block and self.block_inside:
if self.insert_before:
error_el.tail = el.text
el.text = None
el.insert(0, error_el)
else:
el.append(error_el)
else:
parent = el.getparent()
pos = parent.index(el)
if self.insert_before:
parent.insert(pos, error_el)
else:
error_el.tail = el.tail
el.tail = None
parent.insert(pos+1, error_el)
default_error_creator = DefaultErrorCreator()
def insert_errors(
el,
errors,
form_id=None,
form_index=None,
error_class="error",
error_creator=default_error_creator,
):
el = _find_form(el, form_id=form_id, form_index=form_index)
for name, error in errors.items():
if error is None:
continue
for error_el, message in _find_elements_for_name(el, name, error):
assert isinstance(message, (basestring, type(None), ElementBase)), (
"Bad message: %r" % message)
_insert_error(error_el, message, error_class, error_creator)
def insert_errors_html(html, values, **kw):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
insert_errors(doc, values, **kw)
return _transform_result(result_type, doc)
def _insert_error(el, error, error_class, error_creator):
if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
is_block = False
else:
is_block = True
if _nons(el.tag) != 'form' and error_class:
_add_class(el, error_class)
if el.get('id'):
labels = _label_for_xpath(el, for_id=el.get('id'))
if labels:
for label in labels:
_add_class(label, error_class)
error_creator(el, is_block, error)
def _add_class(el, class_name):
if el.get('class'):
el.set('class', el.get('class')+' '+class_name)
else:
el.set('class', class_name)
def _find_elements_for_name(form, name, error):
if name is None:
# An error for the entire form
yield form, error
return
if name.startswith('#'):
# By id
el = form.get_element_by_id(name[1:])
if el is not None:
yield el, error
return
els = _name_xpath(form, name=name)
if not els:
# FIXME: should this raise an exception?
return
if not isinstance(error, (list, tuple)):
yield els[0], error
return
# FIXME: if error is longer than els, should it raise an error?
for el, err in zip(els, error):
if err is None:
continue
yield el, err

View file

@ -0,0 +1,207 @@
"""
An interface to html5lib that mimics the lxml.html interface.
"""
import sys
import string
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
from lxml import etree
from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
# python3 compatibility
try:
_strings = basestring
except NameError:
_strings = (bytes, str)
try:
from urllib2 import urlopen
except ImportError:
from urllib.request import urlopen
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
class HTMLParser(_HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
try:
from html5lib import XHTMLParser as _XHTMLParser
except ImportError:
pass
else:
class XHTMLParser(_XHTMLParser):
"""An html5lib XHTML Parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
xhtml_parser = XHTMLParser()
def _find_tag(tree, tag):
elem = tree.find(tag)
if elem is not None:
return elem
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
def document_fromstring(html, guess_charset=True, parser=None):
"""Parse a whole document into a string."""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
return parser.parse(html, useChardet=guess_charset).getroot()
def fragments_fromstring(html, no_leading_text=False,
guess_charset=False, parser=None):
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.
If `guess_charset` is `True` and the text was not unicode but a
bytestring, the `chardet` library will perform charset guessing on the
string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
children = parser.parseFragment(html, 'div', useChardet=guess_charset)
if children and isinstance(children[0], _strings):
if no_leading_text:
if children[0].strip():
raise etree.ParserError('There is leading text: %r' %
children[0])
del children[0]
return children
def fragment_fromstring(html, create_parent=False,
guess_charset=False, parser=None):
"""Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.
If create_parent is true (or is a tag name) then a parent node
will be created to encapsulate the HTML in a single element. In
this case, leading or trailing text is allowed.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
accept_leading_text = bool(create_parent)
elements = fragments_fromstring(
html, guess_charset=guess_charset, parser=parser,
no_leading_text=not accept_leading_text)
if create_parent:
if not isinstance(create_parent, _strings):
create_parent = 'div'
new_root = Element(create_parent)
if elements:
if isinstance(elements[0], _strings):
new_root.text = elements[0]
del elements[0]
new_root.extend(elements)
return new_root
if not elements:
raise etree.ParserError('No elements found')
if len(elements) > 1:
raise etree.ParserError('Multiple elements found')
result = elements[0]
if result.tail and result.tail.strip():
raise etree.ParserError('Element followed by text: %r' % result.tail)
result.tail = None
return result
def fromstring(html, guess_charset=True, parser=None):
"""Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
if not isinstance(html, _strings):
raise TypeError('string required')
doc = document_fromstring(html, parser=parser,
guess_charset=guess_charset)
# document starts with doctype or <html>, full document!
start = html[:50].lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
return doc
head = _find_tag(doc, 'head')
# if the head is not empty we have a full document
if len(head):
return doc
body = _find_tag(doc, 'body')
# The body has just one element, so it was probably a single
# element passed in
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
return body[0]
# Now we have a body which represents a bunch of tags which have the
# content that was passed in. We will create a fake container, which
# is the body tag, except <body> implies too much structure.
if _contains_block_level_tag(body):
body.tag = 'div'
else:
body.tag = 'span'
return body
def parse(filename_url_or_file, guess_charset=True, parser=None):
"""Parse a filename, URL, or file-like object into an HTML document
tree. Note: this returns a tree, not an element. Use
``parse(...).getroot()`` to get the document root.
"""
if parser is None:
parser = html_parser
if not isinstance(filename_url_or_file, _strings):
fp = filename_url_or_file
elif _looks_like_url(filename_url_or_file):
fp = urlopen(filename_url_or_file)
else:
fp = open(filename_url_or_file, 'rb')
return parser.parse(fp, useChardet=guess_charset)
def _looks_like_url(str):
scheme = urlparse(str)[0]
if not scheme:
return False
elif (sys.platform == 'win32' and
scheme in string.ascii_letters
and len(scheme) == 1):
# looks like a 'normal' absolute path
return False
else:
return True
html_parser = HTMLParser()

125
lib/lxml/html/soupparser.py Normal file
View file

@ -0,0 +1,125 @@
__doc__ = """External interface to the BeautifulSoup HTML parser.
"""
__all__ = ["fromstring", "parse", "convert_tree"]
from lxml import etree, html
from BeautifulSoup import \
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a string of HTML data into an Element tree using the
BeautifulSoup parser.
Returns the root ``<html>`` Element of the tree.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
return _parse(data, beautifulsoup, makeelement, **bsargs)
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
if not hasattr(file, 'read'):
file = open(file)
root = _parse(file, beautifulsoup, makeelement, **bsargs)
return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
"""Convert a BeautifulSoup tree to a list of Element trees.
Returns a list instead of a single root Element to support
HTML-like soup with more than one root element.
You can pass a different Element factory through the `makeelement`
keyword.
"""
if makeelement is None:
makeelement = html.html_parser.makeelement
root = _convert_tree(beautiful_soup_tree, makeelement)
children = root.getchildren()
for child in children:
root.remove(child)
return children
# helpers
def _parse(source, beautifulsoup, makeelement, **bsargs):
if beautifulsoup is None:
beautifulsoup = BeautifulSoup
if makeelement is None:
makeelement = html.html_parser.makeelement
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
if len(root) == 1 and root[0].tag == "html":
return root[0]
root.tag = "html"
return root
def _convert_tree(beautiful_soup_tree, makeelement):
root = makeelement(beautiful_soup_tree.name,
attrib=dict(beautiful_soup_tree.attrs))
_convert_children(root, beautiful_soup_tree, makeelement)
return root
def _convert_children(parent, beautiful_soup_tree, makeelement):
SubElement = etree.SubElement
et_child = None
for child in beautiful_soup_tree:
if isinstance(child, Tag):
et_child = SubElement(parent, child.name, attrib=dict(
[(k, unescape(v)) for (k,v) in child.attrs]))
_convert_children(et_child, child, makeelement)
elif type(child) is NavigableString:
_append_text(parent, et_child, unescape(child))
else:
if isinstance(child, Comment):
parent.append(etree.Comment(child))
elif isinstance(child, ProcessingInstruction):
parent.append(etree.ProcessingInstruction(
*child.split(' ', 1)))
else: # CData
_append_text(parent, et_child, unescape(child))
def _append_text(parent, element, text):
if element is None:
parent.text = (parent.text or '') + text
else:
element.tail = (element.tail or '') + text
# copied from ET's ElementSoup
try:
from html.entities import name2codepoint # Python 3
except ImportError:
from htmlentitydefs import name2codepoint
import re
handle_entities = re.compile("&(\w+);").sub
def unescape(string):
if not string:
return ''
# work around oddities in BeautifulSoup's entity handling
def unescape_entity(m):
try:
return unichr(name2codepoint[m.group(1)])
except KeyError:
return m.group(0) # use as is
return handle_entities(unescape_entity, string)

View file

@ -0,0 +1 @@
#

View file

@ -0,0 +1,7 @@
Description: entry content contains applet
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
<div>safe<applet code="foo.class" codebase="http://example.com/"></applet> <b>description</b></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,8 @@
Description: entry content contains embed
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
Options:
Notes: <div> wrapper
<div><blink>safe</blink> description</div>
----------
<div>safe description</div>

View file

@ -0,0 +1,84 @@
Description: entry content is crazy
Expect: not bozo and entries[0]['content'][0]['value'] == u'Crazy HTML -' + u'- Can Your Regex Parse This?\n\n\n\n<!-' + u'- <script> -' + u'->\n\n<!-' + u'- \n\t<script> \n-' + u'->\n\n\n\nfunction executeMe()\n{\n\n\n\n\n/* \n<h1>Did The Javascript Execute?</h1>\n<div>\nI will execute here, too, if you mouse over me\n</div>'
Options: -page_structure
Notes: for some reason the comments in the expected field are acting weird
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Crazy HTML -- Can Your Regex Parse This?</title>
</head>
<body notRealAttribute="value"onload="executeMe();"foo="bar"
>
<!-- <script> -->
<!--
<script>
-->
</script>
<script
>
function executeMe()
{
/* <script>
function am_i_javascript()
{
var str = "Some innocuously commented out stuff";
}
< /script>
*/
alert("Executed");
}
</script
>
<h1>Did The Javascript Execute?</h1>
<div notRealAttribute="value
"onmouseover="
executeMe();
"foo="bar">
I will execute here, too, if you mouse over me
</div>
</body>
</html>
----------
<html>
<head>
<title>Crazy HTML -- Can Your Regex Parse This?</title>
</head>
<body>
<h1>Did The Javascript Execute?</h1>
<div>
I will execute here, too, if you mouse over me
</div>
</body>
</html>

View file

@ -0,0 +1,8 @@
Description: entry content contains embed
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
Notes: <div> wrapper, close <embed> tag (not closing it lost the <b> tag)
<div>safe<embed src="http://example.com/"></embed> <b>description</b></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,7 @@
Description: entry content contains frameset
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
<div>safe<frameset rows="*"><frame src="http://example.com/"></frameset> <b>description</b></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,8 @@
Description: entry content contains iframe
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
Notes: div wrapper, close <iframe>
<div>safe<iframe src="http://example.com/"></iframe> <b>description</b></iframe></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,7 @@
Description: entry content contains link
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
<div>safe<link rel="stylesheet" type="text/css" href="http://example.com/evil.css"> <b>description</b></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,7 @@
Description: entry content contains meta
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
<div>safe<meta http-equiv="Refresh" content="0; URL=http://example.com/"> <b>description</b></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,8 @@
Description: entry content contains object
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
Options:
Notes: div wrapper, close <object>
<div>safe<object classid="clsid:C932BA85-4374-101B-A56C-00AA003668DC"></object> <b>description</b></div>
----------
<div>safe <b>description</b></div>

View file

@ -0,0 +1,7 @@
Description: entry content contains onabort
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onabort="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onblur
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onblur="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onchange
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onchange="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onclick
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onclick="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains ondblclick
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options: javascript
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" ondblclick="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onerror
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onerror="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onfocus
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onfocus="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onkeydown
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeydown="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onkeypress
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeypress="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onkeyup
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeyup="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onload
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onload="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onmousedown
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmousedown="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onmouseout
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseout="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onmouseover
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseover="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onmouseup
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseup="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onreset
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onreset="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onresize
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onresize="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onsubmit
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onsubmit="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains onunload
Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
Options:
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onunload="location.href='http://www.ragingplatypus.com/';" />
----------
<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />

View file

@ -0,0 +1,7 @@
Description: entry content contains script
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
Options:
<div>safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>
----------
<div>safe description</div>

View file

@ -0,0 +1,13 @@
Description: entry content contains script (cdata)
Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
Options:
Notes: div wrapper. Currently not working because of how HTML() is parsing the CDATA (not in a useful way)
The resulting code is safe, it just includes crap from the <script> tag (but not the script tag
itself).
Ignore: true
<div>
<![CDATA[safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description]]>
</div>
----------
<div>safe description</div>

View file

@ -0,0 +1,7 @@
Description: entry content contains script (inline)
Expect: not bozo and entries[0]['content'][0]['value'] == u'<div>safe description</div>'
Options:
<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>
----------
<div>safe description</div>

View file

@ -0,0 +1,7 @@
Description: entry content contains style
Expect: not bozo and entries[0]['content'][0]['value'] == u'<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>'
Options: style
<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>
----------
<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>

View file

@ -0,0 +1,8 @@
Description: I built a quick XSS fuzzer to detect any erroneous characters that are allowed after the open parenthesis but before the JavaScript directive in IE and Netscape 8.1 in secure site mode. These are in decimal but you can include hex and add padding of course. (Any of the following chars can be used: 1-32, 34, 39, 160, 8192-8.13, 12288, 65279)
http://ha.ckers.org/xss.html#XSS_DIV_background-image_plus
Options: -safe_attrs_only
Notes: As you see, the CSS gets corrupted, but I don't really care that much.
<DIV STYLE="background-image: url(&#1;javascript:alert('XSS'))">text</div>
----------
<div style="background-image: url(">text</div>

View file

@ -0,0 +1,10 @@
Description: exploit (this has been modified slightly to obfuscate the url parameter). The original vulnerability was found by Renaud Lifchitz as a vulnerability in Hotmail.
http://ha.ckers.org/xss.html#XSS_DIV_background_image_unicode
Options: -safe_attrs_only
Ignore: true
Notes: I don't understand how this exploit works. It seems like the description actually refers to
the unicode you'd import, but why that matters I don't know.
<DIV STYLE="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">text</div>
----------
<div style="background-image: ">text</div>

View file

@ -0,0 +1,9 @@
Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job
http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden
Options: -comments, -processing_instructions
<div><!--[if gte IE 4]>
<SCRIPT>alert('XSS');</SCRIPT>
<![endif]--></div>
----------
<div></div>

View file

@ -0,0 +1,12 @@
Description: HTML+TIME in XML. This is how Grey Magic hacked Hotmail and Yahoo!. This only works in Internet Explorer and Netscape 8.1 in IE rendering engine mode and remember that you need to be between HTML and BODY tags for this to work
http://ha.ckers.org/xss.html#XSS_HTML_plus_time
Ignore: true
Notes: I don't understand the vector here, or how this is supposed to work.
<div>
<t:set attributeName="innerHTML" to="XSS&lt;SCRIPT DEFER&gt;alert(&quot;XSS&quot;)&lt;/SCRIPT&gt;">
</BODY></HTML></div>
----------
<div>
<t:set attributeName="innerHTML" to="XSS&lt;SCRIPT DEFER&gt;alert(&quot;XSS&quot;)&lt;/SCRIPT&gt;">
</BODY></HTML>x</div>

View file

@ -0,0 +1,15 @@
Description: javascript: in many forms
<div>
<a href="java
script:alert()">x</a>
<a href="j a v a s c r i p t:alert()">x</a>
<a href="jscript
:alert()">x</a>
</div>
----------
<div>
<a href="">x</a>
<a href="">x</a>
<a href="">x</a>
</div>

View file

@ -0,0 +1,8 @@
Description: to break up expression (Thanks to Roman Ivanov for this one)
http://ha.ckers.org/xss.html#XSS_STYLE_comment
Options: -safe_attrs_only
Notes: Because of the suspicious stuff in there, the style is removed entirely
<IMG STYLE="xss:expr/*XSS*/ession(alert('XSS'))">
----------
<img>

View file

@ -0,0 +1,10 @@
Description: (this is really a hybrid of the above XSS vectors, but it really does show how hard STYLE tags can be to parse apart, like above this can send IE into a loop)
http://ha.ckers.org/xss.html#XSS_IMG_STYLE_expression
Options: -safe_attrs_only
Notes: Modified to avoid a parsing in libxml2 that ruins the XSS (the " marks).
Also there seemed to be an extra "p" in exppression
<div><img style="xss: ex/*<A STYLE='no\xss:noxss(*//*);
xss:&#101;x&#x2F;*XSS*//*/*/pression(alert('XSS'))"></div>
----------
<div><img></div>

View file

@ -0,0 +1,8 @@
Description: tags with broken up JavaScript for XSS (this XSS at times sends IE into an infinite loop of alerts)
http://ha.ckers.org/xss.html#XSS_STYLE
Options: -safe_attrs_only
<div><STYLE>@im\port'\ja\vasc\ript:alert("XSS")';</STYLE></div>
----------
<div><style>/* deleted */</style></div>

View file

@ -0,0 +1,7 @@
Description: (Older versions of Netscape only)
http://ha.ckers.org/xss.html#XSS_STYLE_tag
Options: -safe_attrs_only
<div><STYLE TYPE="text/javascript">alert('XSS');</STYLE></div>
----------
<div></div>

View file

@ -0,0 +1,8 @@
Description: http://ha.ckers.org/xss.html#XSS_STYLE_background-image
Options: -style, -safe_attrs_only
Notes: The CSS is messed up here, but so it goes
<div><STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A></div>
----------
<div><style>.XSS{background-image:url("");}</style><a class="XSS"></a></div>

View file

@ -0,0 +1,10 @@
Description: XML data island with comment obfuscation (this is another take on the same exploit that doesn't use CDATA fields, but rather uses comments to break up the javascript directive)
http://ha.ckers.org/xss.html#XSS_XML_data_island_comment
Ignore: true
Notes: I don't understand the vector here. Maybe datasrc should be filtered?
<div><XML ID="xss"><I><B>&lt;IMG SRC="javas<!-- -->cript:alert('XSS')"&gt;</B></I></XML>
<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN></div>
----------
<div><XML ID="xss"><I><B>&lt;IMG SRC="javas<!-- -->cript:alert('XSS')"&gt;</B></I></XML>
<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN>x</div>

View file

@ -0,0 +1,9 @@
Description: Locally hosted XML with embedded JavaScript#XSS_Local_XML that is generated using an XML data island. This is the same as above but instead referrs to a locally hosted (must be on the same server) XML file that contains your cross site scripting vector. You can see the result here <http://ha.ckers.org/xssxmltest.html>
http://ha.ckers.org/xss.html#XSS_Local_XML
<div><XML SRC="xsstest.xml" ID=I></XML>
<SPAN DATASRC=#I DATAFLD=C DATAFORMATAS=HTML></SPAN></div>
----------
<div>
<span></span>
</div>

View file

@ -0,0 +1,16 @@
Description: XML namespace. The htc file must be located on the same server as your XSS vector
http://ha.ckers.org/xss.html#XSS_XML_namespace
Note: I don't completely understand the vector here. page_structure is what does this.
<HTML xmlns:xss>
<body>
<?import namespace="xss" implementation="http://ha.ckers.org/xss.htc">
<xss:xss>XSS</xss:xss>
</body>
</HTML>
----------
<HTML>
<body>
<div>XSS</div>
</body>
</HTML>

View file

@ -0,0 +1,11 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_autolink.txt')])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,79 @@
This tests autolink::
>>> from lxml.html import usedoctest
>>> from lxml.html.clean import autolink_html
>>> print(autolink_html('''
... <div>Link here: http://test.com/foo.html.</div>
... '''))
<div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div>
>>> print(autolink_html('''
... <div>Mail me at mailto:ianb@test.com or http://myhome.com</div>
... '''))
<div>Mail me at <a href="mailto:ianb@test.com">ianb@test.com</a>
or <a href="http://myhome.com">http://myhome.com</a></div>
>>> print(autolink_html('''
... <div>The <b>great</b> thing is the http://link.com links <i>and</i>
... the http://foobar.com links.</div>'''))
<div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i>
the <a href="http://foobar.com">http://foobar.com</a> links.</div>
>>> print(autolink_html('''
... <div>Link: &lt;http://foobar.com&gt;</div>'''))
<div>Link: &lt;<a href="http://foobar.com">http://foobar.com</a>&gt;</div>
>>> print(autolink_html('''
... <div>Link: (http://foobar.com)</div>'''))
<div>Link: (<a href="http://foobar.com">http://foobar.com</a>)</div>
Parenthesis are tricky, we'll do our best::
>>> print(autolink_html('''
... <div>(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))</div>
... '''))
<div>(Link: <a href="http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)">http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)</a>)</div>
>>> print(autolink_html('''
... <div>... a link: http://foo.com)</div>
... '''))
<div>... a link: <a href="http://foo.com">http://foo.com</a>)</div>
Some cases that won't be caught (on purpose)::
>>> print(autolink_html('''
... <div>A link to http://localhost/foo/bar won't, but a link to
... http://test.com will</div>'''))
<div>A link to http://localhost/foo/bar won't, but a link to
<a href="http://test.com">http://test.com</a> will</div>
>>> print(autolink_html('''
... <div>A link in <textarea>http://test.com</textarea></div>'''))
<div>A link in <textarea>http://test.com</textarea></div>
>>> print(autolink_html('''
... <div>A link in <a href="http://foo.com">http://bar.com</a></div>'''))
<div>A link in <a href="http://foo.com">http://bar.com</a></div>
>>> print(autolink_html('''
... <div>A link in <code>http://foo.com</code> or
... <span class="nolink">http://bar.com</span></div>'''))
<div>A link in <code>http://foo.com</code> or
<span class="nolink">http://bar.com</span></div>
There's also a word wrapping function, that should probably be run
after autolink::
>>> from lxml.html.clean import word_break_html
>>> def pascii(s):
... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii'))
>>> pascii(word_break_html( u'''
... <div>Hey you
... 12345678901234567890123456789012345678901234567890</div>'''))
<div>Hey you
1234567890123456789012345678901234567890&#8203;1234567890</div>
Not everything is broken:
>>> pascii(word_break_html('''
... <div>Hey you
... <code>12345678901234567890123456789012345678901234567890</code></div>'''))
<div>Hey you
<code>12345678901234567890123456789012345678901234567890</code></div>
>>> pascii(word_break_html('''
... <a href="12345678901234567890123456789012345678901234567890">text</a>'''))
<a href="12345678901234567890123456789012345678901234567890">text</a>

View file

@ -0,0 +1,13 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest, doctest
import lxml.html
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_basic.txt')])
suite.addTests([doctest.DocTestSuite(lxml.html)])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,162 @@
lxml.html adds a find_class method to elements::
>>> from lxml.etree import Comment
>>> from lxml.html import document_fromstring, fragment_fromstring, tostring
>>> from lxml.html import fragments_fromstring, fromstring
>>> from lxml.html.clean import clean, clean_html
>>> from lxml.html import usedoctest
>>> try: unicode = unicode
... except NameError: unicode = str
>>> h = document_fromstring('''
... <html><head></head>
... <body>
... <a class="vcard
... fn url" href="foobar">P1</a>
... <a class="not-fn vcard" href="baz">P2</a>
... </body></html>''')
>>> print(tostring(h, encoding=unicode))
<html>
<head></head>
<body>
<a class="vcard
fn url" href="foobar">P1</a>
<a class="not-fn vcard" href="baz">P2</a>
</body>
</html>
>>> print([e.text for e in h.find_class('fn')])
['P1']
>>> print([e.text for e in h.find_class('vcard')])
['P1', 'P2']
Also added is a get_rel_links, which you can use to search for links
like ``<a rel="$something">``::
>>> h = document_fromstring('''
... <a href="1">test 1</a>
... <a href="2" rel="tag">item 2</a>
... <a href="3" rel="tagging">item 3</a>
... <a href="4" rel="TAG">item 4</a>''')
>>> print([e.attrib['href'] for e in h.find_rel_links('tag')])
['2', '4']
>>> print([e.attrib['href'] for e in h.find_rel_links('nofollow')])
[]
Another method is ``get_element_by_id`` that does what it says::
>>> print(tostring(fragment_fromstring('''
... <div>
... <span id="test">stuff</span>
... </div>''').get_element_by_id('test'), encoding=unicode))
<span id="test">stuff</span>
Or to get the content of an element without the tags, use text_content()::
>>> el = fragment_fromstring('''
... <div>This is <a href="foo">a <b>bold</b> link</a></div>''')
>>> el.text_content()
'This is a bold link'
Or drop an element (leaving its content) or the entire tree, like::
>>> doc = document_fromstring('''
... <html>
... <body>
... <div id="body">
... This is a <a href="foo" id="link">test</a> of stuff.
... </div>
... <!-- a comment -->
... <div>footer</div>
... </body>
... </html>''')
>>> doc.get_element_by_id('link').drop_tag()
>>> print(tostring(doc, encoding=unicode))
<html>
<body>
<div id="body">
This is a test of stuff.
</div>
<!-- a comment -->
<div>footer</div>
</body>
</html>
>>> doc.get_element_by_id('body').drop_tree()
>>> print(tostring(doc, encoding=unicode))
<html>
<body>
<!-- a comment -->
<div>footer</div>
</body>
</html>
Note, however, that comment text will not be merged into the tree when you
drop the comment. Here, ``drop_tag()`` behaves exactly like ``drop_tree()``:
>>> for comment in doc.getiterator(Comment):
... comment.drop_tag()
>>> print(tostring(doc, encoding=unicode))
<html>
<body>
<div>footer</div>
</body>
</html>
In Python3 it should be possible to parse strings given as bytes objects, at
least if an encoding is given.
>>> from lxml.html import HTMLParser
>>> enc = 'utf-8'
>>> html_parser = HTMLParser(encoding=enc)
>>> src = '<html><body>Test</body></html>'.encode(enc)
>>> doc = fromstring(src, parser=html_parser)
>>> print(tostring(doc, encoding=unicode))
<html><body>Test</body></html>
>>> docs = fragments_fromstring(src, parser=html_parser)
>>> len(docs)
1
>>> print(docs[0])
Test
Bug 599318: Call fromstring with a frameset fragment should not raise an error,
the whole document is returned.
>>> import lxml.html
>>> content='''
... <frameset>
... <frame src="main.php" name="srcpg">
... </frameset>'''
>>> etree_document = lxml.html.fromstring(content)
>>> print(tostring(etree_document, encoding=unicode))
<html><frameset><frame src="main.php" name="srcpg"></frameset></html>
Bug 599318: Call fromstring with a div fragment should not raise an error,
only the element is returned
>>> import lxml.html
>>> content='<div></div>'
>>> etree_document = lxml.html.fromstring(content)
>>> print(tostring(etree_document, encoding=unicode))
<div></div>
Bug 599318: Call fromstring with a head fragment should not raise an error,
the whole document is returned.
>>> import lxml.html
>>> content='<head></head>'
>>> etree_document = lxml.html.fromstring(content)
>>> print(tostring(etree_document, encoding=unicode))
<html><head></head></html>
Bug 690319: Leading whitespace before doctype declaration should not raise an error.
>>> import lxml.html
>>> content='''
... <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
... <html>
... </html>'''
>>> etree_document = lxml.html.fromstring(content)
>>> print(tostring(etree_document, encoding=unicode))
<html></html>

View file

@ -0,0 +1,80 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest
from lxml.etree import LIBXML_VERSION
import lxml.html
from lxml.html.clean import Cleaner, clean_html
class CleanerTest(unittest.TestCase):
def test_allow_tags(self):
html = """
<html>
<head>
</head>
<body>
<p>some text</p>
<table>
<tr>
<td>hello</td><td>world</td>
</tr>
<tr>
<td>hello</td><td>world</td>
</tr>
</table>
<img>
</body>
</html>
"""
html_root = lxml.html.document_fromstring(html)
cleaner = Cleaner(
remove_unknown_tags = False,
allow_tags = ['table', 'tr', 'td'])
result = cleaner.clean_html(html_root)
self.assertEqual(12-5+1, len(list(result.iter())))
def test_safe_attrs_included(self):
html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
safe_attrs=set(lxml.html.defs.safe_attrs)
safe_attrs.add('style')
cleaner = Cleaner(
safe_attrs_only=True,
safe_attrs=safe_attrs)
result = cleaner.clean_html(html)
self.assertEqual(html, result)
def test_safe_attrs_excluded(self):
html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
expected = """<p><span>Cyan</span></p>"""
safe_attrs=set()
cleaner = Cleaner(
safe_attrs_only=True,
safe_attrs=safe_attrs)
result = cleaner.clean_html(html)
self.assertEqual(expected, result)
def test_clean_invalid_root_tag(self):
# only testing that cleaning with invalid root tags works at all
s = lxml.html.fromstring('parent <invalid tag>child</another>')
self.assertEqual('parent child', clean_html(s).text_content())
s = lxml.html.fromstring('<invalid tag>child</another>')
self.assertEqual('child', clean_html(s).text_content())
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_clean.txt')])
if LIBXML_VERSION >= (2,6,31):
suite.addTests([make_doctest('test_clean_embed.txt')])
suite.addTests(unittest.makeSuite(CleanerTest))
return suite

View file

@ -0,0 +1,161 @@
>>> from lxml.html import fromstring, tostring
>>> from lxml.html.clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest
>>> doc = '''<html>
... <head>
... <script type="text/javascript" src="evil-site"></script>
... <link rel="alternate" type="text/rss" src="evil-rss">
... <link rel="alternate" type="text/rss" href="http://example.com">
... <link rel="stylesheet" type="text/rss" href="http://example.com">
... <style>
... body {background-image: url(javascript:do_evil)};
... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
... div {color: expression(evil)};
... </style>
... </head>
... <body onload="evil_function()">
... <!-- I am interpreted for EVIL! -->
... <a href="javascript:evil_function()">a link</a>
... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
... <a href="#" onclick="evil_function()">another link</a>
... <p onclick="evil_function()">a paragraph</p>
... <div style="display: none">secret EVIL!</div>
... <object> of EVIL! </object>
... <iframe src="evil-site"></iframe>
... <form action="evil-site">
... Password: <input type="password" name="password">
... </form>
... <a href="evil-site">spam spam SPAM!</a>
... <a href="http://example.com" rel="author">Author</a>
... <a href="http://example.com" rel="nofollow">Text</a>
... <img src="evil!">
... </body>
... </html>'''
>>> print(doc)
<html>
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>
body {background-image: url(javascript:do_evil)};
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
div {color: expression(evil)};
</style>
</head>
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
<a href="javascript:evil_function()">a link</a>
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
<a href="#" onclick="evil_function()">another link</a>
<p onclick="evil_function()">a paragraph</p>
<div style="display: none">secret EVIL!</div>
<object> of EVIL! </object>
<iframe src="evil-site"></iframe>
<form action="evil-site">
Password: <input type="password" name="password">
</form>
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(tostring(fromstring(doc)).decode("utf-8"))
<html>
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>
body {background-image: url(javascript:do_evil)};
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
div {color: expression(evil)};
</style>
</head>
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
<a href="javascript:evil_function()">a link</a>
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
<a href="#" onclick="evil_function()">another link</a>
<p onclick="evil_function()">a paragraph</p>
<div style="display: none">secret EVIL!</div>
<object> of EVIL! </object>
<iframe src="evil-site"></iframe>
<form action="evil-site">
Password: <input type="password" name="password">
</form>
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
<style>/* deleted */</style>
</head>
<body>
<a href="">a link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div style="display: none">secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
</head>
<body>
<a href="">a link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div>secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site" rel="nofollow">spam spam SPAM!</a>
<a href="http://example.com" rel="author nofollow">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
<html>
<head>
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>/* deleted */</style>
</head>
<body>
<a href="">a link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div>secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>

View file

@ -0,0 +1,39 @@
THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !!
>>> from lxml.html import fromstring, tostring
>>> from lxml.html.clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest
>>> def tostring(el): # work-around for Py3 'bytes' type
... from lxml.html import tostring
... s = tostring(el)
... if not isinstance(s, str):
... s = s.decode('UTF-8')
... return s
>>> doc_embed = '''<div>
... <embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
... <embed src="http://anothersite.com/v/another"></embed>
... <script src="http://www.youtube.com/example.js"></script>
... <script src="/something-else.js"></script>
... </div>'''
>>> print(tostring(fromstring(doc_embed)))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
<embed src="http://anothersite.com/v/another"></embed>
<script src="http://www.youtube.com/example.js"></script>
<script src="/something-else.js"></script>
</div>
>>> print(Cleaner().clean_html(doc_embed))
<div>
</div>
>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
</div>
>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
<script src="http://www.youtube.com/example.js"></script>
</div>

View file

@ -0,0 +1,14 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest, doctest
from lxml.html import diff
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_diff.txt'),
doctest.DocTestSuite(diff)])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,252 @@
lxml.html.diff does HTML comparisons. These are word-based comparisons.
First, a handy function for normalizing whitespace and doing word wrapping::
>>> import re, textwrap
>>> def pwrapped(text):
... text = re.sub(r'[ \n\t\r]+', ' ', text)
... text = textwrap.fill(text)
... print(text)
>>> def pdiff(text1, text2):
... pwrapped(htmldiff(text1, text2))
Example::
>>> from lxml.html.diff import htmldiff, html_annotate
>>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
>>> html2 = '''<p>This is some test textual writing with some changed stuff
... and some same stuff</p>'''
>>> pdiff(html1, html2)
<p>This is some test <ins>textual writing with some changed stuff
</ins> <del>text with some changes</del> and some same stuff</p>
Style tags are largely ignored in terms of differences, though markup is not eliminated::
>>> html1 = '<p>Hi <i>you guys</i></p>'
>>> html2 = '<p>Hi <i>you</i> guys</p>'
>>> pdiff(html1, html2)
<p>Hi <i>you</i> guys</p>
>>> pdiff('text', '<p>text</p>')
<p>text</p>
>>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!')
<i>Hi <ins>guy</ins> <del>guys</del> </i> !!
>>> pdiff('H<i>i</i>', 'Hi')
<ins>Hi</ins> <del>H<i>i</i></del>
>>> pdiff('<i>A B</i> C', '<i>A</i> C')
<i>A <del>B</del> </i> C
>>> pdiff('<i>A B</i> C', '<i>B</i> C')
<i> <del>A</del> B</i> C
>>> pdiff('<p></p>', '<p></p>')
<p></p>
>>> pdiff('<p>Hi</p>', '<p>Bye</p>')
<p><ins>Bye</ins></p> <p><del>Hi</del></p>
>>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>')
<p> <ins>Bye</ins> <del>Hi</del> Guy</p>
>>> pdiff('<p>Hey there</p>', '')
<ins></ins> <p><del>Hey there</del></p>
Movement between paragraphs is ignored, as tag-based changes are generally ignored::
>>>
>>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>')
<p>Hello World</p>
As a special case, changing the href of a link is displayed, and
images are treated like words:
>>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>')
<a href="http://google.com">search <ins> Link: http://google.com</ins>
<del> Link: http://yahoo.com</del> </a>
>>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>')
<p>Print this <del><img src="print.gif"></del> </p>
>>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
<a href="http://yahoo.com">search</a>
Images may sometimes not have 'src' attributes:
>>> pdiff('<img src="tease"> <img> test <img src="test">', '<img> test <img src="toast">')
<del><img src="tease"></del> <img> test <ins><img src="toast"></ins>
<del><img src="test"></del>
A test of empty elements:
>>> pdiff('some <br> text', 'some <br> test')
some <ins><br> test</ins> <del><br> text</del>
Whitespace is generally ignored for the diff but preserved during the diff:
>>> print(htmldiff('<p> first\nsecond\nthird</p>', '<p> &#xA0; first\n second\nthird </p>'))
<p>first
second
third </p>
>>> print(htmldiff('<pre>first\nsecond\nthird</pre>', '<pre>first\nsecond\nthird</pre>'))
<pre>first
second
third</pre>
>>> print(htmldiff('<pre>first\nsecond</pre>', '<pre>first\nsecond\n third</pre>'))
<pre>first
second
<ins>third</ins> </pre>
The sixteen combinations::
First "insert start" (del start/middle/end/none):
>>> pdiff('<b>A B C</b>', '<b>D B C</b')
<b> <ins>D</ins> <del>A</del> B C</b>
>>> pdiff('<b>A B C</b>', '<b>D A C</b>')
<b> <ins>D</ins> A <del>B</del> C</b>
>>> pdiff('<b>A B C</b>', '<b>D A B</b>')
<b> <ins>D</ins> A B <del>C</del> </b>
>>> pdiff('<b>A B C</b>', '<b>D A B C</b>')
<b> <ins>D</ins> A B C</b>
Next, "insert middle" (del start/middle/end/none):
>>> pdiff('<b>A B C</b>', '<b>D B C</b>')
<b> <ins>D</ins> <del>A</del> B C</b>
>>> pdiff('<b>A B C</b>', '<b>A D C</b>')
<b>A <ins>D</ins> <del>B</del> C</b>
>>> pdiff('<b>A B C</b>', '<b>A D B</b>')
<b>A <ins>D</ins> B <del>C</del> </b>
This one case hits the threshold of our insensitive matching:
>>> pdiff('<b>A B C</b>', '<b>A D B C</b>')
<b> <ins>A D</ins> <del>A</del> B C</b>
Then "insert end" (del start/middle/end/none):
>>> pdiff('<b>A B C</b>', '<b>B C D</b>')
<b> <del>A</del> B C <ins>D</ins> </b>
>>> pdiff('<b>A B C</b>', '<b>A C D</b>')
<b>A <del>B</del> C <ins>D</ins> </b>
>>> pdiff('<b>A B C</b>', '<b>A B D</b>')
<b>A B <ins>D</ins> <del>C</del> </b>
>>> pdiff('<b>A B C</b>', '<b>A B C D</b>')
<b>A B C <ins>D</ins> </b>
Then no insert (del start/middle/end):
>>> pdiff('<b>A B C</b>', '<b>B C</b>')
<b> <del>A</del> B C</b>
>>> pdiff('<b>A B C</b>', '<b>A C</b>')
<b>A <del>B</del> C</b>
>>> pdiff('<b>A B C</b>', '<b>A B</b>')
<b>A B <del>C</del> </b>
>>> pdiff('<b>A B</b> C', '<b>A B</b>')
<b>A B</b> <del>C</del>
>>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>')
<b>A B</b> <del><b>C</b></del>
>>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A')
A <p><del><b>hey there</b> <i>how are you?</i></del></p>
Testing a larger document, to make sure there are not weird
unnecessary parallels found:
>>> pdiff('''
... <p>This is a test document with many words in it that goes on
... for a while and doesn't have anything do to with the next
... document that we match this against</p>''', '''
... <p>This is another document with few similarities to the preceding
... one, but enough that it may have overlap that could turn into
... a confusing series of deletes and inserts.
... </p>''')
<p><ins>This is another document with few similarities to the
preceding one, but enough that it may have overlap that could turn
into a confusing series of deletes and inserts. </ins></p>
<p><del>This is a test document with many words in it that goes on for
a while and doesn't have anything do to with the next document that we
match this against</del></p>
Annotation of content can also be done, where every bit of content is
marked up with information about where it came from.
First, some setup; note that html_annotate is called with a sequence
of documents and the annotation associated with that document. We'll
just use indexes, but you could use author or timestamp information.
>>> def markup(text, annotation):
... return '<span version="%s">%s</span>' % (annotation, text)
>>> def panno(*docs):
... pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)],
... markup=markup))
Now, a sequence of documents:
>>> panno('Hello cruel world', 'Hi cruel world', 'Hi world')
<span version="1">Hi</span> <span version="0">world</span>
>>> panno('A similar document', 'A similar document',
... 'A similar document here')
<span version="0">A similar document</span> <span
version="2">here</span>
>>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>')
<p><span version="0">P1 para</span></p><p><span version="1">P3
foo</span></p>
>>> panno('Hello<p>There World</p>','Hello<p>There Town</p>')
<span version="0">Hello</span><p><span version="0">There</span> <span
version="1">Town</span></p>
>>> panno('<p>Hello</p>There World','<p>Hello</p>There Town')
<p><span version="0">Hello</span></p><span version="0">There</span>
<span version="1">Town</span>
>>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>')
<p><span version="0">Hello</span></p><p><span version="0">There</span>
<span version="1">Town</span></p>
>>> panno('<p>Hi <img src="/foo"> You</p>',
... '<p>Hi You</p>',
... '<p>Hi You <img src="/bar"></p>')
<p><span version="0">Hi You</span> <span version="2"><img
src="/bar"></span></p>
>>> panno('<p><a href="/foo">Hey</a></p>',
... '<p><a href="/bar">Hey</a></p>')
<p><a href="/bar"><span version="0">Hey</span></a></p>
>>> panno('<p><a href="/foo">Hey You</a></p>',
... '<p><a href="/foo">Hey Guy</a></p>')
<p><a href="/foo"><span version="0">Hey</span> <span
version="1">Guy</span></a></p>
Internals
---------
Some utility functions::
>>> from lxml.html.diff import fixup_ins_del_tags, split_unbalanced, split_trailing_whitespace
>>> def pfixup(text):
... print(fixup_ins_del_tags(text).strip())
>>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
<p><ins>some text <b>and more text</b> and more</ins></p>
>>> pfixup('<p><ins>Hi!</ins> you</p>')
<p><ins>Hi!</ins> you</p>
>>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>')
<div>Some text <ins>and </ins><p><ins>more text</ins></p> </div>
>>> pfixup('''
... <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''')
<table><tr><td><ins>One table</ins></td><td><ins>More stuff</ins></td></tr></table>
Testing split_unbalanced::
>>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
([], ['<a href="blah">', 'hey', '</a>'], [])
>>> split_unbalanced(['<a href="blah">', 'hey'])
(['<a href="blah">'], ['hey'], [])
>>> split_unbalanced(['Hey', '</i>', 'You', '</b>'])
([], ['Hey', 'You'], ['</i>', '</b>'])
>>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>'])
([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>'])
>>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There'])
(['<b>'], ['So', 'Hi', 'There'], ['</i>'])
Testing split_trailing_whitespace::
>>> split_trailing_whitespace('test\n\n')
('test', '\n\n')
>>> split_trailing_whitespace(' test\n ')
(' test', '\n ')
>>> split_trailing_whitespace('test')
('test', '')

View file

@ -0,0 +1,33 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest, HelperTestCase
try:
import BeautifulSoup
BS_INSTALLED = True
except ImportError:
BS_INSTALLED = False
if BS_INSTALLED:
class SoupParserTestCase(HelperTestCase):
from lxml.html import soupparser
def test_broken_attribute(self):
html = """\
<html><head></head><body>
<form><input type='text' disabled size='10'></form>
</body></html>
"""
root = self.soupparser.fromstring(html)
self.assertTrue(root.find('.//input').get('disabled') is not None)
def test_suite():
suite = unittest.TestSuite()
if BS_INSTALLED:
suite.addTests([unittest.makeSuite(SoupParserTestCase)])
if sys.version_info[0] < 3:
suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,98 @@
import sys
import os
import re
try:
from rfc822 import Message
except ImportError:
# Python 3
from email import message_from_file as Message
import unittest
from lxml.tests.common_imports import doctest
if sys.version_info >= (2,4):
from lxml.doctestcompare import LHTMLOutputChecker
from lxml.html.clean import clean, Cleaner
feed_dirs = [
os.path.join(os.path.dirname(__file__), 'feedparser-data'),
os.path.join(os.path.dirname(__file__), 'hackers-org-data'),
]
bar_re = re.compile(r"-----+")
class DummyInput:
def __init__(self, **kw):
for name, value in kw.items():
setattr(self, name, value)
class FeedTestCase(unittest.TestCase):
def __init__(self, filename):
self.filename = filename
unittest.TestCase.__init__(self)
def parse(self):
f = open(self.filename, 'r')
headers = Message(f)
c = f.read()
f.close()
if not c.strip():
c = headers.get_payload()
if not headers.keys():
raise Exception(
"File %s has no headers" % self.filename)
self.description = headers['Description']
self.expect = headers.get('Expect', '')
self.ignore = headers.get('Ignore')
self.options = [
o.strip() for o in headers.get('Options', '').split(',')
if o.strip()]
parts = bar_re.split(c)
self.input = parts[0].rstrip() + '\n'
if parts[1:]:
self.expect = parts[1].rstrip() + '\n'
else:
self.expect = None
def runTest(self):
self.parse()
if self.ignore:
# We've marked this test to be ignored.
return
kw = {}
for name in self.options:
if name.startswith('-'):
kw[name[1:]] = False
else:
kw[name] = True
if kw.get('clean', True):
transformed = Cleaner(**kw).clean_html(self.input)
else:
transformed = self.input
assert self.expect is not None, (
"No expected output in %s" % self.filename)
checker = LHTMLOutputChecker()
if not checker.check_output(self.expect, transformed, 0):
result = checker.output_difference(
DummyInput(want=self.expect), transformed, 0)
#result += '\noptions: %s %r' % (', '.join(self.options), kw)
#result += repr(transformed)
raise Exception("\n"+result)
def shortDescription(self):
return self.filename
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
for dir in feed_dirs:
for fn in os.listdir(dir):
fn = os.path.join(dir, fn)
if fn.endswith('.data'):
case = FeedTestCase(fn)
suite.addTests([case])
# This is my lazy way of stopping on first error:
try:
case.runTest()
except:
break
return suite

View file

@ -0,0 +1,8 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_formfill.txt')])
return suite

View file

@ -0,0 +1,112 @@
Some basic imports:
>>> from lxml.html import usedoctest
>>> from lxml.html.formfill import fill_form_html
The simplest kind of filling is just filling an input with a value:
>>> print(fill_form_html('''
... <form><input type="text" name="foo"></form>''', dict(foo='bar')))
<form><input type="text" name="foo" value="bar"></form>
You can also fill multiple inputs, like:
>>> print(fill_form_html('''
... <form>
... <input type="text" name="foo">
... <input type="text" name="foo">
... </form>''', dict(foo=['bar1', 'bar2'])))
<form>
<input type="text" name="foo" value="bar1">
<input type="text" name="foo" value="bar2">
</form>
Checkboxes can work either as boolean true/false, or be selected based
on their inclusion in a set of values::
>>> print(fill_form_html('''
... <form>
... Would you like to be spammed?
... <input type="checkbox" name="spam_me"> <br>
... Spam you'd like to receive:<br>
... Viagra spam:
... <input type="checkbox" name="type" value="viagra"><br>
... Stock spam:
... <input type="checkbox" name="type" value="stock"><br>
... Other spam:
... <input type="checkbox" name="type" value="other"><br>
... <input type="submit" value="Spam!">
... </form>''', dict(spam_me=True, type=['viagra', 'other'])))
<form>
Would you like to be spammed?
<input type="checkbox" name="spam_me" checked> <br>
Spam you'd like to receive:<br>
Viagra spam:
<input type="checkbox" name="type" value="viagra" checked><br>
Stock spam:
<input type="checkbox" name="type" value="stock"><br>
Other spam:
<input type="checkbox" name="type" value="other" checked><br>
<input type="submit" value="Spam!">
</form>
FIXME: I need to test more of this. But I'm lazy and want to use the
coverage report for some of this.
This module also allows you to add error messages to the form. The errors
add an "error" class to the input fields, and any labels if the field
has a label. It also inserts an error message into the form, using a
function you can provide (or the default function).
Example::
>>> from lxml.html.formfill import insert_errors_html
>>> print(insert_errors_html('''
... <form>
... <fieldset id="fieldset">
... <input name="v1"><br>
... <label for="v2">label</label>
... <input name="v2" id="v2"><br>
... </fieldset>
... <input name="v3" class="foo">
... <input name="v3" class="foo">
... <input name="v4">
... <input name="v4">
... </form>''', {
... 'v1': "err1",
... 'v2': "err2",
... 'v3': [None, "err3-2"],
... 'v4': "err4",
... None: 'general error',
... '#fieldset': 'area error',
... }))
<form>
<div class="error-message error-block">general error</div>
<fieldset id="fieldset" class="error">
<div class="error-message error-block">area error</div>
<div class="error-message">err1</div>
<input name="v1" class="error"><br>
<label for="v2" class="error">label</label>
<div class="error-message">err2</div>
<input name="v2" id="v2" class="error"><br>
</fieldset>
<input name="v3" class="foo">
<div class="error-message">err3-2</div>
<input name="v3" class="foo error">
<div class="error-message">err4</div>
<input name="v4" class="error">
<input name="v4">
</form>
REGRESSION: When filling textareas, the "name" attribute used to
be removed. The "name" attribute should be kept::
>>> print(fill_form_html('''
... <form>
... <textarea name="foo">Initial value</textarea>
... </form>''', dict(foo="Bar")))
<form>
<textarea name="foo">Bar</textarea>
</form>

View file

@ -0,0 +1,11 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_forms.txt')])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,195 @@
>>> from lxml.html import usedoctest
>>> from lxml.html import fromstring, tostring
>>> h = fromstring('''<html><body>
... <form action="test">
... <input type="hidden" name="hidden_field" value="hidden_value">
... <input type="text" name="text_field" value="text_value">
... <input type="checkbox" name="single_checkbox">
... <input type="checkbox" name="single_checkbox2" value="good">
... <input type="checkbox" name="check_group" value="1">
... <input type="checkbox" name="check_group" value="2" checked>
... <input type="checkbox" name="check_group" value="3" checked>
... <input type="checkbox" name="check_group" value="4">
... <textarea name="textarea_field">some text</textarea>
... <label for="value1">value 1</label>
... <input type="radio" name="radios" value="value1" id="value1">
... <label for="value2">value 2</label>
... <input type="radio" name="radios" value="value2" id="value2">
... <label for="value3">value 3</label>
... <input type="radio" name="radios" value="value3" id="value3" checked>
... <select name="select1">
... <option> No value </option>
... <option value="">Empty</option>
... <option value="1">number 1</option>
... </select>
... <select name="select2" multiple>
... <option value="1">number 1</option>
... <option value="2">number 2</option>
... <option value="3">number 3</option>
... <option>number 4</option>
... </select>
... <input type="submit" name="submit1" value="submit">
... <input type="submit" name="submit2" value="submit">
... <input type="reset" name="reset1">linksys
... </form>
... </body></html>''', base_url='http://example.org/form.html')
>>> h.base_url
u'http://example.org/form.html'
>>> f = h.forms[0]
>>> f.action
u'http://example.org/test'
>>> f.method
'GET'
>>> f.inputs # doctest:+NOPARSE_MARKUP
<InputGetter for form 0>
>>> hidden = f.inputs['hidden_field']
>>> hidden.checkable
False
>>> hidden.value
'hidden_value'
>>> hidden.value = 'new value'
>>> tostring(hidden, with_tail=False)
b'<input type="hidden" name="hidden_field" value="new value">'
>>> checkbox = f.inputs['single_checkbox']
>>> checkbox.checkable
True
>>> checkbox.type
'checkbox'
>>> checkbox.checked
False
>>> print(checkbox.value)
None
>>> checkbox.checked = True
>>> checkbox.value
'on'
>>> tostring(checkbox, with_tail=False)
b'<input type="checkbox" name="single_checkbox" checked>'
>>> checkbox2 = f.inputs['single_checkbox2']
>>> checkbox2.checked = True
>>> checkbox2.value
'good'
>>> group = f.inputs['check_group']
>>> group.value # doctest:+NOPARSE_MARKUP
<CheckboxValues {'2', '3'} for checkboxes name='check_group'>
>>> group.value.add('1')
>>> group.value # doctest:+NOPARSE_MARKUP
<CheckboxValues {'1', '2', '3'} for checkboxes name='check_group'>
>>> tostring(group[0], with_tail=False)
b'<input type="checkbox" name="check_group" value="1" checked>'
>>> group.value_options
['1', '2', '3', '4']
>>> group.value.add('doesnotexist')
Traceback (most recent call last):
...
KeyError: "No checkbox with value 'doesnotexist'"
>>> textarea = f.inputs['textarea_field']
>>> textarea.value
'some text'
>>> radios = f.inputs['radios']
>>> radios[0].label.text
'value 1'
>>> radios.value
'value3'
>>> radios.value = 'value1'
>>> radios.value
'value1'
>>> tostring(radios[0], with_tail=False)
b'<input type="radio" name="radios" value="value1" id="value1" checked>'
>>> radios.value = None
>>> tostring(radios[0], with_tail=False)
b'<input type="radio" name="radios" value="value1" id="value1">'
>>> radios.value_options
['value1', 'value2', 'value3']
>>> select = f.inputs['select1']
>>> print(select.value)
None
>>> select.value = ""
>>> select.value
''
>>> select.value = 'asdf'
Traceback (most recent call last):
...
ValueError: There is no option with the value of 'asdf'
>>> select.value_options
['No value', '', '1']
>>> select.value = 'No value'
>>> select.value
'No value'
>>> select = f.inputs['select2']
>>> select.value # doctest:+NOPARSE_MARKUP
<MultipleSelectOptions {} for select name='select2'>
>>> select.value.update(['2', '3'])
>>> select.value # doctest:+NOPARSE_MARKUP
<MultipleSelectOptions {'2', '3'} for select name='select2'>
>>> select.value.remove('3')
>>> select.value.add('asdf')
Traceback (most recent call last):
...
ValueError: There is no option with the value 'asdf'
>>> select.value.add('number 4')
>>> select.value # doctest:+NOPARSE_MARKUP
<MultipleSelectOptions {'2', 'number 4'} for select name='select2'>
>>> select.value.remove('number 4')
>>> select.value_options
['1', '2', '3', 'number 4']
>>> try: from urllib import urlencode
... except ImportError: from urllib.parse import urlencode
>>> print(urlencode(f.form_values()))
hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2
>>> fields = f.fields
>>> fields # doctest:+NOPARSE_MARKUP
<FieldsDict for form 0>
>>> for name, value in sorted(fields.items()):
... print('%s: %r' % (name, value))
check_group: <CheckboxValues {'1', '2', '3'} for checkboxes name='check_group'>
hidden_field: 'new value'
radios: None
reset1: None
select1: 'No value'
select2: <MultipleSelectOptions {'2'} for select name='select2'>
single_checkbox: 'on'
single_checkbox2: 'good'
submit1: 'submit'
submit2: 'submit'
text_field: 'text_value'
textarea_field: 'some text'
>>> import lxml.html
>>> tree = lxml.html.fromstring('''
... <html><body>
... <form>
... <input name="foo" value="bar"/>
... <input type="submit" />
... </form>
... </body></html>
... ''')
>>> tree # doctest: +ELLIPSIS
<Element html at ...>
>>> tree.forms[0] # doctest: +ELLIPSIS
<Element form at ...>
>>> tree.forms[0].fields # doctest: +NOPARSE_MARKUP
<FieldsDict for form 0>
>>> list(tree.forms[0].fields.keys())
['foo']
>>> list(tree.forms[0].fields.items())
[('foo', 'bar')]
>>> list(tree.forms[0].fields.values())
['bar']
>>> tree = lxml.html.fromstring('''
... <html><body>
... <form>
... <textarea name="foo">some <b>text<br>content</b> with tags</textarea>
... </form>
... </body></html>
... ''')
>>> list(tree.forms[0].fields.keys())
['foo']
>>> ta = tree.forms[0].inputs['foo']
>>> print(ta.value)
some <b>text<br>content</b> with tags
>>> ta.value = 'abc<br>def'
>>> print(ta.value)
abc<br>def
>>> len(ta)
0

View file

@ -0,0 +1,36 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest, doctest
import lxml.html
from lxml.html import html_parser, XHTML_NAMESPACE
class FrameTest(unittest.TestCase):
def test_parse_fragments_fromstring(self):
parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
html = """<frameset>
<frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
</frameset>"""
etree_document = lxml.html.fragments_fromstring(html, parser=parser)
self.assertEqual(len(etree_document), 1)
root = etree_document[0]
self.assertEqual(root.tag, "frameset")
frame_element = root[0]
self.assertEqual(frame_element.tag, 'frame')
def test_parse_fromstring(self):
parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
html = """<html><frameset>
<frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
</frameset></html>"""
etree_document = lxml.html.fromstring(html, parser=parser)
self.assertEqual(etree_document.tag, 'html')
self.assertEqual(len(etree_document), 1)
frameset_element = etree_document[0]
self.assertEqual(len(frameset_element), 1)
frame_element = frameset_element[0]
self.assertEqual(frame_element.tag, 'frame')
def test_suite():
loader = unittest.TestLoader()
return loader.loadTestsFromModule(sys.modules[__name__])

View file

@ -0,0 +1,429 @@
import os
import imp
try:
from StringIO import StringIO
except ImportError: # python 3
from io import StringIO
import sys
import tempfile
import unittest
try:
from unittest import skipUnless
except ImportError:
# sys.version < (2, 7)
def skipUnless(condition, reason):
return lambda f: condition and f or None
if sys.version_info < (2,6):
class NamedTemporaryFile(object):
def __init__(self, delete=True, **kwargs):
self._tmpfile = tempfile.NamedTemporaryFile(**kwargs)
def close(self):
self._tmpfile.flush()
def __getattr__(self, name):
return getattr(self._tmpfile, name)
else:
NamedTemporaryFile = tempfile.NamedTemporaryFile
from lxml.builder import ElementMaker
from lxml.etree import Element, ElementTree, ParserError
from lxml.html import html_parser, XHTML_NAMESPACE
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
try:
from urllib import pathname2url
except ImportError:
from urllib.request import pathname2url
def path2url(path):
return urlparse.urljoin(
'file:', pathname2url(path))
try:
import html5lib
except ImportError:
html5lib = None
class BogusModules(object):
# See PEP 302 for details on how this works
def __init__(self, mocks):
self.mocks = mocks
def find_module(self, fullname, path=None):
if fullname in self.mocks:
return self
return None
def load_module(self, fullname):
mod = sys.modules.setdefault(fullname, imp.new_module(fullname))
mod.__file__, mod.__loader__, mod.__path__ = "<dummy>", self, []
mod.__dict__.update(self.mocks[fullname])
return mod
# Fake just enough of html5lib so that html5parser.py is importable
# without errors.
sys.meta_path.append(BogusModules({
'html5lib': {
# A do-nothing HTMLParser class
'HTMLParser': type('HTMLParser', (object,), {
'__init__': lambda self, **kw: None,
}),
},
'html5lib.treebuilders': {
},
'html5lib.treebuilders.etree_lxml': {
'TreeBuilder': 'dummy treebuilder',
},
}))
class Test_HTMLParser(unittest.TestCase):
def make_one(self, **kwargs):
from lxml.html.html5parser import HTMLParser
return HTMLParser(**kwargs)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
parser = self.make_one(strict=True)
tree = parser.parse(XHTML_TEST_DOCUMENT)
root = tree.getroot()
self.assertEqual(root.tag, xhtml_tag('html'))
class Test_XHTMLParser(unittest.TestCase):
def make_one(self, **kwargs):
from lxml.html.html5parser import XHTMLParser
return XHTMLParser(**kwargs)
@skipUnless(hasattr(html5lib, 'XHTMLParser'),
'xhtml5lib does not have XHTMLParser')
def test_integration(self):
# XXX: This test are untested. (html5lib no longer has an XHTMLParser)
parser = self.make_one(strict=True)
tree = parser.parse(XHTML_TEST_DOCUMENT)
root = tree.getroot()
self.assertEqual(root.tag, xhtml_tag('html'))
class Test_document_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import document_fromstring
return document_fromstring(*args, **kwargs)
def test_basic(self):
parser = DummyParser(doc=DummyElementTree(root='dummy root'))
elem = self.call_it('dummy input', parser=parser)
self.assertEqual(elem, 'dummy root')
self.assertEqual(parser.parse_args, ('dummy input',))
self.assertEqual(parser.parse_kwargs, {'useChardet': True})
def test_guess_charset_arg_gets_passed_to_parser(self):
parser = DummyParser()
elem = self.call_it('', guess_charset='gc_arg', parser=parser)
self.assertEqual(parser.parse_kwargs, {'useChardet': 'gc_arg'})
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
elem = self.call_it(XHTML_TEST_DOCUMENT)
self.assertEqual(elem.tag, xhtml_tag('html'))
class Test_fragments_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import fragments_fromstring
return fragments_fromstring(*args, **kwargs)
def test_basic(self):
parser = DummyParser(fragments='fragments')
fragments = self.call_it('dummy input', parser=parser)
self.assertEqual(fragments, 'fragments')
def test_guess_charset_arg_gets_passed_to_parser(self):
parser = DummyParser()
elem = self.call_it('', guess_charset='gc_arg', parser=parser)
self.assertEqual(parser.parseFragment_kwargs, {'useChardet': 'gc_arg'})
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
def test_no_leading_text_strips_empty_leading_text(self):
parser = DummyParser(fragments=['', 'tail'])
fragments = self.call_it('', parser=parser, no_leading_text=True)
self.assertEqual(fragments, ['tail'])
def test_no_leading_text_raises_error_if_leading_text(self):
parser = DummyParser(fragments=['leading text', 'tail'])
self.assertRaises(ParserError, self.call_it,
'', parser=parser, no_leading_text=True)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
fragments = self.call_it('a<b>c</b>')
self.assertEqual(len(fragments), 2)
self.assertEqual(fragments[0], 'a')
self.assertEqual(fragments[1].tag, xhtml_tag('b'))
class Test_fragment_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import fragment_fromstring
return fragment_fromstring(*args, **kwargs)
def test_basic(self):
element = DummyElement()
parser = DummyParser(fragments=[element])
self.assertEqual(self.call_it('html', parser=parser), element)
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
def test_create_parent(self):
parser = DummyParser(fragments=['head', Element('child')])
elem = self.call_it('html', parser=parser, create_parent='parent')
self.assertEqual(elem.tag, 'parent')
self.assertEqual(elem.text, 'head')
self.assertEqual(elem[0].tag, 'child')
def test_create_parent_default_type_no_ns(self):
parser = DummyParser(fragments=[], namespaceHTMLElements=False)
elem = self.call_it('html', parser=parser, create_parent=True)
self.assertEqual(elem.tag, 'div')
def test_raises_error_on_leading_text(self):
parser = DummyParser(fragments=['leading text'])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
def test_raises_error_if_no_elements_found(self):
parser = DummyParser(fragments=[])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
def test_raises_error_if_multiple_elements_found(self):
parser = DummyParser(fragments=[DummyElement(), DummyElement()])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
def test_raises_error_if_tail(self):
parser = DummyParser(fragments=[DummyElement(tail='tail')])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
class Test_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import fromstring
return fromstring(*args, **kwargs)
def test_returns_whole_doc_if_input_contains_html_tag(self):
parser = DummyParser(root='the doc')
self.assertEqual(self.call_it('<html></html>', parser=parser),
'the doc')
def test_returns_whole_doc_if_input_contains_doctype(self):
parser = DummyParser(root='the doc')
self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser),
'the doc')
def test_returns_whole_doc_if_head_not_empty(self, use_ns=True):
E = HTMLElementMaker(namespaceHTMLElements=use_ns)
root = E.html(E.head(E.title()))
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), root)
def test_returns_whole_doc_if_head_not_empty_no_ns(self):
self.test_returns_whole_doc_if_head_not_empty(use_ns=False)
def test_returns_unwraps_body_if_single_element(self):
E = HTMLElementMaker()
elem = E.p('test')
root = E.html(E.head(), E.body(elem))
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), elem)
def test_returns_body_if_has_text(self):
E = HTMLElementMaker()
elem = E.p('test')
body = E.body('text', elem)
root = E.html(E.head(), body)
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), body)
def test_returns_body_if_single_element_has_tail(self):
E = HTMLElementMaker()
elem = E.p('test')
elem.tail = 'tail'
body = E.body(elem)
root = E.html(E.head(), body)
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), body)
def test_wraps_multiple_fragments_in_div_no_ns(self):
E = HTMLElementMaker(namespaceHTMLElements=False)
parser = DummyParser(root=E.html(E.head(), E.body(E.h1(), E.p())),
namespaceHTMLElements=False)
elem = self.call_it('', parser=parser)
self.assertEqual(elem.tag, 'div')
def test_wraps_multiple_fragments_in_span_no_ns(self):
E = HTMLElementMaker(namespaceHTMLElements=False)
parser = DummyParser(root=E.html(E.head(), E.body('foo', E.a('link'))),
namespaceHTMLElements=False)
elem = self.call_it('', parser=parser)
self.assertEqual(elem.tag, 'span')
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration_whole_doc(self):
elem = self.call_it(XHTML_TEST_DOCUMENT)
self.assertEqual(elem.tag, xhtml_tag('html'))
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration_single_fragment(self):
elem = self.call_it('<p></p>')
self.assertEqual(elem.tag, xhtml_tag('p'))
class Test_parse(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import parse
return parse(*args, **kwargs)
def make_temp_file(self, contents=''):
tmpfile = NamedTemporaryFile(delete=False)
try:
tmpfile.write(contents.encode('utf8'))
tmpfile.flush()
tmpfile.seek(0)
return tmpfile
except Exception:
try:
tmpfile.close()
finally:
os.unlink(tempfile.name)
raise
def test_with_file_object(self):
parser = DummyParser(doc='the doc')
fp = open(__file__)
try:
self.assertEqual(self.call_it(fp, parser=parser), 'the doc')
self.assertEqual(parser.parse_args, (fp,))
finally:
fp.close()
def test_with_file_name(self):
parser = DummyParser(doc='the doc')
tmpfile = self.make_temp_file('data')
try:
data = tmpfile.read()
finally:
tmpfile.close()
try:
self.assertEqual(self.call_it(tmpfile.name, parser=parser), 'the doc')
fp, = parser.parse_args
try:
self.assertEqual(fp.read(), data)
finally:
fp.close()
finally:
os.unlink(tmpfile.name)
def test_with_url(self):
parser = DummyParser(doc='the doc')
tmpfile = self.make_temp_file('content')
try:
data = tmpfile.read()
finally:
tmpfile.close()
try:
url = path2url(tmpfile.name)
self.assertEqual(self.call_it(url, parser=parser), 'the doc')
fp, = parser.parse_args
try:
self.assertEqual(fp.read(), data)
finally:
fp.close()
finally:
os.unlink(tmpfile.name)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
doc = self.call_it(StringIO(XHTML_TEST_DOCUMENT))
root = doc.getroot()
self.assertEqual(root.tag, xhtml_tag('html'))
def test_suite():
loader = unittest.TestLoader()
return loader.loadTestsFromModule(sys.modules[__name__])
class HTMLElementMaker(ElementMaker):
def __init__(self, namespaceHTMLElements=True):
initargs = dict(makeelement=html_parser.makeelement)
if namespaceHTMLElements:
initargs.update(namespace=XHTML_NAMESPACE,
nsmap={None: XHTML_NAMESPACE})
ElementMaker.__init__(self, **initargs)
class DummyParser(object):
def __init__(self, doc=None, root=None,
fragments=None, namespaceHTMLElements=True):
self.doc = doc or DummyElementTree(root=root)
self.fragments = fragments
self.tree = DummyTreeBuilder(namespaceHTMLElements)
def parse(self, *args, **kwargs):
self.parse_args = args
self.parse_kwargs = kwargs
return self.doc
def parseFragment(self, *args, **kwargs):
self.parseFragment_args = args
self.parseFragment_kwargs = kwargs
return self.fragments
class DummyTreeBuilder(object):
def __init__(self, namespaceHTMLElements=True):
self.namespaceHTMLElements = namespaceHTMLElements
class DummyElementTree(object):
def __init__(self, root):
self.root = root
def getroot(self):
return self.root
class DummyElement(object):
def __init__(self, tag='tag', tail=None):
self.tag = tag
self.tail = tail
def xhtml_tag(tag):
return '{%s}%s' % (XHTML_NAMESPACE, tag)
XHTML_TEST_DOCUMENT = '''
<!DOCTYPE html>
<html>
<head><title>TITLE</title></head>
<body></body>
</html>
'''

View file

@ -0,0 +1,11 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):
suite.addTests([make_doctest('test_rewritelinks.txt')])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,245 @@
Setup::
>>> import lxml.html
We'll define a link translation function:
>>> base_href = 'http://old/base/path.html'
>>> try: import urlparse
... except ImportError: import urllib.parse as urlparse
>>> def relocate_href(link):
... link = urlparse.urljoin(base_href, link)
... if link.startswith('http://old'):
... return 'https://new' + link[len('http://old'):]
... else:
... return link
Now for content. First, to make it easier on us, we need to trim the
normalized HTML we get from these functions::
Some basics::
>>> from lxml.html import usedoctest, tostring
>>> from lxml.html import rewrite_links
>>> print(rewrite_links(
... '<a href="http://old/blah/blah.html">link</a>', relocate_href))
<a href="https://new/blah/blah.html">link</a>
>>> print(rewrite_links(
... '<script src="http://old/foo.js"></script>', relocate_href))
<script src="https://new/foo.js"></script>
>>> print(rewrite_links(
... '<link href="foo.css">', relocate_href))
<link href="https://new/base/foo.css">
>>> print(rewrite_links('''\
... <base href="http://blah/stuff/index.html">
... <link href="foo.css">
... <a href="http://old/bar.html">x</a>\
... ''', relocate_href))
<link href="http://blah/stuff/foo.css">
<a href="https://new/bar.html">x</a>
Links in CSS are also handled::
>>> print(rewrite_links('''
... <style>
... body {background-image: url(http://old/image.gif)};
... @import "http://old/other-style.css";
... </style>''', relocate_href))
<html><head><style>
body {background-image: url(https://new/image.gif)};
@import "https://new/other-style.css";
</style></head></html>
>>> print(rewrite_links('''
... <style>
... body {background-image: url("http://old/image.gif")};
... @import "http://old/other-style.css";
... </style>''', relocate_href))
<html><head><style>
body {background-image: url("https://new/image.gif")};
@import "https://new/other-style.css";
</style></head></html>
Those links in style attributes are also rewritten::
>>> print(rewrite_links('''
... <div style="background-image: url(http://old/image.gif)">text</div>
... ''', relocate_href))
<div style="background-image: url(https://new/image.gif)">text</div>
The ``<base href>`` tag is also respected (but also removed)::
>>> print(rewrite_links('''
... <html><head>
... <base href="http://old/">
... </head>
... <body>
... <a href="foo.html">link</a>
... </body></html>''', relocate_href))
<html>
<head></head>
<body>
<a href="https://new/foo.html">link</a>
</body>
</html>
The ``iterlinks`` method (and function) gives you all the links in
the document, along with the element and attribute the link comes
from. This makes it fairly easy to see what resources the document
references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
is something embedded). It returns a generator of ``(element, attrib,
link)``, which is awkward to test here, so we'll make a printer::
>>> from lxml.html import iterlinks, document_fromstring, tostring
>>> def print_iter(seq):
... for element, attrib, link, pos in seq:
... if pos:
... extra = '@%s' % pos
... else:
... extra = ''
... print('%s %s="%s"%s' % (element.tag, attrib, link, extra))
>>> print_iter(iterlinks('''
... <html>
... <head>
... <link rel="stylesheet" href="style.css">
... <style type="text/css">
... body {
... background-image: url(/bg.gif);
... }
... @import "/other-styles.css";
... </style>
... <script src="/js-funcs.js"></script>
... </head>
... <body>
... <table>
... <tr><td><ul>
... <li><a href="/test.html">Test stuff</a></li>
... <li><a href="/other.html">Other stuff</a></li>
... </td></tr>
... <td style="background-image: url(/td-bg.png)">
... <img src="/logo.gif">
... Hi world!
... </td></tr>
... </table>
... </body></html>'''))
link href="style.css"
style None="/other-styles.css"@69
style None="/bg.gif"@40
script src="/js-funcs.js"
a href="/test.html"
a href="/other.html"
td style="/td-bg.png"@22
img src="/logo.gif"
An application of ``iterlinks()`` is ``make_links_absolute()``::
>>> from lxml.html import make_links_absolute
>>> print(make_links_absolute('''
... <html>
... <head>
... <link rel="stylesheet" href="style.css">
... <style type="text/css">
... body {
... background-image: url(/bg.gif);
... }
... @import "/other-styles.css";
... </style>
... <script src="/js-funcs.js"></script>
... </head>
... <body>
... <table>
... <tr><td><ul>
... <li><a href=" /test.html">Test stuff</a></li>
... <li><a href="/other.html ">Other stuff</a></li>
... </td></tr>
... <tr><td style="background-image: url( /td-bg.png )">
... <img src="logo.gif">
... Hi world!
... </td></tr>
... </table>
... </body></html>''',
... base_url="http://my.little.server/url/"))
<html>
<head>
<link rel="stylesheet" href="http://my.little.server/url/style.css">
<style type="text/css">
body {
background-image: url(http://my.little.server/bg.gif);
}
@import "http://my.little.server/other-styles.css";
</style>
<script src="http://my.little.server/js-funcs.js"></script>
</head>
<body>
<table>
<tr><td><ul>
<li><a href="http://my.little.server/test.html">Test stuff</a></li>
<li><a href="http://my.little.server/other.html">Other stuff</a></li>
</ul></td></tr>
<tr>
<td style="background-image: url(http://my.little.server/td-bg.png)">
<img src="http://my.little.server/url/logo.gif">
Hi world!
</td></tr>
</table>
</body>
</html>
### Test disabled to support Py2.6 and earlier
#If the document contains invalid links, you may choose to "discard" or "ignore"
#them by passing the respective option into the ``handle_failures`` argument::
#
# >>> html = lxml.html.fromstring ('''\
# ... <html><body><div>
# ... <a href="http://fancybase.com]Buy">test2</a>
# ... </div></body></html>''')
#
# >>> html.make_links_absolute(base_url="http://my.little.server/url/",
# ... handle_failures="discard")
#
# >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
# <html><body><div>
# <a>test2</a>
# </div></body></html>
Check if we can replace multiple links inside of the same text string::
>>> html = lxml.html.fromstring ("""\
... <html>
... <head>
... <title>Test</title>
... <style type='text/css'>
... .bg1 {
... background: url(images/bg1.png);
... }
... .bg2 {
... background: url(images/bg2.png);
... }
... </style>
... </head>
... <body>
... <p>Hi</p>
... </body>
... </html>
... """,
... base_url = 'http://www.example.com/')
>>> html.make_links_absolute ()
>>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
<html>
<head>
<title>Test</title>
<style type="text/css">
.bg1 {
background: url(http://www.example.com/images/bg1.png);
}
.bg2 {
background: url(http://www.example.com/images/bg2.png);
}
</style>
</head>
<body>
<p>Hi</p>
</body>
</html>

View file

@ -0,0 +1,11 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest
import lxml.html
def test_suite():
suite = unittest.TestSuite()
suite.addTests([make_doctest('test_xhtml.txt')])
return suite
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,30 @@
>>> from lxml.html import document_fromstring, fragment_fromstring, tostring
lxml.html has two parsers, one for HTML, one for XHTML:
>>> from lxml.html import HTMLParser, XHTMLParser
>>> html = "<html><body><p>Hi!</p></body></html>"
>>> root = document_fromstring(html, parser=HTMLParser())
>>> print(root.tag)
html
>>> root = document_fromstring(html, parser=XHTMLParser())
>>> print(root.tag)
html
There are two functions for converting between HTML and XHTML:
>>> from lxml.html import xhtml_to_html, html_to_xhtml
>>> doc = document_fromstring(html, parser=HTMLParser())
>>> tostring(doc)
b'<html><body><p>Hi!</p></body></html>'
>>> html_to_xhtml(doc)
>>> tostring(doc)
b'<html:html xmlns:html="http://www.w3.org/1999/xhtml"><html:body><html:p>Hi!</html:p></html:body></html:html>'
>>> xhtml_to_html(doc)
>>> tostring(doc)
b'<html xmlns:html="http://www.w3.org/1999/xhtml"><body><p>Hi!</p></body></html>'

View file

@ -0,0 +1,110 @@
"""
This takes the feedparser tests from here:
http://feedparser.org/tests/wellformed/sanitize/
and rewrites them to be easier to handle (not using the internal model
of feedparser). The input format is::
<!--
Description: {description}
Expect: {expression}
-->
...
<content ...>{content}</content>
...
The Expect expression is checked for
``entries[0]['content'][0]['value'] == {data}``.
The output format is::
Description: {description}
Expect: {expression} (if data couldn't be parsed)
Options:
{content, unescaped}
----------
{data, unescaped, if found}
"""
import re
import os
import traceback
_desc_re = re.compile(r'\s*Description:\s*(.*)')
_expect_re = re.compile(r'\s*Expect:\s*(.*)')
_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)")
_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)")
def parse_content(content):
match = _desc_re.search(content)
desc = match.group(1)
match = _expect_re.search(content)
expect = match.group(1)
data = None
for regex in [_data_expect_re, _feed_data_expect_re]:
match = regex.search(expect)
if match:
# Icky, but I'll trust it
data = eval(match.group(1).strip())
break
c = None
for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']:
regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S)
match = regex.search(content)
if match:
c = match.group(1)
break
assert c is not None
# Seems like body isn't quoted
if tag != 'body':
c = c.replace('&lt;', '<')
c = c.replace('&amp;', '&')
# FIXME: I should really do more unescaping...
return {
'Description': desc,
'Expect': expect,
'data': data,
'content': c}
def serialize_content(d):
s = '''\
Description: %(Description)s
Expect: %(Expect)s
Options:
%(content)s
''' % d
if d.get('data') is not None:
s += '----------\n%s' % d['data']
return s
def translate_file(filename):
f = open(filename, 'rb')
c = f.read()
f.close()
try:
output = serialize_content(parse_content(c))
except:
print('Bad data in %s:' % filename)
print(c)
traceback.print_exc()
print('-'*60)
return
new = os.path.splitext(filename)[0] + '.data'
f = open(new, 'wb')
f.write(output)
f.close()
def translate_all(dir):
for fn in os.listdir(dir):
fn = os.path.join(dir, fn)
if fn.endswith('.xml'):
translate_file(fn)
if __name__ == '__main__':
import sys
translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data'))

View file

@ -0,0 +1,13 @@
"""Doctest module for HTML comparison.
Usage::
>>> import lxml.html.usedoctest
>>> # now do your HTML doctests ...
See `lxml.doctestcompare`.
"""
from lxml import doctestcompare
doctestcompare.temp_install(html=True, del_module=__name__)

View file

View file

@ -0,0 +1,26 @@
from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar
from lxml.includes.xpath cimport xmlNodeSet
cdef extern from "libxml/c14n.h":
cdef int xmlC14NDocDumpMemory(xmlDoc* doc,
xmlNodeSet* nodes,
int exclusive,
xmlChar** inclusive_ns_prefixes,
int with_comments,
xmlChar** doc_txt_ptr) nogil
cdef int xmlC14NDocSave(xmlDoc* doc,
xmlNodeSet* nodes,
int exclusive,
xmlChar** inclusive_ns_prefixes,
int with_comments,
char* filename,
int compression) nogil
cdef int xmlC14NDocSaveTo(xmlDoc* doc,
xmlNodeSet* nodes,
int exclusive,
xmlChar** inclusive_ns_prefixes,
int with_comments,
xmlOutputBuffer* buffer) nogil

View file

@ -0,0 +1,3 @@
cdef extern from "etree_defs.h":
cdef bint ENABLE_THREADING
cdef bint ENABLE_SCHEMATRON

View file

@ -0,0 +1,18 @@
from lxml.includes cimport tree
from lxml.includes.tree cimport xmlDoc, xmlDtd
cdef extern from "libxml/valid.h" nogil:
ctypedef void (*xmlValidityErrorFunc)(void * ctx, const char * msg, ...)
ctypedef void (*xmlValidityWarningFunc)(void * ctx, const char * msg, ...)
ctypedef struct xmlValidCtxt:
void *userData
xmlValidityErrorFunc error
xmlValidityWarningFunc warning
cdef xmlValidCtxt* xmlNewValidCtxt()
cdef void xmlFreeValidCtxt(xmlValidCtxt* cur)
cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd)
cdef tree.xmlElement* xmlGetDtdElementDesc(
xmlDtd* dtd, tree.const_xmlChar* name)

View file

@ -0,0 +1,328 @@
#ifndef HAS_ETREE_DEFS_H
#define HAS_ETREE_DEFS_H
/* quick check for Python/libxml2/libxslt devel setup */
#include "Python.h"
#ifndef PY_VERSION_HEX
# error the development package of Python (header files etc.) is not installed correctly
#else
# if PY_VERSION_HEX < 0x02060000 || PY_MAJOR_VERSION >= 3 && PY_VERSION_HEX < 0x03020000
# error this version of lxml requires Python 2.6, 2.7, 3.2 or later
# endif
#endif
#include "libxml/xmlversion.h"
#ifndef LIBXML_VERSION
# error the development package of libxml2 (header files etc.) is not installed correctly
#else
#if LIBXML_VERSION < 20700
# error minimum required version of libxml2 is 2.7.0
#endif
#endif
#include "libxslt/xsltconfig.h"
#ifndef LIBXSLT_VERSION
# error the development package of libxslt (header files etc.) is not installed correctly
#else
#if LIBXSLT_VERSION < 10123
# error minimum required version of libxslt is 1.1.23
#endif
#endif
/* v_arg functions */
#define va_int(ap) va_arg(ap, int)
#define va_charptr(ap) va_arg(ap, char *)
#ifdef PYPY_VERSION
# define IS_PYPY 1
#else
# define IS_PYPY 0
#endif
#if PY_MAJOR_VERSION >= 3
# define IS_PYTHON3 1
#else
# define IS_PYTHON3 0
#endif
#if IS_PYTHON3
#undef LXML_UNICODE_STRINGS
#define LXML_UNICODE_STRINGS 1
#else
#ifndef LXML_UNICODE_STRINGS
#define LXML_UNICODE_STRINGS 0
#endif
#endif
#if !IS_PYPY
# define PyWeakref_LockObject(obj) (NULL)
#endif
/* Threading is not currently supported by PyPy */
#if IS_PYPY
# ifndef WITHOUT_THREADING
# define WITHOUT_THREADING
# endif
#endif
/* Python 3 doesn't have PyFile_*() anymore */
#if PY_MAJOR_VERSION >= 3
# define PyFile_AsFile(o) (NULL)
#else
#if IS_PYPY
# undef PyFile_AsFile
# define PyFile_AsFile(o) (NULL)
# undef PyUnicode_FromFormat
# define PyUnicode_FromFormat(s, a, b) (NULL)
# undef PyByteArray_Check
# define PyByteArray_Check(o) (0)
#endif
#endif
#if PY_VERSION_HEX <= 0x03030000 && !(defined(CYTHON_PEP393_ENABLED) && CYTHON_PEP393_ENABLED)
#define PyUnicode_IS_READY(op) (0)
#define PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
#define PyUnicode_KIND(u) (sizeof(Py_UNICODE))
#define PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u))
#endif
/* PySlice_GetIndicesEx() has wrong signature in Py<=3.1 */
#if PY_VERSION_HEX >= 0x03020000
# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(o, l, b, e, s, sl)
#else
# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(((PySliceObject*)o), l, b, e, s, sl)
#endif
#ifdef WITHOUT_THREADING
# define PyEval_SaveThread() (NULL)
# define PyEval_RestoreThread(state)
# define PyGILState_Ensure() (PyGILState_UNLOCKED)
# define PyGILState_Release(state)
# undef Py_UNBLOCK_THREADS
# define Py_UNBLOCK_THREADS
# undef Py_BLOCK_THREADS
# define Py_BLOCK_THREADS
#endif
#ifdef WITHOUT_THREADING
# define ENABLE_THREADING 0
#else
# define ENABLE_THREADING 1
#endif
#if LIBXML_VERSION < 20704
/* FIXME: hack to make new error reporting compile in old libxml2 versions */
# define xmlStructuredErrorContext NULL
# define xmlXIncludeProcessTreeFlagsData(n,o,d) xmlXIncludeProcessTreeFlags(n,o)
#endif
/* schematron was added in libxml2 2.6.21 */
#ifdef LIBXML_SCHEMATRON_ENABLED
# define ENABLE_SCHEMATRON 1
#else
# define ENABLE_SCHEMATRON 0
# define XML_SCHEMATRON_OUT_QUIET 0
# define XML_SCHEMATRON_OUT_XML 0
# define XML_SCHEMATRON_OUT_ERROR 0
typedef void xmlSchematron;
typedef void xmlSchematronParserCtxt;
typedef void xmlSchematronValidCtxt;
# define xmlSchematronNewDocParserCtxt(doc) NULL
# define xmlSchematronNewParserCtxt(file) NULL
# define xmlSchematronParse(ctxt) NULL
# define xmlSchematronFreeParserCtxt(ctxt)
# define xmlSchematronFree(schema)
# define xmlSchematronNewValidCtxt(schema, options) NULL
# define xmlSchematronValidateDoc(ctxt, doc) 0
# define xmlSchematronFreeValidCtxt(ctxt)
# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data)
#endif
#if LIBXML_VERSION < 20900
# define XML_PARSE_BIG_LINES 4194304
#endif
#include "libxml/tree.h"
#ifndef LIBXML2_NEW_BUFFER
typedef xmlBuffer xmlBuf;
# define xmlBufContent(buf) xmlBufferContent(buf)
# define xmlBufUse(buf) xmlBufferLength(buf)
#endif
/* libexslt 1.1.25+ support EXSLT functions in XPath */
#if LIBXSLT_VERSION < 10125
#define exsltDateXpathCtxtRegister(ctxt, prefix)
#define exsltSetsXpathCtxtRegister(ctxt, prefix)
#define exsltMathXpathCtxtRegister(ctxt, prefix)
#define exsltStrXpathCtxtRegister(ctxt, prefix)
#endif
/* work around MSDEV 6.0 */
#if (_MSC_VER == 1200) && (WINVER < 0x0500)
long _ftol( double ); //defined by VC6 C libs
long _ftol2( double dblSource ) { return _ftol( dblSource ); }
#endif
#ifdef __GNUC__
/* Test for GCC > 2.95 */
#if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
#define unlikely_condition(x) __builtin_expect((x), 0)
#else /* __GNUC__ > 2 ... */
#define unlikely_condition(x) (x)
#endif /* __GNUC__ > 2 ... */
#else /* __GNUC__ */
#define unlikely_condition(x) (x)
#endif /* __GNUC__ */
#ifndef Py_TYPE
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
#endif
#define PY_NEW(T) \
(((PyTypeObject*)(T))->tp_new( \
(PyTypeObject*)(T), __pyx_empty_tuple, NULL))
#define _fqtypename(o) ((Py_TYPE(o))->tp_name)
#if PY_MAJOR_VERSION < 3
#define _isString(obj) (PyString_CheckExact(obj) || \
PyUnicode_CheckExact(obj) || \
PyType_IsSubtype(Py_TYPE(obj), &PyBaseString_Type))
#else
/* builtin subtype type checks are almost as fast as exact checks in Py2.7+
* and Unicode is more common in Py3 */
#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj))
#endif
#define _isElement(c_node) \
(((c_node)->type == XML_ELEMENT_NODE) || \
((c_node)->type == XML_COMMENT_NODE) || \
((c_node)->type == XML_ENTITY_REF_NODE) || \
((c_node)->type == XML_PI_NODE))
#define _isElementOrXInclude(c_node) \
(_isElement(c_node) || \
((c_node)->type == XML_XINCLUDE_START) || \
((c_node)->type == XML_XINCLUDE_END))
#define _getNs(c_node) \
(((c_node)->ns == 0) ? 0 : ((c_node)->ns->href))
/* Macro pair implementation of a depth first tree walker
*
* Calls the code block between the BEGIN and END macros for all elements
* below c_tree_top (exclusively), starting at c_node (inclusively iff
* 'inclusive' is 1). The _ELEMENT_ variants will only stop on nodes
* that match _isElement(), the normal variant will stop on every node
* except text nodes.
*
* To traverse the node and all of its children and siblings in Pyrex, call
* cdef xmlNode* some_node
* BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1)
* # do something with some_node
* END_FOR_EACH_ELEMENT_FROM(some_node)
*
* To traverse only the children and siblings of a node, call
* cdef xmlNode* some_node
* BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0)
* # do something with some_node
* END_FOR_EACH_ELEMENT_FROM(some_node)
*
* To traverse only the children, do:
* cdef xmlNode* some_node
* some_node = parent_node.children
* BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1)
* # do something with some_node
* END_FOR_EACH_ELEMENT_FROM(some_node)
*
* NOTE: 'some_node' MUST be a plain 'xmlNode*' !
*
* NOTE: parent modification during the walk can divert the iterator, but
* should not segfault !
*/
#define _LX__ELEMENT_MATCH(c_node, only_elements) \
((only_elements) ? (_isElement(c_node)) : 1)
#define _LX__ADVANCE_TO_NEXT(c_node, only_elements) \
while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements))) \
c_node = c_node->next;
#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \
{ \
/* walk through children first */ \
xmlNode* _lx__next = c_node->children; \
if (_lx__next != 0) { \
if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
_lx__next = 0; \
} else { \
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
} \
} \
if ((_lx__next == 0) && (c_node != c_stop_node)) { \
/* try siblings */ \
_lx__next = c_node->next; \
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
/* back off through parents */ \
while (_lx__next == 0) { \
c_node = c_node->parent; \
if (c_node == 0) \
break; \
if (c_node == c_stop_node) \
break; \
if ((only_elements) && !_isElement(c_node)) \
break; \
/* we already traversed the parents -> siblings */ \
_lx__next = c_node->next; \
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
} \
} \
c_node = _lx__next; \
}
#define _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, only_elements) \
{ \
if (c_node != 0) { \
const xmlNode* _lx__tree_top = (c_tree_top); \
const int _lx__only_elements = (only_elements); \
/* make sure we start at an element */ \
if (!_LX__ELEMENT_MATCH(c_node, _lx__only_elements)) { \
/* we skip the node, so 'inclusive' is irrelevant */ \
if (c_node == _lx__tree_top) \
c_node = 0; /* nothing to traverse */ \
else { \
c_node = c_node->next; \
_LX__ADVANCE_TO_NEXT(c_node, _lx__only_elements) \
} \
} else if (! (inclusive)) { \
/* skip the first node */ \
_LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
} \
\
/* now run the user code on the elements we find */ \
while (c_node != 0) { \
/* here goes the code to be run for each element */
#define _LX__END_FOR_EACH_FROM(c_node) \
_LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
} \
} \
}
#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive) \
_LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 1)
#define END_FOR_EACH_ELEMENT_FROM(c_node) \
_LX__END_FOR_EACH_FROM(c_node)
#define BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive) \
_LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 0)
#define END_FOR_EACH_FROM(c_node) \
_LX__END_FOR_EACH_FROM(c_node)
#endif /* HAS_ETREE_DEFS_H */

Some files were not shown because too many files have changed in this diff Show more