SickGear/lib/lxml/iterparse.pxi
2014-03-28 21:32:46 -07:00

357 lines
13 KiB
Cython

# iterparse -- event-driven parsing
DEF __ITERPARSE_CHUNK_SIZE = 32768
cdef class iterparse:
u"""iterparse(self, source, events=("end",), tag=None, \
attribute_defaults=False, dtd_validation=False, \
load_dtd=False, no_network=True, remove_blank_text=False, \
remove_comments=False, remove_pis=False, encoding=None, \
html=False, recover=None, huge_tree=False, schema=None)
Incremental parser.
Parses XML into a tree and generates tuples (event, element) in a
SAX-like fashion. ``event`` is any of 'start', 'end', 'start-ns',
'end-ns'.
For 'start' and 'end', ``element`` is the Element that the parser just
found opening or closing. For 'start-ns', it is a tuple (prefix, URI) of
a new namespace declaration. For 'end-ns', it is simply None. Note that
all start and end events are guaranteed to be properly nested.
The keyword argument ``events`` specifies a sequence of event type names
that should be generated. By default, only 'end' events will be
generated.
The additional ``tag`` argument restricts the 'start' and 'end' events to
those elements that match the given tag. By default, events are generated
for all elements. Note that the 'start-ns' and 'end-ns' events are not
impacted by this restriction.
The other keyword arguments in the constructor are mainly based on the
libxml2 parser configuration. A DTD will also be loaded if validation or
attribute default values are requested.
Available boolean keyword arguments:
- attribute_defaults: read default attributes from DTD
- dtd_validation: validate (if DTD is available)
- load_dtd: use DTD for parsing
- no_network: prevent network access for related files
- remove_blank_text: discard blank text nodes
- remove_comments: discard comments
- remove_pis: discard processing instructions
- strip_cdata: replace CDATA sections by normal text content (default: True)
- compact: safe memory for short text content (default: True)
- resolve_entities: replace entities by their text value (default: True)
- huge_tree: disable security restrictions and support very deep trees
and very long text content (only affects libxml2 2.7+)
- html: parse input as HTML (default: XML)
- recover: try hard to parse through broken input (default: True for HTML,
False otherwise)
Other keyword arguments:
- encoding: override the document encoding
- schema: an XMLSchema to validate against
"""
cdef _FeedParser _parser
cdef object _tag
cdef object _events
cdef readonly object root
cdef object _source
cdef object _filename
cdef object _error
cdef bint _close_source_after_read
def __init__(self, source, events=(u"end",), *, tag=None,
attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=True, remove_blank_text=False,
compact=True, resolve_entities=True, remove_comments=False,
remove_pis=False, strip_cdata=True, encoding=None,
html=False, recover=None, huge_tree=False,
XMLSchema schema=None):
if not hasattr(source, 'read'):
self._filename = source
if not python.IS_PYTHON3:
source = _encodeFilename(source)
source = open(source, 'rb')
self._close_source_after_read = True
else:
self._filename = _getFilenameForFile(source)
self._close_source_after_read = False
if recover is None:
recover = html
if html:
# make sure we're not looking for namespaces
events = [event for event in events
if event not in ('start-ns', 'end-ns')]
parser = HTMLPullParser(
events,
tag=tag,
recover=recover,
base_url=self._filename,
encoding=encoding,
remove_blank_text=remove_blank_text,
remove_comments=remove_comments,
remove_pis=remove_pis,
strip_cdata=strip_cdata,
no_network=no_network,
target=None, # TODO
schema=schema,
compact=compact)
else:
parser = XMLPullParser(
events,
tag=tag,
recover=recover,
base_url=self._filename,
encoding=encoding,
attribute_defaults=attribute_defaults,
dtd_validation=dtd_validation,
load_dtd=load_dtd,
no_network=no_network,
schema=schema,
huge_tree=huge_tree,
remove_blank_text=remove_blank_text,
resolve_entities=resolve_entities,
remove_comments=remove_comments,
remove_pis=remove_pis,
strip_cdata=strip_cdata,
target=None, # TODO
compact=compact)
self._events = parser.read_events()
self._parser = parser
self._source = source
property error_log:
u"""The error log of the last (or current) parser run.
"""
def __get__(self):
return self._parser.feed_error_log
property resolvers:
u"""The custom resolver registry of the last (or current) parser run.
"""
def __get__(self):
return self._parser.resolvers
property version:
u"""The version of the underlying XML parser."""
def __get__(self):
return self._parser.version
def set_element_class_lookup(self, ElementClassLookup lookup = None):
u"""set_element_class_lookup(self, lookup = None)
Set a lookup scheme for element classes generated from this parser.
Reset it by passing None or nothing.
"""
self._parser.set_element_class_lookup(lookup)
def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with this parser.
"""
self._parser.makeelement(
_tag, attrib=None, nsmap=None, **_extra)
@cython.final
cdef _close_source(self):
if self._source is None:
return
if not self._close_source_after_read:
self._source = None
return
try:
close = self._source.close
except AttributeError:
close = None
finally:
self._source = None
if close is not None:
close()
def __iter__(self):
return self
def __next__(self):
try:
return next(self._events)
except StopIteration:
pass
context = <_SaxParserContext>self._parser._getPushParserContext()
if self._source is not None:
done = False
while not done:
try:
done = self._read_more_events(context)
return next(self._events)
except StopIteration:
pass # no events yet
except Exception, e:
self._error = e
self._close_source()
try:
return next(self._events)
except StopIteration:
break
# nothing left to read or return
if self._error is not None:
error = self._error
self._error = None
raise error
if (context._validator is not None
and not context._validator.isvalid()):
_raiseParseError(context._c_ctxt, self._filename,
context._error_log)
# no errors => all done
raise StopIteration
@cython.final
cdef bint _read_more_events(self, _SaxParserContext context) except -123:
data = self._source.read(__ITERPARSE_CHUNK_SIZE)
if not isinstance(data, bytes):
self._close_source()
raise TypeError("reading file objects must return bytes objects")
if not data:
try:
self.root = self._parser.close()
finally:
self._close_source()
return True
self._parser.feed(data)
return False
cdef class iterwalk:
u"""iterwalk(self, element_or_tree, events=("end",), tag=None)
A tree walker that generates events from an existing tree as if it
was parsing XML data with ``iterparse()``.
"""
cdef _MultiTagMatcher _matcher
cdef list _node_stack
cdef int _index
cdef list _events
cdef object _pop_event
cdef int _event_filter
def __init__(self, element_or_tree, events=(u"end",), tag=None):
cdef _Element root
cdef int ns_count
root = _rootNodeOrRaise(element_or_tree)
self._event_filter = _buildParseEventFilter(events)
if tag is None or tag == '*':
self._matcher = None
else:
self._matcher = _MultiTagMatcher(tag)
self._node_stack = []
self._events = []
self._pop_event = self._events.pop
if self._event_filter:
self._index = 0
ns_count = self._start_node(root)
self._node_stack.append( (root, ns_count) )
else:
self._index = -1
def __iter__(self):
return self
def __next__(self):
cdef xmlNode* c_child
cdef _Element node
cdef _Element next_node
cdef int ns_count = 0
if self._events:
return self._pop_event(0)
if self._matcher is not None and self._index >= 0:
node = self._node_stack[self._index][0]
self._matcher.cacheTags(node._doc)
# find next node
while self._index >= 0:
node = self._node_stack[self._index][0]
c_child = _findChildForwards(node._c_node, 0)
if c_child is not NULL:
# try children
next_node = _elementFactory(node._doc, c_child)
else:
# back off
next_node = None
while next_node is None:
# back off through parents
self._index -= 1
node = self._end_node()
if self._index < 0:
break
next_node = node.getnext()
if next_node is not None:
if self._event_filter & (PARSE_EVENT_FILTER_START |
PARSE_EVENT_FILTER_START_NS):
ns_count = self._start_node(next_node)
elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
ns_count = _countNsDefs(next_node._c_node)
self._node_stack.append( (next_node, ns_count) )
self._index += 1
if self._events:
return self._pop_event(0)
raise StopIteration
cdef int _start_node(self, _Element node) except -1:
cdef int ns_count
if self._event_filter & PARSE_EVENT_FILTER_START_NS:
ns_count = _appendStartNsEvents(node._c_node, self._events)
elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
ns_count = _countNsDefs(node._c_node)
else:
ns_count = 0
if self._event_filter & PARSE_EVENT_FILTER_START:
if self._matcher is None or self._matcher.matches(node._c_node):
self._events.append( (u"start", node) )
return ns_count
cdef _Element _end_node(self):
cdef _Element node
cdef int i, ns_count
node, ns_count = self._node_stack.pop()
if self._event_filter & PARSE_EVENT_FILTER_END:
if self._matcher is None or self._matcher.matches(node._c_node):
self._events.append( (u"end", node) )
if self._event_filter & PARSE_EVENT_FILTER_END_NS:
event = (u"end-ns", None)
for i from 0 <= i < ns_count:
self._events.append(event)
return node
cdef int _countNsDefs(xmlNode* c_node):
cdef xmlNs* c_ns
cdef int count
count = 0
c_ns = c_node.nsDef
while c_ns is not NULL:
count += 1
c_ns = c_ns.next
return count
cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
cdef xmlNs* c_ns
cdef int count
count = 0
c_ns = c_node.nsDef
while c_ns is not NULL:
ns_tuple = (funicode(c_ns.prefix) if c_ns.prefix is not NULL else '',
funicode(c_ns.href))
event_list.append( (u"start-ns", ns_tuple) )
count += 1
c_ns = c_ns.next
return count