SickGear/lib/imdb/parser/http/bsouplxml/bsoupxpath.py

411 lines
14 KiB
Python
Raw Normal View History

"""
parser.http.bsoupxpath module (imdb.parser.http package).
This module provides XPath support for BeautifulSoup.
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
__author__ = 'H. Turgut Uyar <uyar@tekir.org>'
__docformat__ = 'restructuredtext'
import re
import string
import _bsoup as BeautifulSoup
# XPath related enumerations and constants
AXIS_ANCESTOR = 'ancestor'
AXIS_ATTRIBUTE = 'attribute'
AXIS_CHILD = 'child'
AXIS_DESCENDANT = 'descendant'
AXIS_FOLLOWING = 'following'
AXIS_FOLLOWING_SIBLING = 'following-sibling'
AXIS_PRECEDING_SIBLING = 'preceding-sibling'
AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT,
AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING)
XPATH_FUNCTIONS = ('starts-with', 'string-length', 'contains')
def tokenize_path(path):
"""Tokenize a location path into location steps. Return the list of steps.
If two steps are separated by a double slash, the double slashes are part of
the second step. If they are separated by only one slash, the slash is not
included in any of the steps.
"""
# form a list of tuples that mark the start and end positions of steps
separators = []
last_position = 0
i = -1
in_string = False
while i < len(path) - 1:
i = i + 1
if path[i] == "'":
in_string = not in_string
if in_string:
# slashes within strings are not step separators
continue
if path[i] == '/':
if i > 0:
separators.append((last_position, i))
if (path[i+1] == '/'):
last_position = i
i = i + 1
else:
last_position = i + 1
separators.append((last_position, len(path)))
steps = []
for start, end in separators:
steps.append(path[start:end])
return steps
class Path:
"""A location path.
"""
def __init__(self, path, parse=True):
self.path = path
self.steps = []
if parse:
if (path[0] == '/') and (path[1] != '/'):
# if not on the descendant axis, remove the leading slash
path = path[1:]
steps = tokenize_path(path)
for step in steps:
self.steps.append(PathStep(step))
def apply(self, node):
"""Apply the path to a node. Return the resulting list of nodes.
Apply the steps in the path sequentially by sending the output of each
step as input to the next step.
"""
# FIXME: this should return a node SET, not a node LIST
# or at least a list with no duplicates
if self.path[0] == '/':
# for an absolute path, start from the root
if not isinstance(node, BeautifulSoup.Tag) \
or (node.name != '[document]'):
node = node.findParent('[document]')
nodes = [node]
for step in self.steps:
nodes = step.apply(nodes)
return nodes
class PathStep:
"""A location step in a location path.
"""
AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES)
NODE_TEST_PATTERN = r"""\w+(\(\))?"""
PREDICATE_PATTERN = r"""\[(.*?)\]"""
LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \
% (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN)
_re_location_step = re.compile(LOCATION_STEP_PATTERN)
PREDICATE_NOT_PATTERN = r"""not\((.*?)\)"""
PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \
% (AXIS_PATTERN, NODE_TEST_PATTERN)
PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \
% '|'.join(XPATH_FUNCTIONS)
_re_predicate_not = re.compile(PREDICATE_NOT_PATTERN)
_re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN)
_re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN)
def __init__(self, step):
self.step = step
if (step == '.') or (step == '..'):
return
if step[:2] == '//':
default_axis = AXIS_DESCENDANT
step = step[2:]
else:
default_axis = AXIS_CHILD
step_match = self._re_location_step.match(step)
# determine the axis
axis = step_match.group(1)
if axis is None:
self.axis = default_axis
elif axis == '@':
self.axis = AXIS_ATTRIBUTE
else:
self.axis = step_match.group(2)
self.soup_args = {}
self.index = None
self.node_test = step_match.group(3)
if self.node_test == 'text()':
self.soup_args['text'] = True
else:
self.soup_args['name'] = self.node_test
self.checkers = []
predicates = step_match.group(5)
if predicates is not None:
predicates = [p for p in predicates[1:-1].split('][') if p]
for predicate in predicates:
checker = self.__parse_predicate(predicate)
if checker is not None:
self.checkers.append(checker)
def __parse_predicate(self, predicate):
"""Parse the predicate. Return a callable that can be used to filter
nodes. Update `self.soup_args` to take advantage of BeautifulSoup search
features.
"""
try:
position = int(predicate)
if self.axis == AXIS_DESCENDANT:
return PredicateFilter('position', value=position)
else:
# use the search limit feature instead of a checker
self.soup_args['limit'] = position
self.index = position - 1
return None
except ValueError:
pass
if predicate == "last()":
self.index = -1
return None
negate = self._re_predicate_not.match(predicate)
if negate:
predicate = negate.group(1)
function_match = self._re_predicate_function.match(predicate)
if function_match:
name = function_match.group(1)
arguments = function_match.group(2)
value = function_match.group(4)
if value is not None:
value = function_match.group(5)
return PredicateFilter(name, arguments, value)
axis_match = self._re_predicate_axis.match(predicate)
if axis_match:
axis = axis_match.group(1)
if axis is None:
axis = AXIS_CHILD
elif axis == '@':
axis = AXIS_ATTRIBUTE
if axis == AXIS_ATTRIBUTE:
# use the attribute search feature instead of a checker
attribute_name = axis_match.group(3)
if axis_match.group(5) is not None:
attribute_value = axis_match.group(6)
elif not negate:
attribute_value = True
else:
attribute_value = None
if not self.soup_args.has_key('attrs'):
self.soup_args['attrs'] = {}
self.soup_args['attrs'][attribute_name] = attribute_value
return None
elif axis == AXIS_CHILD:
node_test = axis_match.group(3)
node_value = axis_match.group(6)
return PredicateFilter('axis', node_test, value=node_value,
negate=negate)
raise NotImplementedError("This predicate is not implemented")
def apply(self, nodes):
"""Apply the step to a list of nodes. Return the list of nodes for the
next step.
"""
if self.step == '.':
return nodes
elif self.step == '..':
return [node.parent for node in nodes]
result = []
for node in nodes:
if self.axis == AXIS_CHILD:
found = node.findAll(recursive=False, **self.soup_args)
elif self.axis == AXIS_DESCENDANT:
found = node.findAll(recursive=True, **self.soup_args)
elif self.axis == AXIS_ATTRIBUTE:
try:
found = [node[self.node_test]]
except KeyError:
found = []
elif self.axis == AXIS_FOLLOWING_SIBLING:
found = node.findNextSiblings(**self.soup_args)
elif self.axis == AXIS_PRECEDING_SIBLING:
# TODO: make sure that the result is reverse ordered
found = node.findPreviousSiblings(**self.soup_args)
elif self.axis == AXIS_FOLLOWING:
# find the last descendant of this node
last = node
while (not isinstance(last, BeautifulSoup.NavigableString)) \
and (len(last.contents) > 0):
last = last.contents[-1]
found = last.findAllNext(**self.soup_args)
elif self.axis == AXIS_ANCESTOR:
found = node.findParents(**self.soup_args)
# this should only be active if there is a position predicate
# and the axis is not 'descendant'
if self.index is not None:
if found:
if len(found) > self.index:
found = [found[self.index]]
else:
found = []
if found:
for checker in self.checkers:
found = filter(checker, found)
result.extend(found)
return result
class PredicateFilter:
"""A callable class for filtering nodes.
"""
def __init__(self, name, arguments=None, value=None, negate=False):
self.name = name
self.arguments = arguments
self.negate = negate
if name == 'position':
self.__filter = self.__position
self.value = value
elif name == 'axis':
self.__filter = self.__axis
self.node_test = arguments
self.value = value
elif name in ('starts-with', 'contains'):
if name == 'starts-with':
self.__filter = self.__starts_with
else:
self.__filter = self.__contains
args = map(string.strip, arguments.split(','))
if args[0][0] == '@':
self.arguments = (True, args[0][1:], args[1][1:-1])
else:
self.arguments = (False, args[0], args[1][1:-1])
elif name == 'string-length':
self.__filter = self.__string_length
args = map(string.strip, arguments.split(','))
if args[0][0] == '@':
self.arguments = (True, args[0][1:])
else:
self.arguments = (False, args[0])
self.value = int(value)
else:
raise NotImplementedError("This XPath function is not implemented")
def __call__(self, node):
if self.negate:
return not self.__filter(node)
else:
return self.__filter(node)
def __position(self, node):
if isinstance(node, BeautifulSoup.NavigableString):
actual_position = len(node.findPreviousSiblings(text=True)) + 1
else:
actual_position = len(node.findPreviousSiblings(node.name)) + 1
return actual_position == self.value
def __axis(self, node):
if self.node_test == 'text()':
return node.string == self.value
else:
children = node.findAll(self.node_test, recursive=False)
if len(children) > 0 and self.value is None:
return True
for child in children:
if child.string == self.value:
return True
return False
def __starts_with(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
first = node[attribute_name]
return first.startswith(self.arguments[2])
elif self.arguments[1] == 'text()':
first = node.contents and node.contents[0]
if isinstance(first, BeautifulSoup.NavigableString):
return first.startswith(self.arguments[2])
return False
def __contains(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
first = node[attribute_name]
return self.arguments[2] in first
elif self.arguments[1] == 'text()':
first = node.contents and node.contents[0]
if isinstance(first, BeautifulSoup.NavigableString):
return self.arguments[2] in first
return False
def __string_length(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
value = node[attribute_name]
else:
value = None
elif self.arguments[1] == 'text()':
value = node.string
if value is not None:
return len(value) == self.value
return False
_paths = {}
_steps = {}
def get_path(path):
"""Utility for eliminating repeated parsings of the same paths and steps.
"""
if not _paths.has_key(path):
p = Path(path, parse=False)
steps = tokenize_path(path)
for step in steps:
if not _steps.has_key(step):
_steps[step] = PathStep(step)
p.steps.append(_steps[step])
_paths[path] = p
return _paths[path]