SickGear/lib/imdb/parser/http/bsouplxml/bsoupxpath.py
echel0n 0d9fbc1ad7 Welcome to our SickBeard-TVRage Edition ...
This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer.

Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer.

Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk!

Enjoy!
2014-03-09 22:39:12 -07:00

410 lines
14 KiB
Python

"""
parser.http.bsoupxpath module (imdb.parser.http package).
This module provides XPath support for BeautifulSoup.
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
__author__ = 'H. Turgut Uyar <uyar@tekir.org>'
__docformat__ = 'restructuredtext'
import re
import string
import _bsoup as BeautifulSoup
# XPath related enumerations and constants
AXIS_ANCESTOR = 'ancestor'
AXIS_ATTRIBUTE = 'attribute'
AXIS_CHILD = 'child'
AXIS_DESCENDANT = 'descendant'
AXIS_FOLLOWING = 'following'
AXIS_FOLLOWING_SIBLING = 'following-sibling'
AXIS_PRECEDING_SIBLING = 'preceding-sibling'
AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT,
AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING)
XPATH_FUNCTIONS = ('starts-with', 'string-length', 'contains')
def tokenize_path(path):
"""Tokenize a location path into location steps. Return the list of steps.
If two steps are separated by a double slash, the double slashes are part of
the second step. If they are separated by only one slash, the slash is not
included in any of the steps.
"""
# form a list of tuples that mark the start and end positions of steps
separators = []
last_position = 0
i = -1
in_string = False
while i < len(path) - 1:
i = i + 1
if path[i] == "'":
in_string = not in_string
if in_string:
# slashes within strings are not step separators
continue
if path[i] == '/':
if i > 0:
separators.append((last_position, i))
if (path[i+1] == '/'):
last_position = i
i = i + 1
else:
last_position = i + 1
separators.append((last_position, len(path)))
steps = []
for start, end in separators:
steps.append(path[start:end])
return steps
class Path:
"""A location path.
"""
def __init__(self, path, parse=True):
self.path = path
self.steps = []
if parse:
if (path[0] == '/') and (path[1] != '/'):
# if not on the descendant axis, remove the leading slash
path = path[1:]
steps = tokenize_path(path)
for step in steps:
self.steps.append(PathStep(step))
def apply(self, node):
"""Apply the path to a node. Return the resulting list of nodes.
Apply the steps in the path sequentially by sending the output of each
step as input to the next step.
"""
# FIXME: this should return a node SET, not a node LIST
# or at least a list with no duplicates
if self.path[0] == '/':
# for an absolute path, start from the root
if not isinstance(node, BeautifulSoup.Tag) \
or (node.name != '[document]'):
node = node.findParent('[document]')
nodes = [node]
for step in self.steps:
nodes = step.apply(nodes)
return nodes
class PathStep:
"""A location step in a location path.
"""
AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES)
NODE_TEST_PATTERN = r"""\w+(\(\))?"""
PREDICATE_PATTERN = r"""\[(.*?)\]"""
LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \
% (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN)
_re_location_step = re.compile(LOCATION_STEP_PATTERN)
PREDICATE_NOT_PATTERN = r"""not\((.*?)\)"""
PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \
% (AXIS_PATTERN, NODE_TEST_PATTERN)
PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \
% '|'.join(XPATH_FUNCTIONS)
_re_predicate_not = re.compile(PREDICATE_NOT_PATTERN)
_re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN)
_re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN)
def __init__(self, step):
self.step = step
if (step == '.') or (step == '..'):
return
if step[:2] == '//':
default_axis = AXIS_DESCENDANT
step = step[2:]
else:
default_axis = AXIS_CHILD
step_match = self._re_location_step.match(step)
# determine the axis
axis = step_match.group(1)
if axis is None:
self.axis = default_axis
elif axis == '@':
self.axis = AXIS_ATTRIBUTE
else:
self.axis = step_match.group(2)
self.soup_args = {}
self.index = None
self.node_test = step_match.group(3)
if self.node_test == 'text()':
self.soup_args['text'] = True
else:
self.soup_args['name'] = self.node_test
self.checkers = []
predicates = step_match.group(5)
if predicates is not None:
predicates = [p for p in predicates[1:-1].split('][') if p]
for predicate in predicates:
checker = self.__parse_predicate(predicate)
if checker is not None:
self.checkers.append(checker)
def __parse_predicate(self, predicate):
"""Parse the predicate. Return a callable that can be used to filter
nodes. Update `self.soup_args` to take advantage of BeautifulSoup search
features.
"""
try:
position = int(predicate)
if self.axis == AXIS_DESCENDANT:
return PredicateFilter('position', value=position)
else:
# use the search limit feature instead of a checker
self.soup_args['limit'] = position
self.index = position - 1
return None
except ValueError:
pass
if predicate == "last()":
self.index = -1
return None
negate = self._re_predicate_not.match(predicate)
if negate:
predicate = negate.group(1)
function_match = self._re_predicate_function.match(predicate)
if function_match:
name = function_match.group(1)
arguments = function_match.group(2)
value = function_match.group(4)
if value is not None:
value = function_match.group(5)
return PredicateFilter(name, arguments, value)
axis_match = self._re_predicate_axis.match(predicate)
if axis_match:
axis = axis_match.group(1)
if axis is None:
axis = AXIS_CHILD
elif axis == '@':
axis = AXIS_ATTRIBUTE
if axis == AXIS_ATTRIBUTE:
# use the attribute search feature instead of a checker
attribute_name = axis_match.group(3)
if axis_match.group(5) is not None:
attribute_value = axis_match.group(6)
elif not negate:
attribute_value = True
else:
attribute_value = None
if not self.soup_args.has_key('attrs'):
self.soup_args['attrs'] = {}
self.soup_args['attrs'][attribute_name] = attribute_value
return None
elif axis == AXIS_CHILD:
node_test = axis_match.group(3)
node_value = axis_match.group(6)
return PredicateFilter('axis', node_test, value=node_value,
negate=negate)
raise NotImplementedError("This predicate is not implemented")
def apply(self, nodes):
"""Apply the step to a list of nodes. Return the list of nodes for the
next step.
"""
if self.step == '.':
return nodes
elif self.step == '..':
return [node.parent for node in nodes]
result = []
for node in nodes:
if self.axis == AXIS_CHILD:
found = node.findAll(recursive=False, **self.soup_args)
elif self.axis == AXIS_DESCENDANT:
found = node.findAll(recursive=True, **self.soup_args)
elif self.axis == AXIS_ATTRIBUTE:
try:
found = [node[self.node_test]]
except KeyError:
found = []
elif self.axis == AXIS_FOLLOWING_SIBLING:
found = node.findNextSiblings(**self.soup_args)
elif self.axis == AXIS_PRECEDING_SIBLING:
# TODO: make sure that the result is reverse ordered
found = node.findPreviousSiblings(**self.soup_args)
elif self.axis == AXIS_FOLLOWING:
# find the last descendant of this node
last = node
while (not isinstance(last, BeautifulSoup.NavigableString)) \
and (len(last.contents) > 0):
last = last.contents[-1]
found = last.findAllNext(**self.soup_args)
elif self.axis == AXIS_ANCESTOR:
found = node.findParents(**self.soup_args)
# this should only be active if there is a position predicate
# and the axis is not 'descendant'
if self.index is not None:
if found:
if len(found) > self.index:
found = [found[self.index]]
else:
found = []
if found:
for checker in self.checkers:
found = filter(checker, found)
result.extend(found)
return result
class PredicateFilter:
"""A callable class for filtering nodes.
"""
def __init__(self, name, arguments=None, value=None, negate=False):
self.name = name
self.arguments = arguments
self.negate = negate
if name == 'position':
self.__filter = self.__position
self.value = value
elif name == 'axis':
self.__filter = self.__axis
self.node_test = arguments
self.value = value
elif name in ('starts-with', 'contains'):
if name == 'starts-with':
self.__filter = self.__starts_with
else:
self.__filter = self.__contains
args = map(string.strip, arguments.split(','))
if args[0][0] == '@':
self.arguments = (True, args[0][1:], args[1][1:-1])
else:
self.arguments = (False, args[0], args[1][1:-1])
elif name == 'string-length':
self.__filter = self.__string_length
args = map(string.strip, arguments.split(','))
if args[0][0] == '@':
self.arguments = (True, args[0][1:])
else:
self.arguments = (False, args[0])
self.value = int(value)
else:
raise NotImplementedError("This XPath function is not implemented")
def __call__(self, node):
if self.negate:
return not self.__filter(node)
else:
return self.__filter(node)
def __position(self, node):
if isinstance(node, BeautifulSoup.NavigableString):
actual_position = len(node.findPreviousSiblings(text=True)) + 1
else:
actual_position = len(node.findPreviousSiblings(node.name)) + 1
return actual_position == self.value
def __axis(self, node):
if self.node_test == 'text()':
return node.string == self.value
else:
children = node.findAll(self.node_test, recursive=False)
if len(children) > 0 and self.value is None:
return True
for child in children:
if child.string == self.value:
return True
return False
def __starts_with(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
first = node[attribute_name]
return first.startswith(self.arguments[2])
elif self.arguments[1] == 'text()':
first = node.contents and node.contents[0]
if isinstance(first, BeautifulSoup.NavigableString):
return first.startswith(self.arguments[2])
return False
def __contains(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
first = node[attribute_name]
return self.arguments[2] in first
elif self.arguments[1] == 'text()':
first = node.contents and node.contents[0]
if isinstance(first, BeautifulSoup.NavigableString):
return self.arguments[2] in first
return False
def __string_length(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
value = node[attribute_name]
else:
value = None
elif self.arguments[1] == 'text()':
value = node.string
if value is not None:
return len(value) == self.value
return False
_paths = {}
_steps = {}
def get_path(path):
"""Utility for eliminating repeated parsings of the same paths and steps.
"""
if not _paths.has_key(path):
p = Path(path, parse=False)
steps = tokenize_path(path)
for step in steps:
if not _steps.has_key(step):
_steps[step] = PathStep(step)
p.steps.append(_steps[step])
_paths[path] = p
return _paths[path]