""" parser.http.bsoupxpath module (imdb.parser.http package). This module provides XPath support for BeautifulSoup. Copyright 2008 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ __author__ = 'H. Turgut Uyar ' __docformat__ = 'restructuredtext' import re import string import _bsoup as BeautifulSoup # XPath related enumerations and constants AXIS_ANCESTOR = 'ancestor' AXIS_ATTRIBUTE = 'attribute' AXIS_CHILD = 'child' AXIS_DESCENDANT = 'descendant' AXIS_FOLLOWING = 'following' AXIS_FOLLOWING_SIBLING = 'following-sibling' AXIS_PRECEDING_SIBLING = 'preceding-sibling' AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT, AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING) XPATH_FUNCTIONS = ('starts-with', 'string-length', 'contains') def tokenize_path(path): """Tokenize a location path into location steps. Return the list of steps. If two steps are separated by a double slash, the double slashes are part of the second step. If they are separated by only one slash, the slash is not included in any of the steps. """ # form a list of tuples that mark the start and end positions of steps separators = [] last_position = 0 i = -1 in_string = False while i < len(path) - 1: i = i + 1 if path[i] == "'": in_string = not in_string if in_string: # slashes within strings are not step separators continue if path[i] == '/': if i > 0: separators.append((last_position, i)) if path[i+1] == '/': last_position = i i = i + 1 else: last_position = i + 1 separators.append((last_position, len(path))) steps = [] for start, end in separators: steps.append(path[start:end]) return steps class Path: """A location path. """ def __init__(self, path, parse=True): self.path = path self.steps = [] if parse: if (path[0] == '/') and (path[1] != '/'): # if not on the descendant axis, remove the leading slash path = path[1:] steps = tokenize_path(path) for step in steps: self.steps.append(PathStep(step)) def apply(self, node): """Apply the path to a node. Return the resulting list of nodes. Apply the steps in the path sequentially by sending the output of each step as input to the next step. """ # FIXME: this should return a node SET, not a node LIST # or at least a list with no duplicates if self.path[0] == '/': # for an absolute path, start from the root if not isinstance(node, BeautifulSoup.Tag) \ or (node.name != '[document]'): node = node.findParent('[document]') nodes = [node] for step in self.steps: nodes = step.apply(nodes) return nodes class PathStep: """A location step in a location path. """ AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES) NODE_TEST_PATTERN = r"""\w+(\(\))?""" PREDICATE_PATTERN = r"""\[(.*?)\]""" LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \ % (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN) _re_location_step = re.compile(LOCATION_STEP_PATTERN) PREDICATE_NOT_PATTERN = r"""not\((.*?)\)""" PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \ % (AXIS_PATTERN, NODE_TEST_PATTERN) PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \ % '|'.join(XPATH_FUNCTIONS) _re_predicate_not = re.compile(PREDICATE_NOT_PATTERN) _re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN) _re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN) def __init__(self, step): self.step = step if (step == '.') or (step == '..'): return if step[:2] == '//': default_axis = AXIS_DESCENDANT step = step[2:] else: default_axis = AXIS_CHILD step_match = self._re_location_step.match(step) # determine the axis axis = step_match.group(1) if axis is None: self.axis = default_axis elif axis == '@': self.axis = AXIS_ATTRIBUTE else: self.axis = step_match.group(2) self.soup_args = {} self.index = None self.node_test = step_match.group(3) if self.node_test == 'text()': self.soup_args['text'] = True else: self.soup_args['name'] = self.node_test self.checkers = [] predicates = step_match.group(5) if predicates is not None: predicates = [p for p in predicates[1:-1].split('][') if p] for predicate in predicates: checker = self.__parse_predicate(predicate) if checker is not None: self.checkers.append(checker) def __parse_predicate(self, predicate): """Parse the predicate. Return a callable that can be used to filter nodes. Update `self.soup_args` to take advantage of BeautifulSoup search features. """ try: position = int(predicate) if self.axis == AXIS_DESCENDANT: return PredicateFilter('position', value=position) else: # use the search limit feature instead of a checker self.soup_args['limit'] = position self.index = position - 1 return None except ValueError: pass if predicate == "last()": self.index = -1 return None negate = self._re_predicate_not.match(predicate) if negate: predicate = negate.group(1) function_match = self._re_predicate_function.match(predicate) if function_match: name = function_match.group(1) arguments = function_match.group(2) value = function_match.group(4) if value is not None: value = function_match.group(5) return PredicateFilter(name, arguments, value) axis_match = self._re_predicate_axis.match(predicate) if axis_match: axis = axis_match.group(1) if axis is None: axis = AXIS_CHILD elif axis == '@': axis = AXIS_ATTRIBUTE if axis == AXIS_ATTRIBUTE: # use the attribute search feature instead of a checker attribute_name = axis_match.group(3) if axis_match.group(5) is not None: attribute_value = axis_match.group(6) elif not negate: attribute_value = True else: attribute_value = None if not self.soup_args.has_key('attrs'): self.soup_args['attrs'] = {} self.soup_args['attrs'][attribute_name] = attribute_value return None elif axis == AXIS_CHILD: node_test = axis_match.group(3) node_value = axis_match.group(6) return PredicateFilter('axis', node_test, value=node_value, negate=negate) raise NotImplementedError("This predicate is not implemented") def apply(self, nodes): """Apply the step to a list of nodes. Return the list of nodes for the next step. """ if self.step == '.': return nodes elif self.step == '..': return [node.parent for node in nodes] result = [] for node in nodes: if self.axis == AXIS_CHILD: found = node.findAll(recursive=False, **self.soup_args) elif self.axis == AXIS_DESCENDANT: found = node.findAll(recursive=True, **self.soup_args) elif self.axis == AXIS_ATTRIBUTE: try: found = [node[self.node_test]] except KeyError: found = [] elif self.axis == AXIS_FOLLOWING_SIBLING: found = node.findNextSiblings(**self.soup_args) elif self.axis == AXIS_PRECEDING_SIBLING: # TODO: make sure that the result is reverse ordered found = node.findPreviousSiblings(**self.soup_args) elif self.axis == AXIS_FOLLOWING: # find the last descendant of this node last = node while (not isinstance(last, BeautifulSoup.NavigableString)) \ and (len(last.contents) > 0): last = last.contents[-1] found = last.findAllNext(**self.soup_args) elif self.axis == AXIS_ANCESTOR: found = node.findParents(**self.soup_args) # this should only be active if there is a position predicate # and the axis is not 'descendant' if self.index is not None: if found: if len(found) > self.index: found = [found[self.index]] else: found = [] if found: for checker in self.checkers: found = filter(checker, found) result.extend(found) return result class PredicateFilter: """A callable class for filtering nodes. """ def __init__(self, name, arguments=None, value=None, negate=False): self.name = name self.arguments = arguments self.negate = negate if name == 'position': self.__filter = self.__position self.value = value elif name == 'axis': self.__filter = self.__axis self.node_test = arguments self.value = value elif name in ('starts-with', 'contains'): if name == 'starts-with': self.__filter = self.__starts_with else: self.__filter = self.__contains args = map(string.strip, arguments.split(',')) if args[0][0] == '@': self.arguments = (True, args[0][1:], args[1][1:-1]) else: self.arguments = (False, args[0], args[1][1:-1]) elif name == 'string-length': self.__filter = self.__string_length args = map(string.strip, arguments.split(',')) if args[0][0] == '@': self.arguments = (True, args[0][1:]) else: self.arguments = (False, args[0]) self.value = int(value) else: raise NotImplementedError("This XPath function is not implemented") def __call__(self, node): if self.negate: return not self.__filter(node) else: return self.__filter(node) def __position(self, node): if isinstance(node, BeautifulSoup.NavigableString): actual_position = len(node.findPreviousSiblings(text=True)) + 1 else: actual_position = len(node.findPreviousSiblings(node.name)) + 1 return actual_position == self.value def __axis(self, node): if self.node_test == 'text()': return node.string == self.value else: children = node.findAll(self.node_test, recursive=False) if len(children) > 0 and self.value is None: return True for child in children: if child.string == self.value: return True return False def __starts_with(self, node): if self.arguments[0]: # this is an attribute attribute_name = self.arguments[1] if node.has_key(attribute_name): first = node[attribute_name] return first.startswith(self.arguments[2]) elif self.arguments[1] == 'text()': first = node.contents and node.contents[0] if isinstance(first, BeautifulSoup.NavigableString): return first.startswith(self.arguments[2]) return False def __contains(self, node): if self.arguments[0]: # this is an attribute attribute_name = self.arguments[1] if node.has_key(attribute_name): first = node[attribute_name] return self.arguments[2] in first elif self.arguments[1] == 'text()': first = node.contents and node.contents[0] if isinstance(first, BeautifulSoup.NavigableString): return self.arguments[2] in first return False def __string_length(self, node): if self.arguments[0]: # this is an attribute attribute_name = self.arguments[1] if node.has_key(attribute_name): value = node[attribute_name] else: value = None elif self.arguments[1] == 'text()': value = node.string if value is not None: return len(value) == self.value return False _paths = {} _steps = {} def get_path(path): """Utility for eliminating repeated parsings of the same paths and steps. """ if not _paths.has_key(path): p = Path(path, parse=False) steps = tokenize_path(path) for step in steps: if not _steps.has_key(step): _steps[step] = PathStep(step) p.steps.append(_steps[step]) _paths[path] = p return _paths[path]