diff --git a/CHANGES.md b/CHANGES.md index 201255d4..08ae9294 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ * Add search crawler exclusions * Fix saving default show list group on add new show options page * Remove legacy anime split home option from anime settings tab (new option located in general/interface tab) +* Update Beautiful Soup 4.3.2 to 4.4.0 (r390) ### 0.10.0 (2015-08-06 11:05:00 UTC) diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index a53048d6..d35f765b 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.3.2" -__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" +__version__ = "4.4.0" +__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] @@ -77,10 +77,11 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" @@ -156,8 +157,13 @@ class BeautifulSoup(Tag): builder = builder_class() if not (original_features == builder.NAME or original_features in builder.ALTERNATE_NAMES): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( - parser=builder.NAME)) + parser=builder.NAME, + markup_type=markup_type)) self.builder = builder self.is_xml = builder.is_xml @@ -202,7 +208,8 @@ class BeautifulSoup(Tag): for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( - self.builder.prepare_markup(markup, from_encoding)): + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() try: self._feed() @@ -215,6 +222,16 @@ class BeautifulSoup(Tag): self.markup = None self.builder.soup = None + def __copy__(self): + return type(self)(self.encode(), builder=self.builder) + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + del d['builder'] + return d + def _feed(self): # Convert the document to Unicode. self.builder.reset() @@ -241,9 +258,7 @@ class BeautifulSoup(Tag): def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" - navigable = subclass(s) - navigable.setup() - return navigable + return subclass(s) def insert_before(self, successor): raise NotImplementedError("BeautifulSoup objects don't support insert_before().") @@ -302,14 +317,49 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" parent = parent or self.currentTag - most_recent_element = most_recent_element or self._most_recent_element - o.setup(parent, most_recent_element) + previous_element = most_recent_element or self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if not previous_element: + previous_element = o.previous_element + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) - if most_recent_element is not None: - most_recent_element.next_element = o self._most_recent_element = o parent.contents.append(o) + if parent.next_sibling: + # This node is being inserted into an element that has + # already been parsed. Deal with any dangling references. + index = parent.contents.index(o) + if index == 0: + previous_element = parent + previous_sibling = None + else: + previous_element = previous_sibling = parent.contents[index-1] + if index == len(parent.contents)-1: + next_element = parent.next_sibling + next_sibling = None + else: + next_element = next_sibling = parent.contents[index+1] + + o.previous_element = previous_element + if previous_element: + previous_element.next_element = o + o.next_element = next_element + if next_element: + next_element.previous_element = o + o.next_sibling = next_sibling + if next_sibling: + next_sibling.previous_sibling = o + o.previous_sibling = previous_sibling + if previous_sibling: + previous_sibling.next_sibling = o + def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 820bc801..f8fce568 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -85,6 +85,7 @@ class TreeBuilder(object): features = [] is_xml = False + picklable = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 60135752..ab5793c1 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -2,6 +2,7 @@ __all__ = [ 'HTML5TreeBuilder', ] +from pdb import set_trace import warnings from bs4.builder import ( PERMISSIVE, @@ -9,7 +10,10 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) -from bs4.element import NamespacedAttribute +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) import html5lib from html5lib.constants import namespaces from bs4.element import ( @@ -26,9 +30,16 @@ class HTML5TreeBuilder(HTMLTreeBuilder): features = [NAME, PERMISSIVE, HTML_5, HTML] - def prepare_markup(self, markup, user_specified_encoding): + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") yield (markup, None, None, False) # These methods are defined by Beautiful Soup. @@ -103,7 +114,13 @@ class AttrList(object): def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): - "set attr", name, value + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -180,6 +197,7 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: converted_attributes = [] @@ -226,6 +244,9 @@ class Element(html5lib.treebuilders._base.Node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -244,17 +265,28 @@ class Element(html5lib.treebuilders._base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent.element.contents + append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - first_child.previous_element = new_parents_last_descendant + if new_parents_last_descendant: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child: + new_parents_last_child.next_sibling = first_child # Fix the last child's next_element and next_sibling last_child = to_append[-1] last_child.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element: + new_parents_last_descendant_next_element.previous_element = last_child last_child.next_sibling = None for child in to_append: @@ -265,6 +297,10 @@ class Element(html5lib.treebuilders._base.Node): element.contents = [] element.next_element = final_next_element + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index 7f3ae735..0101d647 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -4,10 +4,16 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + import sys import warnings @@ -20,8 +26,10 @@ import warnings # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + from bs4.element import ( CData, Comment, @@ -119,18 +127,19 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False + picklable = True NAME = HTMLPARSER features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: kwargs['convert_charrefs'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be @@ -141,7 +150,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py index b0bc8a03..9e8f88fb 100644 --- a/lib/bs4/builder/_lxml.py +++ b/lib/bs4/builder/_lxml.py @@ -31,6 +31,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): is_xml = True NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] # Well, it's permissive by XML parser standards. features = [NAME, LXML, XML, FAST, PERMISSIVE] @@ -77,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. @@ -102,7 +104,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector(markup, try_encodings, is_html) + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py index 59640b7c..317ad6d7 100644 --- a/lib/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -3,10 +3,11 @@ This library converts a bytestream to Unicode through any means necessary. It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It works best on XML and XML, but it does not rewrite the +Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +from pdb import set_trace import codecs from htmlentitydefs import codepoint2name import re @@ -212,8 +213,11 @@ class EncodingDetector: 5. Windows-1252. """ - def __init__(self, markup, override_encodings=None, is_html=False): + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None @@ -224,6 +228,8 @@ class EncodingDetector: def _usable(self, encoding, tried): if encoding is not None: encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False if encoding not in tried: tried.add(encoding) return True @@ -266,6 +272,9 @@ class EncodingDetector: def strip_byte_order_mark(cls, data): """If a byte-order mark is present, strip it and return the encoding it implies.""" encoding = None + if isinstance(data, unicode): + # Unicode data cannot have a byte-order mark. + return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' @@ -306,7 +315,7 @@ class EncodingDetector: declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii') + 'ascii', 'replace') if declared_encoding: return declared_encoding.lower() return None @@ -331,13 +340,14 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): + smart_quotes_to=None, is_html=False, exclude_encodings=[]): self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - self.detector = EncodingDetector(markup, override_encodings, is_html) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index 4d0b00af..1b719830 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -33,12 +33,21 @@ def diagnose(data): if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) - from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + try: + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + except ImportError, e: + print ( + "lxml is not installed or couldn't be imported.") + if 'html5lib' in basic_parsers: - import html5lib - print "Found html5lib version %s" % html5lib.__version__ + try: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + except ImportError, e: + print ( + "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() diff --git a/lib/bs4/element.py b/lib/bs4/element.py index ff716dfb..c70ad5a0 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -1,3 +1,4 @@ +from pdb import set_trace import collections import re import sys @@ -185,24 +186,40 @@ class PageElement(object): return self.HTML_FORMATTERS.get( name, HTMLAwareEntitySubstitution.substitute_xml) - def setup(self, parent=None, previous_element=None): + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent + self.previous_element = previous_element if previous_element is not None: self.previous_element.next_element = self - self.next_element = None - self.previous_sibling = None - self.next_sibling = None - if self.parent is not None and self.parent.contents: - self.previous_sibling = self.parent.contents[-1] + + self.next_element = next_element + if self.next_element: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling: + self.next_sibling.previous_sibling = self + + if (not previous_sibling + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling: self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): + if not self.parent: + raise ValueError( + "Cannot replace one element with another when the" + "element to be replaced is not part of a tree.") if replace_with is self: return if replace_with is self.parent: @@ -216,6 +233,10 @@ class PageElement(object): def unwrap(self): my_parent = self.parent + if not self.parent: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") my_index = self.parent.index(self) self.extract() for child in reversed(self.contents[:]): @@ -240,17 +261,20 @@ class PageElement(object): last_child = self._last_descendant() next_element = last_child.next_element - if self.previous_element is not None: + if (self.previous_element is not None and + self.previous_element != next_element): self.previous_element.next_element = next_element - if next_element is not None: + if next_element is not None and next_element != self.previous_element: next_element.previous_element = self.previous_element self.previous_element = None last_child.next_element = None self.parent = None - if self.previous_sibling is not None: + if (self.previous_sibling is not None + and self.previous_sibling != self.next_sibling): self.previous_sibling.next_sibling = self.next_sibling - if self.next_sibling is not None: + if (self.next_sibling is not None + and self.next_sibling != self.previous_sibling): self.next_sibling.previous_sibling = self.previous_sibling self.previous_sibling = self.next_sibling = None return self @@ -478,6 +502,10 @@ class PageElement(object): def _find_all(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + if isinstance(name, SoupStrainer): strainer = name else: @@ -558,7 +586,7 @@ class PageElement(object): # | Attribute # Tag attribselect_re = re.compile( - r'^(?P[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'^(?P[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P[\w-]+)(?P[=~\|\^\$\*]?)' + r'=?"?(?P[^\]"]*)"?\]$' ) @@ -654,11 +682,17 @@ class NavigableString(unicode, PageElement): how to handle non-ASCII characters. """ if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = unicode.__new__(cls, value) + else: + u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u def __copy__(self): - return self + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) def __getnewargs__(self): return (unicode(self),) @@ -759,9 +793,12 @@ class Tag(PageElement): self.prefix = prefix if attrs is None: attrs = {} - elif attrs and builder.cdata_list_attributes: - attrs = builder._replace_cdata_list_attribute_values( - self.name, attrs) + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) else: attrs = dict(attrs) self.attrs = attrs @@ -778,6 +815,18 @@ class Tag(PageElement): parserClass = _alias("parser_class") # BS3 + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)(None, self.builder, self.name, self.namespace, + self.nsprefix, self.attrs) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) @@ -971,15 +1020,25 @@ class Tag(PageElement): as defined in __eq__.""" return not self == other - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + def __repr__(self, encoding="unicode-escape"): """Renders this tag as a string.""" - return self.encode(encoding) + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) def __unicode__(self): return self.decode() def __str__(self): - return self.encode() + if PY3K: + return self.decode() + else: + return self.encode() if PY3K: __str__ = __repr__ = __unicode__ @@ -1103,12 +1162,18 @@ class Tag(PageElement): formatter="minimal"): """Renders the contents of this tag as a Unicode string. + :param indent_level: Each line of the rendering will be + indented this many spaces. + :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a tag that mentions the document's encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. """ # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. @@ -1137,7 +1202,17 @@ class Tag(PageElement): def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Renders the contents of this tag as a bytestring.""" + """Renders the contents of this tag as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. + """ + contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) @@ -1201,7 +1276,14 @@ class Tag(PageElement): _selector_combinators = ['>', '+', '~'] _select_debug = False - def select(self, selector, _candidate_generator=None): + def select_one(self, selector): + """Perform a CSS selection operation on the current element.""" + value = self.select(selector, limit=1) + if value: + return value[0] + return None + + def select(self, selector, _candidate_generator=None, limit=None): """Perform a CSS selection operation on the current element.""" # Remove whitespace directly after the grouping operator ',' @@ -1272,35 +1354,38 @@ class Tag(PageElement): "A pseudo-class must be prefixed with a tag name.") pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) found = [] - if pseudo_attributes is not None: + if pseudo_attributes is None: + pseudo_type = pseudo + pseudo_value = None + else: pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - if self.count > self.destination: - # Stop the generator that's sending us - # these things. - raise StopIteration() - return False - checker = Counter(pseudo_value).nth_child_of_type - else: + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') + 'Only numeric values are currently supported for the nth-of-type pseudo-class.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudo-class value must be at least 1.') + class Counter(object): + def __init__(self, destination): + self.count = 0 + self.destination = destination + + def nth_child_of_type(self, tag): + self.count += 1 + if self.count == self.destination: + return True + if self.count > self.destination: + # Stop the generator that's sending us + # these things. + raise StopIteration() + return False + checker = Counter(pseudo_value).nth_child_of_type + else: + raise NotImplementedError( + 'Only the following pseudo-classes are implemented: nth-of-type.') elif token == '*': # Star selector -- matches everything @@ -1376,6 +1461,7 @@ class Tag(PageElement): else: _use_candidate_generator = _candidate_generator + count = 0 for tag in current_context: if self._select_debug: print " Running candidate generator on %s %s" % ( @@ -1400,6 +1486,8 @@ class Tag(PageElement): # don't include it in the context more than once. new_context.append(candidate) new_context_ids.add(id(candidate)) + if limit and len(new_context) >= limit: + break elif self._select_debug: print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))