Merge pull request #482 from JackDandy/feature/UpdateBSoup

Update Beautiful Soup to 4.4.0 (r390).
2025-01-05 17:43:37 +00:00 · 2015-08-11 17:07:30 +01:00 · 2015-08-11 17:07:30 +01:00 · 8b42315bde
commit 8b42315bde
parent 8eee9d8699 b0525a0dd6
9 changed files with 292 additions and 84 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -5,6 +5,7 @@
 * Add search crawler exclusions
 * Fix saving default show list group on add new show options page
 * Remove legacy anime split home option from anime settings tab (new option located in general/interface tab)
+* Update Beautiful Soup 4.3.2 to 4.4.0 (r390)


 ### 0.10.0 (2015-08-06 11:05:00 UTC)
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.3.2"
-__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
+__version__ = "4.4.0"
+__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']
@ -77,10 +77,11 @@ class BeautifulSoup(Tag):

    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"

    def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None, **kwargs):
+                 parse_only=None, from_encoding=None, exclude_encodings=None,
+                 **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""
@ -156,8 +157,13 @@ class BeautifulSoup(Tag):
            builder = builder_class()
            if not (original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES):
+                if builder.is_xml:
+                    markup_type = "XML"
+                else:
+                    markup_type = "HTML"
                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
-                    parser=builder.NAME))
+                    parser=builder.NAME,
+                    markup_type=markup_type))

        self.builder = builder
        self.is_xml = builder.is_xml
@ -202,7 +208,8 @@ class BeautifulSoup(Tag):

        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
-            self.builder.prepare_markup(markup, from_encoding)):
+             self.builder.prepare_markup(
+                 markup, from_encoding, exclude_encodings=exclude_encodings)):
            self.reset()
            try:
                self._feed()
@ -215,6 +222,16 @@ class BeautifulSoup(Tag):
        self.markup = None
        self.builder.soup = None

+    def __copy__(self):
+        return type(self)(self.encode(), builder=self.builder)
+
+    def __getstate__(self):
+        # Frequently a tree builder can't be pickled.
+        d = dict(self.__dict__)
+        if 'builder' in d and not self.builder.picklable:
+            del d['builder']
+        return d
+
    def _feed(self):
        # Convert the document to Unicode.
        self.builder.reset()
@ -241,9 +258,7 @@ class BeautifulSoup(Tag):

    def new_string(self, s, subclass=NavigableString):
        """Create a new NavigableString associated with this soup."""
-        navigable = subclass(s)
-        navigable.setup()
-        return navigable
+        return subclass(s)

    def insert_before(self, successor):
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@ -302,14 +317,49 @@ class BeautifulSoup(Tag):
    def object_was_parsed(self, o, parent=None, most_recent_element=None):
        """Add an object to the parse tree."""
        parent = parent or self.currentTag
-        most_recent_element = most_recent_element or self._most_recent_element
-        o.setup(parent, most_recent_element)
+        previous_element = most_recent_element or self._most_recent_element
+
+        next_element = previous_sibling = next_sibling = None
+        if isinstance(o, Tag):
+            next_element = o.next_element
+            next_sibling = o.next_sibling
+            previous_sibling = o.previous_sibling
+            if not previous_element:
+                previous_element = o.previous_element
+
+        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)

-        if most_recent_element is not None:
-            most_recent_element.next_element = o
        self._most_recent_element = o
        parent.contents.append(o)

+        if parent.next_sibling:
+            # This node is being inserted into an element that has
+            # already been parsed. Deal with any dangling references.
+            index = parent.contents.index(o)
+            if index == 0:
+                previous_element = parent
+                previous_sibling = None
+            else:
+                previous_element = previous_sibling = parent.contents[index-1]
+            if index == len(parent.contents)-1:
+                next_element = parent.next_sibling
+                next_sibling = None
+            else:
+                next_element = next_sibling = parent.contents[index+1]
+
+            o.previous_element = previous_element
+            if previous_element:
+                previous_element.next_element = o
+            o.next_element = next_element
+            if next_element:
+                next_element.previous_element = o
+            o.next_sibling = next_sibling
+            if next_sibling:
+                next_sibling.previous_sibling = o
+            o.previous_sibling = previous_sibling
+            if previous_sibling:
+                previous_sibling.next_sibling = o
+
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
        instance of the given tag. If inclusivePop is false, pops the tag
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -85,6 +85,7 @@ class TreeBuilder(object):
    features = []

    is_xml = False
+    picklable = False
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -2,6 +2,7 @@ __all__ = [
    'HTML5TreeBuilder',
    ]

+from pdb import set_trace
 import warnings
 from bs4.builder import (
    PERMISSIVE,
@ -9,7 +10,10 @@ from bs4.builder import (
    HTML_5,
    HTMLTreeBuilder,
    )
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+    NamespacedAttribute,
+    whitespace_re,
+)
 import html5lib
 from html5lib.constants import namespaces
 from bs4.element import (
@ -26,9 +30,16 @@ class HTML5TreeBuilder(HTMLTreeBuilder):

    features = [NAME, PERMISSIVE, HTML_5, HTML]

-    def prepare_markup(self, markup, user_specified_encoding):
+    def prepare_markup(self, markup, user_specified_encoding,
+                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        if exclude_encodings:
+            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
        yield (markup, None, None, False)

    # These methods are defined by Beautiful Soup.
@ -103,7 +114,13 @@ class AttrList(object):
    def __iter__(self):
        return list(self.attrs.items()).__iter__()
    def __setitem__(self, name, value):
-        "set attr", name, value
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        if (name in list_attr['*']
+            or (self.element.name in list_attr
+                and name in list_attr[self.element.name])):
+            value = whitespace_re.split(value)
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
@ -180,6 +197,7 @@ class Element(html5lib.treebuilders._base.Node):
        return AttrList(self.element)

    def setAttributes(self, attributes):
+
        if attributes is not None and len(attributes) > 0:

            converted_attributes = []
@ -226,6 +244,9 @@ class Element(html5lib.treebuilders._base.Node):

    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
+        # print "MOVE", self.element.contents
+        # print "FROM", self.element
+        # print "TO", new_parent.element
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@ -244,17 +265,28 @@ class Element(html5lib.treebuilders._base.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element

        to_append = element.contents
-        append_after = new_parent.element.contents
+        append_after = new_parent_element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
            first_child = to_append[0]
-            first_child.previous_element = new_parents_last_descendant
+            if new_parents_last_descendant:
+                first_child.previous_element = new_parents_last_descendant
+            else:
+                first_child.previous_element = new_parent_element
            first_child.previous_sibling = new_parents_last_child
+            if new_parents_last_descendant:
+                new_parents_last_descendant.next_element = first_child
+            else:
+                new_parent_element.next_element = first_child
+            if new_parents_last_child:
+                new_parents_last_child.next_sibling = first_child

            # Fix the last child's next_element and next_sibling
            last_child = to_append[-1]
            last_child.next_element = new_parents_last_descendant_next_element
+            if new_parents_last_descendant_next_element:
+                new_parents_last_descendant_next_element.previous_element = last_child
            last_child.next_sibling = None

        for child in to_append:
@ -265,6 +297,10 @@ class Element(html5lib.treebuilders._base.Node):
        element.contents = []
        element.next_element = final_next_element

+        # print "DONE WITH MOVE"
+        # print "FROM", self.element
+        # print "TO", new_parent_element
+
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
        node = Element(tag, self.soup, self.namespace)
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -4,10 +4,16 @@ __all__ = [
    'HTMLParserTreeBuilder',
    ]

-from HTMLParser import (
-    HTMLParser,
-    HTMLParseError,
-    )
+from HTMLParser import HTMLParser
+
+try:
+    from HTMLParser import HTMLParseError
+except ImportError, e:
+    # HTMLParseError is removed in Python 3.5. Since it can never be
+    # thrown in 3.5, we can just define our own class as a placeholder.
+    class HTMLParseError(Exception):
+        pass
+
 import sys
 import warnings

@ -20,8 +26,10 @@ import warnings
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
 CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4

+
 from bs4.element import (
    CData,
    Comment,
@ -119,18 +127,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):

    is_xml = False
+    picklable = True
    NAME = HTMLPARSER
    features = [NAME, HTML, STRICT]

    def __init__(self, *args, **kwargs):
-        if CONSTRUCTOR_TAKES_STRICT:
+        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)

    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
        """
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
@ -141,7 +150,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
            return

        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+                               exclude_encodings=exclude_encodings)
        yield (dammit.markup, dammit.original_encoding,
               dammit.declared_html_encoding,
               dammit.contains_replacement_characters)
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -31,6 +31,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    is_xml = True

    NAME = "lxml-xml"
+    ALTERNATE_NAMES = ["xml"]

    # Well, it's permissive by XML parser standards.
    features = [NAME, LXML, XML, FAST, PERMISSIVE]
@ -77,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            return (None, tag)

    def prepare_markup(self, markup, user_specified_encoding=None,
+                       exclude_encodings=None,
                       document_declared_encoding=None):
        """
        :yield: A series of 4-tuples.
@ -102,7 +104,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # the document as each one in turn.
        is_html = not self.is_xml
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        detector = EncodingDetector(markup, try_encodings, is_html)
+        detector = EncodingDetector(
+            markup, try_encodings, is_html, exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)

--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -3,10 +3,11 @@

 This library converts a bytestream to Unicode through any means
 necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """

+from pdb import set_trace
 import codecs
 from htmlentitydefs import codepoint2name
 import re
@ -212,8 +213,11 @@ class EncodingDetector:

    5. Windows-1252.
    """
-    def __init__(self, markup, override_encodings=None, is_html=False):
+    def __init__(self, markup, override_encodings=None, is_html=False,
+                 exclude_encodings=None):
        self.override_encodings = override_encodings or []
+        exclude_encodings = exclude_encodings or []
+        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
        self.chardet_encoding = None
        self.is_html = is_html
        self.declared_encoding = None
@ -224,6 +228,8 @@ class EncodingDetector:
    def _usable(self, encoding, tried):
        if encoding is not None:
            encoding = encoding.lower()
+            if encoding in self.exclude_encodings:
+                return False
            if encoding not in tried:
                tried.add(encoding)
                return True
@ -266,6 +272,9 @@ class EncodingDetector:
    def strip_byte_order_mark(cls, data):
        """If a byte-order mark is present, strip it and return the encoding it implies."""
        encoding = None
+        if isinstance(data, unicode):
+            # Unicode data cannot have a byte-order mark.
+            return data, encoding
        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
               and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16be'
@ -306,7 +315,7 @@ class EncodingDetector:
            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
            declared_encoding = declared_encoding_match.groups()[0].decode(
-                'ascii')
+                'ascii', 'replace')
        if declared_encoding:
            return declared_encoding.lower()
        return None
@ -331,13 +340,14 @@ class UnicodeDammit:
        ]

    def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False):
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html

-        self.detector = EncodingDetector(markup, override_encodings, is_html)
+        self.detector = EncodingDetector(
+            markup, override_encodings, is_html, exclude_encodings)

        # Short-circuit if the data is in Unicode to begin with.
        if isinstance(markup, unicode) or markup == '':
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -33,12 +33,21 @@ def diagnose(data):

    if 'lxml' in basic_parsers:
        basic_parsers.append(["lxml", "xml"])
-        from lxml import etree
-        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        try:
+            from lxml import etree
+            print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        except ImportError, e:
+            print (
+                "lxml is not installed or couldn't be imported.")
+

    if 'html5lib' in basic_parsers:
-        import html5lib
-        print "Found html5lib version %s" % html5lib.__version__
+        try:
+            import html5lib
+            print "Found html5lib version %s" % html5lib.__version__
+        except ImportError, e:
+            print (
+                "html5lib is not installed or couldn't be imported.")

    if hasattr(data, 'read'):
        data = data.read()
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -1,3 +1,4 @@
+from pdb import set_trace
 import collections
 import re
 import sys
@ -185,24 +186,40 @@ class PageElement(object):
            return self.HTML_FORMATTERS.get(
                name, HTMLAwareEntitySubstitution.substitute_xml)

-    def setup(self, parent=None, previous_element=None):
+    def setup(self, parent=None, previous_element=None, next_element=None,
+              previous_sibling=None, next_sibling=None):
        """Sets up the initial relations between this element and
        other elements."""
        self.parent = parent
+
        self.previous_element = previous_element
        if previous_element is not None:
            self.previous_element.next_element = self
-        self.next_element = None
-        self.previous_sibling = None
-        self.next_sibling = None
-        if self.parent is not None and self.parent.contents:
-            self.previous_sibling = self.parent.contents[-1]
+
+        self.next_element = next_element
+        if self.next_element:
+            self.next_element.previous_element = self
+
+        self.next_sibling = next_sibling
+        if self.next_sibling:
+            self.next_sibling.previous_sibling = self
+
+        if (not previous_sibling
+            and self.parent is not None and self.parent.contents):
+            previous_sibling = self.parent.contents[-1]
+
+        self.previous_sibling = previous_sibling
+        if previous_sibling:
            self.previous_sibling.next_sibling = self

    nextSibling = _alias("next_sibling")  # BS3
    previousSibling = _alias("previous_sibling")  # BS3

    def replace_with(self, replace_with):
+        if not self.parent:
+            raise ValueError(
+                "Cannot replace one element with another when the"
+                "element to be replaced is not part of a tree.")
        if replace_with is self:
            return
        if replace_with is self.parent:
@ -216,6 +233,10 @@ class PageElement(object):

    def unwrap(self):
        my_parent = self.parent
+        if not self.parent:
+            raise ValueError(
+                "Cannot replace an element with its contents when that"
+                "element is not part of a tree.")
        my_index = self.parent.index(self)
        self.extract()
        for child in reversed(self.contents[:]):
@ -240,17 +261,20 @@ class PageElement(object):
        last_child = self._last_descendant()
        next_element = last_child.next_element

-        if self.previous_element is not None:
+        if (self.previous_element is not None and
+            self.previous_element != next_element):
            self.previous_element.next_element = next_element
-        if next_element is not None:
+        if next_element is not None and next_element != self.previous_element:
            next_element.previous_element = self.previous_element
        self.previous_element = None
        last_child.next_element = None

        self.parent = None
-        if self.previous_sibling is not None:
+        if (self.previous_sibling is not None
+            and self.previous_sibling != self.next_sibling):
            self.previous_sibling.next_sibling = self.next_sibling
-        if self.next_sibling is not None:
+        if (self.next_sibling is not None
+            and self.next_sibling != self.previous_sibling):
            self.next_sibling.previous_sibling = self.previous_sibling
        self.previous_sibling = self.next_sibling = None
        return self
@ -478,6 +502,10 @@ class PageElement(object):
    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
        "Iterates over a generator looking for things that match."

+        if text is None and 'string' in kwargs:
+            text = kwargs['string']
+            del kwargs['string']
+
        if isinstance(name, SoupStrainer):
            strainer = name
        else:
@ -558,7 +586,7 @@ class PageElement(object):
    #     |                           Attribute
    #    Tag
    attribselect_re = re.compile(
-        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
+        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
        r'=?"?(?P<value>[^\]"]*)"?\]$'
        )

@ -654,11 +682,17 @@ class NavigableString(unicode, PageElement):
        how to handle non-ASCII characters.
        """
        if isinstance(value, unicode):
-            return unicode.__new__(cls, value)
-        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+            u = unicode.__new__(cls, value)
+        else:
+            u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+        u.setup()
+        return u

    def __copy__(self):
-        return self
+        """A copy of a NavigableString has the same contents and class
+        as the original, but it is not connected to the parse tree.
+        """
+        return type(self)(self)

    def __getnewargs__(self):
        return (unicode(self),)
@ -759,9 +793,12 @@ class Tag(PageElement):
        self.prefix = prefix
        if attrs is None:
            attrs = {}
-        elif attrs and builder.cdata_list_attributes:
-            attrs = builder._replace_cdata_list_attribute_values(
-                self.name, attrs)
+        elif attrs:
+            if builder is not None and builder.cdata_list_attributes:
+                attrs = builder._replace_cdata_list_attribute_values(
+                    self.name, attrs)
+            else:
+                attrs = dict(attrs)
        else:
            attrs = dict(attrs)
        self.attrs = attrs
@ -778,6 +815,18 @@ class Tag(PageElement):

    parserClass = _alias("parser_class")  # BS3

+    def __copy__(self):
+        """A copy of a Tag is a new Tag, unconnected to the parse tree.
+        Its contents are a copy of the old Tag's contents.
+        """
+        clone = type(self)(None, self.builder, self.name, self.namespace,
+                           self.nsprefix, self.attrs)
+        for attr in ('can_be_empty_element', 'hidden'):
+            setattr(clone, attr, getattr(self, attr))
+        for child in self.contents:
+            clone.append(child.__copy__())
+        return clone
+
    @property
    def is_empty_element(self):
        """Is this tag an empty-element tag? (aka a self-closing tag)
@ -971,15 +1020,25 @@ class Tag(PageElement):
        as defined in __eq__."""
        return not self == other

-    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+    def __repr__(self, encoding="unicode-escape"):
        """Renders this tag as a string."""
-        return self.encode(encoding)
+        if PY3K:
+            # "The return value must be a string object", i.e. Unicode
+            return self.decode()
+        else:
+            # "The return value must be a string object", i.e. a bytestring.
+            # By convention, the return value of __repr__ should also be
+            # an ASCII string.
+            return self.encode(encoding)

    def __unicode__(self):
        return self.decode()

    def __str__(self):
-        return self.encode()
+        if PY3K:
+            return self.decode()
+        else:
+            return self.encode()

    if PY3K:
        __str__ = __repr__ = __unicode__
@ -1103,12 +1162,18 @@ class Tag(PageElement):
                       formatter="minimal"):
        """Renders the contents of this tag as a Unicode string.

+        :param indent_level: Each line of the rendering will be
+           indented this many spaces.
+
        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
+
+        :param formatter: The output formatter responsible for converting
+           entities to Unicode characters.
        """
        # First off, turn a string formatter into a function. This
        # will stop the lookup from happening over and over again.
@ -1137,7 +1202,17 @@ class Tag(PageElement):
    def encode_contents(
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
        formatter="minimal"):
-        """Renders the contents of this tag as a bytestring."""
+        """Renders the contents of this tag as a bytestring.
+
+        :param indent_level: Each line of the rendering will be
+           indented this many spaces.
+
+        :param eventual_encoding: The bytestring will be in this encoding.
+
+        :param formatter: The output formatter responsible for converting
+           entities to Unicode characters.
+        """
+
        contents = self.decode_contents(indent_level, encoding, formatter)
        return contents.encode(encoding)

@ -1201,7 +1276,14 @@ class Tag(PageElement):

    _selector_combinators = ['>', '+', '~']
    _select_debug = False
-    def select(self, selector, _candidate_generator=None):
+    def select_one(self, selector):
+        """Perform a CSS selection operation on the current element."""
+        value = self.select(selector, limit=1)
+        if value:
+            return value[0]
+        return None
+
+    def select(self, selector, _candidate_generator=None, limit=None):
        """Perform a CSS selection operation on the current element."""

        # Remove whitespace directly after the grouping operator ','
@ -1272,35 +1354,38 @@ class Tag(PageElement):
                            "A pseudo-class must be prefixed with a tag name.")
                    pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
                    found = []
-                    if pseudo_attributes is not None:
+                    if pseudo_attributes is None:
+                        pseudo_type = pseudo
+                        pseudo_value = None
+                    else:
                        pseudo_type, pseudo_value = pseudo_attributes.groups()
-                        if pseudo_type == 'nth-of-type':
-                            try:
-                                pseudo_value = int(pseudo_value)
-                            except:
-                                raise NotImplementedError(
-                                    'Only numeric values are currently supported for the nth-of-type pseudo-class.')
-                            if pseudo_value < 1:
-                                raise ValueError(
-                                    'nth-of-type pseudo-class value must be at least 1.')
-                            class Counter(object):
-                                def __init__(self, destination):
-                                    self.count = 0
-                                    self.destination = destination
-
-                                def nth_child_of_type(self, tag):
-                                    self.count += 1
-                                    if self.count == self.destination:
-                                        return True
-                                    if self.count > self.destination:
-                                        # Stop the generator that's sending us
-                                        # these things.
-                                        raise StopIteration()
-                                    return False
-                            checker = Counter(pseudo_value).nth_child_of_type
-                        else:
+                    if pseudo_type == 'nth-of-type':
+                        try:
+                            pseudo_value = int(pseudo_value)
+                        except:
                            raise NotImplementedError(
-                                'Only the following pseudo-classes are implemented: nth-of-type.')
+                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
+                        if pseudo_value < 1:
+                            raise ValueError(
+                                'nth-of-type pseudo-class value must be at least 1.')
+                        class Counter(object):
+                            def __init__(self, destination):
+                                self.count = 0
+                                self.destination = destination
+
+                            def nth_child_of_type(self, tag):
+                                self.count += 1
+                                if self.count == self.destination:
+                                    return True
+                                if self.count > self.destination:
+                                    # Stop the generator that's sending us
+                                    # these things.
+                                    raise StopIteration()
+                                return False
+                        checker = Counter(pseudo_value).nth_child_of_type
+                    else:
+                        raise NotImplementedError(
+                            'Only the following pseudo-classes are implemented: nth-of-type.')

                elif token == '*':
                    # Star selector -- matches everything
@ -1376,6 +1461,7 @@ class Tag(PageElement):
                else:
                    _use_candidate_generator = _candidate_generator

+                count = 0
                for tag in current_context:
                    if self._select_debug:
                        print "    Running candidate generator on %s %s" % (
@ -1400,6 +1486,8 @@ class Tag(PageElement):
                                # don't include it in the context more than once.
                                new_context.append(candidate)
                                new_context_ids.add(id(candidate))
+                                if limit and len(new_context) >= limit:
+                                    break
                        elif self._select_debug:
                            print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))