Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439).

2025-04-07 03:01:07 +00:00 · 2017-01-27 14:20:52 +00:00 · 2017-01-27 14:20:52 +00:00 · 447c7e0fa8
commit 447c7e0fa8
parent 9b497e6df2
9 changed files with 301 additions and 93 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -9,6 +9,7 @@
 * Change improve add show search results by comparing search term to an additional unidecoded result set
 * Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
 * Update backports_abc 0.4 to 0.5
+* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)


 [develop changelog]
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/

 Beautiful Soup uses a pluggable XML or HTML parser to parse a
 (possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.

-Beautiful Soup works with Python 2.6 and up. It works better if lxml
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
 and/or html5lib is installed.

 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
 """

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.4.0"
-__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
+__version__ = "4.5.3"
+__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']

 import os
 import re
+import traceback
 import warnings

 from .builder import builder_registry, ParserRejectedMarkup
@ -77,7 +82,7 @@ class BeautifulSoup(Tag):

    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"

    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
@ -137,6 +142,10 @@ class BeautifulSoup(Tag):
        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")

+        if from_encoding and isinstance(markup, unicode):
+            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+            from_encoding = None
+
        if len(kwargs) > 0:
            arg = kwargs.keys().pop()
            raise TypeError(
@ -161,19 +170,29 @@ class BeautifulSoup(Tag):
                    markup_type = "XML"
                else:
                    markup_type = "HTML"
+
+                caller = traceback.extract_stack()[0]
+                filename = caller[0]
+                line_number = caller[1]
                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                    filename=filename,
+                    line_number=line_number,
                    parser=builder.NAME,
                    markup_type=markup_type))

        self.builder = builder
        self.is_xml = builder.is_xml
+        self.known_xml = self.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
-        elif len(markup) <= 256:
+        elif len(markup) <= 256 and (
+                (isinstance(markup, bytes) and not b'<' in markup)
+                or (isinstance(markup, unicode) and not u'<' in markup)
+        ):
            # Print out warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
@ -195,16 +214,10 @@ class BeautifulSoup(Tag):
                if isinstance(markup, unicode):
                    markup = markup.encode("utf8")
                warnings.warn(
-                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
-            if markup[:5] == "http:" or markup[:6] == "https:":
-                # TODO: This is ugly but I couldn't get it to work in
-                # Python 3 otherwise.
-                if ((isinstance(markup, bytes) and not b' ' in markup)
-                    or (isinstance(markup, unicode) and not u' ' in markup)):
-                    if isinstance(markup, unicode):
-                        markup = markup.encode("utf8")
-                    warnings.warn(
-                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+                    '"%s" looks like a filename, not markup. You should'
+                    'probably open this file and pass the filehandle into'
+                    'Beautiful Soup.' % markup)
+            self._check_markup_is_url(markup)

        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
@ -223,15 +236,52 @@ class BeautifulSoup(Tag):
        self.builder.soup = None

    def __copy__(self):
-        return type(self)(self.encode(), builder=self.builder)
+        copy = type(self)(
+            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+        )
+
+        # Although we encoded the tree to UTF-8, that may not have
+        # been the encoding of the original markup. Set the copy's
+        # .original_encoding to reflect the original object's
+        # .original_encoding.
+        copy.original_encoding = self.original_encoding
+        return copy

    def __getstate__(self):
        # Frequently a tree builder can't be pickled.
        d = dict(self.__dict__)
        if 'builder' in d and not self.builder.picklable:
-            del d['builder']
+            d['builder'] = None
        return d

+    @staticmethod
+    def _check_markup_is_url(markup):
+        """ 
+        Check if markup looks like it's actually a url and raise a warning 
+        if so. Markup can be unicode or str (py2) / bytes (py3).
+        """
+        if isinstance(markup, bytes):
+            space = b' '
+            cant_start_with = (b"http:", b"https:")
+        elif isinstance(markup, unicode):
+            space = u' '
+            cant_start_with = (u"http:", u"https:")
+        else:
+            return
+
+        if any(markup.startswith(prefix) for prefix in cant_start_with):
+            if not space in markup:
+                if isinstance(markup, bytes):
+                    decoded_markup = markup.decode('utf-8', 'replace')
+                else:
+                    decoded_markup = markup
+                warnings.warn(
+                    '"%s" looks like a URL. Beautiful Soup is not an'
+                    ' HTTP client. You should probably use an HTTP client like'
+                    ' requests to get the document behind the URL, and feed'
+                    ' that document to Beautiful Soup.' % decoded_markup
+                )
+
    def _feed(self):
        # Convert the document to Unicode.
        self.builder.reset()
@ -335,7 +385,18 @@ class BeautifulSoup(Tag):
        if parent.next_sibling:
            # This node is being inserted into an element that has
            # already been parsed. Deal with any dangling references.
-            index = parent.contents.index(o)
+            index = len(parent.contents)-1
+            while index >= 0:
+                if parent.contents[index] is o:
+                    break
+                index -= 1
+            else:
+                raise ValueError(
+                    "Error building tree: supposedly %r was inserted "
+                    "into %r after the fact, but I don't see it!" % (
+                        o, parent
+                    )
+                )
            if index == 0:
                previous_element = parent
                previous_sibling = None
@ -387,7 +448,7 @@ class BeautifulSoup(Tag):
        """Push a start tag on to the stack.

        If this method returns None, the tag was rejected by the
-        SoupStrainer. You should proceed as if the tag had not occured
+        SoupStrainer. You should proceed as if the tag had not occurred
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 from collections import defaultdict
 import itertools
 import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
+    HTMLAwareEntitySubstitution,
    whitespace_re
    )

@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
    Such as which tags are empty-element tags.
    """

-    preserve_whitespace_tags = set(['pre', 'textarea'])
+    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                              'spacer', 'link', 'frame', 'base'])

--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -1,9 +1,12 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
    'HTML5TreeBuilder',
    ]

-from pdb import set_trace
 import warnings
+import re
 from bs4.builder import (
    PERMISSIVE,
    HTML,
@ -15,7 +18,10 @@ from bs4.element import (
    whitespace_re,
 )
 import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+    namespaces,
+    prefixes,
+    )
 from bs4.element import (
    Comment,
    Doctype,
@ -23,6 +29,15 @@ from bs4.element import (
    Tag,
    )

+try:
+    # Pre-0.99999999
+    from html5lib.treebuilders import _base as treebuilder_base
+    new_html5lib = False
+except ImportError, e:
+    # 0.99999999 and up
+    from html5lib.treebuilders import base as treebuilder_base
+    new_html5lib = True
+
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        extra_kwargs = dict()
+        if not isinstance(markup, unicode):
+            if new_html5lib:
+                extra_kwargs['override_encoding'] = self.user_specified_encoding
+            else:
+                extra_kwargs['encoding'] = self.user_specified_encoding
+        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            if not isinstance(original_encoding, basestring):
+                # In 0.99999999 and up, the encoding is an html5lib
+                # Encoding object. We want to use a string for compatibility
+                # with other tree builders.
+                original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding

    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
-            self.soup, namespaceHTMLElements)
+            namespaceHTMLElements, self.soup)
        return self.underlying_builder

    def test_fragment_to_document(self, fragment):
@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        return u'<html><head></head><body>%s</body></html>' % fragment


-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):

-    def __init__(self, soup, namespaceHTMLElements):
-        self.soup = soup
+    def __init__(self, namespaceHTMLElements, soup=None):
+        if soup:
+            self.soup = soup
+        else:
+            from bs4 import BeautifulSoup
+            self.soup = BeautifulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
-        self.soup = BeautifulSoup("")
+        from bs4 import BeautifulSoup
+        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
        return self.soup

    def getFragment(self):
-        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+        return treebuilder_base.TreeBuilder.getFragment(self).element
+
+    def testSerializer(self, element):
+        from bs4 import BeautifulSoup
+        rv = []
+        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+
+        def serializeElement(element, indent=0):
+            if isinstance(element, BeautifulSoup):
+                pass
+            if isinstance(element, Doctype):
+                m = doctype_re.match(element)
+                if m:
+                    name = m.group(1)
+                    if m.lastindex > 1:
+                        publicId = m.group(2) or ""
+                        systemId = m.group(3) or m.group(4) or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif isinstance(element, Comment):
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
+            elif isinstance(element, NavigableString):
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                if element.namespace:
+                    name = "%s %s" % (prefixes[element.namespace],
+                                      element.name)
+                else:
+                    name = element.name
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.attrs:
+                    attributes = []
+                    for name, value in element.attrs.items():
+                        if isinstance(name, NamespacedAttribute):
+                            name = "%s %s" % (prefixes[name.namespace], name.name)
+                        if isinstance(value, list):
+                            value = " ".join(value)
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                indent += 2
+                for child in element.children:
+                    serializeElement(child, indent)
+        serializeElement(element, 0)
+
+        return "\n".join(rv)

 class AttrList(object):
    def __init__(self, element):
@ -137,9 +220,9 @@ class AttrList(object):
        return name in list(self.attrs.keys())


-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
    def __init__(self, element, soup, namespace):
-        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        treebuilder_base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node):
            child = node
        elif node.element.__class__ == NavigableString:
            string_child = child = node.element
+            node.parent = self
        else:
            child = node.element
+            node.parent = self

        if not isinstance(child, basestring) and child.parent is not None:
            node.element.extract()
@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
                most_recent_element=most_recent_element)

    def getAttributes(self):
+        if isinstance(self.element, Comment):
+            return {}
        return AttrList(self.element)

    def setAttributes(self, attributes):
@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node):
    attributes = property(getAttributes, setAttributes)

    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
        if insertBefore:
-            text = TextNode(self.soup.new_string(data), self.soup)
-            self.insertBefore(data, insertBefore)
+            self.insertBefore(text, insertBefore)
        else:
-            self.appendChild(data)
+            self.appendChild(text)

    def insertBefore(self, node, refNode):
        index = self.element.index(refNode.element)
@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
        # print "MOVE", self.element.contents
        # print "FROM", self.element
        # print "TO", new_parent.element
+
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element

        to_append = element.contents
-        append_after = new_parent_element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
            if new_parents_last_child:
                new_parents_last_child.next_sibling = first_child

-            # Fix the last child's next_element and next_sibling
-            last_child = to_append[-1]
-            last_child.next_element = new_parents_last_descendant_next_element
+            # Find the very last element being moved. It is now the
+            # parent's last descendant. It has no .next_sibling and
+            # its .next_element is whatever the previous last
+            # descendant had.
+            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
            if new_parents_last_descendant_next_element:
-                new_parents_last_descendant_next_element.previous_element = last_child
-            last_child.next_sibling = None
+                # TODO: This code has no test coverage and I'm not sure
+                # how to get html5lib to go through this path, but it's
+                # just the other side of the previous line.
+                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+            last_childs_last_descendant.next_sibling = None

        for child in to_append:
            child.parent = new_parent_element
@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):

 class TextNode(Element):
    def __init__(self, element, soup):
-        html5lib.treebuilders._base.Node.__init__(self, None)
+        treebuilder_base.Node.__init__(self, None)
        self.element = element
        self.soup = soup

--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -1,5 +1,8 @@
 """Use the HTMLParser library to parse HTML files that aren't too bad."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
    'HTMLParserTreeBuilder',
    ]
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -1,3 +1,5 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __all__ = [
    'LXMLTreeBuilderForXML',
    'LXMLTreeBuilder',
@ -12,6 +14,7 @@ from bs4.element import (
    Doctype,
    NamespacedAttribute,
    ProcessingInstruction,
+    XMLProcessingInstruction,
 )
 from bs4.builder import (
    FAST,
@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_PARSER_CLASS = etree.XMLParser

    is_xml = True
+    processing_instruction_class = XMLProcessingInstruction

    NAME = "lxml-xml"
    ALTERNATE_NAMES = ["xml"]
@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):

        Each 4-tuple represents a strategy for parsing the document.
        """
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
        if isinstance(markup, unicode):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)

-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
-        is_html = not self.is_xml
        try_encodings = [user_specified_encoding, document_declared_encoding]
        detector = EncodingDetector(
            markup, try_encodings, is_html, exclude_encodings)
@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def pi(self, target, data):
        self.soup.endData()
        self.soup.handle_data(target + ' ' + data)
-        self.soup.endData(ProcessingInstruction)
+        self.soup.endData(self.processing_instruction_class)

    def data(self, content):
        self.soup.handle_data(content)
@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
+    processing_instruction_class = ProcessingInstruction

    def default_parser(self, encoding):
        return etree.HTMLParser
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

-from pdb import set_trace
 import codecs
 from htmlentitydefs import codepoint2name
 import re
@ -309,7 +310,7 @@ class EncodingDetector:
        else:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))
-            
+
        declared_encoding = None
        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
@ -346,7 +347,7 @@ class UnicodeDammit:
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html
-
+        self.log = logging.getLogger(__name__)
        self.detector = EncodingDetector(
            markup, override_encodings, is_html, exclude_encodings)

@ -376,9 +377,10 @@ class UnicodeDammit:
                if encoding != "ascii":
                    u = self._convert_from(encoding, "replace")
                if u is not None:
-                    logging.warning(
+                    self.log.warning(
                            "Some characters could not be decoded, and were "
-                            "replaced with REPLACEMENT CHARACTER.")
+                            "replaced with REPLACEMENT CHARACTER."
+                    )
                    self.contains_replacement_characters = True
                    break

@ -734,7 +736,7 @@ class UnicodeDammit:
        0xde : b'\xc3\x9e',     # Þ
        0xdf : b'\xc3\x9f',     # ß
        0xe0 : b'\xc3\xa0',     # à
-        0xe1 : b'\xa1',     # á
+        0xe1 : b'\xa1',         # á
        0xe2 : b'\xc3\xa2',     # â
        0xe3 : b'\xc3\xa3',     # ã
        0xe4 : b'\xc3\xa4',     # ä
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -1,5 +1,7 @@
 """Diagnostic functions, mainly for use when doing tech support."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

 import cProfile
@ -56,7 +58,8 @@ def diagnose(data):
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
-        data = open(data).read()
+        with open(data) as fp:
+            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

-from pdb import set_trace
 import collections
 import re
+import shlex
 import sys
 import warnings
 from bs4.dammit import EntitySubstitution
@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):

    preformatted_tags = set(["pre"])

+    preserve_whitespace_tags = set(['pre', 'textarea'])
+
    @classmethod
    def _substitute_if_appropriate(cls, ns, f):
        if (isinstance(ns, NavigableString)
@ -169,11 +173,19 @@ class PageElement(object):

        This is used when mapping a formatter name ("minimal") to an
        appropriate function (one that performs entity-substitution on
-        the contents of <script> and <style> tags, or not). It's
+        the contents of <script> and <style> tags, or not). It can be
        inefficient, but it should be called very rarely.
        """
+        if self.known_xml is not None:
+            # Most of the time we will have determined this when the
+            # document is parsed.
+            return self.known_xml
+
+        # Otherwise, it's likely that this element was created by
+        # direct invocation of the constructor from within the user's
+        # Python code.
        if self.parent is None:
-            # This is the top-level object. It should have .is_xml set
+            # This is the top-level object. It should have .known_xml set
            # from tree creation. If not, take a guess--BS is usually
            # used on HTML markup.
            return getattr(self, 'is_xml', False)
@ -637,7 +649,7 @@ class PageElement(object):
            return lambda el: el._attr_value_as_string(
                attribute, '').startswith(value)
        elif operator == '$':
-            # string represenation of `attribute` ends with `value`
+            # string representation of `attribute` ends with `value`
            return lambda el: el._attr_value_as_string(
                attribute, '').endswith(value)
        elif operator == '*':
@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement):
    PREFIX = ''
    SUFFIX = ''

+    # We can't tell just by looking at a string whether it's contained
+    # in an XML document or an HTML document.
+
+    known_xml = None
+
    def __new__(cls, value):
        """Create a new NavigableString.

@ -743,10 +760,16 @@ class CData(PreformattedString):
    SUFFIX = u']]>'

 class ProcessingInstruction(PreformattedString):
+    """A SGML processing instruction."""

    PREFIX = u'<?'
    SUFFIX = u'>'

+class XMLProcessingInstruction(ProcessingInstruction):
+    """An XML processing instruction."""
+    PREFIX = u'<?'
+    SUFFIX = u'?>'
+
 class Comment(PreformattedString):

    PREFIX = u'<!--'
@ -781,7 +804,8 @@ class Tag(PageElement):
    """Represents a found HTML tag with its attributes and contents."""

    def __init__(self, parser=None, builder=None, name=None, namespace=None,
-                 prefix=None, attrs=None, parent=None, previous=None):
+                 prefix=None, attrs=None, parent=None, previous=None,
+                 is_xml=None):
        "Basic constructor."

        if parser is None:
@ -795,6 +819,14 @@ class Tag(PageElement):
        self.name = name
        self.namespace = namespace
        self.prefix = prefix
+        if builder is not None:
+            preserve_whitespace_tags = builder.preserve_whitespace_tags
+        else:
+            if is_xml:
+                preserve_whitespace_tags = []
+            else:
+                preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
+        self.preserve_whitespace_tags = preserve_whitespace_tags
        if attrs is None:
            attrs = {}
        elif attrs:
@ -805,6 +837,13 @@ class Tag(PageElement):
                attrs = dict(attrs)
        else:
            attrs = dict(attrs)
+
+        # If possible, determine ahead of time whether this tag is an
+        # XML tag.
+        if builder:
+            self.known_xml = builder.is_xml
+        else:
+            self.known_xml = is_xml
        self.attrs = attrs
        self.contents = []
        self.setup(parent, previous)
@ -824,7 +863,7 @@ class Tag(PageElement):
        Its contents are a copy of the old Tag's contents.
        """
        clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.nsprefix, self.attrs)
+                           self.nsprefix, self.attrs, is_xml=self._is_xml)
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@ -997,7 +1036,7 @@ class Tag(PageElement):
                    tag_name, tag_name))
            return self.find(tag_name)
        # We special case contents to avoid recursion.
-        elif not tag.startswith("__") and not tag=="contents":
+        elif not tag.startswith("__") and not tag == "contents":
            return self.find(tag)
        raise AttributeError(
            "'%s' object has no attribute '%s'" % (self.__class__, tag))
@ -1057,10 +1096,11 @@ class Tag(PageElement):

    def _should_pretty_print(self, indent_level):
        """Should this tag be pretty-printed?"""
+
        return (
-            indent_level is not None and
-            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
-             or self._is_xml))
+            indent_level is not None
+            and self.name not in self.preserve_whitespace_tags
+        )

    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
@ -1280,6 +1320,7 @@ class Tag(PageElement):

    _selector_combinators = ['>', '+', '~']
    _select_debug = False
+    quoted_colon = re.compile('"[^"]*:[^"]*"')
    def select_one(self, selector):
        """Perform a CSS selection operation on the current element."""
        value = self.select(selector, limit=1)
@ -1305,8 +1346,7 @@ class Tag(PageElement):
                if limit and len(context) >= limit:
                    break
            return context
-
-        tokens = selector.split()
+        tokens = shlex.split(selector)
        current_context = [self]

        if tokens[-1] in self._selector_combinators:
@ -1358,7 +1398,7 @@ class Tag(PageElement):
                    return classes.issubset(candidate.get('class', []))
                checker = classes_match

-            elif ':' in token:
+            elif ':' in token and not self.quoted_colon.search(token):
                # Pseudo-class
                tag_name, pseudo = token.split(':', 1)
                if tag_name == '':
@ -1389,11 +1429,8 @@ class Tag(PageElement):
                            self.count += 1
                            if self.count == self.destination:
                                return True
-                            if self.count > self.destination:
-                                # Stop the generator that's sending us
-                                # these things.
-                                raise StopIteration()
-                            return False
+                            else:
+                                return False
                    checker = Counter(pseudo_value).nth_child_of_type
                else:
                    raise NotImplementedError(
@ -1498,13 +1535,12 @@ class Tag(PageElement):
                            # don't include it in the context more than once.
                            new_context.append(candidate)
                            new_context_ids.add(id(candidate))
-                            if limit and len(new_context) >= limit:
-                                break
                    elif self._select_debug:
                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))

-
            current_context = new_context
+        if limit and len(current_context) >= limit:
+            current_context = current_context[:limit]

        if self._select_debug:
            print "Final verdict:"
@ -1668,21 +1704,15 @@ class SoupStrainer(object):
        if isinstance(markup, list) or isinstance(markup, tuple):
            # This should only happen when searching a multi-valued attribute
            # like 'class'.
-            if (isinstance(match_against, unicode)
-                and ' ' in match_against):
-                # A bit of a special case. If they try to match "foo
-                # bar" on a multivalue attribute's value, only accept
-                # the literal value "foo bar"
-                #
-                # XXX This is going to be pretty slow because we keep
-                # splitting match_against. But it shouldn't come up
-                # too often.
-                return (whitespace_re.split(match_against) == markup)
-            else:
-                for item in markup:
-                    if self._matches(item, match_against):
-                        return True
-                return False
+            for item in markup:
+                if self._matches(item, match_against):
+                    return True
+            # We didn't match any particular value of the multivalue
+            # attribute, but maybe we match the attribute value when
+            # considered as a string.
+            if self._matches(' '.join(markup), match_against):
+                return True
+            return False

        if match_against is True:
            # True matches any non-None value.