Merge pull request #420 from JackDandy/feature/UpdateBSoup

Update Beautiful Soup to 4.3.2 (r353).
2025-01-07 10:33:38 +00:00 · 2015-06-15 01:08:25 +01:00 · 2015-06-15 01:08:25 +01:00 · 23da0ad9ef
commit 23da0ad9ef
parent 3741b3ce2e f83390e689
16 changed files with 240 additions and 3425 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -47,6 +47,7 @@
 * Update dateutil library 2.2 to 2.4.2 (a6b8925)
 * Change zoneinfo update/loader to be compatible with dateutil 2.4.2
 * Update ConfigObj library 4.6.0 to 5.1.0 (a68530a)
 * Update Beautiful Soup to 4.3.2 (r353)
 [develop changelog]
 * Update Requests library 2.7.0 (ab1f493) to 2.7.0 (8b5e457)
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -45,7 +45,7 @@ from .element import (
 # The very first thing we do is give a useful error if someone is
 # running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
 class BeautifulSoup(Tag):
    """
@ -77,6 +77,8 @@ class BeautifulSoup(Tag):
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, **kwargs):
        """The Soup object is initialized as the 'root tag', and the
@ -114,9 +116,9 @@ class BeautifulSoup(Tag):
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
-                "BeautifulSoup constructor. You can pass in features='html' "
+                "BeautifulSoup constructor. Suggest you use "
-                "or features='xml' to get a builder capable of handling "
+                "features='lxml' for HTML and features='lxml-xml' for "
-                "one or the other.")
+                "XML.")
        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
@ -140,6 +142,7 @@ class BeautifulSoup(Tag):
                "__init__() got an unexpected keyword argument '%s'" % arg)
        if builder is None:
            original_features = features
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
@ -151,6 +154,11 @@ class BeautifulSoup(Tag):
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
            if not (original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES):
                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
                    parser=builder.NAME))
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self
@ -178,6 +186,8 @@ class BeautifulSoup(Tag):
                # system. Just let it go.
                pass
            if is_file:
                if isinstance(markup, unicode):
                    markup = markup.encode("utf8")
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
            if markup[:5] == "http:" or markup[:6] == "https:":
@ -185,6 +195,8 @@ class BeautifulSoup(Tag):
                # Python 3 otherwise.
                if ((isinstance(markup, bytes) and not b' ' in markup)
                    or (isinstance(markup, unicode) and not u' ' in markup)):
                    if isinstance(markup, unicode):
                        markup = markup.encode("utf8")
                    warnings.warn(
                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -80,6 +80,8 @@ builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
    """Turn a document into a Beautiful Soup object tree."""
    NAME = "[Unknown tree builder]"
    ALTERNATE_NAMES = []
    features = []
    is_xml = False
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -22,7 +22,9 @@ from bs4.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""
-    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+    NAME = "html5lib"
    features = [NAME, PERMISSIVE, HTML_5, HTML]
    def prepare_markup(self, markup, user_specified_encoding):
        # Store the user-specified encoding for use later on.
@ -161,6 +163,12 @@ class Element(html5lib.treebuilders._base.Node):
            # immediately after the parent, if it has no children.)
            if self.element.contents:
                most_recent_element = self.element._last_descendant(False)
            elif self.element.next_element is not None:
                # Something from further ahead in the parse tree is
                # being inserted into this earlier element. This is
                # very annoying because it means an expensive search
                # for the last element in the tree.
                most_recent_element = self.soup._last_descendant()
            else:
                most_recent_element = self.element
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -19,10 +19,8 @@ import warnings
 # At the end of this file, we monkeypatch HTMLParser so that
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
-    major > 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
    or (major == 3 and minor > 2)
    or (major == 3 and minor == 2 and release >= 3))
 from bs4.element import (
    CData,
@ -63,7 +61,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
    def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
-        # it's fixed.
+        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
@ -113,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
    def handle_pi(self, data):
        self.soup.endData()
        if data.endswith("?") and data.lower().startswith("xml"):
            # "An XHTML processing instruction using the trailing '?'
            # will cause the '?' to be included in data." - HTMLParser
            # docs.
            #
            # Strip the question mark so we don't end up with two
            # question marks.
            data = data[:-1]
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)
@ -128,11 +119,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
    is_xml = False
-    features = [HTML, STRICT, HTMLPARSER]
+    NAME = HTMLPARSER
    features = [NAME, HTML, STRICT]
    def __init__(self, *args, **kwargs):
        if CONSTRUCTOR_TAKES_STRICT:
            kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)
    def prepare_markup(self, markup, user_specified_encoding=None,
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -7,7 +7,12 @@ from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
    Comment,
    Doctype,
    NamespacedAttribute,
    ProcessingInstruction,
 )
 from bs4.builder import (
    FAST,
    HTML,
@ -25,8 +30,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    is_xml = True
    NAME = "lxml-xml"
    # Well, it's permissive by XML parser standards.
-    features = [LXML, XML, FAST, PERMISSIVE]
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]
    CHUNK_SIZE = 512
@ -189,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.pop()
    def pi(self, target, data):
-        pass
+        self.soup.endData()
        self.soup.handle_data(target + ' ' + data)
        self.soup.endData(ProcessingInstruction)
    def data(self, content):
        self.soup.handle_data(content)
@ -212,7 +221,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
-    features = [LXML, HTML, FAST, PERMISSIVE]
+    NAME = LXML
    ALTERNATE_NAMES = ["lxml-html"]
    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
    def default_parser(self, encoding):
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -548,17 +548,17 @@ class PageElement(object):
    # Methods for supporting CSS selectors.
-    tag_name_re = re.compile('^[a-z0-9]+$')
+    tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
-    # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
+    # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
-    #   \---/  \---/\-------------/    \-------/
+    #   \---------------------------/  \---/\-------------/    \-------/
-    #     |      |         |               |
+    #     |                              |         |               |
-    #     |      |         |           The value
+    #     |                              |         |           The value
-    #     |      |    ~,|,^,$,* or =
+    #     |                              |    ~,|,^,$,* or =
-    #     |   Attribute
+    #     |                           Attribute
    #    Tag
    attribselect_re = re.compile(
-        r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
+        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
        r'=?"?(?P<value>[^\]"]*)"?\]$'
        )
@ -707,7 +707,7 @@ class CData(PreformattedString):
 class ProcessingInstruction(PreformattedString):
    PREFIX = u'<?'
-    SUFFIX = u'?>'
+    SUFFIX = u'>'
 class Comment(PreformattedString):
@ -1203,192 +1203,206 @@ class Tag(PageElement):
    _select_debug = False
    def select(self, selector, _candidate_generator=None):
        """Perform a CSS selection operation on the current element."""
-        tokens = selector.split()
+
        # Remove whitespace directly after the grouping operator ','
        # then split into tokens.
        tokens = re.sub(',[\s]*',',', selector).split()
        current_context = [self]
        if tokens[-1] in self._selector_combinators:
            raise ValueError(
                'Final combinator "%s" is missing an argument.' % tokens[-1])
        if self._select_debug:
            print 'Running CSS selector "%s"' % selector
-        for index, token in enumerate(tokens):
+
-            if self._select_debug:
+        for index, token_group in enumerate(tokens):
-                print ' Considering token "%s"' % token
+            new_context = []
-            recursive_candidate_generator = None
+            new_context_ids = set([])
-            tag_name = None
+
            # Grouping selectors, ie: p,a
            grouped_tokens = token_group.split(',')
            if '' in grouped_tokens:
                raise ValueError('Invalid group selection syntax: %s' % token_group)
            if tokens[index-1] in self._selector_combinators:
                # This token was consumed by the previous combinator. Skip it.
                if self._select_debug:
                    print '  Token was consumed by the previous combinator.'
                continue
            # Each operation corresponds to a checker function, a rule
            # for determining whether a candidate matches the
            # selector. Candidates are generated by the active
            # iterator.
            checker = None
-            m = self.attribselect_re.match(token)
+            for token in grouped_tokens:
-            if m is not None:
+                if self._select_debug:
-                # Attribute selector
+                    print ' Considering token "%s"' % token
-                tag_name, attribute, operator, value = m.groups()
+                recursive_candidate_generator = None
-                checker = self._attribute_checker(operator, attribute, value)
+                tag_name = None
-            elif '#' in token:
+                # Each operation corresponds to a checker function, a rule
-                # ID selector
+                # for determining whether a candidate matches the
-                tag_name, tag_id = token.split('#', 1)
+                # selector. Candidates are generated by the active
-                def id_matches(tag):
+                # iterator.
-                    return tag.get('id', None) == tag_id
+                checker = None
                checker = id_matches
-            elif '.' in token:
+                m = self.attribselect_re.match(token)
-                # Class selector
+                if m is not None:
-                tag_name, klass = token.split('.', 1)
+                    # Attribute selector
-                classes = set(klass.split('.'))
+                    tag_name, attribute, operator, value = m.groups()
-                def classes_match(candidate):
+                    checker = self._attribute_checker(operator, attribute, value)
                    return classes.issubset(candidate.get('class', []))
                checker = classes_match
-            elif ':' in token:
+                elif '#' in token:
-                # Pseudo-class
+                    # ID selector
-                tag_name, pseudo = token.split(':', 1)
+                    tag_name, tag_id = token.split('#', 1)
-                if tag_name == '':
+                    def id_matches(tag):
-                    raise ValueError(
+                        return tag.get('id', None) == tag_id
-                        "A pseudo-class must be prefixed with a tag name.")
+                    checker = id_matches
-                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+
-                found = []
+                elif '.' in token:
-                if pseudo_attributes is not None:
+                    # Class selector
-                    pseudo_type, pseudo_value = pseudo_attributes.groups()
+                    tag_name, klass = token.split('.', 1)
-                    if pseudo_type == 'nth-of-type':
+                    classes = set(klass.split('.'))
-                        try:
+                    def classes_match(candidate):
-                            pseudo_value = int(pseudo_value)
+                        return classes.issubset(candidate.get('class', []))
-                        except:
+                    checker = classes_match
                elif ':' in token:
                    # Pseudo-class
                    tag_name, pseudo = token.split(':', 1)
                    if tag_name == '':
                        raise ValueError(
                            "A pseudo-class must be prefixed with a tag name.")
                    pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
                    found = []
                    if pseudo_attributes is not None:
                        pseudo_type, pseudo_value = pseudo_attributes.groups()
                        if pseudo_type == 'nth-of-type':
                            try:
                                pseudo_value = int(pseudo_value)
                            except:
                                raise NotImplementedError(
                                    'Only numeric values are currently supported for the nth-of-type pseudo-class.')
                            if pseudo_value < 1:
                                raise ValueError(
                                    'nth-of-type pseudo-class value must be at least 1.')
                            class Counter(object):
                                def __init__(self, destination):
                                    self.count = 0
                                    self.destination = destination
                                def nth_child_of_type(self, tag):
                                    self.count += 1
                                    if self.count == self.destination:
                                        return True
                                    if self.count > self.destination:
                                        # Stop the generator that's sending us
                                        # these things.
                                        raise StopIteration()
                                    return False
                            checker = Counter(pseudo_value).nth_child_of_type
                        else:
                            raise NotImplementedError(
-                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
+                                'Only the following pseudo-classes are implemented: nth-of-type.')
                        if pseudo_value < 1:
                            raise ValueError(
                                'nth-of-type pseudo-class value must be at least 1.')
                        class Counter(object):
                            def __init__(self, destination):
                                self.count = 0
                                self.destination = destination
-                            def nth_child_of_type(self, tag):
+                elif token == '*':
-                                self.count += 1
+                    # Star selector -- matches everything
-                                if self.count == self.destination:
+                    pass
-                                    return True
+                elif token == '>':
-                                if self.count > self.destination:
+                    # Run the next token as a CSS selector against the
-                                    # Stop the generator that's sending us
+                    # direct children of each tag in the current context.
-                                    # these things.
+                    recursive_candidate_generator = lambda tag: tag.children
-                                    raise StopIteration()
+                elif token == '~':
-                                return False
+                    # Run the next token as a CSS selector against the
-                        checker = Counter(pseudo_value).nth_child_of_type
+                    # siblings of each tag in the current context.
-                    else:
+                    recursive_candidate_generator = lambda tag: tag.next_siblings
-                        raise NotImplementedError(
+                elif token == '+':
-                            'Only the following pseudo-classes are implemented: nth-of-type.')
+                    # For each tag in the current context, run the next
                    # token as a CSS selector against the tag's next
                    # sibling that's a tag.
                    def next_tag_sibling(tag):
                        yield tag.find_next_sibling(True)
                    recursive_candidate_generator = next_tag_sibling
-            elif token == '*':
+                elif self.tag_name_re.match(token):
-                # Star selector -- matches everything
+                    # Just a tag name.
-                pass
+                    tag_name = token
            elif token == '>':
                # Run the next token as a CSS selector against the
                # direct children of each tag in the current context.
                recursive_candidate_generator = lambda tag: tag.children
            elif token == '~':
                # Run the next token as a CSS selector against the
                # siblings of each tag in the current context.
                recursive_candidate_generator = lambda tag: tag.next_siblings
            elif token == '+':
                # For each tag in the current context, run the next
                # token as a CSS selector against the tag's next
                # sibling that's a tag.
                def next_tag_sibling(tag):
                    yield tag.find_next_sibling(True)
                recursive_candidate_generator = next_tag_sibling
            elif self.tag_name_re.match(token):
                # Just a tag name.
                tag_name = token
            else:
                raise ValueError(
                    'Unsupported or invalid CSS selector: "%s"' % token)
            if recursive_candidate_generator:
                # This happens when the selector looks like  "> foo".
                #
                # The generator calls select() recursively on every
                # member of the current context, passing in a different
                # candidate generator and a different selector.
                #
                # In the case of "> foo", the candidate generator is
                # one that yields a tag's direct children (">"), and
                # the selector is "foo".
                next_token = tokens[index+1]
                def recursive_select(tag):
                    if self._select_debug:
                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
                        print '-' * 40
                    for i in tag.select(next_token, recursive_candidate_generator):
                        if self._select_debug:
                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
                        yield i
                    if self._select_debug:
                        print '-' * 40
                _use_candidate_generator = recursive_select
            elif _candidate_generator is None:
                # By default, a tag's candidates are all of its
                # children. If tag_name is defined, only yield tags
                # with that name.
                if self._select_debug:
                    if tag_name:
                        check = "[any]"
                    else:
                        check = tag_name
                    print '   Default candidate generator, tag name="%s"' % check
                if self._select_debug:
                    # This is redundant with later code, but it stops
                    # a bunch of bogus tags from cluttering up the
                    # debug log.
                    def default_candidate_generator(tag):
                        for child in tag.descendants:
                            if not isinstance(child, Tag):
                                continue
                            if tag_name and not child.name == tag_name:
                                continue
                            yield child
                    _use_candidate_generator = default_candidate_generator
                else:
-                    _use_candidate_generator = lambda tag: tag.descendants
+                    raise ValueError(
-            else:
+                        'Unsupported or invalid CSS selector: "%s"' % token)
-                _use_candidate_generator = _candidate_generator
+                if recursive_candidate_generator:
-
+                    # This happens when the selector looks like  "> foo".
-            new_context = []
+                    #
-            new_context_ids = set([])
+                    # The generator calls select() recursively on every
-            for tag in current_context:
+                    # member of the current context, passing in a different
-                if self._select_debug:
+                    # candidate generator and a different selector.
-                    print "    Running candidate generator on %s %s" % (
+                    #
-                        tag.name, repr(tag.attrs))
+                    # In the case of "> foo", the candidate generator is
-                for candidate in _use_candidate_generator(tag):
+                    # one that yields a tag's direct children (">"), and
-                    if not isinstance(candidate, Tag):
+                    # the selector is "foo".
-                        continue
+                    next_token = tokens[index+1]
-                    if tag_name and candidate.name != tag_name:
+                    def recursive_select(tag):
                        continue
                    if checker is not None:
                        try:
                            result = checker(candidate)
                        except StopIteration:
                            # The checker has decided we should no longer
                            # run the generator.
                            break
                    if checker is None or result:
                        if self._select_debug:
-                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+                            print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
-                        if id(candidate) not in new_context_ids:
+                            print '-' * 40
-                            # If a tag matches a selector more than once,
+                        for i in tag.select(next_token, recursive_candidate_generator):
-                            # don't include it in the context more than once.
+                            if self._select_debug:
-                            new_context.append(candidate)
+                                print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
-                            new_context_ids.add(id(candidate))
+                            yield i
-                    elif self._select_debug:
+                        if self._select_debug:
-                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+                            print '-' * 40
                    _use_candidate_generator = recursive_select
                elif _candidate_generator is None:
                    # By default, a tag's candidates are all of its
                    # children. If tag_name is defined, only yield tags
                    # with that name.
                    if self._select_debug:
                        if tag_name:
                            check = "[any]"
                        else:
                            check = tag_name
                        print '   Default candidate generator, tag name="%s"' % check
                    if self._select_debug:
                        # This is redundant with later code, but it stops
                        # a bunch of bogus tags from cluttering up the
                        # debug log.
                        def default_candidate_generator(tag):
                            for child in tag.descendants:
                                if not isinstance(child, Tag):
                                    continue
                                if tag_name and not child.name == tag_name:
                                    continue
                                yield child
                        _use_candidate_generator = default_candidate_generator
                    else:
                        _use_candidate_generator = lambda tag: tag.descendants
                else:
                    _use_candidate_generator = _candidate_generator
                for tag in current_context:
                    if self._select_debug:
                        print "    Running candidate generator on %s %s" % (
                            tag.name, repr(tag.attrs))
                    for candidate in _use_candidate_generator(tag):
                        if not isinstance(candidate, Tag):
                            continue
                        if tag_name and candidate.name != tag_name:
                            continue
                        if checker is not None:
                            try:
                                result = checker(candidate)
                            except StopIteration:
                                # The checker has decided we should no longer
                                # run the generator.
                                break
                        if checker is None or result:
                            if self._select_debug:
                                print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
                            if id(candidate) not in new_context_ids:
                                # If a tag matches a selector more than once,
                                # don't include it in the context more than once.
                                new_context.append(candidate)
                                new_context_ids.add(id(candidate))
                        elif self._select_debug:
                            print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
            current_context = new_context
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -1,592 +0,0 @@
 """Helper classes for tests."""
 import copy
 import functools
 import unittest
 from unittest import TestCase
 from bs4 import BeautifulSoup
 from bs4.element import (
    CharsetMetaAttributeValue,
    Comment,
    ContentMetaAttributeValue,
    Doctype,
    SoupStrainer,
 )
 from bs4.builder import HTMLParserTreeBuilder
 default_builder = HTMLParserTreeBuilder
 class SoupTest(unittest.TestCase):
    @property
    def default_builder(self):
        return default_builder()
    def soup(self, markup, **kwargs):
        """Build a Beautiful Soup object from markup."""
        builder = kwargs.pop('builder', self.default_builder)
        return BeautifulSoup(markup, builder=builder, **kwargs)
    def document_for(self, markup):
        """Turn an HTML fragment into a document.
        The details depend on the builder.
        """
        return self.default_builder.test_fragment_to_document(markup)
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
        obj = BeautifulSoup(to_parse, builder=builder)
        if compare_parsed_to is None:
            compare_parsed_to = to_parse
        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
 class HTMLTreeBuilderSmokeTest(object):
    """A basic test of a treebuilder's competence.
    Any HTML treebuilder, present or future, should be able to pass
    these tests. With invalid markup, there's room for interpretation,
    and different parsers can handle it differently. But with the
    markup in these tests, there's not much room for interpretation.
    """
    def assertDoctypeHandled(self, doctype_fragment):
        """Assert that a given doctype string is handled correctly."""
        doctype_str, soup = self._document_with_doctype(doctype_fragment)
        # Make sure a Doctype object was created.
        doctype = soup.contents[0]
        self.assertEqual(doctype.__class__, Doctype)
        self.assertEqual(doctype, doctype_fragment)
        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
        # Make sure that the doctype was correctly associated with the
        # parse tree and that the rest of the document parsed.
        self.assertEqual(soup.p.contents[0], 'foo')
    def _document_with_doctype(self, doctype_fragment):
        """Generate and parse a document with the given doctype."""
        doctype = '<!DOCTYPE %s>' % doctype_fragment
        markup = doctype + '\n<p>foo</p>'
        soup = self.soup(markup)
        return doctype, soup
    def test_normal_doctypes(self):
        """Make sure normal, everyday HTML doctypes are handled correctly."""
        self.assertDoctypeHandled("html")
        self.assertDoctypeHandled(
            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
    def test_system_doctype(self):
        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
    def test_namespaced_system_doctype(self):
        # We can handle a namespaced doctype with a system ID.
        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
    def test_namespaced_public_doctype(self):
        # Test a namespaced doctype with a public id.
        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
    def test_real_xhtml_document(self):
        """A real XHTML document should come out more or less the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8").replace(b"\n", b""),
            markup.replace(b"\n", b""))
    def test_deepcopy(self):
        """Make sure you can copy the tree builder.
        This is important because the builder is part of a
        BeautifulSoup object, and we want to be able to copy that.
        """
        copy.deepcopy(self.default_builder)
    def test_p_tag_is_never_empty_element(self):
        """A <p> tag is never designated as an empty-element tag.
        Even if the markup shows it as an empty-element tag, it
        shouldn't be presented that way.
        """
        soup = self.soup("<p/>")
        self.assertFalse(soup.p.is_empty_element)
        self.assertEqual(str(soup.p), "<p></p>")
    def test_unclosed_tags_get_closed(self):
        """A tag that's not closed by the end of the document should be closed.
        This applies to all tags except empty-element tags.
        """
        self.assertSoupEquals("<p>", "<p></p>")
        self.assertSoupEquals("<b>", "<b></b>")
        self.assertSoupEquals("<br>", "<br/>")
    def test_br_is_always_empty_element_tag(self):
        """A <br> tag is designated as an empty-element tag.
        Some parsers treat <br></br> as one <br/> tag, some parsers as
        two tags, but it should always be an empty-element tag.
        """
        soup = self.soup("<br></br>")
        self.assertTrue(soup.br.is_empty_element)
        self.assertEqual(str(soup.br), "<br/>")
    def test_nested_formatting_elements(self):
        self.assertSoupEquals("<em><em></em></em>")
    def test_comment(self):
        # Comments are represented as Comment objects.
        markup = "<p>foo<!--foobar-->baz</p>"
        self.assertSoupEquals(markup)
        soup = self.soup(markup)
        comment = soup.find(text="foobar")
        self.assertEqual(comment.__class__, Comment)
        # The comment is properly integrated into the tree.
        foo = soup.find(text="foo")
        self.assertEqual(comment, foo.next_element)
        baz = soup.find(text="baz")
        self.assertEqual(comment, baz.previous_element)
    def test_preserved_whitespace_in_pre_and_textarea(self):
        """Whitespace must be preserved in <pre> and <textarea> tags."""
        self.assertSoupEquals("<pre>   </pre>")
        self.assertSoupEquals("<textarea> woo  </textarea>")
    def test_nested_inline_elements(self):
        """Inline elements can be nested indefinitely."""
        b_tag = "<b>Inside a B tag</b>"
        self.assertSoupEquals(b_tag)
        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
        self.assertSoupEquals(nested_b_tag)
        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
        self.assertSoupEquals(nested_b_tag)
    def test_nested_block_level_elements(self):
        """Block elements can be nested."""
        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
        blockquote = soup.blockquote
        self.assertEqual(blockquote.p.b.string, 'Foo')
        self.assertEqual(blockquote.b.string, 'Foo')
    def test_correctly_nested_tables(self):
        """One table can go inside another one."""
        markup = ('<table id="1">'
                  '<tr>'
                  "<td>Here's another table:"
                  '<table id="2">'
                  '<tr><td>foo</td></tr>'
                  '</table></td>')
        self.assertSoupEquals(
            markup,
            '<table id="1"><tr><td>Here\'s another table:'
            '<table id="2"><tr><td>foo</td></tr></table>'
            '</td></tr></table>')
        self.assertSoupEquals(
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_deeply_nested_multivalued_attribute(self):
        # html5lib can set the attributes of the same tag many times
        # as it rearranges the tree. This has caused problems with
        # multivalued attributes.
        markup = '<table><div><div class="css"></div></div></table>'
        soup = self.soup(markup)
        self.assertEqual(["css"], soup.div.div['class'])
    def test_angle_brackets_in_attribute_values_are_escaped(self):
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
    def test_entities_in_attributes_converted_to_unicode(self):
        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
    def test_entities_in_text_converted_to_unicode(self):
        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
    def test_quot_entity_converted_to_quotation_mark(self):
        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
                              '<p>I said "good day!"</p>')
    def test_out_of_range_entity(self):
        expect = u"\N{REPLACEMENT CHARACTER}"
        self.assertSoupEquals("&#10000000000000;", expect)
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
    def test_multipart_strings(self):
        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
        self.assertEqual("p", soup.h2.string.next_element.name)
        self.assertEqual("p", soup.p.name)
    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
        very least they should not choke on namespaces or lose
        data."""
        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode())
        html = soup.html
        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
        self.assertEqual(
            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
        self.assertEqual(
            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
    def test_multivalued_attribute_value_becomes_list(self):
        markup = b'<a class="foo bar">'
        soup = self.soup(markup)
        self.assertEqual(['foo', 'bar'], soup.a['class'])
    #
    # Generally speaking, tests below this point are more tests of
    # Beautiful Soup than tests of the tree builders. But parsers are
    # weird, so we run these tests separately for every tree builder
    # to detect any differences between them.
    #
    def test_can_parse_unicode_document(self):
        # A seemingly innocuous document... but it's in Unicode! And
        # it contains characters that can't be represented in the
        # encoding found in the  declaration! The horror!
        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
        soup = self.soup(markup)
        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
    def test_soupstrainer(self):
        """Parsers should be able to work with SoupStrainers."""
        strainer = SoupStrainer("b")
        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
                         parse_only=strainer)
        self.assertEqual(soup.decode(), "<b>bold</b>")
    def test_single_quote_attribute_values_become_double_quotes(self):
        self.assertSoupEquals("<foo attr='bar'></foo>",
                              '<foo attr="bar"></foo>')
    def test_attribute_values_with_nested_quotes_are_left_alone(self):
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
        self.assertSoupEquals(text)
    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
        soup = self.soup(text)
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
        self.assertSoupEquals(
            soup.foo.decode(),
            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
    def test_ampersand_in_attribute_value_gets_escaped(self):
        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
                              '<this is="really messed up &amp; stuff"></this>')
        self.assertSoupEquals(
            '<a href="http://example.org?a=1&b=2;3">foo</a>',
            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
    def test_entities_in_strings_converted_during_parsing(self):
        # Both XML and HTML entities are converted to Unicode characters
        # during parsing.
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
        self.assertSoupEquals(text, expected)
    def test_smart_quotes_converted_on_the_way_in(self):
        # Microsoft smart quotes are converted to Unicode characters during
        # parsing.
        quote = b"<p>\x91Foo\x92</p>"
        soup = self.soup(quote)
        self.assertEqual(
            soup.p.string,
            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
    def test_non_breaking_spaces_converted_on_the_way_in(self):
        soup = self.soup("<a>&nbsp;&nbsp;</a>")
        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
    def test_entities_converted_on_the_way_out(self):
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
        soup = self.soup(text)
        self.assertEqual(soup.p.encode("utf-8"), expected)
    def test_real_iso_latin_document(self):
        # Smoke test of interrelated functionality, using an
        # easy-to-understand document.
        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
        # That's because we're going to encode it into ISO-Latin-1, and use
        # that to test.
        iso_latin_html = unicode_html.encode("iso-8859-1")
        # Parse the ISO-Latin-1 HTML.
        soup = self.soup(iso_latin_html)
        # Encode it to UTF-8.
        result = soup.encode("utf-8")
        # What do we expect the result to look like? Well, it would
        # look like unicode_html, except that the META tag would say
        # UTF-8 instead of ISO-Latin-1.
        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
        # And, of course, it would be in UTF-8, not Unicode.
        expected = expected.encode("utf-8")
        # Ta-da!
        self.assertEqual(result, expected)
    def test_real_shift_jis_document(self):
        # Smoke test to make sure the parser can handle a document in
        # Shift-JIS encoding, without choking.
        shift_jis_html = (
            b'<html><head></head><body><pre>'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'</pre></body></html>')
        unicode_html = shift_jis_html.decode("shift-jis")
        soup = self.soup(unicode_html)
        # Make sure the parse tree is correctly encoded to various
        # encodings.
        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
    def test_real_hebrew_document(self):
        # A real-world test to make sure we can convert ISO-8859-9 (a
        # Hebrew encoding) to UTF-8.
        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
        soup = self.soup(
            hebrew_document, from_encoding="iso8859-8")
        self.assertEqual(soup.original_encoding, 'iso8859-8')
        self.assertEqual(
            soup.encode('utf-8'),
            hebrew_document.decode("iso8859-8").encode("utf-8"))
    def test_meta_tag_reflects_current_encoding(self):
        # Here's the <meta> tag saying that a document is
        # encoded in Shift-JIS.
        meta_tag = ('<meta content="text/html; charset=x-sjis" '
                    'http-equiv="Content-type"/>')
        # Here's a document incorporating that meta tag.
        shift_jis_html = (
            '<html><head>\n%s\n'
            '<meta http-equiv="Content-language" content="ja"/>'
            '</head><body>Shift-JIS markup goes here.') % meta_tag
        soup = self.soup(shift_jis_html)
        # Parse the document, and the charset is seemingly unaffected.
        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
        content = parsed_meta['content']
        self.assertEqual('text/html; charset=x-sjis', content)
        # But that value is actually a ContentMetaAttributeValue object.
        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
        # And it will take on a value that reflects its current
        # encoding.
        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
        # For the rest of the story, see TestSubstitutions in
        # test_tree.py.
    def test_html5_style_meta_tag_reflects_current_encoding(self):
        # Here's the <meta> tag saying that a document is
        # encoded in Shift-JIS.
        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
        # Here's a document incorporating that meta tag.
        shift_jis_html = (
            '<html><head>\n%s\n'
            '<meta http-equiv="Content-language" content="ja"/>'
            '</head><body>Shift-JIS markup goes here.') % meta_tag
        soup = self.soup(shift_jis_html)
        # Parse the document, and the charset is seemingly unaffected.
        parsed_meta = soup.find('meta', id="encoding")
        charset = parsed_meta['charset']
        self.assertEqual('x-sjis', charset)
        # But that value is actually a CharsetMetaAttributeValue object.
        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
        # And it will take on a value that reflects its current
        # encoding.
        self.assertEqual('utf8', charset.encode("utf8"))
    def test_tag_with_no_attributes_can_have_attributes_added(self):
        data = self.soup("<a>text</a>")
        data.a['foo'] = 'bar'
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
 class XMLTreeBuilderSmokeTest(object):
    def test_docstring_generated(self):
        soup = self.soup("<root/>")
        self.assertEqual(
            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
    def test_real_xhtml_document(self):
        """A real XHTML document should come out *exactly* the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8"), markup)
    def test_formatter_processes_script_tag_for_xml_documents(self):
        doc = """
  <script type="text/javascript">
  </script>
 """
        soup = BeautifulSoup(doc, "xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
        encoded = soup.encode()
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
    def test_can_parse_unicode_document(self):
        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
        soup = self.soup(markup)
        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
        self.assertEqual(
            unicode(soup.rss), markup)
    def test_docstring_includes_correct_encoding(self):
        soup = self.soup("<root/>")
        self.assertEqual(
            soup.encode("latin1"),
            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
    def test_large_xml_document(self):
        """A large XML document should come out the same as it went in."""
        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
                  + b'0' * (2**12)
                  + b'</root>')
        soup = self.soup(markup)
        self.assertEqual(soup.encode("utf-8"), markup)
    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
        self.assertSoupEquals("<p>", "<p/>")
        self.assertSoupEquals("<p>foo</p>")
    def test_namespaces_are_preserved(self):
        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
        soup = self.soup(markup)
        root = soup.root
        self.assertEqual("http://example.com/", root['xmlns:a'])
        self.assertEqual("http://example.net/", root['xmlns:b'])
    def test_closing_namespaced_tag(self):
        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.p), markup)
    def test_namespaced_attributes(self):
        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)
    def test_namespaced_attributes_xml_namespace(self):
        markup = '<foo xml:lang="fr">bar</foo>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""
    def test_real_xhtml_document(self):
        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
        # XHTML documents in any particular way.
        pass
    def test_html_tags_have_namespace(self):
        markup = "<a>"
        soup = self.soup(markup)
        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
    def test_svg_tags_have_namespace(self):
        markup = '<svg><circle/></svg>'
        soup = self.soup(markup)
        namespace = "http://www.w3.org/2000/svg"
        self.assertEqual(namespace, soup.svg.namespace)
        self.assertEqual(namespace, soup.circle.namespace)
    def test_mathml_tags_have_namespace(self):
        markup = '<math><msqrt>5</msqrt></math>'
        soup = self.soup(markup)
        namespace = 'http://www.w3.org/1998/Math/MathML'
        self.assertEqual(namespace, soup.math.namespace)
        self.assertEqual(namespace, soup.msqrt.namespace)
    def test_xml_declaration_becomes_comment(self):
        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
        soup = self.soup(markup)
        self.assertTrue(isinstance(soup.contents[0], Comment))
        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
        self.assertEqual("html", soup.contents[0].next_element.name)
 def skipIf(condition, reason):
   def nothing(test, *args, **kwargs):
       return None
   def decorator(test_item):
       if condition:
           return nothing
       else:
           return test_item
   return decorator
--- a/lib/bs4/tests/init.py
+++ b/lib/bs4/tests/init.py
@ -1 +0,0 @@
 "The beautifulsoup tests."
--- a/lib/bs4/tests/test_builder_registry.py
+++ b/lib/bs4/tests/test_builder_registry.py
@ -1,141 +0,0 @@
 """Tests of the builder registry."""
 import unittest
 from bs4 import BeautifulSoup
 from bs4.builder import (
    builder_registry as registry,
    HTMLParserTreeBuilder,
    TreeBuilderRegistry,
 )
 try:
    from bs4.builder import HTML5TreeBuilder
    HTML5LIB_PRESENT = True
 except ImportError:
    HTML5LIB_PRESENT = False
 try:
    from bs4.builder import (
        LXMLTreeBuilderForXML,
        LXMLTreeBuilder,
        )
    LXML_PRESENT = True
 except ImportError:
    LXML_PRESENT = False
 class BuiltInRegistryTest(unittest.TestCase):
    """Test the built-in registry with the default builders registered."""
    def test_combination(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('fast', 'html'),
                             LXMLTreeBuilder)
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('permissive', 'xml'),
                             LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup('strict', 'html'),
                          HTMLParserTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib', 'html'),
                              HTML5TreeBuilder)
    def test_lookup_by_markup_type(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
            self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
        else:
            self.assertEqual(registry.lookup('xml'), None)
            if HTML5LIB_PRESENT:
                self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
            else:
                self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
    def test_named_library(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('lxml', 'xml'),
                             LXMLTreeBuilderForXML)
            self.assertEqual(registry.lookup('lxml', 'html'),
                             LXMLTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib'),
                              HTML5TreeBuilder)
        self.assertEqual(registry.lookup('html.parser'),
                          HTMLParserTreeBuilder)
    def test_beautifulsoup_constructor_does_lookup(self):
        # You can pass in a string.
        BeautifulSoup("", features="html")
        # Or a list of strings.
        BeautifulSoup("", features=["html", "fast"])
        # You'll get an exception if BS can't find an appropriate
        # builder.
        self.assertRaises(ValueError, BeautifulSoup,
                          "", features="no-such-feature")
 class RegistryTest(unittest.TestCase):
    """Test the TreeBuilderRegistry class in general."""
    def setUp(self):
        self.registry = TreeBuilderRegistry()
    def builder_for_features(self, *feature_list):
        cls = type('Builder_' + '_'.join(feature_list),
                   (object,), {'features' : feature_list})
        self.registry.register(cls)
        return cls
    def test_register_with_no_features(self):
        builder = self.builder_for_features()
        # Since the builder advertises no features, you can't find it
        # by looking up features.
        self.assertEqual(self.registry.lookup('foo'), None)
        # But you can find it by doing a lookup with no features, if
        # this happens to be the only registered builder.
        self.assertEqual(self.registry.lookup(), builder)
    def test_register_with_features_makes_lookup_succeed(self):
        builder = self.builder_for_features('foo', 'bar')
        self.assertEqual(self.registry.lookup('foo'), builder)
        self.assertEqual(self.registry.lookup('bar'), builder)
    def test_lookup_fails_when_no_builder_implements_feature(self):
        builder = self.builder_for_features('foo', 'bar')
        self.assertEqual(self.registry.lookup('baz'), None)
    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
        builder1 = self.builder_for_features('foo')
        builder2 = self.builder_for_features('bar')
        self.assertEqual(self.registry.lookup(), builder2)
    def test_lookup_fails_when_no_tree_builders_registered(self):
        self.assertEqual(self.registry.lookup(), None)
    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
        has_one = self.builder_for_features('foo')
        has_the_other = self.builder_for_features('bar')
        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
        lacks_one = self.builder_for_features('bar')
        has_the_other = self.builder_for_features('foo')
        # There are two builders featuring 'foo' and 'bar', but
        # the one that also features 'quux' was registered later.
        self.assertEqual(self.registry.lookup('foo', 'bar'),
                          has_both_late)
        # There is only one builder featuring 'foo', 'bar', and 'baz'.
        self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
                          has_both_early)
    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
        builder1 = self.builder_for_features('foo', 'bar')
        builder2 = self.builder_for_features('foo', 'baz')
        self.assertEqual(self.registry.lookup('bar', 'baz'), None)
--- a/lib/bs4/tests/test_docs.py
+++ b/lib/bs4/tests/test_docs.py
@ -1,36 +0,0 @@
 "Test harness for doctests."
 # pylint: disable-msg=E0611,W0142
 __metaclass__ = type
 __all__ = [
    'additional_tests',
    ]
 import atexit
 import doctest
 import os
 #from pkg_resources import (
 #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
 import unittest
 DOCTEST_FLAGS = (
    doctest.ELLIPSIS |
    doctest.NORMALIZE_WHITESPACE |
    doctest.REPORT_NDIFF)
 # def additional_tests():
 #     "Run the doc tests (README.txt and docs/*, if any exist)"
 #     doctest_files = [
 #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
 #     if resource_exists('bs4', 'docs'):
 #         for name in resource_listdir('bs4', 'docs'):
 #             if name.endswith('.txt'):
 #                 doctest_files.append(
 #                     os.path.abspath(
 #                         resource_filename('bs4', 'docs/%s' % name)))
 #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
 #     atexit.register(cleanup_resources)
 #     return unittest.TestSuite((
 #         doctest.DocFileSuite(*doctest_files, **kwargs)))
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -1,85 +0,0 @@
 """Tests to ensure that the html5lib tree builder generates good trees."""
 import warnings
 try:
    from bs4.builder import HTML5TreeBuilder
    HTML5LIB_PRESENT = True
 except ImportError, e:
    HTML5LIB_PRESENT = False
 from bs4.element import SoupStrainer
 from bs4.testing import (
    HTML5TreeBuilderSmokeTest,
    SoupTest,
    skipIf,
 )
@skipIf(
    not HTML5LIB_PRESENT,
    "html5lib seems not to be present, not testing its tree builder.")
 class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
    """See ``HTML5TreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return HTML5TreeBuilder()
    def test_soupstrainer(self):
        # The html5lib tree builder does not support SoupStrainers.
        strainer = SoupStrainer("b")
        markup = "<p>A <b>bold</b> statement.</p>"
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(
            soup.decode(), self.document_for(markup))
        self.assertTrue(
            "the html5lib tree builder doesn't support parse_only" in
            str(w[0].message))
    def test_correctly_nested_tables(self):
        """html5lib inserts <tbody> tags where other parsers don't."""
        markup = ('<table id="1">'
                  '<tr>'
                  "<td>Here's another table:"
                  '<table id="2">'
                  '<tr><td>foo</td></tr>'
                  '</table></td>')
        self.assertSoupEquals(
            markup,
            '<table id="1"><tbody><tr><td>Here\'s another table:'
            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
            '</td></tr></tbody></table>')
        self.assertSoupEquals(
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_xml_declaration_followed_by_doctype(self):
        markup = '''<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html>
 <html>
  <head>
  </head>
  <body>
   <p>foo</p>
  </body>
 </html>'''
        soup = self.soup(markup)
        # Verify that we can reach the <p> tag; this means the tree is connected.
        self.assertEqual(b"<p>foo</p>", soup.p.encode())
    def test_reparented_markup(self):
        markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
        soup = self.soup(markup)
        self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
        self.assertEqual(2, len(soup.find_all('p')))
    def test_reparented_markup_ends_with_whitespace(self):
        markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
        soup = self.soup(markup)
        self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
        self.assertEqual(2, len(soup.find_all('p')))
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -1,19 +0,0 @@
 """Tests to ensure that the html.parser tree builder generates good
 trees."""
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
 class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    @property
    def default_builder(self):
        return HTMLParserTreeBuilder()
    def test_namespaced_system_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
    def test_namespaced_public_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -1,91 +0,0 @@
 """Tests to ensure that the lxml tree builder generates good trees."""
 import re
 import warnings
 try:
    import lxml.etree
    LXML_PRESENT = True
    LXML_VERSION = lxml.etree.LXML_VERSION
 except ImportError, e:
    LXML_PRESENT = False
    LXML_VERSION = (0,)
 if LXML_PRESENT:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
    )
 from bs4.element import Comment, Doctype, SoupStrainer
 from bs4.testing import skipIf
 from bs4.tests import test_htmlparser
 from bs4.testing import (
    HTMLTreeBuilderSmokeTest,
    XMLTreeBuilderSmokeTest,
    SoupTest,
    skipIf,
 )
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its tree builder.")
 class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return LXMLTreeBuilder()
    def test_out_of_range_entity(self):
        self.assertSoupEquals(
            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
    # test if an old version of lxml is installed.
    @skipIf(
        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
        "Skipping doctype test for old version of lxml to avoid segfault.")
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())
    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<b />")
        self.assertEqual(u"<b/>", unicode(soup.b))
        self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
    def test_real_xhtml_document(self):
        """lxml strips the XML definition from an XHTML doc, which is fine."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head><title>Hello.</title></head>
 <body>Goodbye.</body>
 </html>"""
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8").replace(b"\n", b''),
            markup.replace(b'\n', b'').replace(
                b'<?xml version="1.0" encoding="utf-8"?>', b''))
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its XML tree builder.")
 class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML()
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -1,434 +0,0 @@
 # -*- coding: utf-8 -*-
 """Tests of Beautiful Soup as a whole."""
 import logging
 import unittest
 import sys
 import tempfile
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
 )
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
    SoupStrainer,
    NamespacedAttribute,
    )
 import bs4.dammit
 from bs4.dammit import (
    EntitySubstitution,
    UnicodeDammit,
 )
 from bs4.testing import (
    SoupTest,
    skipIf,
 )
 import warnings
 try:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
    LXML_PRESENT = True
 except ImportError, e:
    LXML_PRESENT = False
 PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 class TestConstructor(SoupTest):
    def test_short_unicode_input(self):
        data = u"<h1>éé</h1>"
        soup = self.soup(data)
        self.assertEqual(u"éé", soup.h1.string)
    def test_embedded_null(self):
        data = u"<h1>foo\0bar</h1>"
        soup = self.soup(data)
        self.assertEqual(u"foo\0bar", soup.h1.string)
 class TestDeprecatedConstructorArguments(SoupTest):
    def test_parseOnlyThese_renamed_to_parse_only(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
        msg = str(w[0].message)
        self.assertTrue("parseOnlyThese" in msg)
        self.assertTrue("parse_only" in msg)
        self.assertEqual(b"<b></b>", soup.encode())
    def test_fromEncoding_renamed_to_from_encoding(self):
        with warnings.catch_warnings(record=True) as w:
            utf8 = b"\xc3\xa9"
            soup = self.soup(utf8, fromEncoding="utf8")
        msg = str(w[0].message)
        self.assertTrue("fromEncoding" in msg)
        self.assertTrue("from_encoding" in msg)
        self.assertEqual("utf8", soup.original_encoding)
    def test_unrecognized_keyword_argument(self):
        self.assertRaises(
            TypeError, self.soup, "<a>", no_such_argument=True)
 class TestWarnings(SoupTest):
    def test_disk_file_warning(self):
        filehandle = tempfile.NamedTemporaryFile()
        filename = filehandle.name
        try:
            with warnings.catch_warnings(record=True) as w:
                soup = self.soup(filename)
            msg = str(w[0].message)
            self.assertTrue("looks like a filename" in msg)
        finally:
            filehandle.close()
        # The file no longer exists, so Beautiful Soup will no longer issue the warning.
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(filename)
        self.assertEqual(0, len(w))
    def test_url_warning(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("http://www.crummy.com/")
        msg = str(w[0].message)
        self.assertTrue("looks like a URL" in msg)
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("http://www.crummy.com/ is great")
        self.assertEqual(0, len(w))
 class TestSelectiveParsing(SoupTest):
    def test_parse_with_soupstrainer(self):
        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
        strainer = SoupStrainer("b")
        soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
 class TestEntitySubstitution(unittest.TestCase):
    """Standalone tests of the EntitySubstitution class."""
    def setUp(self):
        self.sub = EntitySubstitution
    def test_simple_html_substitution(self):
        # Unicode characters corresponding to named HTML entites
        # are substituted, and no others.
        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
        self.assertEqual(self.sub.substitute_html(s),
                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
    def test_smart_quote_substitution(self):
        # MS smart quotes are a common source of frustration, so we
        # give them a special test.
        quotes = b"\x91\x92foo\x93\x94"
        dammit = UnicodeDammit(quotes)
        self.assertEqual(self.sub.substitute_html(dammit.markup),
                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, False), s)
    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
        self.assertEqual(self.sub.substitute_xml("Welcome", True),
                          '"Welcome"')
        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
                          '"Bob\'s Bar"')
    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, True),
                          "'Welcome to \"my bar\"'")
    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
        s = 'Welcome to "Bob\'s Bar"'
        self.assertEqual(
            self.sub.substitute_xml(s, True),
            '"Welcome to &quot;Bob\'s Bar&quot;"')
    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
        quoted = 'Welcome to "Bob\'s Bar"'
        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
    def test_xml_quoting_handles_angle_brackets(self):
        self.assertEqual(
            self.sub.substitute_xml("foo<bar>"),
            "foo&lt;bar&gt;")
    def test_xml_quoting_handles_ampersands(self):
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
    def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml("&Aacute;T&T"),
            "&amp;Aacute;T&amp;T")
    def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
            "&Aacute;T&amp;T")
    def test_quotes_not_html_substituted(self):
        """There's no need to do this except inside attribute values."""
        text = 'Bob\'s "bar"'
        self.assertEqual(self.sub.substitute_html(text), text)
 class TestEncodingConversion(SoupTest):
    # Test Beautiful Soup's ability to decode and encode from various
    # encodings.
    def setUp(self):
        super(TestEncodingConversion, self).setUp()
        self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
        self.utf8_data = self.unicode_data.encode("utf-8")
        # Just so you know what it looks like.
        self.assertEqual(
            self.utf8_data,
            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
    def test_ascii_in_unicode_out(self):
        # ASCII input is converted to Unicode. The original_encoding
        # attribute is set to 'utf-8', a superset of ASCII.
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            # Disable chardet, which will realize that the ASCII is ASCII.
            bs4.dammit.chardet_dammit = noop
            ascii = b"<foo>a</foo>"
            soup_from_ascii = self.soup(ascii)
            unicode_output = soup_from_ascii.decode()
            self.assertTrue(isinstance(unicode_output, unicode))
            self.assertEqual(unicode_output, self.document_for(ascii.decode()))
            self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
    def test_unicode_in_unicode_out(self):
        # Unicode input is left alone. The original_encoding attribute
        # is not set.
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
        self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
        self.assertEqual(soup_from_unicode.original_encoding, None)
    def test_utf8_in_unicode_out(self):
        # UTF-8 input is converted to Unicode. The original_encoding
        # attribute is set.
        soup_from_utf8 = self.soup(self.utf8_data)
        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
        self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
    def test_utf8_out(self):
        # The internal data structures can be encoded as UTF-8.
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
    @skipIf(
        PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
    def test_attribute_name_containing_unicode_characters(self):
        markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
 class TestUnicodeDammit(unittest.TestCase):
    """Standalone tests of UnicodeDammit."""
    def test_unicode_input(self):
        markup = u"I'm already Unicode! \N{SNOWMAN}"
        dammit = UnicodeDammit(markup)
        self.assertEqual(dammit.unicode_markup, markup)
    def test_smart_quotes_to_unicode(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup)
        self.assertEqual(
            dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
    def test_smart_quotes_to_xml_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
    def test_smart_quotes_to_html_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="html")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
    def test_smart_quotes_to_ascii(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
        self.assertEqual(
            dammit.unicode_markup, """<foo>''""</foo>""")
    def test_detect_utf8(self):
        utf8 = b"\xc3\xa9"
        dammit = UnicodeDammit(utf8)
        self.assertEqual(dammit.unicode_markup, u'\xe9')
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_convert_hebrew(self):
        hebrew = b"\xed\xe5\xec\xf9"
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
        self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
    def test_dont_see_smart_quotes_where_there_are_none(self):
        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
        dammit = UnicodeDammit(utf_8)
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
    def test_ignore_inappropriate_codecs(self):
        utf8_data = u"Räksmörgås".encode("utf-8")
        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_ignore_invalid_codecs(self):
        utf8_data = u"Räksmörgås".encode("utf-8")
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
            dammit = UnicodeDammit(utf8_data, [bad_encoding])
            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_detect_html5_style_meta_tag(self):
        for data in (
            b'<html><meta charset="euc-jp" /></html>',
            b"<html><meta charset='euc-jp' /></html>",
            b"<html><meta charset=euc-jp /></html>",
            b"<html><meta charset=euc-jp/></html>"):
            dammit = UnicodeDammit(data, is_html=True)
            self.assertEqual(
                "euc-jp", dammit.original_encoding)
    def test_last_ditch_entity_replacement(self):
        # This is a UTF-8 document that contains bytestrings
        # completely incompatible with UTF-8 (ie. encoded with some other
        # encoding).
        #
        # Since there is no consistent encoding for the document,
        # Unicode, Dammit will eventually encode the document as UTF-8
        # and encode the incompatible characters as REPLACEMENT
        # CHARACTER.
        #
        # If chardet is installed, it will detect that the document
        # can be converted into ISO-8859-1 without errors. This happens
        # to be the wrong encoding, but it is a consistent encoding, so the
        # code we're testing here won't run.
        #
        # So we temporarily disable chardet if it's present.
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            bs4.dammit.chardet_dammit = noop
            dammit = UnicodeDammit(doc)
            self.assertEqual(True, dammit.contains_replacement_characters)
            self.assertTrue(u"\ufffd" in dammit.unicode_markup)
            soup = BeautifulSoup(doc, "html.parser")
            self.assertTrue(soup.contains_replacement_characters)
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
    def test_byte_order_mark_removed(self):
        # A document written in UTF-16LE will have its byte order marker stripped.
        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
        dammit = UnicodeDammit(data)
        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
        self.assertEqual("utf-16le", dammit.original_encoding)
    def test_detwingle(self):
        # Here's a UTF8 document.
        utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
        # Here's a Windows-1252 document.
        windows_1252 = (
            u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
            u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
        # Through some unholy alchemy, they've been stuck together.
        doc = utf8 + windows_1252 + utf8
        # The document can't be turned into UTF-8:
        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
        # Unicode, Dammit thinks the whole document is Windows-1252,
        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
        # But if we run it through fix_embedded_windows_1252, it's fixed:
        fixed = UnicodeDammit.detwingle(doc)
        self.assertEqual(
            u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
    def test_detwingle_ignores_multibyte_characters(self):
        # Each of these characters has a UTF-8 representation ending
        # in \x93. \x93 is a smart quote if interpreted as
        # Windows-1252. But our code knows to skip over multibyte
        # UTF-8 characters, so they'll survive the process unscathed.
        for tricky_unicode_char in (
            u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
            u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
            u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
            ):
            input = tricky_unicode_char.encode("utf8")
            self.assertTrue(input.endswith(b'\x93'))
            output = UnicodeDammit.detwingle(input)
            self.assertEqual(output, input)
 class TestNamedspacedAttribute(SoupTest):
    def test_name_may_be_none(self):
        a = NamespacedAttribute("xmlns", None)
        self.assertEqual(a, "xmlns")
    def test_attribute_is_equivalent_to_colon_separated_string(self):
        a = NamespacedAttribute("a", "b")
        self.assertEqual("a:b", a)
    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
        a = NamespacedAttribute("a", "b", "c")
        b = NamespacedAttribute("a", "b", "c")
        self.assertEqual(a, b)
        # The actual namespace is not considered.
        c = NamespacedAttribute("a", "b", None)
        self.assertEqual(a, c)
        # But name and prefix are important.
        d = NamespacedAttribute("a", "z", "c")
        self.assertNotEqual(a, d)
        e = NamespacedAttribute("z", "b", "c")
        self.assertNotEqual(a, e)
 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
    def test_content_meta_attribute_value(self):
        value = CharsetMetaAttributeValue("euc-jp")
        self.assertEqual("euc-jp", value)
        self.assertEqual("euc-jp", value.original_value)
        self.assertEqual("utf8", value.encode("utf8"))
    def test_content_meta_attribute_value(self):
        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
        self.assertEqual("text/html; charset=euc-jp", value)
        self.assertEqual("text/html; charset=euc-jp", value.original_value)
        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py