Merge pull request #612 from JackDandy/feature/UpdateBSoup

Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397).
2025-04-17 16:01:29 +00:00 · 2016-01-12 02:58:37 +00:00 · 2016-01-12 02:58:37 +00:00 · 9ecc98fff0
commit 9ecc98fff0
parent a4b5812cad 7c0c75fc99
5 changed files with 204 additions and 184 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -6,6 +6,7 @@
 * Update change to suppress reporting of Tornado exception error 1 to updated package (ref:hacks.txt)
 * Change API response header for JSON content type and the return of JSONP data
 * Remove redundant MultipartPostHandler
+* Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397)


 ### 0.11.0 (2016-01-10 22:30:00 UTC)
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -120,7 +120,10 @@ class AttrList(object):
        if (name in list_attr['*']
            or (self.element.name in list_attr
                and name in list_attr[self.element.name])):
-            value = whitespace_re.split(value)
+            # A node that is being cloned may have already undergone
+            # this procedure.
+            if not isinstance(value, list):
+                value = whitespace_re.split(value)
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -6,6 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
+__license__ = "MIT"

 from pdb import set_trace
 import codecs
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -1,4 +1,7 @@
 """Diagnostic functions, mainly for use when doing tech support."""
+
+__license__ = "MIT"
+
 import cProfile
 from StringIO import StringIO
 from HTMLParser import HTMLParser
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -1,3 +1,5 @@
+__license__ = "MIT"
+
 from pdb import set_trace
 import collections
 import re
@ -262,19 +264,19 @@ class PageElement(object):
        next_element = last_child.next_element

        if (self.previous_element is not None and
-            self.previous_element != next_element):
+            self.previous_element is not next_element):
            self.previous_element.next_element = next_element
-        if next_element is not None and next_element != self.previous_element:
+        if next_element is not None and next_element is not self.previous_element:
            next_element.previous_element = self.previous_element
        self.previous_element = None
        last_child.next_element = None

        self.parent = None
        if (self.previous_sibling is not None
-            and self.previous_sibling != self.next_sibling):
+            and self.previous_sibling is not self.next_sibling):
            self.previous_sibling.next_sibling = self.next_sibling
        if (self.next_sibling is not None
-            and self.next_sibling != self.previous_sibling):
+            and self.next_sibling is not self.previous_sibling):
            self.next_sibling.previous_sibling = self.previous_sibling
        self.previous_sibling = self.next_sibling = None
        return self
@ -287,13 +289,15 @@ class PageElement(object):
            last_child = self
            while isinstance(last_child, Tag) and last_child.contents:
                last_child = last_child.contents[-1]
-        if not accept_self and last_child == self:
+        if not accept_self and last_child is self:
            last_child = None
        return last_child
    # BS3: Not part of the API!
    _lastRecursiveChild = _last_descendant

    def insert(self, position, new_child):
+        if new_child is None:
+            raise ValueError("Cannot insert None into a tag.")
        if new_child is self:
            raise ValueError("Cannot insert a tag into itself.")
        if (isinstance(new_child, basestring)
@ -750,8 +754,8 @@ class Comment(PreformattedString):


 class Declaration(PreformattedString):
-    PREFIX = u'<!'
-    SUFFIX = u'!>'
+    PREFIX = u'<?'
+    SUFFIX = u'?>'


 class Doctype(PreformattedString):
@ -1286,9 +1290,23 @@ class Tag(PageElement):
    def select(self, selector, _candidate_generator=None, limit=None):
        """Perform a CSS selection operation on the current element."""

-        # Remove whitespace directly after the grouping operator ','
-        # then split into tokens.
-        tokens = re.sub(',[\s]*',',', selector).split()
+        # Handle grouping selectors if ',' exists, ie: p,a
+        if ',' in selector:
+            context = []
+            for partial_selector in selector.split(','):
+                partial_selector = partial_selector.strip()
+                if partial_selector == '':
+                    raise ValueError('Invalid group selection syntax: %s' % selector)
+                candidates = self.select(partial_selector, limit=limit)
+                for candidate in candidates:
+                    if candidate not in context:
+                        context.append(candidate)
+
+                if limit and len(context) >= limit:
+                    break
+            return context
+
+        tokens = selector.split()
        current_context = [self]

        if tokens[-1] in self._selector_combinators:
@ -1298,198 +1316,192 @@ class Tag(PageElement):
        if self._select_debug:
            print 'Running CSS selector "%s"' % selector

-        for index, token_group in enumerate(tokens):
+        for index, token in enumerate(tokens):
            new_context = []
            new_context_ids = set([])

-            # Grouping selectors, ie: p,a
-            grouped_tokens = token_group.split(',')
-            if '' in grouped_tokens:
-                raise ValueError('Invalid group selection syntax: %s' % token_group)
-
            if tokens[index-1] in self._selector_combinators:
                # This token was consumed by the previous combinator. Skip it.
                if self._select_debug:
                    print '  Token was consumed by the previous combinator.'
                continue

-            for token in grouped_tokens:
-                if self._select_debug:
-                    print ' Considering token "%s"' % token
-                recursive_candidate_generator = None
-                tag_name = None
+            if self._select_debug:
+                print ' Considering token "%s"' % token
+            recursive_candidate_generator = None
+            tag_name = None

-                # Each operation corresponds to a checker function, a rule
-                # for determining whether a candidate matches the
-                # selector. Candidates are generated by the active
-                # iterator.
-                checker = None
+            # Each operation corresponds to a checker function, a rule
+            # for determining whether a candidate matches the
+            # selector. Candidates are generated by the active
+            # iterator.
+            checker = None

-                m = self.attribselect_re.match(token)
-                if m is not None:
-                    # Attribute selector
-                    tag_name, attribute, operator, value = m.groups()
-                    checker = self._attribute_checker(operator, attribute, value)
+            m = self.attribselect_re.match(token)
+            if m is not None:
+                # Attribute selector
+                tag_name, attribute, operator, value = m.groups()
+                checker = self._attribute_checker(operator, attribute, value)

-                elif '#' in token:
-                    # ID selector
-                    tag_name, tag_id = token.split('#', 1)
-                    def id_matches(tag):
-                        return tag.get('id', None) == tag_id
-                    checker = id_matches
+            elif '#' in token:
+                # ID selector
+                tag_name, tag_id = token.split('#', 1)
+                def id_matches(tag):
+                    return tag.get('id', None) == tag_id
+                checker = id_matches

-                elif '.' in token:
-                    # Class selector
-                    tag_name, klass = token.split('.', 1)
-                    classes = set(klass.split('.'))
-                    def classes_match(candidate):
-                        return classes.issubset(candidate.get('class', []))
-                    checker = classes_match
+            elif '.' in token:
+                # Class selector
+                tag_name, klass = token.split('.', 1)
+                classes = set(klass.split('.'))
+                def classes_match(candidate):
+                    return classes.issubset(candidate.get('class', []))
+                checker = classes_match

-                elif ':' in token:
-                    # Pseudo-class
-                    tag_name, pseudo = token.split(':', 1)
-                    if tag_name == '':
-                        raise ValueError(
-                            "A pseudo-class must be prefixed with a tag name.")
-                    pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
-                    found = []
-                    if pseudo_attributes is None:
-                        pseudo_type = pseudo
-                        pseudo_value = None
-                    else:
-                        pseudo_type, pseudo_value = pseudo_attributes.groups()
-                    if pseudo_type == 'nth-of-type':
-                        try:
-                            pseudo_value = int(pseudo_value)
-                        except:
-                            raise NotImplementedError(
-                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
-                        if pseudo_value < 1:
-                            raise ValueError(
-                                'nth-of-type pseudo-class value must be at least 1.')
-                        class Counter(object):
-                            def __init__(self, destination):
-                                self.count = 0
-                                self.destination = destination
-
-                            def nth_child_of_type(self, tag):
-                                self.count += 1
-                                if self.count == self.destination:
-                                    return True
-                                if self.count > self.destination:
-                                    # Stop the generator that's sending us
-                                    # these things.
-                                    raise StopIteration()
-                                return False
-                        checker = Counter(pseudo_value).nth_child_of_type
-                    else:
-                        raise NotImplementedError(
-                            'Only the following pseudo-classes are implemented: nth-of-type.')
-
-                elif token == '*':
-                    # Star selector -- matches everything
-                    pass
-                elif token == '>':
-                    # Run the next token as a CSS selector against the
-                    # direct children of each tag in the current context.
-                    recursive_candidate_generator = lambda tag: tag.children
-                elif token == '~':
-                    # Run the next token as a CSS selector against the
-                    # siblings of each tag in the current context.
-                    recursive_candidate_generator = lambda tag: tag.next_siblings
-                elif token == '+':
-                    # For each tag in the current context, run the next
-                    # token as a CSS selector against the tag's next
-                    # sibling that's a tag.
-                    def next_tag_sibling(tag):
-                        yield tag.find_next_sibling(True)
-                    recursive_candidate_generator = next_tag_sibling
-
-                elif self.tag_name_re.match(token):
-                    # Just a tag name.
-                    tag_name = token
-                else:
+            elif ':' in token:
+                # Pseudo-class
+                tag_name, pseudo = token.split(':', 1)
+                if tag_name == '':
                    raise ValueError(
-                        'Unsupported or invalid CSS selector: "%s"' % token)
-                if recursive_candidate_generator:
-                    # This happens when the selector looks like  "> foo".
-                    #
-                    # The generator calls select() recursively on every
-                    # member of the current context, passing in a different
-                    # candidate generator and a different selector.
-                    #
-                    # In the case of "> foo", the candidate generator is
-                    # one that yields a tag's direct children (">"), and
-                    # the selector is "foo".
-                    next_token = tokens[index+1]
-                    def recursive_select(tag):
-                        if self._select_debug:
-                            print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
-                            print '-' * 40
-                        for i in tag.select(next_token, recursive_candidate_generator):
-                            if self._select_debug:
-                                print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
-                            yield i
-                        if self._select_debug:
-                            print '-' * 40
-                    _use_candidate_generator = recursive_select
-                elif _candidate_generator is None:
-                    # By default, a tag's candidates are all of its
-                    # children. If tag_name is defined, only yield tags
-                    # with that name.
-                    if self._select_debug:
-                        if tag_name:
-                            check = "[any]"
-                        else:
-                            check = tag_name
-                        print '   Default candidate generator, tag name="%s"' % check
-                    if self._select_debug:
-                        # This is redundant with later code, but it stops
-                        # a bunch of bogus tags from cluttering up the
-                        # debug log.
-                        def default_candidate_generator(tag):
-                            for child in tag.descendants:
-                                if not isinstance(child, Tag):
-                                    continue
-                                if tag_name and not child.name == tag_name:
-                                    continue
-                                yield child
-                        _use_candidate_generator = default_candidate_generator
-                    else:
-                        _use_candidate_generator = lambda tag: tag.descendants
+                        "A pseudo-class must be prefixed with a tag name.")
+                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                found = []
+                if pseudo_attributes is None:
+                    pseudo_type = pseudo
+                    pseudo_value = None
                else:
-                    _use_candidate_generator = _candidate_generator
+                    pseudo_type, pseudo_value = pseudo_attributes.groups()
+                if pseudo_type == 'nth-of-type':
+                    try:
+                        pseudo_value = int(pseudo_value)
+                    except:
+                        raise NotImplementedError(
+                            'Only numeric values are currently supported for the nth-of-type pseudo-class.')
+                    if pseudo_value < 1:
+                        raise ValueError(
+                            'nth-of-type pseudo-class value must be at least 1.')
+                    class Counter(object):
+                        def __init__(self, destination):
+                            self.count = 0
+                            self.destination = destination

-                count = 0
-                for tag in current_context:
+                        def nth_child_of_type(self, tag):
+                            self.count += 1
+                            if self.count == self.destination:
+                                return True
+                            if self.count > self.destination:
+                                # Stop the generator that's sending us
+                                # these things.
+                                raise StopIteration()
+                            return False
+                    checker = Counter(pseudo_value).nth_child_of_type
+                else:
+                    raise NotImplementedError(
+                        'Only the following pseudo-classes are implemented: nth-of-type.')
+
+            elif token == '*':
+                # Star selector -- matches everything
+                pass
+            elif token == '>':
+                # Run the next token as a CSS selector against the
+                # direct children of each tag in the current context.
+                recursive_candidate_generator = lambda tag: tag.children
+            elif token == '~':
+                # Run the next token as a CSS selector against the
+                # siblings of each tag in the current context.
+                recursive_candidate_generator = lambda tag: tag.next_siblings
+            elif token == '+':
+                # For each tag in the current context, run the next
+                # token as a CSS selector against the tag's next
+                # sibling that's a tag.
+                def next_tag_sibling(tag):
+                    yield tag.find_next_sibling(True)
+                recursive_candidate_generator = next_tag_sibling
+
+            elif self.tag_name_re.match(token):
+                # Just a tag name.
+                tag_name = token
+            else:
+                raise ValueError(
+                    'Unsupported or invalid CSS selector: "%s"' % token)
+            if recursive_candidate_generator:
+                # This happens when the selector looks like  "> foo".
+                #
+                # The generator calls select() recursively on every
+                # member of the current context, passing in a different
+                # candidate generator and a different selector.
+                #
+                # In the case of "> foo", the candidate generator is
+                # one that yields a tag's direct children (">"), and
+                # the selector is "foo".
+                next_token = tokens[index+1]
+                def recursive_select(tag):
                    if self._select_debug:
-                        print "    Running candidate generator on %s %s" % (
-                            tag.name, repr(tag.attrs))
-                    for candidate in _use_candidate_generator(tag):
-                        if not isinstance(candidate, Tag):
-                            continue
-                        if tag_name and candidate.name != tag_name:
-                            continue
-                        if checker is not None:
-                            try:
-                                result = checker(candidate)
-                            except StopIteration:
-                                # The checker has decided we should no longer
-                                # run the generator.
+                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
+                        print '-' * 40
+                    for i in tag.select(next_token, recursive_candidate_generator):
+                        if self._select_debug:
+                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+                        yield i
+                    if self._select_debug:
+                        print '-' * 40
+                _use_candidate_generator = recursive_select
+            elif _candidate_generator is None:
+                # By default, a tag's candidates are all of its
+                # children. If tag_name is defined, only yield tags
+                # with that name.
+                if self._select_debug:
+                    if tag_name:
+                        check = "[any]"
+                    else:
+                        check = tag_name
+                    print '   Default candidate generator, tag name="%s"' % check
+                if self._select_debug:
+                    # This is redundant with later code, but it stops
+                    # a bunch of bogus tags from cluttering up the
+                    # debug log.
+                    def default_candidate_generator(tag):
+                        for child in tag.descendants:
+                            if not isinstance(child, Tag):
+                                continue
+                            if tag_name and not child.name == tag_name:
+                                continue
+                            yield child
+                    _use_candidate_generator = default_candidate_generator
+                else:
+                    _use_candidate_generator = lambda tag: tag.descendants
+            else:
+                _use_candidate_generator = _candidate_generator
+
+            count = 0
+            for tag in current_context:
+                if self._select_debug:
+                    print "    Running candidate generator on %s %s" % (
+                        tag.name, repr(tag.attrs))
+                for candidate in _use_candidate_generator(tag):
+                    if not isinstance(candidate, Tag):
+                        continue
+                    if tag_name and candidate.name != tag_name:
+                        continue
+                    if checker is not None:
+                        try:
+                            result = checker(candidate)
+                        except StopIteration:
+                            # The checker has decided we should no longer
+                            # run the generator.
+                            break
+                    if checker is None or result:
+                        if self._select_debug:
+                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+                        if id(candidate) not in new_context_ids:
+                            # If a tag matches a selector more than once,
+                            # don't include it in the context more than once.
+                            new_context.append(candidate)
+                            new_context_ids.add(id(candidate))
+                            if limit and len(new_context) >= limit:
                                break
-                        if checker is None or result:
-                            if self._select_debug:
-                                print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
-                            if id(candidate) not in new_context_ids:
-                                # If a tag matches a selector more than once,
-                                # don't include it in the context more than once.
-                                new_context.append(candidate)
-                                new_context_ids.add(id(candidate))
-                                if limit and len(new_context) >= limit:
-                                    break
-                        elif self._select_debug:
-                            print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+                    elif self._select_debug:
+                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))


            current_context = new_context