Merge pull request #612 from JackDandy/feature/UpdateBSoup

Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397).
2025-03-17 10:07:42 +00:00 · 2016-01-12 02:58:37 +00:00 · 2016-01-12 02:58:37 +00:00 · 9ecc98fff0
commit 9ecc98fff0
parent a4b5812cad 7c0c75fc99
5 changed files with 204 additions and 184 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -6,6 +6,7 @@
 * Update change to suppress reporting of Tornado exception error 1 to updated package (ref:hacks.txt)
 * Change API response header for JSON content type and the return of JSONP data
 * Remove redundant MultipartPostHandler
+* Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397)


 ### 0.11.0 (2016-01-10 22:30:00 UTC)
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -120,6 +120,9 @@ class AttrList(object):
        if (name in list_attr['*']
            or (self.element.name in list_attr
                and name in list_attr[self.element.name])):
+            # A node that is being cloned may have already undergone
+            # this procedure.
+            if not isinstance(value, list):
                value = whitespace_re.split(value)
        self.element[name] = value
    def items(self):
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -6,6 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
+__license__ = "MIT"

 from pdb import set_trace
 import codecs
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -1,4 +1,7 @@
 """Diagnostic functions, mainly for use when doing tech support."""
+
+__license__ = "MIT"
+
 import cProfile
 from StringIO import StringIO
 from HTMLParser import HTMLParser
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -1,3 +1,5 @@
+__license__ = "MIT"
+
 from pdb import set_trace
 import collections
 import re
@ -262,19 +264,19 @@ class PageElement(object):
        next_element = last_child.next_element

        if (self.previous_element is not None and
-            self.previous_element != next_element):
+            self.previous_element is not next_element):
            self.previous_element.next_element = next_element
-        if next_element is not None and next_element != self.previous_element:
+        if next_element is not None and next_element is not self.previous_element:
            next_element.previous_element = self.previous_element
        self.previous_element = None
        last_child.next_element = None

        self.parent = None
        if (self.previous_sibling is not None
-            and self.previous_sibling != self.next_sibling):
+            and self.previous_sibling is not self.next_sibling):
            self.previous_sibling.next_sibling = self.next_sibling
        if (self.next_sibling is not None
-            and self.next_sibling != self.previous_sibling):
+            and self.next_sibling is not self.previous_sibling):
            self.next_sibling.previous_sibling = self.previous_sibling
        self.previous_sibling = self.next_sibling = None
        return self
@ -287,13 +289,15 @@ class PageElement(object):
            last_child = self
            while isinstance(last_child, Tag) and last_child.contents:
                last_child = last_child.contents[-1]
-        if not accept_self and last_child == self:
+        if not accept_self and last_child is self:
            last_child = None
        return last_child
    # BS3: Not part of the API!
    _lastRecursiveChild = _last_descendant

    def insert(self, position, new_child):
+        if new_child is None:
+            raise ValueError("Cannot insert None into a tag.")
        if new_child is self:
            raise ValueError("Cannot insert a tag into itself.")
        if (isinstance(new_child, basestring)
@ -750,8 +754,8 @@ class Comment(PreformattedString):


 class Declaration(PreformattedString):
-    PREFIX = u'<!'
-    SUFFIX = u'!>'
+    PREFIX = u'<?'
+    SUFFIX = u'?>'


 class Doctype(PreformattedString):
@ -1286,9 +1290,23 @@ class Tag(PageElement):
    def select(self, selector, _candidate_generator=None, limit=None):
        """Perform a CSS selection operation on the current element."""

-        # Remove whitespace directly after the grouping operator ','
-        # then split into tokens.
-        tokens = re.sub(',[\s]*',',', selector).split()
+        # Handle grouping selectors if ',' exists, ie: p,a
+        if ',' in selector:
+            context = []
+            for partial_selector in selector.split(','):
+                partial_selector = partial_selector.strip()
+                if partial_selector == '':
+                    raise ValueError('Invalid group selection syntax: %s' % selector)
+                candidates = self.select(partial_selector, limit=limit)
+                for candidate in candidates:
+                    if candidate not in context:
+                        context.append(candidate)
+
+                if limit and len(context) >= limit:
+                    break
+            return context
+
+        tokens = selector.split()
        current_context = [self]

        if tokens[-1] in self._selector_combinators:
@ -1298,22 +1316,16 @@ class Tag(PageElement):
        if self._select_debug:
            print 'Running CSS selector "%s"' % selector

-        for index, token_group in enumerate(tokens):
+        for index, token in enumerate(tokens):
            new_context = []
            new_context_ids = set([])

-            # Grouping selectors, ie: p,a
-            grouped_tokens = token_group.split(',')
-            if '' in grouped_tokens:
-                raise ValueError('Invalid group selection syntax: %s' % token_group)
-
            if tokens[index-1] in self._selector_combinators:
                # This token was consumed by the previous combinator. Skip it.
                if self._select_debug:
                    print '  Token was consumed by the previous combinator.'
                continue

-            for token in grouped_tokens:
            if self._select_debug:
                print ' Considering token "%s"' % token
            recursive_candidate_generator = None