Merge pull request #978 from JackDandy/feature/UpdateBS

Update Beautiful Soup 4.4.0 (r397) to 4.6.0 (r449).
2024-12-03 18:03:37 +00:00 · 2017-08-26 01:23:03 +01:00 · 2017-08-26 01:23:03 +01:00 · e69960ba2c
commit e69960ba2c
parent cf383de226 1ff79cee4d
5 changed files with 131 additions and 23 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -10,7 +10,7 @@
 * Change improve add show search results by comparing search term to an additional unidecoded result set
 * Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
 * Update backports_abc 0.4 to 0.5
-* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
+* Update Beautiful Soup 4.4.0 (r397) to 4.6.0 (r449)
 * Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
 * Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
 * Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2)
@ -93,6 +93,7 @@
 * Fix "too many redirects" or "no CSS/JS content" delivered
 * Change restart/shutdown to use updated jQuery
 * Remove AlphaReign torrent provider
+* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
 * Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776)
 * Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30)
 * Update Tornado Web Server 4.5.dev1 (92f29b8) to 4.5.dev1 (38e493e)
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -82,7 +82,7 @@ class BeautifulSoup(Tag):

    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"

    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
@ -215,8 +215,8 @@ class BeautifulSoup(Tag):
                    markup = markup.encode("utf8")
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
-                    'probably open this file and pass the filehandle into'
-                    'Beautiful Soup.' % markup)
+                    ' probably open this file and pass the filehandle into'
+                    ' Beautiful Soup.' % markup)
            self._check_markup_is_url(markup)

        for (self.markup, self.original_encoding, self.declared_html_encoding,
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
    """

    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
-    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
-                              'spacer', 'link', 'frame', 'base'])
+    empty_element_tags = set([
+        # These are from HTML5.
+        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+
+        # These are from HTML4, removed in HTML5.
+        'spacer', 'frame'
+    ])

    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -52,7 +52,31 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'

 class BeautifulSoupHTMLParser(HTMLParser):
-    def handle_starttag(self, name, attrs):
+
+    def __init__(self, *args, **kwargs):
+        HTMLParser.__init__(self, *args, **kwargs)
+
+        # Keep a list of empty-element tags that were encountered
+        # without an explicit closing tag. If we encounter a closing tag
+        # of this type, we'll associate it with one of those entries.
+        #
+        # This isn't a stack because we don't care about the
+        # order. It's a list of closing tags we've already handled and
+        # will ignore, assuming they ever show up.
+        self.already_closed_empty_element = []
+    
+    def handle_startendtag(self, name, attrs):
+        # This is only called when the markup looks like
+        # <tag/>.
+
+        # is_startend() tells handle_starttag not to close the tag
+        # just because its name matches a known empty-element tag. We
+        # know that this is an empty-element tag and we want to call
+        # handle_endtag ourselves.
+        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
+        self.handle_endtag(name)
+        
+    def handle_starttag(self, name, attrs, handle_empty_element=True):
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@ -62,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
                value = ''
            attr_dict[key] = value
            attrvalue = '""'
-        self.soup.handle_starttag(name, None, None, attr_dict)
+        #print "START", name
+        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        if tag and tag.is_empty_element and handle_empty_element:
+            # Unlike other parsers, html.parser doesn't send separate end tag
+            # events for empty-element tags. (It's handled in
+            # handle_startendtag, but only if the original markup looked like
+            # <tag/>.)
+            #
+            # So we need to call handle_endtag() ourselves. Since we
+            # know the start event is identical to the end event, we
+            # don't want handle_endtag() to cross off any previous end
+            # events for tags of this name.
+            self.handle_endtag(name, check_already_closed=False)

-    def handle_endtag(self, name):
-        self.soup.handle_endtag(name)
+            # But we might encounter an explicit closing tag for this tag
+            # later on. If so, we want to ignore it.
+            self.already_closed_empty_element.append(name)
+            
+    def handle_endtag(self, name, check_already_closed=True):
+        #print "END", name
+        if check_already_closed and name in self.already_closed_empty_element:
+            # This is a redundant end tag for an empty-element tag.
+            # We've already called handle_endtag() for it, so just
+            # check it off the list.
+            # print "ALREADY CLOSED", name
+            self.already_closed_empty_element.remove(name)
+        else:
+            self.soup.handle_endtag(name)

    def handle_data(self, data):
        self.soup.handle_data(data)
@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
+        parser.already_closed_empty_element = []

 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -132,7 +132,7 @@ class PageElement(object):
    #
    # "html" - All Unicode characters with corresponding HTML entities
    #   are converted to those entities on output. 
-    # "minimal" - Bare ampersands and angle brackets are converted to
+   # "minimal" - Bare ampersands and angle brackets are converted to
    #   XML entities: &amp; &lt; &gt;
    # None - The null formatter. Unicode characters are never
    #   converted to entities.  This is not recommended, but it's
@ -535,9 +535,16 @@ class PageElement(object):
                return ResultSet(strainer, result)
            elif isinstance(name, basestring):
                # Optimization to find all tags with a given name.
+                if name.count(':') == 1:
+                    # This is a name with a prefix.
+                    prefix, name = name.split(':', 1)
+                else:
+                    prefix = None
                result = (element for element in generator
                          if isinstance(element, Tag)
-                            and element.name == name)
+                            and element.name == name
+                          and (prefix is None or element.prefix == prefix)
+                )
                return ResultSet(strainer, result)
        results = ResultSet(strainer)
        while True:
@ -863,7 +870,7 @@ class Tag(PageElement):
        Its contents are a copy of the old Tag's contents.
        """
        clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.nsprefix, self.attrs, is_xml=self._is_xml)
+                           self.prefix, self.attrs, is_xml=self._is_xml)
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@ -985,6 +992,13 @@ class Tag(PageElement):
        attribute."""
        return self.attrs.get(key, default)

+    def get_attribute_list(self, key, default=None):
+        """The same as get(), but always returns a list."""
+        value = self.get(key, default)
+        if not isinstance(value, list):
+            value = [value]
+        return value
+    
    def has_attr(self, key):
        return key in self.attrs

@ -1698,7 +1712,7 @@ class SoupStrainer(object):
                "I don't know how to match against a %s" % markup.__class__)
        return found

-    def _matches(self, markup, match_against):
+    def _matches(self, markup, match_against, already_tried=None):
        # print u"Matching %s against %s" % (markup, match_against)
        result = False
        if isinstance(markup, list) or isinstance(markup, tuple):
@ -1723,6 +1737,7 @@ class SoupStrainer(object):

        # Custom callables take the tag as an argument, but all
        # other ways of matching match the tag name as a string.
+        original_markup = markup
        if isinstance(markup, Tag):
            markup = markup.name

@ -1733,18 +1748,51 @@ class SoupStrainer(object):
            # None matches None, False, an empty string, an empty list, and so on.
            return not match_against

-        if isinstance(match_against, unicode):
-            # Exact string match
-            return markup == match_against
+        if (hasattr(match_against, '__iter__')
+            and not isinstance(match_against, basestring)):
+            # We're asked to match against an iterable of items.
+            # The markup must be match at least one item in the
+            # iterable. We'll try each one in turn.
+            #
+            # To avoid infinite recursion we need to keep track of
+            # items we've already seen.
+            if not already_tried:
+                already_tried = set()
+            for item in match_against:
+                if item.__hash__:
+                    key = item
+                else:
+                    key = id(item)
+                if key in already_tried:
+                    continue
+                else:
+                    already_tried.add(key)
+                    if self._matches(original_markup, item, already_tried):
+                        return True
+            else:
+                return False
        
-        if hasattr(match_against, 'match'):
+        # Beyond this point we might need to run the test twice: once against
+        # the tag's name and once against its prefixed name.
+        match = False
+        
+        if not match and isinstance(match_against, unicode):
+            # Exact string match
+            match = markup == match_against
+
+        if not match and hasattr(match_against, 'search'):
            # Regexp match
            return match_against.search(markup)

-        if hasattr(match_against, '__iter__'):
-            # The markup must be an exact match against something
-            # in the iterable.
-            return markup in match_against
+        if (not match
+            and isinstance(original_markup, Tag)
+            and original_markup.prefix):
+            # Try the whole thing again with the prefixed tag name.
+            return self._matches(
+                original_markup.prefix + ':' + original_markup.name, match_against
+            )
+
+        return match


 class ResultSet(list):
@ -1753,3 +1801,8 @@ class ResultSet(list):
    def __init__(self, source, result=()):
        super(ResultSet, self).__init__(result)
        self.source = source
+
+    def __getattr__(self, key):
+        raise AttributeError(
+            "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
+        )