From 1ff79cee4d0091fd25730e59d32b72b7f60d0ae6 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Sat, 26 Aug 2017 01:09:14 +0100 Subject: [PATCH] Update Beautiful Soup 4.4.0 (r397) to 4.6.0 (r449). --- CHANGES.md | 3 +- lib/bs4/__init__.py | 6 +-- lib/bs4/builder/__init__.py | 9 +++- lib/bs4/builder/_htmlparser.py | 57 ++++++++++++++++++++++-- lib/bs4/element.py | 79 ++++++++++++++++++++++++++++------ 5 files changed, 131 insertions(+), 23 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0af4d142..0ba6b6c9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,7 +10,7 @@ * Change improve add show search results by comparing search term to an additional unidecoded result set * Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups * Update backports_abc 0.4 to 0.5 -* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439) +* Update Beautiful Soup 4.4.0 (r397) to 4.6.0 (r449) * Update cachecontrol library 0.11.5 to 0.12.3 (db54c40) * Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089) * Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2) @@ -93,6 +93,7 @@ * Fix "too many redirects" or "no CSS/JS content" delivered * Change restart/shutdown to use updated jQuery * Remove AlphaReign torrent provider +* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439) * Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776) * Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30) * Update Tornado Web Server 4.5.dev1 (92f29b8) to 4.5.dev1 (38e493e) diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index 46caac04..c984ef6e 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -82,7 +82,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, @@ -215,8 +215,8 @@ class BeautifulSoup(Tag): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' - 'probably open this file and pass the filehandle into' - 'Beautiful Soup.' % markup) + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 601979bf..fdb3362f 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder): """ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from HTML4, removed in HTML5. + 'spacer', 'frame' + ]) # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index 823ca15a..67890b3a 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -52,7 +52,31 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): - def handle_starttag(self, name, attrs): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # . + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): # XXX namespace attr_dict = {} for key, value in attrs: @@ -62,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser): value = '' attr_dict[key] = value attrvalue = '""' - self.soup.handle_starttag(name, None, None, attr_dict) + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # .) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) - def handle_endtag(self, name): - self.soup.handle_endtag(name) + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) @@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e + parser.already_closed_empty_element = [] # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like

as a diff --git a/lib/bs4/element.py b/lib/bs4/element.py index b100d18b..9ef75f81 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -131,8 +131,8 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's @@ -535,9 +535,16 @@ class PageElement(object): return ResultSet(strainer, result) elif isinstance(name, basestring): # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. + prefix, name = name.split(':', 1) + else: + prefix = None result = (element for element in generator if isinstance(element, Tag) - and element.name == name) + and element.name == name + and (prefix is None or element.prefix == prefix) + ) return ResultSet(strainer, result) results = ResultSet(strainer) while True: @@ -863,7 +870,7 @@ class Tag(PageElement): Its contents are a copy of the old Tag's contents. """ clone = type(self)(None, self.builder, self.name, self.namespace, - self.nsprefix, self.attrs, is_xml=self._is_xml) + self.prefix, self.attrs, is_xml=self._is_xml) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: @@ -985,6 +992,13 @@ class Tag(PageElement): attribute.""" return self.attrs.get(key, default) + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list.""" + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + def has_attr(self, key): return key in self.attrs @@ -1698,7 +1712,7 @@ class SoupStrainer(object): "I don't know how to match against a %s" % markup.__class__) return found - def _matches(self, markup, match_against): + def _matches(self, markup, match_against, already_tried=None): # print u"Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): @@ -1713,7 +1727,7 @@ class SoupStrainer(object): if self._matches(' '.join(markup), match_against): return True return False - + if match_against is True: # True matches any non-None value. return markup is not None @@ -1723,6 +1737,7 @@ class SoupStrainer(object): # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. + original_markup = markup if isinstance(markup, Tag): markup = markup.name @@ -1733,18 +1748,51 @@ class SoupStrainer(object): # None matches None, False, an empty string, an empty list, and so on. return not match_against - if isinstance(match_against, unicode): + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, basestring)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, unicode): # Exact string match - return markup == match_against + match = markup == match_against - if hasattr(match_against, 'match'): + if not match and hasattr(match_against, 'search'): # Regexp match return match_against.search(markup) - if hasattr(match_against, '__iter__'): - # The markup must be an exact match against something - # in the iterable. - return markup in match_against + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match class ResultSet(list): @@ -1753,3 +1801,8 @@ class ResultSet(list): def __init__(self, source, result=()): super(ResultSet, self).__init__(result) self.source = source + + def __getattr__(self, key): + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + )