From 682c6dae7a2fb41c252685fa5fb1305ac98d2a52 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Sat, 14 Jan 2023 01:03:30 +0000 Subject: [PATCH] =?UTF-8?q?Update=20Beautiful=20Soup=204.9.3=20(r593)=20?= =?UTF-8?q?=E2=86=92=204.11.1=20(r642).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 1 + lib/bs4/__init__.py | 133 +++++++----- lib/bs4/builder/__init__.py | 128 ++++++++++- lib/bs4/builder/_html5lib.py | 12 +- lib/bs4/builder/_htmlparser.py | 32 ++- lib/bs4/builder/_lxml.py | 70 +++++- lib/bs4/check_block.py | 4 - lib/bs4/dammit.py | 300 +++++++++++++++++++------- lib/bs4/diagnose.py | 10 +- lib/bs4/element.py | 382 +++++++++++++++++++++------------ lib/bs4/formatter.py | 43 +++- 11 files changed, 819 insertions(+), 296 deletions(-) delete mode 100644 lib/bs4/check_block.py diff --git a/CHANGES.md b/CHANGES.md index 7913370e..15e18ccb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ ### 3.27.0 (202x-xx-xx xx:xx:00 UTC) * Update attr 20.3.0 (f3762ba) to 22.2.0 (a9960de) +* Update Beautiful Soup 4.9.3 (r593) to 4.11.1 (r642) * Update cachecontrol 0.12.6 (167a605) to 0.12.11 (c05ef9e) * Add filelock 3.9.0 (ce3e891) * Remove lockfile no longer used by cachecontrol diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index 67b21af5..4d8ee829 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. -Beautiful Soup works with Python 2.7 and up. It works better if lxml +Beautiful Soup works with Python 3.5 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the @@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.9.3" -__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" +__version__ = "4.11.1" +__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -29,7 +29,16 @@ import sys import traceback import warnings -from .builder import builder_registry, ParserRejectedMarkup +# The very first thing we do is give a useful error if someone is +# running this code under Python 2. +if sys.version_info.major < 3: + raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') + +from .builder import ( + builder_registry, + ParserRejectedMarkup, + XMLParsedAsHTMLWarning, +) from .dammit import UnicodeDammit from .element import ( CData, @@ -49,10 +58,6 @@ from .element import ( TemplateString, ) -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - # Define some custom warnings. class GuessedAtParserWarning(UserWarning): """The warning issued when BeautifulSoup has to guess what parser to @@ -65,7 +70,7 @@ class MarkupResemblesLocatorWarning(UserWarning): on disk. """ - + class BeautifulSoup(Tag): """A data structure representing a parsed HTML or XML document. @@ -205,10 +210,10 @@ class BeautifulSoup(Tag): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' - 'has been renamed to "%s."' % (old_name, new_name)) - value = kwargs[old_name] - del kwargs[old_name] - return value + 'has been renamed to "%s."' % (old_name, new_name), + DeprecationWarning + ) + return kwargs.pop(old_name) return None parse_only = parse_only or deprecated_argument( @@ -303,39 +308,18 @@ class BeautifulSoup(Tag): self._namespaces = dict() self.parse_only = parse_only - self.builder.initialize_soup(self) - if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) or (isinstance(markup, str) and not '<' in markup) ): - # Print out warnings for a couple beginner problems + # Issue warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, - # just in case that's what the user really wants. - if (isinstance(markup, str) - and not os.path.supports_unicode_filenames): - possible_filename = markup.encode("utf8") - else: - possible_filename = markup - is_file = False - try: - is_file = os.path.exists(possible_filename) - except Exception as e: - # This is almost certainly a problem involving - # characters not valid in filenames on this - # system. Just let it go. - pass - if is_file: - warnings.warn( - '"%s" looks like a filename, not markup. You should' - ' probably open this file and pass the filehandle into' - ' Beautiful Soup.' % self._decode_markup(markup), - MarkupResemblesLocatorWarning - ) - self._check_markup_is_url(markup) + # since that is sometimes the intended behavior. + if not self._markup_is_url(markup): + self._markup_resembles_filename(markup) rejections = [] success = False @@ -344,6 +328,7 @@ class BeautifulSoup(Tag): self.builder.prepare_markup( markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() + self.builder.initialize_soup(self) try: self._feed() success = True @@ -379,10 +364,10 @@ class BeautifulSoup(Tag): def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) - if 'builder' in d and not self.builder.picklable: + if 'builder' in d and d['builder'] is not None and not self.builder.picklable: d['builder'] = None return d - + @classmethod def _decode_markup(cls, markup): """Ensure `markup` is bytes so it's safe to send into warnings.warn. @@ -397,11 +382,13 @@ class BeautifulSoup(Tag): return decoded @classmethod - def _check_markup_is_url(cls, markup): + def _markup_is_url(cls, markup): """Error-handling method to raise a warning if incoming markup looks like a URL. :param markup: A string. + :return: Whether or not the markup resembles a URL + closely enough to justify a warning. """ if isinstance(markup, bytes): space = b' ' @@ -410,20 +397,50 @@ class BeautifulSoup(Tag): space = ' ' cant_start_with = ("http:", "https:") else: - return + return False if any(markup.startswith(prefix) for prefix in cant_start_with): if not space in markup: warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an' - ' HTTP client. You should probably use an HTTP client like' - ' requests to get the document behind the URL, and feed' - ' that document to Beautiful Soup.' % cls._decode_markup( - markup - ), + 'The input looks more like a URL than markup. You may want to use' + ' an HTTP client like requests to get the document behind' + ' the URL, and feed that document to Beautiful Soup.', MarkupResemblesLocatorWarning ) + return True + return False + @classmethod + def _markup_resembles_filename(cls, markup): + """Error-handling method to raise a warning if incoming markup + resembles a filename. + + :param markup: A bytestring or string. + :return: Whether or not the markup resembles a filename + closely enough to justify a warning. + """ + path_characters = '/\\' + extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] + if isinstance(markup, bytes): + path_characters = path_characters.encode("utf8") + extensions = [x.encode('utf8') for x in extensions] + filelike = False + if any(x in markup for x in path_characters): + filelike = True + else: + lower = markup.lower() + if any(lower.endswith(ext) for ext in extensions): + filelike = True + if filelike: + warnings.warn( + 'The input looks more like a filename than markup. You may' + ' want to open this file and pass the filehandle into' + ' Beautiful Soup.', + MarkupResemblesLocatorWarning + ) + return True + return False + def _feed(self): """Internal method that parses previously set markup, creating a large number of Tag and NavigableString objects. @@ -485,7 +502,7 @@ class BeautifulSoup(Tag): # On top of that, we may be inside a tag that needs a special # container class. - if self.string_container_stack: + if self.string_container_stack and container is NavigableString: container = self.builder.string_containers.get( self.string_container_stack[-1].name, container ) @@ -541,9 +558,7 @@ class BeautifulSoup(Tag): def endData(self, containerClass=None): """Method called by the TreeBuilder when the end of a data segment occurs. - """ - containerClass = self.string_container(containerClass) - + """ if self.current_data: current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains @@ -570,6 +585,7 @@ class BeautifulSoup(Tag): not self.parse_only.search(current_data)): return + containerClass = self.string_container(containerClass) o = containerClass(current_data) self.object_was_parsed(o) @@ -676,7 +692,7 @@ class BeautifulSoup(Tag): return most_recently_popped def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, - sourcepos=None): + sourcepos=None, namespaces=None): """Called by the tree builder when a new tag is encountered. :param name: Name of the tag. @@ -686,6 +702,8 @@ class BeautifulSoup(Tag): source document. :param sourcepos: The character position within `sourceline` where this tag was found. + :param namespaces: A dictionary of all namespace prefix mappings + currently in scope in the document. If this method returns None, the tag was rejected by an active SoupStrainer. You should proceed as if the tag had not occurred @@ -703,7 +721,8 @@ class BeautifulSoup(Tag): tag = self.element_classes.get(Tag, Tag)( self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self._most_recent_element, - sourceline=sourceline, sourcepos=sourcepos + sourceline=sourceline, sourcepos=sourcepos, + namespaces=namespaces ) if tag is None: return tag @@ -722,7 +741,7 @@ class BeautifulSoup(Tag): #print("End tag: " + name) self.endData() self._popToTag(name, nsprefix) - + def handle_data(self, data): """Called by the tree builder when a chunk of textual data is encountered.""" self.current_data.append(data) @@ -769,7 +788,9 @@ class BeautifulStoneSoup(BeautifulSoup): kwargs['features'] = 'xml' warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' - 'it, pass features="xml" into the BeautifulSoup constructor.') + 'it, pass features="xml" into the BeautifulSoup constructor.', + DeprecationWarning + ) super(BeautifulStoneSoup, self).__init__(*args, **kwargs) diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 0335ce6d..fa5017f5 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -3,10 +3,14 @@ __license__ = "MIT" from collections import defaultdict import itertools +import re +import warnings import sys from ..element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + RubyParenthesisString, + RubyTextString, Stylesheet, Script, TemplateString, @@ -28,6 +32,12 @@ XML = 'xml' HTML = 'html' HTML_5 = 'html5' +class XMLParsedAsHTMLWarning(UserWarning): + """The warning issued when an HTML parser is used to parse + XML that is not XHTML. + """ + MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.""" + class TreeBuilderRegistry(object): """A way of looking up TreeBuilder subclasses by their name or by desired @@ -112,7 +122,7 @@ class TreeBuilder(object): # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - DEFAULT_CDATA_LIST_ATTRIBUTES = {} + DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list) # Whitespace should be preserved inside these tags. DEFAULT_PRESERVE_WHITESPACE_TAGS = set() @@ -234,7 +244,8 @@ class TreeBuilder(object): :param markup: Some markup -- probably a bytestring. :param user_specified_encoding: The user asked to try this encoding. :param document_declared_encoding: The markup itself claims to be - in this encoding. + in this encoding. NOTE: This argument is not used by the + calling code and can probably be removed. :param exclude_encodings: The user asked _not_ to try any of these encodings. @@ -318,7 +329,7 @@ class TreeBuilder(object): values = value attrs[attr] = values return attrs - + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events. @@ -389,17 +400,25 @@ class HTMLTreeBuilder(TreeBuilder): # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - # The HTML standard defines an unusual content model for these tags. - # We represent this by using a string class other than NavigableString - # inside these tags. + # These HTML tags need special treatment so they can be + # represented by a string class other than NavigableString. # - # I made this list by going through the HTML spec + # For some of these tags, it's because the HTML standard defines + # an unusual content model for them. I made this list by going + # through the HTML spec # (https://html.spec.whatwg.org/#metadata-content) and looking for # "metadata content" elements that can contain strings. # + # The Ruby tags ( and ) are here despite being normal + # "phrasing content" tags, because the content they contain is + # qualitatively different from other text in the document, and it + # can be useful to be able to distinguish it. + # # TODO: Arguably