From 997e6955b20a17de231b979f75d72825923f07b1 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Sun, 28 May 2023 13:58:26 +0100 Subject: [PATCH 01/29] =?UTF-8?q?Update=20Beautiful=20Soup=204.11.1=20(r64?= =?UTF-8?q?2)=20=E2=86=92=204.12.2=20and=20soupsieve=202.3.2.post1=20(792d?= =?UTF-8?q?566)=20=E2=86=92=202.4.1=20(2e66beb).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 8 +- lib/bs4/__init__.py | 72 +++-- lib/bs4/builder/_html5lib.py | 10 +- lib/bs4/builder/_htmlparser.py | 166 ++-------- lib/bs4/css.py | 280 +++++++++++++++++ lib/bs4/diagnose.py | 15 - lib/bs4/element.py | 533 +++++++++++++++++++++------------ lib/bs4/formatter.py | 6 +- lib/soupsieve/__init__.py | 40 +-- lib/soupsieve/__meta__.py | 2 +- lib/soupsieve/css_match.py | 54 ++-- lib/soupsieve/css_parser.py | 53 ++-- lib/soupsieve/css_types.py | 24 +- lib/soupsieve/util.py | 6 +- 14 files changed, 794 insertions(+), 475 deletions(-) create mode 100644 lib/bs4/css.py diff --git a/CHANGES.md b/CHANGES.md index fc99521f..b63eabbb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,10 @@ -### 3.29.2 (2023-05-28 07:45:00 UTC) +### 3.30.0 (2023-0x-xx xx:xx:00 UTC) + +* Update Beautiful Soup 4.11.1 (r642) to 4.12.2 +* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb) + + +### 3.29.2 (2023-05-28 07:45:00 UTC) * Fix find show results returned as newest/oldest that are then sorted z to a * Fix add show "TheTVDB via Trakt" diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index 4d8ee829..98092923 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. -Beautiful Soup works with Python 3.5 and up. It works better if lxml +Beautiful Soup works with Python 3.6 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the @@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.11.1" -__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" +__version__ = "4.12.2" +__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -38,11 +38,13 @@ from .builder import ( builder_registry, ParserRejectedMarkup, XMLParsedAsHTMLWarning, + HTMLParserTreeBuilder ) from .dammit import UnicodeDammit from .element import ( CData, Comment, + CSS, DEFAULT_OUTPUT_ENCODING, Declaration, Doctype, @@ -116,7 +118,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs): @@ -211,7 +213,7 @@ class BeautifulSoup(Tag): warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name), - DeprecationWarning + DeprecationWarning, stacklevel=3 ) return kwargs.pop(old_name) return None @@ -348,25 +350,49 @@ class BeautifulSoup(Tag): self.markup = None self.builder.soup = None - def __copy__(self): - """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" - copy = type(self)( - self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' - ) + def _clone(self): + """Create a new BeautifulSoup object with the same TreeBuilder, + but not associated with any markup. - # Although we encoded the tree to UTF-8, that may not have - # been the encoding of the original markup. Set the copy's - # .original_encoding to reflect the original object's - # .original_encoding. - copy.original_encoding = self.original_encoding - return copy + This is the first step of the deepcopy process. + """ + clone = type(self)("", None, self.builder) + # Keep track of the encoding of the original document, + # since we won't be parsing it again. + clone.original_encoding = self.original_encoding + return clone + def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and d['builder'] is not None and not self.builder.picklable: - d['builder'] = None + d['builder'] = type(self.builder) + # Store the contents as a Unicode string. + d['contents'] = [] + d['markup'] = self.decode() + + # If _most_recent_element is present, it's a Tag object left + # over from initial parse. It might not be picklable and we + # don't need it. + if '_most_recent_element' in d: + del d['_most_recent_element'] return d + + def __setstate__(self, state): + # If necessary, restore the TreeBuilder by looking it up. + self.__dict__ = state + if isinstance(self.builder, type): + self.builder = self.builder() + elif not self.builder: + # We don't know which builder was used to build this + # parse tree, so use a default we know is always available. + self.builder = HTMLParserTreeBuilder() + self.builder.soup = self + self.reset() + self._feed() + return state + @classmethod def _decode_markup(cls, markup): @@ -405,7 +431,8 @@ class BeautifulSoup(Tag): 'The input looks more like a URL than markup. You may want to use' ' an HTTP client like requests to get the document behind' ' the URL, and feed that document to Beautiful Soup.', - MarkupResemblesLocatorWarning + MarkupResemblesLocatorWarning, + stacklevel=3 ) return True return False @@ -436,7 +463,7 @@ class BeautifulSoup(Tag): 'The input looks more like a filename than markup. You may' ' want to open this file and pass the filehandle into' ' Beautiful Soup.', - MarkupResemblesLocatorWarning + MarkupResemblesLocatorWarning, stacklevel=3 ) return True return False @@ -467,6 +494,7 @@ class BeautifulSoup(Tag): self.open_tag_counter = Counter() self.preserve_whitespace_tag_stack = [] self.string_container_stack = [] + self._most_recent_element = None self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, @@ -748,7 +776,7 @@ class BeautifulSoup(Tag): def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): + formatter="minimal", iterator=None): """Returns a string or Unicode representation of the parse tree as an HTML or XML document. @@ -775,7 +803,7 @@ class BeautifulSoup(Tag): else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) + indent_level, eventual_encoding, formatter, iterator) # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' _s = BeautifulSoup @@ -789,7 +817,7 @@ class BeautifulStoneSoup(BeautifulSoup): warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' 'it, pass features="xml" into the BeautifulSoup constructor.', - DeprecationWarning + DeprecationWarning, stacklevel=2 ) super(BeautifulStoneSoup, self).__init__(*args, **kwargs) diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 463613a8..a5711d5d 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # ATM because the html5lib TreeBuilder doesn't use # UnicodeDammit. if exclude_encodings: - warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + warnings.warn( + "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.", + stacklevel=3 + ) # html5lib only parses HTML, so if it's given XML that's worth # noting. @@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # These methods are defined by Beautiful Soup. def feed(self, markup): if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + warnings.warn( + "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", + stacklevel=4 + ) parser = html5lib.HTMLParser(tree=self.create_treebuilder) self.underlying_builder.parser = parser extra_kwargs = dict() diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index e37cdcde..4c5ced93 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -10,30 +10,9 @@ __all__ = [ from html.parser import HTMLParser -try: - from html.parser import HTMLParseError -except ImportError as e: - # HTMLParseError is removed in Python 3.5. Since it can never be - # thrown in 3.5, we can just define our own class as a placeholder. - class HTMLParseError(Exception): - pass - import sys import warnings -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 -CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 -CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 - - from ..element import ( CData, Comment, @@ -45,6 +24,7 @@ from ..dammit import EntitySubstitution, UnicodeDammit from ..builder import ( DetectsXMLParsedAsHTML, + ParserRejectedMarkup, HTML, HTMLTreeBuilder, STRICT, @@ -90,20 +70,23 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): self.already_closed_empty_element = [] self._initialize_xml_detector() - - def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although - this requirement doesn't appear to be documented. - In Python 2, HTMLParser implements error() by raising an exception, - which we don't want to do. + def error(self, message): + # NOTE: This method is required so long as Python 3.9 is + # supported. The corresponding code is removed from HTMLParser + # in 3.5, but not removed from ParserBase until 3.10. + # https://github.com/python/cpython/issues/76025 + # + # The original implementation turned the error into a warning, + # but in every case I discovered, this made HTMLParser + # immediately crash with an error message that was less + # helpful than the warning. The new implementation makes it + # more clear that html.parser just can't parse this + # markup. The 3.10 implementation does the same, though it + # raises AssertionError rather than calling a method. (We + # catch this error and wrap it in a ParserRejectedMarkup.) + raise ParserRejectedMarkup(message) - In any event, this method is called only on very strange - markup and our best strategy is to pretend it didn't happen - and keep going. - """ - warnings.warn(msg) - def handle_startendtag(self, name, attrs): """Handle an incoming empty-element tag. @@ -203,9 +186,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): :param name: Character number, possibly in hexadecimal. """ - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed in all supported versions. - # http://bugs.python.org/issue13633 + # TODO: This was originally a workaround for a bug in + # HTMLParser. (http://bugs.python.org/issue13633) The bug has + # been fixed, but removing this code still makes some + # Beautiful Soup tests fail. This needs investigation. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -333,10 +317,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser_args = parser_args or [] parser_kwargs = parser_kwargs or {} parser_kwargs.update(extra_parser_kwargs) - if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - parser_kwargs['strict'] = False - if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - parser_kwargs['convert_charrefs'] = False + parser_kwargs['convert_charrefs'] = False self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, @@ -397,103 +378,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - parser.close() - except HTMLParseError as e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e + except AssertionError as e: + # html.parser raises AssertionError in rare cases to + # indicate a fatal problem with the markup, especially + # when there's an error in the doctype declaration. + raise ParserRejectedMarkup(e) + parser.close() parser.already_closed_empty_element = [] - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like

as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True diff --git a/lib/bs4/css.py b/lib/bs4/css.py new file mode 100644 index 00000000..572014b1 --- /dev/null +++ b/lib/bs4/css.py @@ -0,0 +1,280 @@ +"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve).""" + +import warnings +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + + +class CSS(object): + """A proxy object against the soupsieve library, to simplify its + CSS selector API. + + Acquire this object through the .css attribute on the + BeautifulSoup object, or on the Tag you want to use as the + starting point for a CSS selector. + + The main advantage of doing this is that the tag to be selected + against doesn't need to be explicitly specified in the function + calls, since it's already scoped to a tag. + """ + + def __init__(self, tag, api=soupsieve): + """Constructor. + + You don't need to instantiate this class yourself; instead, + access the .css attribute on the BeautifulSoup object, or on + the Tag you want to use as the starting point for your CSS + selector. + + :param tag: All CSS selectors will use this as their starting + point. + + :param api: A plug-in replacement for the soupsieve module, + designed mainly for use in tests. + """ + if api is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + self.api = api + self.tag = tag + + def escape(self, ident): + """Escape a CSS identifier. + + This is a simple wrapper around soupselect.escape(). See the + documentation for that function for more information. + """ + if soupsieve is None: + raise NotImplementedError( + "Cannot escape CSS identifiers because the soupsieve package is not installed." + ) + return self.api.escape(ident) + + def _ns(self, ns, select): + """Normalize a dictionary of namespaces.""" + if not isinstance(select, self.api.SoupSieve) and ns is None: + # If the selector is a precompiled pattern, it already has + # a namespace context compiled in, which cannot be + # replaced. + ns = self.tag._namespaces + return ns + + def _rs(self, results): + """Normalize a list of results to a Resultset. + + A ResultSet is more consistent with the rest of Beautiful + Soup's API, and ResultSet.__getattr__ has a helpful error + message if you try to treat a list of results as a single + result (a common mistake). + """ + # Import here to avoid circular import + from .element import ResultSet + return ResultSet(None, results) + + def compile(self, select, namespaces=None, flags=0, **kwargs): + """Pre-compile a selector and return the compiled object. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.compile() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.compile() method. + + :return: A precompiled selector object. + :rtype: soupsieve.SoupSieve + """ + return self.api.compile( + select, self._ns(namespaces, select), flags, **kwargs + ) + + def select_one(self, select, namespaces=None, flags=0, **kwargs): + """Perform a CSS selection operation on the current Tag and return the + first result. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.select_one() + method. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.select_one() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select_one() method. + + :return: A Tag, or None if the selector has no match. + :rtype: bs4.element.Tag + + """ + return self.api.select_one( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + + def select(self, select, namespaces=None, limit=0, flags=0, **kwargs): + """Perform a CSS selection operation on the current Tag. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.select() + method. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.select() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of Tag objects. + :rtype: bs4.element.ResultSet + + """ + if limit is None: + limit = 0 + + return self._rs( + self.api.select( + select, self.tag, self._ns(namespaces, select), limit, flags, + **kwargs + ) + ) + + def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs): + """Perform a CSS selection operation on the current Tag. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.iselect() + method. It is the same as select(), but it returns a generator + instead of a list. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.iselect() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.iselect() method. + + :return: A generator + :rtype: types.GeneratorType + """ + return self.api.iselect( + select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs + ) + + def closest(self, select, namespaces=None, flags=0, **kwargs): + """Find the Tag closest to this one that matches the given selector. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.closest() + method. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.closest() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.closest() method. + + :return: A Tag, or None if there is no match. + :rtype: bs4.Tag + + """ + return self.api.closest( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + + def match(self, select, namespaces=None, flags=0, **kwargs): + """Check whether this Tag matches the given CSS selector. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.match() + method. + + :param: a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.match() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.match() method. + + :return: True if this Tag matches the selector; False otherwise. + :rtype: bool + """ + return self.api.match( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + + def filter(self, select, namespaces=None, flags=0, **kwargs): + """Filter this Tag's direct children based on the given CSS selector. + + This uses the Soup Sieve library. It works the same way as + passing this Tag into that library's soupsieve.filter() + method. More information, for more information see the + documentation for soupsieve.filter(). + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.filter() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.filter() method. + + :return: A ResultSet of Tag objects. + :rtype: bs4.element.ResultSet + + """ + return self._rs( + self.api.filter( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + ) diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index e458729b..0b1c1e53 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -59,21 +59,6 @@ def diagnose(data): if hasattr(data, 'read'): data = data.read() - elif data.startswith("http:") or data.startswith("https:"): - print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) - print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") - return - else: - try: - if os.path.exists(data): - print(('"%s" looks like a filename. Reading data from the file.' % data)) - with open(data) as fp: - data = fp.read() - except ValueError: - # This can happen on some platforms when the 'filename' is - # too long. Assume it's data and not a filename. - pass - print("") for parser in basic_parsers: print(("Trying to parse your markup with %s" % parser)) diff --git a/lib/bs4/element.py b/lib/bs4/element.py index 0eea8733..99fc8137 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -8,14 +8,8 @@ except ImportError as e: import re import sys import warnings -try: - import soupsieve -except ImportError as e: - soupsieve = None - warnings.warn( - 'The soupsieve package is not installed. CSS selectors cannot be used.' - ) +from .css import CSS from .formatter import ( Formatter, HTMLFormatter, @@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([ "string-escape", "string_escape", ]) - + class NamespacedAttribute(str): """A namespaced string (e.g. 'xml:lang') that remembers the namespace ('xml') and the name ('lang') that were used to create it. """ - + def __new__(cls, prefix, name=None, namespace=None): if not name: # This is the default namespace. Its name "has no value" @@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) - + class PageElement(object): """Contains the navigational information for some part of the page: that is, its current location in the parse tree. NavigableString, Tag, etc. are all subclasses of PageElement. """ - + + # In general, we can't tell just by looking at an element whether + # it's contained in an XML document or an HTML document. But for + # Tags (q.v.) we can store this information at parse time. + known_xml = None + def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and @@ -163,7 +162,7 @@ class PageElement(object): :param previous_element: The element parsed immediately before this one. - + :param next_element: The element parsed immediately before this one. @@ -257,11 +256,11 @@ class PageElement(object): default = object() def _all_strings(self, strip=False, types=default): """Yield all strings of certain classes, possibly stripping them. - + This is implemented differently in Tag and NavigableString. """ raise NotImplementedError() - + @property def stripped_strings(self): """Yield all strings in this PageElement, stripping them first. @@ -294,11 +293,11 @@ class PageElement(object): strip, types=types)]) getText = get_text text = property(get_text) - + def replace_with(self, *args): - """Replace this PageElement with one or more PageElements, keeping the + """Replace this PageElement with one or more PageElements, keeping the rest of the tree the same. - + :param args: One or more PageElements. :return: `self`, no longer part of the tree. """ @@ -410,7 +409,7 @@ class PageElement(object): This works the same way as `list.insert`. :param position: The numeric position that should be occupied - in `self.children` by the new PageElement. + in `self.children` by the new PageElement. :param new_child: A PageElement. """ if new_child is None: @@ -496,13 +495,16 @@ class PageElement(object): def extend(self, tags): """Appends the given PageElements to this one's contents. - :param tags: A list of PageElements. + :param tags: A list of PageElements. If a single Tag is + provided instead, this PageElement's contents will be extended + with that Tag's contents. """ if isinstance(tags, Tag): - # Calling self.append() on another tag's contents will change - # the list we're iterating over. Make a list that won't - # change. - tags = list(tags.contents) + tags = tags.contents + if isinstance(tags, list): + # Moving items around the tree may change their position in + # the original list. Make a list that won't change. + tags = list(tags) for tag in tags: self.append(tag) @@ -543,7 +545,7 @@ class PageElement(object): "Element has no parent, so 'after' has no meaning.") if any(x is self for x in args): raise ValueError("Can't insert an element after itself.") - + offset = 0 for successor in args: # Extract first so that the index won't be screwed up if they @@ -586,8 +588,9 @@ class PageElement(object): :kwargs: A dictionary of filters on attribute values. :return: A ResultSet containing PageElements. """ + _stacklevel = kwargs.pop('_stacklevel', 2) return self._find_all(name, attrs, string, limit, self.next_elements, - **kwargs) + _stacklevel=_stacklevel+1, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): @@ -624,8 +627,11 @@ class PageElement(object): :return: A ResultSet of PageElements. :rtype: bs4.element.ResultSet """ - return self._find_all(name, attrs, string, limit, - self.next_siblings, **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, + self.next_siblings, _stacklevel=_stacklevel+1, **kwargs + ) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 @@ -663,8 +669,11 @@ class PageElement(object): :return: A ResultSet of PageElements. :rtype: bs4.element.ResultSet """ - return self._find_all(name, attrs, string, limit, self.previous_elements, - **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, self.previous_elements, + _stacklevel=_stacklevel+1, **kwargs + ) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 @@ -702,8 +711,11 @@ class PageElement(object): :return: A ResultSet of PageElements. :rtype: bs4.element.ResultSet """ - return self._find_all(name, attrs, string, limit, - self.previous_siblings, **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, + self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs + ) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 @@ -724,7 +736,7 @@ class PageElement(object): # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None - l = self.find_parents(name, attrs, 1, **kwargs) + l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) if l: r = l[0] return r @@ -744,8 +756,9 @@ class PageElement(object): :return: A PageElement. :rtype: bs4.element.Tag | bs4.element.NavigableString """ + _stacklevel = kwargs.pop('_stacklevel', 2) return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) + _stacklevel=_stacklevel+1, **kwargs) findParents = find_parents # BS3 fetchParents = find_parents # BS2 @@ -771,19 +784,20 @@ class PageElement(object): def _find_one(self, method, name, attrs, string, **kwargs): r = None - l = method(name, attrs, string, 1, **kwargs) + l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) if l: r = l[0] return r def _find_all(self, name, attrs, string, limit, generator, **kwargs): "Iterates over a generator looking for things that match." + _stacklevel = kwargs.pop('_stacklevel', 3) if string is None and 'text' in kwargs: string = kwargs.pop('text') warnings.warn( "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", - DeprecationWarning + DeprecationWarning, stacklevel=_stacklevel ) if isinstance(name, SoupStrainer): @@ -897,7 +911,7 @@ class PageElement(object): :rtype: bool """ return getattr(self, '_decomposed', False) or False - + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -921,16 +935,11 @@ class NavigableString(str, PageElement): When Beautiful Soup parses the markup penguin, it will create a NavigableString for the string "penguin". - """ + """ PREFIX = '' SUFFIX = '' - # We can't tell just by looking at a string whether it's contained - # in an XML document or an HTML document. - - known_xml = None - def __new__(cls, value): """Create a new NavigableString. @@ -946,12 +955,22 @@ class NavigableString(str, PageElement): u.setup() return u - def __copy__(self): + def __deepcopy__(self, memo, recursive=False): """A copy of a NavigableString has the same contents and class as the original, but it is not connected to the parse tree. + + :param recursive: This parameter is ignored; it's only defined + so that NavigableString.__deepcopy__ implements the same + signature as Tag.__deepcopy__. """ return type(self)(self) + def __copy__(self): + """A copy of a NavigableString can only be a deep copy, because + only one PageElement can occupy a given place in a parse tree. + """ + return self.__deepcopy__({}) + def __getnewargs__(self): return (str(self),) @@ -1044,10 +1063,10 @@ class PreformattedString(NavigableString): as comments (the Comment class) and CDATA blocks (the CData class). """ - + PREFIX = '' SUFFIX = '' - + def output_ready(self, formatter=None): """Make this string ready for output by adding any subclass-specific prefix or suffix. @@ -1129,7 +1148,7 @@ class Stylesheet(NavigableString): """ pass - + class Script(NavigableString): """A NavigableString representing an executable script (probably Javascript). @@ -1235,7 +1254,7 @@ class Tag(PageElement): if ((not builder or builder.store_line_numbers) and (sourceline is not None or sourcepos is not None)): self.sourceline = sourceline - self.sourcepos = sourcepos + self.sourcepos = sourcepos if attrs is None: attrs = {} elif attrs: @@ -1293,25 +1312,60 @@ class Tag(PageElement): self.interesting_string_types = builder.string_containers[self.name] else: self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES - + parserClass = _alias("parser_class") # BS3 - def __copy__(self): - """A copy of a Tag is a new Tag, unconnected to the parse tree. + def __deepcopy__(self, memo, recursive=True): + """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. """ + clone = self._clone() + + if recursive: + # Clone this tag's descendants recursively, but without + # making any recursive function calls. + tag_stack = [clone] + for event, element in self._event_stream(self.descendants): + if event is Tag.END_ELEMENT_EVENT: + # Stop appending incoming Tags to the Tag that was + # just closed. + tag_stack.pop() + else: + descendant_clone = element.__deepcopy__( + memo, recursive=False + ) + # Add to its parent's .contents + tag_stack[-1].append(descendant_clone) + + if event is Tag.START_ELEMENT_EVENT: + # Add the Tag itself to the stack so that its + # children will be .appended to it. + tag_stack.append(descendant_clone) + return clone + + def __copy__(self): + """A copy of a Tag must always be a deep copy, because a Tag's + children can only have one parent at a time. + """ + return self.__deepcopy__({}) + + def _clone(self): + """Create a new Tag just like this one, but with no + contents and unattached to any parse tree. + + This is the first step in the deepcopy process. + """ clone = type(self)( None, self.builder, self.name, self.namespace, self.prefix, self.attrs, is_xml=self._is_xml, sourceline=self.sourceline, sourcepos=self.sourcepos, can_be_empty_element=self.can_be_empty_element, cdata_list_attributes=self.cdata_list_attributes, - preserve_whitespace_tags=self.preserve_whitespace_tags + preserve_whitespace_tags=self.preserve_whitespace_tags, + interesting_string_types=self.interesting_string_types ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) - for child in self.contents: - clone.append(child.__copy__()) return clone @property @@ -1417,7 +1471,7 @@ class Tag(PageElement): i.contents = [] i._decomposed = True i = n - + def clear(self, decompose=False): """Wipe out all children of this PageElement by calling extract() on them. @@ -1505,7 +1559,7 @@ class Tag(PageElement): if not isinstance(value, list): value = [value] return value - + def has_attr(self, key): """Does this PageElement have an attribute with the given name?""" return key in self.attrs @@ -1558,7 +1612,7 @@ class Tag(PageElement): '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( name=tag_name ), - DeprecationWarning + DeprecationWarning, stacklevel=2 ) return self.find(tag_name) # We special case contents to avoid recursion. @@ -1592,7 +1646,7 @@ class Tag(PageElement): def __repr__(self, encoding="unicode-escape"): """Renders this PageElement as a string. - :param encoding: The encoding to use (Python 2 only). + :param encoding: The encoding to use (Python 2 only). TODO: This is now ignored and a warning should be issued if a value is provided. :return: A (Unicode) string. @@ -1634,106 +1688,212 @@ class Tag(PageElement): def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - """ - + formatter="minimal", + iterator=None): + pieces = [] # First off, turn a non-Formatter `formatter` into a Formatter # object. This will stop the lookup from happening over and # over again. if not isinstance(formatter, Formatter): formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key + + if indent_level is True: + indent_level = 0 + + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + + for event, element in self._event_stream(iterator): + if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): + piece = element._format_tag( + eventual_encoding, formatter, opening=True + ) + elif event is Tag.END_ELEMENT_EVENT: + piece = element._format_tag( + eventual_encoding, formatter, opening=False + ) + if indent_level is not None: + indent_level -= 1 else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) + piece = element.output_ready(formatter) - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like
 and
+            #