Update Beautiful Soup 4.11.1 (r642) → 4.12.2 and soupsieve 2.3.2.post1 (792d566) → 2.4.1 (2e66beb).

This commit is contained in:
JackDandy 2023-05-28 13:58:26 +01:00
parent 18370cebab
commit 997e6955b2
14 changed files with 794 additions and 475 deletions

View file

@ -1,4 +1,10 @@
### 3.29.2 (2023-05-28 07:45:00 UTC) ### 3.30.0 (2023-0x-xx xx:xx:00 UTC)
* Update Beautiful Soup 4.11.1 (r642) to 4.12.2
* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
### 3.29.2 (2023-05-28 07:45:00 UTC)
* Fix find show results returned as newest/oldest that are then sorted z to a * Fix find show results returned as newest/oldest that are then sorted z to a
* Fix add show "TheTVDB via Trakt" * Fix add show "TheTVDB via Trakt"

View file

@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate, provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree. search, and modify the parse tree.
Beautiful Soup works with Python 3.5 and up. It works better if lxml Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed. and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the For more than you ever wanted to know about Beautiful Soup, see the
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.11.1" __version__ = "4.12.2"
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license. # Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
@ -38,11 +38,13 @@ from .builder import (
builder_registry, builder_registry,
ParserRejectedMarkup, ParserRejectedMarkup,
XMLParsedAsHTMLWarning, XMLParsedAsHTMLWarning,
HTMLParserTreeBuilder
) )
from .dammit import UnicodeDammit from .dammit import UnicodeDammit
from .element import ( from .element import (
CData, CData,
Comment, Comment,
CSS,
DEFAULT_OUTPUT_ENCODING, DEFAULT_OUTPUT_ENCODING,
Declaration, Declaration,
Doctype, Doctype,
@ -116,7 +118,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs): element_classes=None, **kwargs):
@ -211,7 +213,7 @@ class BeautifulSoup(Tag):
warnings.warn( warnings.warn(
'The "%s" argument to the BeautifulSoup constructor ' 'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name), 'has been renamed to "%s."' % (old_name, new_name),
DeprecationWarning DeprecationWarning, stacklevel=3
) )
return kwargs.pop(old_name) return kwargs.pop(old_name)
return None return None
@ -348,25 +350,49 @@ class BeautifulSoup(Tag):
self.markup = None self.markup = None
self.builder.soup = None self.builder.soup = None
def __copy__(self): def _clone(self):
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" """Create a new BeautifulSoup object with the same TreeBuilder,
copy = type(self)( but not associated with any markup.
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
# Although we encoded the tree to UTF-8, that may not have This is the first step of the deepcopy process.
# been the encoding of the original markup. Set the copy's """
# .original_encoding to reflect the original object's clone = type(self)("", None, self.builder)
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
# Keep track of the encoding of the original document,
# since we won't be parsing it again.
clone.original_encoding = self.original_encoding
return clone
def __getstate__(self): def __getstate__(self):
# Frequently a tree builder can't be pickled. # Frequently a tree builder can't be pickled.
d = dict(self.__dict__) d = dict(self.__dict__)
if 'builder' in d and d['builder'] is not None and not self.builder.picklable: if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
d['builder'] = None d['builder'] = type(self.builder)
# Store the contents as a Unicode string.
d['contents'] = []
d['markup'] = self.decode()
# If _most_recent_element is present, it's a Tag object left
# over from initial parse. It might not be picklable and we
# don't need it.
if '_most_recent_element' in d:
del d['_most_recent_element']
return d return d
def __setstate__(self, state):
# If necessary, restore the TreeBuilder by looking it up.
self.__dict__ = state
if isinstance(self.builder, type):
self.builder = self.builder()
elif not self.builder:
# We don't know which builder was used to build this
# parse tree, so use a default we know is always available.
self.builder = HTMLParserTreeBuilder()
self.builder.soup = self
self.reset()
self._feed()
return state
@classmethod @classmethod
def _decode_markup(cls, markup): def _decode_markup(cls, markup):
@ -405,7 +431,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use' 'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind' ' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.', ' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning MarkupResemblesLocatorWarning,
stacklevel=3
) )
return True return True
return False return False
@ -436,7 +463,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may' 'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into' ' want to open this file and pass the filehandle into'
' Beautiful Soup.', ' Beautiful Soup.',
MarkupResemblesLocatorWarning MarkupResemblesLocatorWarning, stacklevel=3
) )
return True return True
return False return False
@ -467,6 +494,7 @@ class BeautifulSoup(Tag):
self.open_tag_counter = Counter() self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = [] self.preserve_whitespace_tag_stack = []
self.string_container_stack = [] self.string_container_stack = []
self._most_recent_element = None
self.pushTag(self) self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
@ -748,7 +776,7 @@ class BeautifulSoup(Tag):
def decode(self, pretty_print=False, def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal", iterator=None):
"""Returns a string or Unicode representation of the parse tree """Returns a string or Unicode representation of the parse tree
as an HTML or XML document. as an HTML or XML document.
@ -775,7 +803,7 @@ class BeautifulSoup(Tag):
else: else:
indent_level = 0 indent_level = 0
return prefix + super(BeautifulSoup, self).decode( return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter) indent_level, eventual_encoding, formatter, iterator)
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup _s = BeautifulSoup
@ -789,7 +817,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn( warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using ' 'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.', 'it, pass features="xml" into the BeautifulSoup constructor.',
DeprecationWarning DeprecationWarning, stacklevel=2
) )
super(BeautifulStoneSoup, self).__init__(*args, **kwargs) super(BeautifulStoneSoup, self).__init__(*args, **kwargs)

View file

@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use # ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit. # UnicodeDammit.
if exclude_encodings: if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") warnings.warn(
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
stacklevel=3
)
# html5lib only parses HTML, so if it's given XML that's worth # html5lib only parses HTML, so if it's given XML that's worth
# noting. # noting.
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup. # These methods are defined by Beautiful Soup.
def feed(self, markup): def feed(self, markup):
if self.soup.parse_only is not None: if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") warnings.warn(
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
stacklevel=4
)
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser self.underlying_builder.parser = parser
extra_kwargs = dict() extra_kwargs = dict()

View file

@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys import sys
import warnings import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from ..element import ( from ..element import (
CData, CData,
Comment, Comment,
@ -45,6 +24,7 @@ from ..dammit import EntitySubstitution, UnicodeDammit
from ..builder import ( from ..builder import (
DetectsXMLParsedAsHTML, DetectsXMLParsedAsHTML,
ParserRejectedMarkup,
HTML, HTML,
HTMLTreeBuilder, HTMLTreeBuilder,
STRICT, STRICT,
@ -90,20 +70,23 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self.already_closed_empty_element = [] self.already_closed_empty_element = []
self._initialize_xml_detector() self._initialize_xml_detector()
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although
this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() by raising an exception, def error(self, message):
which we don't want to do. # NOTE: This method is required so long as Python 3.9 is
# supported. The corresponding code is removed from HTMLParser
# in 3.5, but not removed from ParserBase until 3.10.
# https://github.com/python/cpython/issues/76025
#
# The original implementation turned the error into a warning,
# but in every case I discovered, this made HTMLParser
# immediately crash with an error message that was less
# helpful than the warning. The new implementation makes it
# more clear that html.parser just can't parse this
# markup. The 3.10 implementation does the same, though it
# raises AssertionError rather than calling a method. (We
# catch this error and wrap it in a ParserRejectedMarkup.)
raise ParserRejectedMarkup(message)
In any event, this method is called only on very strange
markup and our best strategy is to pretend it didn't happen
and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs): def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag. """Handle an incoming empty-element tag.
@ -203,9 +186,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal. :param name: Character number, possibly in hexadecimal.
""" """
# XXX workaround for a bug in HTMLParser. Remove this once # TODO: This was originally a workaround for a bug in
# it's fixed in all supported versions. # HTMLParser. (http://bugs.python.org/issue13633) The bug has
# http://bugs.python.org/issue13633 # been fixed, but removing this code still makes some
# Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'): if name.startswith('x'):
real_name = int(name.lstrip('x'), 16) real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'): elif name.startswith('X'):
@ -333,10 +317,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or [] parser_args = parser_args or []
parser_kwargs = parser_kwargs or {} parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs) parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: parser_kwargs['convert_charrefs'] = False
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs) self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
@ -397,103 +378,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
parser.close() except AssertionError as e:
except HTMLParseError as e: # html.parser raises AssertionError in rare cases to
warnings.warn(RuntimeWarning( # indicate a fatal problem with the markup, especially
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) # when there's an error in the doctype declaration.
raise e raise ParserRejectedMarkup(e)
parser.close()
parser.already_closed_empty_element = [] parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

280
lib/bs4/css.py Normal file
View file

@ -0,0 +1,280 @@
"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
import warnings
try:
import soupsieve
except ImportError as e:
soupsieve = None
warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
class CSS(object):
"""A proxy object against the soupsieve library, to simplify its
CSS selector API.
Acquire this object through the .css attribute on the
BeautifulSoup object, or on the Tag you want to use as the
starting point for a CSS selector.
The main advantage of doing this is that the tag to be selected
against doesn't need to be explicitly specified in the function
calls, since it's already scoped to a tag.
"""
def __init__(self, tag, api=soupsieve):
"""Constructor.
You don't need to instantiate this class yourself; instead,
access the .css attribute on the BeautifulSoup object, or on
the Tag you want to use as the starting point for your CSS
selector.
:param tag: All CSS selectors will use this as their starting
point.
:param api: A plug-in replacement for the soupsieve module,
designed mainly for use in tests.
"""
if api is None:
raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
self.api = api
self.tag = tag
def escape(self, ident):
"""Escape a CSS identifier.
This is a simple wrapper around soupselect.escape(). See the
documentation for that function for more information.
"""
if soupsieve is None:
raise NotImplementedError(
"Cannot escape CSS identifiers because the soupsieve package is not installed."
)
return self.api.escape(ident)
def _ns(self, ns, select):
"""Normalize a dictionary of namespaces."""
if not isinstance(select, self.api.SoupSieve) and ns is None:
# If the selector is a precompiled pattern, it already has
# a namespace context compiled in, which cannot be
# replaced.
ns = self.tag._namespaces
return ns
def _rs(self, results):
"""Normalize a list of results to a Resultset.
A ResultSet is more consistent with the rest of Beautiful
Soup's API, and ResultSet.__getattr__ has a helpful error
message if you try to treat a list of results as a single
result (a common mistake).
"""
# Import here to avoid circular import
from .element import ResultSet
return ResultSet(None, results)
def compile(self, select, namespaces=None, flags=0, **kwargs):
"""Pre-compile a selector and return the compiled object.
:param selector: A CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.compile() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.compile() method.
:return: A precompiled selector object.
:rtype: soupsieve.SoupSieve
"""
return self.api.compile(
select, self._ns(namespaces, select), flags, **kwargs
)
def select_one(self, select, namespaces=None, flags=0, **kwargs):
"""Perform a CSS selection operation on the current Tag and return the
first result.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.select_one()
method.
:param selector: A CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.select_one() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select_one() method.
:return: A Tag, or None if the selector has no match.
:rtype: bs4.element.Tag
"""
return self.api.select_one(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
"""Perform a CSS selection operation on the current Tag.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.select()
method.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param limit: After finding this number of results, stop looking.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.select() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method.
:return: A ResultSet of Tag objects.
:rtype: bs4.element.ResultSet
"""
if limit is None:
limit = 0
return self._rs(
self.api.select(
select, self.tag, self._ns(namespaces, select), limit, flags,
**kwargs
)
)
def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
"""Perform a CSS selection operation on the current Tag.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.iselect()
method. It is the same as select(), but it returns a generator
instead of a list.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param limit: After finding this number of results, stop looking.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.iselect() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.iselect() method.
:return: A generator
:rtype: types.GeneratorType
"""
return self.api.iselect(
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
)
def closest(self, select, namespaces=None, flags=0, **kwargs):
"""Find the Tag closest to this one that matches the given selector.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.closest()
method.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.closest() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.closest() method.
:return: A Tag, or None if there is no match.
:rtype: bs4.Tag
"""
return self.api.closest(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
def match(self, select, namespaces=None, flags=0, **kwargs):
"""Check whether this Tag matches the given CSS selector.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.match()
method.
:param: a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.match() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.match() method.
:return: True if this Tag matches the selector; False otherwise.
:rtype: bool
"""
return self.api.match(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
def filter(self, select, namespaces=None, flags=0, **kwargs):
"""Filter this Tag's direct children based on the given CSS selector.
This uses the Soup Sieve library. It works the same way as
passing this Tag into that library's soupsieve.filter()
method. More information, for more information see the
documentation for soupsieve.filter().
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.filter() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.filter() method.
:return: A ResultSet of Tag objects.
:rtype: bs4.element.ResultSet
"""
return self._rs(
self.api.filter(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
)

View file

@ -59,21 +59,6 @@ def diagnose(data):
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif data.startswith("http:") or data.startswith("https:"):
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
else:
try:
if os.path.exists(data):
print(('"%s" looks like a filename. Reading data from the file.' % data))
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print("")
for parser in basic_parsers: for parser in basic_parsers:
print(("Trying to parse your markup with %s" % parser)) print(("Trying to parse your markup with %s" % parser))

View file

@ -8,14 +8,8 @@ except ImportError as e:
import re import re
import sys import sys
import warnings import warnings
try:
import soupsieve
except ImportError as e:
soupsieve = None
warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
from .css import CSS
from .formatter import ( from .formatter import (
Formatter, Formatter,
HTMLFormatter, HTMLFormatter,
@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([
"string-escape", "string-escape",
"string_escape", "string_escape",
]) ])
class NamespacedAttribute(str): class NamespacedAttribute(str):
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace """A namespaced string (e.g. 'xml:lang') that remembers the namespace
('xml') and the name ('lang') that were used to create it. ('xml') and the name ('lang') that were used to create it.
""" """
def __new__(cls, prefix, name=None, namespace=None): def __new__(cls, prefix, name=None, namespace=None):
if not name: if not name:
# This is the default namespace. Its name "has no value" # This is the default namespace. Its name "has no value"
@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value) return self.CHARSET_RE.sub(rewrite, self.original_value)
class PageElement(object): class PageElement(object):
"""Contains the navigational information for some part of the page: """Contains the navigational information for some part of the page:
that is, its current location in the parse tree. that is, its current location in the parse tree.
NavigableString, Tag, etc. are all subclasses of PageElement. NavigableString, Tag, etc. are all subclasses of PageElement.
""" """
# In general, we can't tell just by looking at an element whether
# it's contained in an XML document or an HTML document. But for
# Tags (q.v.) we can store this information at parse time.
known_xml = None
def setup(self, parent=None, previous_element=None, next_element=None, def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None): previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and """Sets up the initial relations between this element and
@ -163,7 +162,7 @@ class PageElement(object):
:param previous_element: The element parsed immediately before :param previous_element: The element parsed immediately before
this one. this one.
:param next_element: The element parsed immediately before :param next_element: The element parsed immediately before
this one. this one.
@ -257,11 +256,11 @@ class PageElement(object):
default = object() default = object()
def _all_strings(self, strip=False, types=default): def _all_strings(self, strip=False, types=default):
"""Yield all strings of certain classes, possibly stripping them. """Yield all strings of certain classes, possibly stripping them.
This is implemented differently in Tag and NavigableString. This is implemented differently in Tag and NavigableString.
""" """
raise NotImplementedError() raise NotImplementedError()
@property @property
def stripped_strings(self): def stripped_strings(self):
"""Yield all strings in this PageElement, stripping them first. """Yield all strings in this PageElement, stripping them first.
@ -294,11 +293,11 @@ class PageElement(object):
strip, types=types)]) strip, types=types)])
getText = get_text getText = get_text
text = property(get_text) text = property(get_text)
def replace_with(self, *args): def replace_with(self, *args):
"""Replace this PageElement with one or more PageElements, keeping the """Replace this PageElement with one or more PageElements, keeping the
rest of the tree the same. rest of the tree the same.
:param args: One or more PageElements. :param args: One or more PageElements.
:return: `self`, no longer part of the tree. :return: `self`, no longer part of the tree.
""" """
@ -410,7 +409,7 @@ class PageElement(object):
This works the same way as `list.insert`. This works the same way as `list.insert`.
:param position: The numeric position that should be occupied :param position: The numeric position that should be occupied
in `self.children` by the new PageElement. in `self.children` by the new PageElement.
:param new_child: A PageElement. :param new_child: A PageElement.
""" """
if new_child is None: if new_child is None:
@ -496,13 +495,16 @@ class PageElement(object):
def extend(self, tags): def extend(self, tags):
"""Appends the given PageElements to this one's contents. """Appends the given PageElements to this one's contents.
:param tags: A list of PageElements. :param tags: A list of PageElements. If a single Tag is
provided instead, this PageElement's contents will be extended
with that Tag's contents.
""" """
if isinstance(tags, Tag): if isinstance(tags, Tag):
# Calling self.append() on another tag's contents will change tags = tags.contents
# the list we're iterating over. Make a list that won't if isinstance(tags, list):
# change. # Moving items around the tree may change their position in
tags = list(tags.contents) # the original list. Make a list that won't change.
tags = list(tags)
for tag in tags: for tag in tags:
self.append(tag) self.append(tag)
@ -543,7 +545,7 @@ class PageElement(object):
"Element has no parent, so 'after' has no meaning.") "Element has no parent, so 'after' has no meaning.")
if any(x is self for x in args): if any(x is self for x in args):
raise ValueError("Can't insert an element after itself.") raise ValueError("Can't insert an element after itself.")
offset = 0 offset = 0
for successor in args: for successor in args:
# Extract first so that the index won't be screwed up if they # Extract first so that the index won't be screwed up if they
@ -586,8 +588,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values. :kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements. :return: A ResultSet containing PageElements.
""" """
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements, return self._find_all(name, attrs, string, limit, self.next_elements,
**kwargs) _stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3 findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@ -624,8 +627,11 @@ class PageElement(object):
:return: A ResultSet of PageElements. :return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
return self._find_all(name, attrs, string, limit, _stacklevel = kwargs.pop('_stacklevel', 2)
self.next_siblings, **kwargs) return self._find_all(
name, attrs, string, limit,
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findNextSiblings = find_next_siblings # BS3 findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2 fetchNextSiblings = find_next_siblings # BS2
@ -663,8 +669,11 @@ class PageElement(object):
:return: A ResultSet of PageElements. :return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
return self._find_all(name, attrs, string, limit, self.previous_elements, _stacklevel = kwargs.pop('_stacklevel', 2)
**kwargs) return self._find_all(
name, attrs, string, limit, self.previous_elements,
_stacklevel=_stacklevel+1, **kwargs
)
findAllPrevious = find_all_previous # BS3 findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2 fetchPrevious = find_all_previous # BS2
@ -702,8 +711,11 @@ class PageElement(object):
:return: A ResultSet of PageElements. :return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
return self._find_all(name, attrs, string, limit, _stacklevel = kwargs.pop('_stacklevel', 2)
self.previous_siblings, **kwargs) return self._find_all(
name, attrs, string, limit,
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findPreviousSiblings = find_previous_siblings # BS3 findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2 fetchPreviousSiblings = find_previous_siblings # BS2
@ -724,7 +736,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different # NOTE: We can't use _find_one because findParents takes a different
# set of arguments. # set of arguments.
r = None r = None
l = self.find_parents(name, attrs, 1, **kwargs) l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
@ -744,8 +756,9 @@ class PageElement(object):
:return: A PageElement. :return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString :rtype: bs4.element.Tag | bs4.element.NavigableString
""" """
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents, return self._find_all(name, attrs, None, limit, self.parents,
**kwargs) _stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3 findParents = find_parents # BS3
fetchParents = find_parents # BS2 fetchParents = find_parents # BS2
@ -771,19 +784,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs): def _find_one(self, method, name, attrs, string, **kwargs):
r = None r = None
l = method(name, attrs, string, 1, **kwargs) l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs): def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match." "Iterates over a generator looking for things that match."
_stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs: if string is None and 'text' in kwargs:
string = kwargs.pop('text') string = kwargs.pop('text')
warnings.warn( warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
DeprecationWarning DeprecationWarning, stacklevel=_stacklevel
) )
if isinstance(name, SoupStrainer): if isinstance(name, SoupStrainer):
@ -897,7 +911,7 @@ class PageElement(object):
:rtype: bool :rtype: bool
""" """
return getattr(self, '_decomposed', False) or False return getattr(self, '_decomposed', False) or False
# Old non-property versions of the generators, for backwards # Old non-property versions of the generators, for backwards
# compatibility with BS3. # compatibility with BS3.
def nextGenerator(self): def nextGenerator(self):
@ -921,16 +935,11 @@ class NavigableString(str, PageElement):
When Beautiful Soup parses the markup <b>penguin</b>, it will When Beautiful Soup parses the markup <b>penguin</b>, it will
create a NavigableString for the string "penguin". create a NavigableString for the string "penguin".
""" """
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__(cls, value): def __new__(cls, value):
"""Create a new NavigableString. """Create a new NavigableString.
@ -946,12 +955,22 @@ class NavigableString(str, PageElement):
u.setup() u.setup()
return u return u
def __copy__(self): def __deepcopy__(self, memo, recursive=False):
"""A copy of a NavigableString has the same contents and class """A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree. as the original, but it is not connected to the parse tree.
:param recursive: This parameter is ignored; it's only defined
so that NavigableString.__deepcopy__ implements the same
signature as Tag.__deepcopy__.
""" """
return type(self)(self) return type(self)(self)
def __copy__(self):
"""A copy of a NavigableString can only be a deep copy, because
only one PageElement can occupy a given place in a parse tree.
"""
return self.__deepcopy__({})
def __getnewargs__(self): def __getnewargs__(self):
return (str(self),) return (str(self),)
@ -1044,10 +1063,10 @@ class PreformattedString(NavigableString):
as comments (the Comment class) and CDATA blocks (the CData as comments (the Comment class) and CDATA blocks (the CData
class). class).
""" """
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
def output_ready(self, formatter=None): def output_ready(self, formatter=None):
"""Make this string ready for output by adding any subclass-specific """Make this string ready for output by adding any subclass-specific
prefix or suffix. prefix or suffix.
@ -1129,7 +1148,7 @@ class Stylesheet(NavigableString):
""" """
pass pass
class Script(NavigableString): class Script(NavigableString):
"""A NavigableString representing an executable script (probably """A NavigableString representing an executable script (probably
Javascript). Javascript).
@ -1235,7 +1254,7 @@ class Tag(PageElement):
if ((not builder or builder.store_line_numbers) if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)): and (sourceline is not None or sourcepos is not None)):
self.sourceline = sourceline self.sourceline = sourceline
self.sourcepos = sourcepos self.sourcepos = sourcepos
if attrs is None: if attrs is None:
attrs = {} attrs = {}
elif attrs: elif attrs:
@ -1293,25 +1312,60 @@ class Tag(PageElement):
self.interesting_string_types = builder.string_containers[self.name] self.interesting_string_types = builder.string_containers[self.name]
else: else:
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias("parser_class") # BS3 parserClass = _alias("parser_class") # BS3
def __copy__(self): def __deepcopy__(self, memo, recursive=True):
"""A copy of a Tag is a new Tag, unconnected to the parse tree. """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents. Its contents are a copy of the old Tag's contents.
""" """
clone = self._clone()
if recursive:
# Clone this tag's descendants recursively, but without
# making any recursive function calls.
tag_stack = [clone]
for event, element in self._event_stream(self.descendants):
if event is Tag.END_ELEMENT_EVENT:
# Stop appending incoming Tags to the Tag that was
# just closed.
tag_stack.pop()
else:
descendant_clone = element.__deepcopy__(
memo, recursive=False
)
# Add to its parent's .contents
tag_stack[-1].append(descendant_clone)
if event is Tag.START_ELEMENT_EVENT:
# Add the Tag itself to the stack so that its
# children will be .appended to it.
tag_stack.append(descendant_clone)
return clone
def __copy__(self):
"""A copy of a Tag must always be a deep copy, because a Tag's
children can only have one parent at a time.
"""
return self.__deepcopy__({})
def _clone(self):
"""Create a new Tag just like this one, but with no
contents and unattached to any parse tree.
This is the first step in the deepcopy process.
"""
clone = type(self)( clone = type(self)(
None, self.builder, self.name, self.namespace, None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml, self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos, sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element, can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes, cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags preserve_whitespace_tags=self.preserve_whitespace_tags,
interesting_string_types=self.interesting_string_types
) )
for attr in ('can_be_empty_element', 'hidden'): for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr)) setattr(clone, attr, getattr(self, attr))
for child in self.contents:
clone.append(child.__copy__())
return clone return clone
@property @property
@ -1417,7 +1471,7 @@ class Tag(PageElement):
i.contents = [] i.contents = []
i._decomposed = True i._decomposed = True
i = n i = n
def clear(self, decompose=False): def clear(self, decompose=False):
"""Wipe out all children of this PageElement by calling extract() """Wipe out all children of this PageElement by calling extract()
on them. on them.
@ -1505,7 +1559,7 @@ class Tag(PageElement):
if not isinstance(value, list): if not isinstance(value, list):
value = [value] value = [value]
return value return value
def has_attr(self, key): def has_attr(self, key):
"""Does this PageElement have an attribute with the given name?""" """Does this PageElement have an attribute with the given name?"""
return key in self.attrs return key in self.attrs
@ -1558,7 +1612,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name name=tag_name
), ),
DeprecationWarning DeprecationWarning, stacklevel=2
) )
return self.find(tag_name) return self.find(tag_name)
# We special case contents to avoid recursion. # We special case contents to avoid recursion.
@ -1592,7 +1646,7 @@ class Tag(PageElement):
def __repr__(self, encoding="unicode-escape"): def __repr__(self, encoding="unicode-escape"):
"""Renders this PageElement as a string. """Renders this PageElement as a string.
:param encoding: The encoding to use (Python 2 only). :param encoding: The encoding to use (Python 2 only).
TODO: This is now ignored and a warning should be issued TODO: This is now ignored and a warning should be issued
if a value is provided. if a value is provided.
:return: A (Unicode) string. :return: A (Unicode) string.
@ -1634,106 +1688,212 @@ class Tag(PageElement):
def decode(self, indent_level=None, def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal",
"""Render a Unicode representation of this PageElement and its iterator=None):
contents. pieces = []
:param indent_level: Each line of the rendering will be
indented this many spaces. Used internally in
recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
"""
# First off, turn a non-Formatter `formatter` into a Formatter # First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and # object. This will stop the lookup from happening over and
# over again. # over again.
if not isinstance(formatter, Formatter): if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter) formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
attrs = [] if indent_level is True:
for key, val in attributes: indent_level = 0
if val is None:
decoded = key # The currently active tag that put us into string literal
# mode. Until this element is closed, children will be treated
# as string literals and not pretty-printed. String literal
# mode is turned on immediately after this tag begins, and
# turned off immediately before it's closed. This means there
# will be whitespace before and after the tag itself.
string_literal_tag = None
for event, element in self._event_stream(iterator):
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
piece = element._format_tag(
eventual_encoding, formatter, opening=True
)
elif event is Tag.END_ELEMENT_EVENT:
piece = element._format_tag(
eventual_encoding, formatter, opening=False
)
if indent_level is not None:
indent_level -= 1
else: else:
if isinstance(val, list) or isinstance(val, tuple): piece = element.output_ready(formatter)
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
text = formatter.attribute_value(val) # Now we need to apply the 'prettiness' -- extra
decoded = ( # whitespace before and/or after this tag. This can get
str(key) + '=' # complicated because certain tags, like <pre> and
+ formatter.quoted_attribute_value(text)) # <script>, can't be prettified, since adding whitespace would
attrs.append(decoded) # change the meaning of the content.
close = ''
closeTag = ''
# The default behavior is to add whitespace before and
# after an element when string literal mode is off, and to
# leave things as they are when string literal mode is on.
if string_literal_tag:
indent_before = indent_after = False
else:
indent_before = indent_after = True
# The only time the behavior is more complex than that is
# when we encounter an opening or closing tag that might
# put us into or out of string literal mode.
if (event is Tag.START_ELEMENT_EVENT
and not string_literal_tag
and not element._should_pretty_print()):
# We are about to enter string literal mode. Add
# whitespace before this tag, but not after. We
# will stay in string literal mode until this tag
# is closed.
indent_before = True
indent_after = False
string_literal_tag = element
elif (event is Tag.END_ELEMENT_EVENT
and element is string_literal_tag):
# We are about to exit string literal mode by closing
# the tag that sent us into that mode. Add whitespace
# after this tag, but not before.
indent_before = False
indent_after = True
string_literal_tag = None
# Now we know whether to add whitespace before and/or
# after this element.
if indent_level is not None:
if (indent_before or indent_after):
if isinstance(element, NavigableString):
piece = piece.strip()
if piece:
piece = self._indent_string(
piece, indent_level, formatter,
indent_before, indent_after
)
if event == Tag.START_ELEMENT_EVENT:
indent_level += 1
pieces.append(piece)
return "".join(pieces)
# Names for the different events yielded by _event_stream
START_ELEMENT_EVENT = object()
END_ELEMENT_EVENT = object()
EMPTY_ELEMENT_EVENT = object()
STRING_ELEMENT_EVENT = object()
def _event_stream(self, iterator=None):
"""Yield a sequence of events that can be used to reconstruct the DOM
for this element.
This lets us recreate the nested structure of this element
(e.g. when formatting it as a string) without using recursive
method calls.
This is similar in concept to the SAX API, but it's a simpler
interface designed for internal use. The events are different
from SAX and the arguments associated with the events are Tags
and other Beautiful Soup objects.
:param iterator: An alternate iterator to use when traversing
the tree.
"""
tag_stack = []
iterator = iterator or self.self_and_descendants
for c in iterator:
# If the parent of the element we're about to yield is not
# the tag currently on the stack, it means that the tag on
# the stack closed before this element appeared.
while tag_stack and c.parent != tag_stack[-1]:
now_closed_tag = tag_stack.pop()
yield Tag.END_ELEMENT_EVENT, now_closed_tag
if isinstance(c, Tag):
if c.is_empty_element:
yield Tag.EMPTY_ELEMENT_EVENT, c
else:
yield Tag.START_ELEMENT_EVENT, c
tag_stack.append(c)
continue
else:
yield Tag.STRING_ELEMENT_EVENT, c
while tag_stack:
now_closed_tag = tag_stack.pop()
yield Tag.END_ELEMENT_EVENT, now_closed_tag
def _indent_string(self, s, indent_level, formatter,
indent_before, indent_after):
"""Add indentation whitespace before and/or after a string.
:param s: The string to amend with whitespace.
:param indent_level: The indentation level; affects how much
whitespace goes before the string.
:param indent_before: Whether or not to add whitespace
before the string.
:param indent_after: Whether or not to add whitespace
(a newline) after the string.
"""
space_before = ''
if indent_before and indent_level:
space_before = (formatter.indent * indent_level)
space_after = ''
if indent_after:
space_after = "\n"
return space_before + s + space_after
def _format_tag(self, eventual_encoding, formatter, opening):
# A tag starts with the < character (see below).
# Then the / character, if this is a closing tag.
closing_slash = ''
if not opening:
closing_slash = '/'
# Then an optional namespace prefix.
prefix = '' prefix = ''
if self.prefix: if self.prefix:
prefix = self.prefix + ":" prefix = self.prefix + ":"
if self.is_empty_element: # Then a list of attribute values, if this is an opening tag.
close = formatter.void_element_close_prefix or '' attribute_string = ''
else: if opening:
closeTag = '</%s%s>' % (prefix, self.name) attributes = formatter.attributes(self)
attrs = []
for key, val in attributes:
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
pretty_print = self._should_pretty_print(indent_level) text = formatter.attribute_value(val)
space = '' decoded = (
indent_space = '' str(key) + '='
if indent_level is not None: + formatter.quoted_attribute_value(text))
indent_space = (formatter.indent * (indent_level - 1)) attrs.append(decoded)
if pretty_print:
space = indent_space
indent_contents = indent_level + 1
else:
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter
)
if self.hidden:
# This is the 'document root' object.
s = contents
else:
s = []
attribute_string = ''
if attrs: if attrs:
attribute_string = ' ' + ' '.join(attrs) attribute_string = ' ' + ' '.join(attrs)
if indent_level is not None:
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
if pretty_print:
s.append("\n")
s.append(contents)
if pretty_print and contents and contents[-1] != "\n":
s.append("\n")
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
def _should_pretty_print(self, indent_level): # Then an optional closing slash (for a void element in an
# XML document).
void_element_closing_slash = ''
if self.is_empty_element:
void_element_closing_slash = formatter.void_element_close_prefix or ''
# Put it all together.
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
def _should_pretty_print(self, indent_level=1):
"""Should this tag be pretty-printed? """Should this tag be pretty-printed?
Most of them should, but some (such as <pre> in HTML Most of them should, but some (such as <pre> in HTML
@ -1754,7 +1914,7 @@ class Tag(PageElement):
a Unicode string will be returned. a Unicode string will be returned.
:param formatter: A Formatter object, or a string naming one of :param formatter: A Formatter object, or a string naming one of
the standard formatters. the standard formatters.
:return: A Unicode string (if encoding==None) or a bytestring :return: A Unicode string (if encoding==None) or a bytestring
(otherwise). (otherwise).
""" """
if encoding is None: if encoding is None:
@ -1784,33 +1944,9 @@ class Tag(PageElement):
the standard Formatters. the standard Formatters.
""" """
# First off, turn a string formatter into a Formatter object. This return self.decode(indent_level, eventual_encoding, formatter,
# will stop the lookup from happening over and over again. iterator=self.descendants)
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
for c in self:
text = None
if isinstance(c, NavigableString):
text = c.output_ready(formatter)
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
preserve_whitespace = (
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
if pretty_print and not preserve_whitespace:
s.append(formatter.indent * (indent_level - 1))
s.append(text)
if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
def encode_contents( def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal"):
@ -1862,7 +1998,8 @@ class Tag(PageElement):
:rtype: bs4.element.Tag | bs4.element.NavigableString :rtype: bs4.element.Tag | bs4.element.NavigableString
""" """
r = None r = None
l = self.find_all(name, attrs, recursive, string, 1, **kwargs) l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
**kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
@ -1889,7 +2026,9 @@ class Tag(PageElement):
generator = self.descendants generator = self.descendants
if not recursive: if not recursive:
generator = self.children generator = self.children
return self._find_all(name, attrs, string, limit, generator, **kwargs) _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, generator,
_stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3 findAll = find_all # BS3
findChildren = find_all # BS2 findChildren = find_all # BS2
@ -1903,6 +2042,18 @@ class Tag(PageElement):
# return iter() to make the purpose of the method clear # return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested. return iter(self.contents) # XXX This seems to be untested.
@property
def self_and_descendants(self):
"""Iterate over this PageElement and its children in a
breadth-first sequence.
:yield: A sequence of PageElements.
"""
if not self.hidden:
yield self
for i in self.descendants:
yield i
@property @property
def descendants(self): def descendants(self):
"""Iterate over all children of this PageElement in a """Iterate over all children of this PageElement in a
@ -1929,16 +2080,13 @@ class Tag(PageElement):
Beautiful Soup will use the prefixes it encountered while Beautiful Soup will use the prefixes it encountered while
parsing the document. parsing the document.
:param kwargs: Keyword arguments to be passed into SoupSieve's :param kwargs: Keyword arguments to be passed into Soup Sieve's
soupsieve.select() method. soupsieve.select() method.
:return: A Tag. :return: A Tag.
:rtype: bs4.element.Tag :rtype: bs4.element.Tag
""" """
value = self.select(selector, namespaces, 1, **kwargs) return self.css.select_one(selector, namespaces, **kwargs)
if value:
return value[0]
return None
def select(self, selector, namespaces=None, limit=None, **kwargs): def select(self, selector, namespaces=None, limit=None, **kwargs):
"""Perform a CSS selection operation on the current element. """Perform a CSS selection operation on the current element.
@ -1954,27 +2102,18 @@ class Tag(PageElement):
:param limit: After finding this number of results, stop looking. :param limit: After finding this number of results, stop looking.
:param kwargs: Keyword arguments to be passed into SoupSieve's :param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method. soupsieve.select() method.
:return: A ResultSet of Tags. :return: A ResultSet of Tags.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
if namespaces is None: return self.css.select(selector, namespaces, limit, **kwargs)
namespaces = self._namespaces
if limit is None:
limit = 0
if soupsieve is None:
raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
# We do this because it's more consistent and because @property
# ResultSet.__getattr__ has a helpful error message. def css(self):
return ResultSet(None, results) """Return an interface to the CSS selector API."""
return CSS(self)
# Old names for backwards compatibility # Old names for backwards compatibility
def childGenerator(self): def childGenerator(self):
@ -1993,7 +2132,7 @@ class Tag(PageElement):
""" """
warnings.warn( warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.', 'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning DeprecationWarning, stacklevel=2
) )
return self.has_attr(key) return self.has_attr(key)
@ -2019,12 +2158,12 @@ class SoupStrainer(object):
:param attrs: A dictionary of filters on attribute values. :param attrs: A dictionary of filters on attribute values.
:param string: A filter for a NavigableString with specific text. :param string: A filter for a NavigableString with specific text.
:kwargs: A dictionary of filters on attribute values. :kwargs: A dictionary of filters on attribute values.
""" """
if string is None and 'text' in kwargs: if string is None and 'text' in kwargs:
string = kwargs.pop('text') string = kwargs.pop('text')
warnings.warn( warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning DeprecationWarning, stacklevel=2
) )
self.name = self._normalize_search_value(name) self.name = self._normalize_search_value(name)
@ -2118,7 +2257,7 @@ class SoupStrainer(object):
# looking at a tag with a different name. # looking at a tag with a different name.
if markup and not markup.prefix and self.name != markup.name: if markup and not markup.prefix and self.name != markup.name:
return False return False
call_function_with_tag_data = ( call_function_with_tag_data = (
isinstance(self.name, Callable) isinstance(self.name, Callable)
and not isinstance(markup_name, Tag)) and not isinstance(markup_name, Tag))
@ -2204,7 +2343,7 @@ class SoupStrainer(object):
if self._matches(' '.join(markup), match_against): if self._matches(' '.join(markup), match_against):
return True return True
return False return False
if match_against is True: if match_against is True:
# True matches any non-None value. # True matches any non-None value.
return markup is not None return markup is not None
@ -2248,11 +2387,11 @@ class SoupStrainer(object):
return True return True
else: else:
return False return False
# Beyond this point we might need to run the test twice: once against # Beyond this point we might need to run the test twice: once against
# the tag's name and once against its prefixed name. # the tag's name and once against its prefixed name.
match = False match = False
if not match and isinstance(match_against, str): if not match and isinstance(match_against, str):
# Exact string match # Exact string match
match = markup == match_against match = markup == match_against

View file

@ -97,7 +97,7 @@ class Formatter(EntitySubstitution):
else: else:
indent = ' ' indent = ' '
self.indent = indent self.indent = indent
def substitute(self, ns): def substitute(self, ns):
"""Process a string that needs to undergo entity substitution. """Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as This may be a string encountered in an attribute value or as
@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
"""A generic Formatter for HTML.""" """A generic Formatter for HTML."""
REGISTRY = {} REGISTRY = {}
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter): class XMLFormatter(Formatter):
"""A generic Formatter for XML.""" """A generic Formatter for XML."""
REGISTRY = {} REGISTRY = {}
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters. # Set up aliases for the default formatters.

View file

@ -32,7 +32,7 @@ from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401 from .util import DEBUG, SelectorSyntaxError # noqa: F401
import bs4 # type: ignore[import] import bs4 # type: ignore[import]
from typing import Optional, Any, Iterator, Iterable from typing import Any, Iterator, Iterable
__all__ = ( __all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve', 'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
@ -45,17 +45,14 @@ SoupSieve = cm.SoupSieve
def compile( # noqa: A001 def compile( # noqa: A001
pattern: str, pattern: str,
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> cm.SoupSieve: ) -> cm.SoupSieve:
"""Compile CSS pattern.""" """Compile CSS pattern."""
ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
if isinstance(pattern, SoupSieve): if isinstance(pattern, SoupSieve):
if flags: if flags:
raise ValueError("Cannot process 'flags' argument on a compiled selector list") raise ValueError("Cannot process 'flags' argument on a compiled selector list")
@ -65,7 +62,12 @@ def compile( # noqa: A001
raise ValueError("Cannot process 'custom' argument on a compiled selector list") raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern return pattern
return cp._cached_css_compile(pattern, ns, cs, flags) return cp._cached_css_compile(
pattern,
ct.Namespaces(namespaces) if namespaces is not None else namespaces,
ct.CustomSelectors(custom) if custom is not None else custom,
flags
)
def purge() -> None: def purge() -> None:
@ -77,10 +79,10 @@ def purge() -> None:
def closest( def closest(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> 'bs4.Tag': ) -> 'bs4.Tag':
"""Match closest ancestor.""" """Match closest ancestor."""
@ -91,10 +93,10 @@ def closest(
def match( def match(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> bool: ) -> bool:
"""Match node.""" """Match node."""
@ -105,10 +107,10 @@ def match(
def filter( # noqa: A001 def filter( # noqa: A001
select: str, select: str,
iterable: Iterable['bs4.Tag'], iterable: Iterable['bs4.Tag'],
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> list['bs4.Tag']: ) -> list['bs4.Tag']:
"""Filter list of nodes.""" """Filter list of nodes."""
@ -119,10 +121,10 @@ def filter( # noqa: A001
def select_one( def select_one(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> 'bs4.Tag': ) -> 'bs4.Tag':
"""Select a single tag.""" """Select a single tag."""
@ -133,11 +135,11 @@ def select_one(
def select( def select(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
limit: int = 0, limit: int = 0,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> list['bs4.Tag']: ) -> list['bs4.Tag']:
"""Select the specified tags.""" """Select the specified tags."""
@ -148,11 +150,11 @@ def select(
def iselect( def iselect(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None, namespaces: dict[str, str] | None = None,
limit: int = 0, limit: int = 0,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[dict[str, str]] = None, custom: dict[str, str] | None = None,
**kwargs: Any **kwargs: Any
) -> Iterator['bs4.Tag']: ) -> Iterator['bs4.Tag']:
"""Iterate the specified tags.""" """Iterate the specified tags."""

View file

@ -193,5 +193,5 @@ def parse_version(ver: str) -> Version:
return Version(major, minor, micro, release, pre, post, dev) return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(2, 5, 0, "final", post=1) __version_info__ = Version(2, 4, 1, "final")
__version__ = __version_info__._get_canonical() __version__ = __version_info__._get_canonical()

View file

@ -6,7 +6,7 @@ import re
from . import css_types as ct from . import css_types as ct
import unicodedata import unicodedata
import bs4 # type: ignore[import] import bs4 # type: ignore[import]
from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401 from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
# Empty tag pattern (whitespace okay) # Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -171,7 +171,7 @@ class _DocumentNav:
def get_children( def get_children(
self, self,
el: bs4.Tag, el: bs4.Tag,
start: Optional[int] = None, start: int | None = None,
reverse: bool = False, reverse: bool = False,
tags: bool = True, tags: bool = True,
no_iframe: bool = False no_iframe: bool = False
@ -239,22 +239,22 @@ class _DocumentNav:
return parent return parent
@staticmethod @staticmethod
def get_tag_name(el: bs4.Tag) -> Optional[str]: def get_tag_name(el: bs4.Tag) -> str | None:
"""Get tag.""" """Get tag."""
return cast(Optional[str], el.name) return cast('str | None', el.name)
@staticmethod @staticmethod
def get_prefix_name(el: bs4.Tag) -> Optional[str]: def get_prefix_name(el: bs4.Tag) -> str | None:
"""Get prefix.""" """Get prefix."""
return cast(Optional[str], el.prefix) return cast('str | None', el.prefix)
@staticmethod @staticmethod
def get_uri(el: bs4.Tag) -> Optional[str]: def get_uri(el: bs4.Tag) -> str | None:
"""Get namespace `URI`.""" """Get namespace `URI`."""
return cast(Optional[str], el.namespace) return cast('str | None', el.namespace)
@classmethod @classmethod
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
@ -287,7 +287,7 @@ class _DocumentNav:
return bool(ns and ns == NS_XHTML) return bool(ns and ns == NS_XHTML)
@staticmethod @staticmethod
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]: def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
"""Return namespace and attribute name without the prefix.""" """Return namespace and attribute name without the prefix."""
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
@ -330,8 +330,8 @@ class _DocumentNav:
cls, cls,
el: bs4.Tag, el: bs4.Tag,
name: str, name: str,
default: Optional[str | Sequence[str]] = None default: str | Sequence[str] | None = None
) -> Optional[str | Sequence[str]]: ) -> str | Sequence[str] | None:
"""Get attribute by name.""" """Get attribute by name."""
value = default value = default
@ -348,7 +348,7 @@ class _DocumentNav:
return value return value
@classmethod @classmethod
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]: def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
"""Iterate attributes.""" """Iterate attributes."""
for k, v in el.attrs.items(): for k, v in el.attrs.items():
@ -424,10 +424,10 @@ class Inputs:
return 0 <= minutes <= 59 return 0 <= minutes <= 59
@classmethod @classmethod
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]: def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
"""Parse the input value.""" """Parse the input value."""
parsed = None # type: Optional[tuple[float, ...]] parsed = None # type: tuple[float, ...] | None
if value is None: if value is None:
return value return value
if itype == "date": if itype == "date":
@ -486,7 +486,7 @@ class CSSMatch(_DocumentNav):
self, self,
selectors: ct.SelectorList, selectors: ct.SelectorList,
scope: bs4.Tag, scope: bs4.Tag,
namespaces: Optional[ct.Namespaces], namespaces: ct.Namespaces | None,
flags: int flags: int
) -> None: ) -> None:
"""Initialize.""" """Initialize."""
@ -545,19 +545,19 @@ class CSSMatch(_DocumentNav):
return self.get_tag_ns(el) == NS_XHTML return self.get_tag_ns(el) == NS_XHTML
def get_tag(self, el: bs4.Tag) -> Optional[str]: def get_tag(self, el: bs4.Tag) -> str | None:
"""Get tag.""" """Get tag."""
name = self.get_tag_name(el) name = self.get_tag_name(el)
return util.lower(name) if name is not None and not self.is_xml else name return util.lower(name) if name is not None and not self.is_xml else name
def get_prefix(self, el: bs4.Tag) -> Optional[str]: def get_prefix(self, el: bs4.Tag) -> str | None:
"""Get prefix.""" """Get prefix."""
prefix = self.get_prefix_name(el) prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
def find_bidi(self, el: bs4.Tag) -> Optional[int]: def find_bidi(self, el: bs4.Tag) -> int | None:
"""Get directionality from element text.""" """Get directionality from element text."""
for node in self.get_children(el, tags=False): for node in self.get_children(el, tags=False):
@ -653,8 +653,8 @@ class CSSMatch(_DocumentNav):
self, self,
el: bs4.Tag, el: bs4.Tag,
attr: str, attr: str,
prefix: Optional[str] prefix: str | None
) -> Optional[str | Sequence[str]]: ) -> str | Sequence[str] | None:
"""Match attribute name and return value if it exists.""" """Match attribute name and return value if it exists."""
value = None value = None
@ -751,7 +751,7 @@ class CSSMatch(_DocumentNav):
name not in (self.get_tag(el), '*') name not in (self.get_tag(el), '*')
) )
def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool: def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
"""Match the tag.""" """Match the tag."""
match = True match = True
@ -1030,7 +1030,7 @@ class CSSMatch(_DocumentNav):
"""Match element if it contains text.""" """Match element if it contains text."""
match = True match = True
content = None # type: Optional[str | Sequence[str]] content = None # type: str | Sequence[str] | None
for contain_list in contains: for contain_list in contains:
if content is None: if content is None:
if contain_list.own: if contain_list.own:
@ -1099,7 +1099,7 @@ class CSSMatch(_DocumentNav):
match = False match = False
name = cast(str, self.get_attribute_by_name(el, 'name')) name = cast(str, self.get_attribute_by_name(el, 'name'))
def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]: def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
"""Find this input's form.""" """Find this input's form."""
form = None form = None
parent = self.get_parent(el, no_iframe=True) parent = self.get_parent(el, no_iframe=True)
@ -1478,7 +1478,7 @@ class CSSMatch(_DocumentNav):
if lim < 1: if lim < 1:
break break
def closest(self) -> Optional[bs4.Tag]: def closest(self) -> bs4.Tag | None:
"""Match closest ancestor.""" """Match closest ancestor."""
current = self.tag current = self.tag
@ -1506,7 +1506,7 @@ class SoupSieve(ct.Immutable):
pattern: str pattern: str
selectors: ct.SelectorList selectors: ct.SelectorList
namespaces: Optional[ct.Namespaces] namespaces: ct.Namespaces | None
custom: dict[str, str] custom: dict[str, str]
flags: int flags: int
@ -1516,8 +1516,8 @@ class SoupSieve(ct.Immutable):
self, self,
pattern: str, pattern: str,
selectors: ct.SelectorList, selectors: ct.SelectorList,
namespaces: Optional[ct.Namespaces], namespaces: ct.Namespaces | None,
custom: Optional[ct.CustomSelectors], custom: ct.CustomSelectors | None,
flags: int flags: int
): ):
"""Initialize.""" """Initialize."""

View file

@ -7,7 +7,7 @@ from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import SelectorSyntaxError from .util import SelectorSyntaxError
import warnings import warnings
from typing import Optional, Match, Any, Iterator, cast from typing import Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -113,7 +113,7 @@ VALUE = r'''
'''.format(nl=NEWLINE, ident=IDENTIFIER) '''.format(nl=NEWLINE, ident=IDENTIFIER)
# Attribute value comparison. `!=` is handled special as it is non-standard. # Attribute value comparison. `!=` is handled special as it is non-standard.
ATTR = r''' ATTR = r'''
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\] (?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}*(?P<case>[is]))?)?{ws}*\]
'''.format(ws=WSC, value=VALUE) '''.format(ws=WSC, value=VALUE)
# Selector patterns # Selector patterns
@ -207,8 +207,8 @@ _MAXCACHE = 500
@lru_cache(maxsize=_MAXCACHE) @lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile( def _cached_css_compile(
pattern: str, pattern: str,
namespaces: Optional[ct.Namespaces], namespaces: ct.Namespaces | None,
custom: Optional[ct.CustomSelectors], custom: ct.CustomSelectors | None,
flags: int flags: int
) -> cm.SoupSieve: ) -> cm.SoupSieve:
"""Cached CSS compile.""" """Cached CSS compile."""
@ -233,7 +233,7 @@ def _purge_cache() -> None:
_cached_css_compile.cache_clear() _cached_css_compile.cache_clear()
def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]: def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
"""Process custom.""" """Process custom."""
custom_selectors = {} custom_selectors = {}
@ -317,7 +317,7 @@ class SelectorPattern:
return self.name return self.name
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]: def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
"""Match the selector.""" """Match the selector."""
return self.re_pattern.match(selector, index) return self.re_pattern.match(selector, index)
@ -336,7 +336,7 @@ class SpecialPseudoPattern(SelectorPattern):
for pseudo in p[1]: for pseudo in p[1]:
self.patterns[pseudo] = pattern self.patterns[pseudo] = pattern
self.matched_name = None # type: Optional[SelectorPattern] self.matched_name = None # type: SelectorPattern | None
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
def get_name(self) -> str: def get_name(self) -> str:
@ -344,7 +344,7 @@ class SpecialPseudoPattern(SelectorPattern):
return '' if self.matched_name is None else self.matched_name.get_name() return '' if self.matched_name is None else self.matched_name.get_name()
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]: def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
"""Match the selector.""" """Match the selector."""
pseudo = None pseudo = None
@ -372,14 +372,14 @@ class _Selector:
def __init__(self, **kwargs: Any) -> None: def __init__(self, **kwargs: Any) -> None:
"""Initialize.""" """Initialize."""
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag] self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
self.ids = kwargs.get('ids', []) # type: list[str] self.ids = kwargs.get('ids', []) # type: list[str]
self.classes = kwargs.get('classes', []) # type: list[str] self.classes = kwargs.get('classes', []) # type: list[str]
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute] self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth] self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList] self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
self.relations = kwargs.get('relations', []) # type: list[_Selector] self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str] self.rel_type = kwargs.get('rel_type', None) # type: str | None
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains] self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang] self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) # type: int self.flags = kwargs.get('flags', 0) # type: int
@ -462,7 +462,7 @@ class CSSParser:
def __init__( def __init__(
self, self,
selector: str, selector: str,
custom: Optional[dict[str, str | ct.SelectorList]] = None, custom: dict[str, str | ct.SelectorList] | None = None,
flags: int = 0 flags: int = 0
) -> None: ) -> None:
"""Initialize.""" """Initialize."""
@ -723,7 +723,7 @@ class CSSParser:
if postfix == '_child': if postfix == '_child':
if m.group('of'): if m.group('of'):
# Parse the rest of `of S`. # Parse the rest of `of S`.
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN | FLG_FORGIVE) nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
else: else:
# Use default `*|*` for `of S`. # Use default `*|*` for `of S`.
nth_sel = CSS_NTH_OF_S_DEFAULT nth_sel = CSS_NTH_OF_S_DEFAULT
@ -753,7 +753,7 @@ class CSSParser:
if name == ':not': if name == ':not':
flags |= FLG_NOT flags |= FLG_NOT
elif name == ':has': elif name == ':has':
flags |= FLG_RELATIVE | FLG_FORGIVE flags |= FLG_RELATIVE
elif name in (':where', ':is'): elif name in (':where', ':is'):
flags |= FLG_FORGIVE flags |= FLG_FORGIVE
@ -777,11 +777,6 @@ class CSSParser:
if not combinator: if not combinator:
combinator = WS_COMBINATOR combinator = WS_COMBINATOR
if combinator == COMMA_COMBINATOR: if combinator == COMMA_COMBINATOR:
if not has_selector:
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
sel.no_match = True
sel.rel_type = rel_type sel.rel_type = rel_type
selectors[-1].relations.append(sel) selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR rel_type = ":" + WS_COMBINATOR
@ -1070,22 +1065,12 @@ class CSSParser:
selectors.append(sel) selectors.append(sel)
# Forgive empty slots in pseudo-classes that have lists (and are forgiving) # Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive: elif is_forgive and (not selectors or not relations):
if is_relative: # Handle normal pseudo-classes with empty slots like `:is()` etc.
# Handle relative selectors pseudo-classes with empty slots like `:has()` sel.no_match = True
if selectors and selectors[-1].rel_type is None and rel_type == ': ': del relations[:]
sel.rel_type = rel_type selectors.append(sel)
sel.no_match = True has_selector = True
selectors[-1].relations.append(sel)
has_selector = True
else:
# Handle normal pseudo-classes with empty slots
if not selectors or not relations:
# Others like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
has_selector = True
if not has_selector: if not has_selector:
# We will always need to finish a selector when `:has()` is used as it leads with combining. # We will always need to finish a selector when `:has()` is used as it leads with combining.

View file

@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
import copyreg import copyreg
from .pretty import pretty from .pretty import pretty
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping
__all__ = ( __all__ = (
'Selector', 'Selector',
@ -189,28 +189,28 @@ class Selector(Immutable):
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
) )
tag: Optional[SelectorTag] tag: SelectorTag | None
ids: tuple[str, ...] ids: tuple[str, ...]
classes: tuple[str, ...] classes: tuple[str, ...]
attributes: tuple[SelectorAttribute, ...] attributes: tuple[SelectorAttribute, ...]
nth: tuple[SelectorNth, ...] nth: tuple[SelectorNth, ...]
selectors: tuple[SelectorList, ...] selectors: tuple[SelectorList, ...]
relation: SelectorList relation: SelectorList
rel_type: Optional[str] rel_type: str | None
contains: tuple[SelectorContains, ...] contains: tuple[SelectorContains, ...]
lang: tuple[SelectorLang, ...] lang: tuple[SelectorLang, ...]
flags: int flags: int
def __init__( def __init__(
self, self,
tag: Optional[SelectorTag], tag: SelectorTag | None,
ids: tuple[str, ...], ids: tuple[str, ...],
classes: tuple[str, ...], classes: tuple[str, ...],
attributes: tuple[SelectorAttribute, ...], attributes: tuple[SelectorAttribute, ...],
nth: tuple[SelectorNth, ...], nth: tuple[SelectorNth, ...],
selectors: tuple[SelectorList, ...], selectors: tuple[SelectorList, ...],
relation: SelectorList, relation: SelectorList,
rel_type: Optional[str], rel_type: str | None,
contains: tuple[SelectorContains, ...], contains: tuple[SelectorContains, ...],
lang: tuple[SelectorLang, ...], lang: tuple[SelectorLang, ...],
flags: int flags: int
@ -247,9 +247,9 @@ class SelectorTag(Immutable):
__slots__ = ("name", "prefix", "_hash") __slots__ = ("name", "prefix", "_hash")
name: str name: str
prefix: Optional[str] prefix: str | None
def __init__(self, name: str, prefix: Optional[str]) -> None: def __init__(self, name: str, prefix: str | None) -> None:
"""Initialize.""" """Initialize."""
super().__init__(name=name, prefix=prefix) super().__init__(name=name, prefix=prefix)
@ -262,15 +262,15 @@ class SelectorAttribute(Immutable):
attribute: str attribute: str
prefix: str prefix: str
pattern: Optional[Pattern[str]] pattern: Pattern[str] | None
xml_type_pattern: Optional[Pattern[str]] xml_type_pattern: Pattern[str] | None
def __init__( def __init__(
self, self,
attribute: str, attribute: str,
prefix: str, prefix: str,
pattern: Optional[Pattern[str]], pattern: Pattern[str] | None,
xml_type_pattern: Optional[Pattern[str]] xml_type_pattern: Pattern[str] | None
) -> None: ) -> None:
"""Initialize.""" """Initialize."""
@ -360,7 +360,7 @@ class SelectorList(Immutable):
def __init__( def __init__(
self, self,
selectors: Optional[Iterable[Selector | SelectorNull]] = None, selectors: Iterable[Selector | SelectorNull] | None = None,
is_not: bool = False, is_not: bool = False,
is_html: bool = False is_html: bool = False
) -> None: ) -> None:

View file

@ -3,7 +3,7 @@ from __future__ import annotations
from functools import wraps, lru_cache from functools import wraps, lru_cache
import warnings import warnings
import re import re
from typing import Callable, Any, Optional from typing import Callable, Any
DEBUG = 0x00001 DEBUG = 0x00001
@ -27,7 +27,7 @@ def lower(string: str) -> str:
class SelectorSyntaxError(Exception): class SelectorSyntaxError(Exception):
"""Syntax error in a CSS selector.""" """Syntax error in a CSS selector."""
def __init__(self, msg: str, pattern: Optional[str] = None, index: Optional[int] = None) -> None: def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None:
"""Initialize.""" """Initialize."""
self.line = None self.line = None
@ -84,7 +84,7 @@ def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
col = 1 col = 1
text = [] # type: list[str] text = [] # type: list[str]
line = 1 line = 1
offset = None # type: Optional[int] offset = None # type: int | None
# Split pattern by newline and handle the text before the newline # Split pattern by newline and handle the text before the newline
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern): for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):