Merge branch 'feature/UpdateBSoup' into dev

This commit is contained in:
JackDandy 2023-05-28 14:38:31 +01:00
commit b4c3af19b8
14 changed files with 794 additions and 475 deletions

View file

@ -1,4 +1,10 @@
### 3.29.2 (2023-05-28 07:45:00 UTC)
### 3.30.0 (2023-0x-xx xx:xx:00 UTC)
* Update Beautiful Soup 4.11.1 (r642) to 4.12.2
* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
### 3.29.2 (2023-05-28 07:45:00 UTC)
* Fix find show results returned as newest/oldest that are then sorted z to a
* Fix add show "TheTVDB via Trakt"

View file

@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
Beautiful Soup works with Python 3.5 and up. It works better if lxml
Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.11.1"
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
__version__ = "4.12.2"
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@ -38,11 +38,13 @@ from .builder import (
builder_registry,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
HTMLParserTreeBuilder
)
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
CSS,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
@ -116,7 +118,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
@ -211,7 +213,7 @@ class BeautifulSoup(Tag):
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name),
DeprecationWarning
DeprecationWarning, stacklevel=3
)
return kwargs.pop(old_name)
return None
@ -348,25 +350,49 @@ class BeautifulSoup(Tag):
self.markup = None
self.builder.soup = None
def __copy__(self):
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
def _clone(self):
"""Create a new BeautifulSoup object with the same TreeBuilder,
but not associated with any markup.
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
This is the first step of the deepcopy process.
"""
clone = type(self)("", None, self.builder)
# Keep track of the encoding of the original document,
# since we won't be parsing it again.
clone.original_encoding = self.original_encoding
return clone
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
d['builder'] = None
d['builder'] = type(self.builder)
# Store the contents as a Unicode string.
d['contents'] = []
d['markup'] = self.decode()
# If _most_recent_element is present, it's a Tag object left
# over from initial parse. It might not be picklable and we
# don't need it.
if '_most_recent_element' in d:
del d['_most_recent_element']
return d
def __setstate__(self, state):
# If necessary, restore the TreeBuilder by looking it up.
self.__dict__ = state
if isinstance(self.builder, type):
self.builder = self.builder()
elif not self.builder:
# We don't know which builder was used to build this
# parse tree, so use a default we know is always available.
self.builder = HTMLParserTreeBuilder()
self.builder.soup = self
self.reset()
self._feed()
return state
@classmethod
def _decode_markup(cls, markup):
@ -405,7 +431,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning
MarkupResemblesLocatorWarning,
stacklevel=3
)
return True
return False
@ -436,7 +463,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into'
' Beautiful Soup.',
MarkupResemblesLocatorWarning
MarkupResemblesLocatorWarning, stacklevel=3
)
return True
return False
@ -467,6 +494,7 @@ class BeautifulSoup(Tag):
self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
self._most_recent_element = None
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
@ -748,7 +776,7 @@ class BeautifulSoup(Tag):
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
formatter="minimal", iterator=None):
"""Returns a string or Unicode representation of the parse tree
as an HTML or XML document.
@ -775,7 +803,7 @@ class BeautifulSoup(Tag):
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
indent_level, eventual_encoding, formatter, iterator)
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup
@ -789,7 +817,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.',
DeprecationWarning
DeprecationWarning, stacklevel=2
)
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)

View file

@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
warnings.warn(
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
stacklevel=3
)
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
warnings.warn(
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
stacklevel=4
)
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()

View file

@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from ..element import (
CData,
Comment,
@ -45,6 +24,7 @@ from ..dammit import EntitySubstitution, UnicodeDammit
from ..builder import (
DetectsXMLParsedAsHTML,
ParserRejectedMarkup,
HTML,
HTMLTreeBuilder,
STRICT,
@ -90,20 +70,23 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self.already_closed_empty_element = []
self._initialize_xml_detector()
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although
this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() by raising an exception,
which we don't want to do.
def error(self, message):
# NOTE: This method is required so long as Python 3.9 is
# supported. The corresponding code is removed from HTMLParser
# in 3.5, but not removed from ParserBase until 3.10.
# https://github.com/python/cpython/issues/76025
#
# The original implementation turned the error into a warning,
# but in every case I discovered, this made HTMLParser
# immediately crash with an error message that was less
# helpful than the warning. The new implementation makes it
# more clear that html.parser just can't parse this
# markup. The 3.10 implementation does the same, though it
# raises AssertionError rather than calling a method. (We
# catch this error and wrap it in a ParserRejectedMarkup.)
raise ParserRejectedMarkup(message)
In any event, this method is called only on very strange
markup and our best strategy is to pretend it didn't happen
and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
@ -203,9 +186,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal.
"""
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
# TODO: This was originally a workaround for a bug in
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
# been fixed, but removing this code still makes some
# Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@ -333,10 +317,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
@ -397,103 +378,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
except AssertionError as e:
# html.parser raises AssertionError in rare cases to
# indicate a fatal problem with the markup, especially
# when there's an error in the doctype declaration.
raise ParserRejectedMarkup(e)
parser.close()
parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

280
lib/bs4/css.py Normal file
View file

@ -0,0 +1,280 @@
"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
import warnings
try:
import soupsieve
except ImportError as e:
soupsieve = None
warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
class CSS(object):
"""A proxy object against the soupsieve library, to simplify its
CSS selector API.
Acquire this object through the .css attribute on the
BeautifulSoup object, or on the Tag you want to use as the
starting point for a CSS selector.
The main advantage of doing this is that the tag to be selected
against doesn't need to be explicitly specified in the function
calls, since it's already scoped to a tag.
"""
def __init__(self, tag, api=soupsieve):
"""Constructor.
You don't need to instantiate this class yourself; instead,
access the .css attribute on the BeautifulSoup object, or on
the Tag you want to use as the starting point for your CSS
selector.
:param tag: All CSS selectors will use this as their starting
point.
:param api: A plug-in replacement for the soupsieve module,
designed mainly for use in tests.
"""
if api is None:
raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
self.api = api
self.tag = tag
def escape(self, ident):
"""Escape a CSS identifier.
This is a simple wrapper around soupselect.escape(). See the
documentation for that function for more information.
"""
if soupsieve is None:
raise NotImplementedError(
"Cannot escape CSS identifiers because the soupsieve package is not installed."
)
return self.api.escape(ident)
def _ns(self, ns, select):
"""Normalize a dictionary of namespaces."""
if not isinstance(select, self.api.SoupSieve) and ns is None:
# If the selector is a precompiled pattern, it already has
# a namespace context compiled in, which cannot be
# replaced.
ns = self.tag._namespaces
return ns
def _rs(self, results):
"""Normalize a list of results to a Resultset.
A ResultSet is more consistent with the rest of Beautiful
Soup's API, and ResultSet.__getattr__ has a helpful error
message if you try to treat a list of results as a single
result (a common mistake).
"""
# Import here to avoid circular import
from .element import ResultSet
return ResultSet(None, results)
def compile(self, select, namespaces=None, flags=0, **kwargs):
"""Pre-compile a selector and return the compiled object.
:param selector: A CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.compile() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.compile() method.
:return: A precompiled selector object.
:rtype: soupsieve.SoupSieve
"""
return self.api.compile(
select, self._ns(namespaces, select), flags, **kwargs
)
def select_one(self, select, namespaces=None, flags=0, **kwargs):
"""Perform a CSS selection operation on the current Tag and return the
first result.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.select_one()
method.
:param selector: A CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.select_one() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select_one() method.
:return: A Tag, or None if the selector has no match.
:rtype: bs4.element.Tag
"""
return self.api.select_one(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
"""Perform a CSS selection operation on the current Tag.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.select()
method.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param limit: After finding this number of results, stop looking.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.select() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method.
:return: A ResultSet of Tag objects.
:rtype: bs4.element.ResultSet
"""
if limit is None:
limit = 0
return self._rs(
self.api.select(
select, self.tag, self._ns(namespaces, select), limit, flags,
**kwargs
)
)
def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
"""Perform a CSS selection operation on the current Tag.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.iselect()
method. It is the same as select(), but it returns a generator
instead of a list.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param limit: After finding this number of results, stop looking.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.iselect() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.iselect() method.
:return: A generator
:rtype: types.GeneratorType
"""
return self.api.iselect(
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
)
def closest(self, select, namespaces=None, flags=0, **kwargs):
"""Find the Tag closest to this one that matches the given selector.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.closest()
method.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.closest() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.closest() method.
:return: A Tag, or None if there is no match.
:rtype: bs4.Tag
"""
return self.api.closest(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
def match(self, select, namespaces=None, flags=0, **kwargs):
"""Check whether this Tag matches the given CSS selector.
This uses the Soup Sieve library. For more information, see
that library's documentation for the soupsieve.match()
method.
:param: a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.match() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.match() method.
:return: True if this Tag matches the selector; False otherwise.
:rtype: bool
"""
return self.api.match(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
def filter(self, select, namespaces=None, flags=0, **kwargs):
"""Filter this Tag's direct children based on the given CSS selector.
This uses the Soup Sieve library. It works the same way as
passing this Tag into that library's soupsieve.filter()
method. More information, for more information see the
documentation for soupsieve.filter().
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will pass in the prefixes it encountered while
parsing the document.
:param flags: Flags to be passed into Soup Sieve's
soupsieve.filter() method.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.filter() method.
:return: A ResultSet of Tag objects.
:rtype: bs4.element.ResultSet
"""
return self._rs(
self.api.filter(
select, self.tag, self._ns(namespaces, select), flags, **kwargs
)
)

View file

@ -59,21 +59,6 @@ def diagnose(data):
if hasattr(data, 'read'):
data = data.read()
elif data.startswith("http:") or data.startswith("https:"):
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
else:
try:
if os.path.exists(data):
print(('"%s" looks like a filename. Reading data from the file.' % data))
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print("")
for parser in basic_parsers:
print(("Trying to parse your markup with %s" % parser))

View file

@ -8,14 +8,8 @@ except ImportError as e:
import re
import sys
import warnings
try:
import soupsieve
except ImportError as e:
soupsieve = None
warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
from .css import CSS
from .formatter import (
Formatter,
HTMLFormatter,
@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([
"string-escape",
"string_escape",
])
class NamespacedAttribute(str):
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
('xml') and the name ('lang') that were used to create it.
"""
def __new__(cls, prefix, name=None, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
class PageElement(object):
"""Contains the navigational information for some part of the page:
that is, its current location in the parse tree.
NavigableString, Tag, etc. are all subclasses of PageElement.
"""
# In general, we can't tell just by looking at an element whether
# it's contained in an XML document or an HTML document. But for
# Tags (q.v.) we can store this information at parse time.
known_xml = None
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
@ -163,7 +162,7 @@ class PageElement(object):
:param previous_element: The element parsed immediately before
this one.
:param next_element: The element parsed immediately before
this one.
@ -257,11 +256,11 @@ class PageElement(object):
default = object()
def _all_strings(self, strip=False, types=default):
"""Yield all strings of certain classes, possibly stripping them.
This is implemented differently in Tag and NavigableString.
"""
raise NotImplementedError()
@property
def stripped_strings(self):
"""Yield all strings in this PageElement, stripping them first.
@ -294,11 +293,11 @@ class PageElement(object):
strip, types=types)])
getText = get_text
text = property(get_text)
def replace_with(self, *args):
"""Replace this PageElement with one or more PageElements, keeping the
"""Replace this PageElement with one or more PageElements, keeping the
rest of the tree the same.
:param args: One or more PageElements.
:return: `self`, no longer part of the tree.
"""
@ -410,7 +409,7 @@ class PageElement(object):
This works the same way as `list.insert`.
:param position: The numeric position that should be occupied
in `self.children` by the new PageElement.
in `self.children` by the new PageElement.
:param new_child: A PageElement.
"""
if new_child is None:
@ -496,13 +495,16 @@ class PageElement(object):
def extend(self, tags):
"""Appends the given PageElements to this one's contents.
:param tags: A list of PageElements.
:param tags: A list of PageElements. If a single Tag is
provided instead, this PageElement's contents will be extended
with that Tag's contents.
"""
if isinstance(tags, Tag):
# Calling self.append() on another tag's contents will change
# the list we're iterating over. Make a list that won't
# change.
tags = list(tags.contents)
tags = tags.contents
if isinstance(tags, list):
# Moving items around the tree may change their position in
# the original list. Make a list that won't change.
tags = list(tags)
for tag in tags:
self.append(tag)
@ -543,7 +545,7 @@ class PageElement(object):
"Element has no parent, so 'after' has no meaning.")
if any(x is self for x in args):
raise ValueError("Can't insert an element after itself.")
offset = 0
for successor in args:
# Extract first so that the index won't be screwed up if they
@ -586,8 +588,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements.
"""
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements,
**kwargs)
_stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@ -624,8 +627,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit,
self.next_siblings, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit,
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
@ -663,8 +669,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit, self.previous_elements,
**kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit, self.previous_elements,
_stacklevel=_stacklevel+1, **kwargs
)
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
@ -702,8 +711,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit,
self.previous_siblings, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit,
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
@ -724,7 +736,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
l = self.find_parents(name, attrs, 1, **kwargs)
l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l:
r = l[0]
return r
@ -744,8 +756,9 @@ class PageElement(object):
:return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
_stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@ -771,19 +784,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs):
r = None
l = method(name, attrs, string, 1, **kwargs)
l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l:
r = l[0]
return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
_stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
DeprecationWarning
DeprecationWarning, stacklevel=_stacklevel
)
if isinstance(name, SoupStrainer):
@ -897,7 +911,7 @@ class PageElement(object):
:rtype: bool
"""
return getattr(self, '_decomposed', False) or False
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@ -921,16 +935,11 @@ class NavigableString(str, PageElement):
When Beautiful Soup parses the markup <b>penguin</b>, it will
create a NavigableString for the string "penguin".
"""
"""
PREFIX = ''
SUFFIX = ''
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__(cls, value):
"""Create a new NavigableString.
@ -946,12 +955,22 @@ class NavigableString(str, PageElement):
u.setup()
return u
def __copy__(self):
def __deepcopy__(self, memo, recursive=False):
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
:param recursive: This parameter is ignored; it's only defined
so that NavigableString.__deepcopy__ implements the same
signature as Tag.__deepcopy__.
"""
return type(self)(self)
def __copy__(self):
"""A copy of a NavigableString can only be a deep copy, because
only one PageElement can occupy a given place in a parse tree.
"""
return self.__deepcopy__({})
def __getnewargs__(self):
return (str(self),)
@ -1044,10 +1063,10 @@ class PreformattedString(NavigableString):
as comments (the Comment class) and CDATA blocks (the CData
class).
"""
PREFIX = ''
SUFFIX = ''
def output_ready(self, formatter=None):
"""Make this string ready for output by adding any subclass-specific
prefix or suffix.
@ -1129,7 +1148,7 @@ class Stylesheet(NavigableString):
"""
pass
class Script(NavigableString):
"""A NavigableString representing an executable script (probably
Javascript).
@ -1235,7 +1254,7 @@ class Tag(PageElement):
if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)):
self.sourceline = sourceline
self.sourcepos = sourcepos
self.sourcepos = sourcepos
if attrs is None:
attrs = {}
elif attrs:
@ -1293,25 +1312,60 @@ class Tag(PageElement):
self.interesting_string_types = builder.string_containers[self.name]
else:
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias("parser_class") # BS3
def __copy__(self):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
def __deepcopy__(self, memo, recursive=True):
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
clone = self._clone()
if recursive:
# Clone this tag's descendants recursively, but without
# making any recursive function calls.
tag_stack = [clone]
for event, element in self._event_stream(self.descendants):
if event is Tag.END_ELEMENT_EVENT:
# Stop appending incoming Tags to the Tag that was
# just closed.
tag_stack.pop()
else:
descendant_clone = element.__deepcopy__(
memo, recursive=False
)
# Add to its parent's .contents
tag_stack[-1].append(descendant_clone)
if event is Tag.START_ELEMENT_EVENT:
# Add the Tag itself to the stack so that its
# children will be .appended to it.
tag_stack.append(descendant_clone)
return clone
def __copy__(self):
"""A copy of a Tag must always be a deep copy, because a Tag's
children can only have one parent at a time.
"""
return self.__deepcopy__({})
def _clone(self):
"""Create a new Tag just like this one, but with no
contents and unattached to any parse tree.
This is the first step in the deepcopy process.
"""
clone = type(self)(
None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags
preserve_whitespace_tags=self.preserve_whitespace_tags,
interesting_string_types=self.interesting_string_types
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
clone.append(child.__copy__())
return clone
@property
@ -1417,7 +1471,7 @@ class Tag(PageElement):
i.contents = []
i._decomposed = True
i = n
def clear(self, decompose=False):
"""Wipe out all children of this PageElement by calling extract()
on them.
@ -1505,7 +1559,7 @@ class Tag(PageElement):
if not isinstance(value, list):
value = [value]
return value
def has_attr(self, key):
"""Does this PageElement have an attribute with the given name?"""
return key in self.attrs
@ -1558,7 +1612,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
),
DeprecationWarning
DeprecationWarning, stacklevel=2
)
return self.find(tag_name)
# We special case contents to avoid recursion.
@ -1592,7 +1646,7 @@ class Tag(PageElement):
def __repr__(self, encoding="unicode-escape"):
"""Renders this PageElement as a string.
:param encoding: The encoding to use (Python 2 only).
:param encoding: The encoding to use (Python 2 only).
TODO: This is now ignored and a warning should be issued
if a value is provided.
:return: A (Unicode) string.
@ -1634,106 +1688,212 @@ class Tag(PageElement):
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Render a Unicode representation of this PageElement and its
contents.
:param indent_level: Each line of the rendering will be
indented this many spaces. Used internally in
recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
"""
formatter="minimal",
iterator=None):
pieces = []
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
attrs = []
for key, val in attributes:
if val is None:
decoded = key
if indent_level is True:
indent_level = 0
# The currently active tag that put us into string literal
# mode. Until this element is closed, children will be treated
# as string literals and not pretty-printed. String literal
# mode is turned on immediately after this tag begins, and
# turned off immediately before it's closed. This means there
# will be whitespace before and after the tag itself.
string_literal_tag = None
for event, element in self._event_stream(iterator):
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
piece = element._format_tag(
eventual_encoding, formatter, opening=True
)
elif event is Tag.END_ELEMENT_EVENT:
piece = element._format_tag(
eventual_encoding, formatter, opening=False
)
if indent_level is not None:
indent_level -= 1
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
piece = element.output_ready(formatter)
text = formatter.attribute_value(val)
decoded = (
str(key) + '='
+ formatter.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
# Now we need to apply the 'prettiness' -- extra
# whitespace before and/or after this tag. This can get
# complicated because certain tags, like <pre> and
# <script>, can't be prettified, since adding whitespace would
# change the meaning of the content.
# The default behavior is to add whitespace before and
# after an element when string literal mode is off, and to
# leave things as they are when string literal mode is on.
if string_literal_tag:
indent_before = indent_after = False
else:
indent_before = indent_after = True
# The only time the behavior is more complex than that is
# when we encounter an opening or closing tag that might
# put us into or out of string literal mode.
if (event is Tag.START_ELEMENT_EVENT
and not string_literal_tag
and not element._should_pretty_print()):
# We are about to enter string literal mode. Add
# whitespace before this tag, but not after. We
# will stay in string literal mode until this tag
# is closed.
indent_before = True
indent_after = False
string_literal_tag = element
elif (event is Tag.END_ELEMENT_EVENT
and element is string_literal_tag):
# We are about to exit string literal mode by closing
# the tag that sent us into that mode. Add whitespace
# after this tag, but not before.
indent_before = False
indent_after = True
string_literal_tag = None
# Now we know whether to add whitespace before and/or
# after this element.
if indent_level is not None:
if (indent_before or indent_after):
if isinstance(element, NavigableString):
piece = piece.strip()
if piece:
piece = self._indent_string(
piece, indent_level, formatter,
indent_before, indent_after
)
if event == Tag.START_ELEMENT_EVENT:
indent_level += 1
pieces.append(piece)
return "".join(pieces)
# Names for the different events yielded by _event_stream
START_ELEMENT_EVENT = object()
END_ELEMENT_EVENT = object()
EMPTY_ELEMENT_EVENT = object()
STRING_ELEMENT_EVENT = object()
def _event_stream(self, iterator=None):
"""Yield a sequence of events that can be used to reconstruct the DOM
for this element.
This lets us recreate the nested structure of this element
(e.g. when formatting it as a string) without using recursive
method calls.
This is similar in concept to the SAX API, but it's a simpler
interface designed for internal use. The events are different
from SAX and the arguments associated with the events are Tags
and other Beautiful Soup objects.
:param iterator: An alternate iterator to use when traversing
the tree.
"""
tag_stack = []
iterator = iterator or self.self_and_descendants
for c in iterator:
# If the parent of the element we're about to yield is not
# the tag currently on the stack, it means that the tag on
# the stack closed before this element appeared.
while tag_stack and c.parent != tag_stack[-1]:
now_closed_tag = tag_stack.pop()
yield Tag.END_ELEMENT_EVENT, now_closed_tag
if isinstance(c, Tag):
if c.is_empty_element:
yield Tag.EMPTY_ELEMENT_EVENT, c
else:
yield Tag.START_ELEMENT_EVENT, c
tag_stack.append(c)
continue
else:
yield Tag.STRING_ELEMENT_EVENT, c
while tag_stack:
now_closed_tag = tag_stack.pop()
yield Tag.END_ELEMENT_EVENT, now_closed_tag
def _indent_string(self, s, indent_level, formatter,
indent_before, indent_after):
"""Add indentation whitespace before and/or after a string.
:param s: The string to amend with whitespace.
:param indent_level: The indentation level; affects how much
whitespace goes before the string.
:param indent_before: Whether or not to add whitespace
before the string.
:param indent_after: Whether or not to add whitespace
(a newline) after the string.
"""
space_before = ''
if indent_before and indent_level:
space_before = (formatter.indent * indent_level)
space_after = ''
if indent_after:
space_after = "\n"
return space_before + s + space_after
def _format_tag(self, eventual_encoding, formatter, opening):
# A tag starts with the < character (see below).
# Then the / character, if this is a closing tag.
closing_slash = ''
if not opening:
closing_slash = '/'
# Then an optional namespace prefix.
prefix = ''
if self.prefix:
prefix = self.prefix + ":"
if self.is_empty_element:
close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
# Then a list of attribute values, if this is an opening tag.
attribute_string = ''
if opening:
attributes = formatter.attributes(self)
attrs = []
for key, val in attributes:
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
pretty_print = self._should_pretty_print(indent_level)
space = ''
indent_space = ''
if indent_level is not None:
indent_space = (formatter.indent * (indent_level - 1))
if pretty_print:
space = indent_space
indent_contents = indent_level + 1
else:
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter
)
if self.hidden:
# This is the 'document root' object.
s = contents
else:
s = []
attribute_string = ''
text = formatter.attribute_value(val)
decoded = (
str(key) + '='
+ formatter.quoted_attribute_value(text))
attrs.append(decoded)
if attrs:
attribute_string = ' ' + ' '.join(attrs)
if indent_level is not None:
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
if pretty_print:
s.append("\n")
s.append(contents)
if pretty_print and contents and contents[-1] != "\n":
s.append("\n")
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
def _should_pretty_print(self, indent_level):
# Then an optional closing slash (for a void element in an
# XML document).
void_element_closing_slash = ''
if self.is_empty_element:
void_element_closing_slash = formatter.void_element_close_prefix or ''
# Put it all together.
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
def _should_pretty_print(self, indent_level=1):
"""Should this tag be pretty-printed?
Most of them should, but some (such as <pre> in HTML
@ -1754,7 +1914,7 @@ class Tag(PageElement):
a Unicode string will be returned.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
:return: A Unicode string (if encoding==None) or a bytestring
:return: A Unicode string (if encoding==None) or a bytestring
(otherwise).
"""
if encoding is None:
@ -1784,33 +1944,9 @@ class Tag(PageElement):
the standard Formatters.
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
return self.decode(indent_level, eventual_encoding, formatter,
iterator=self.descendants)
pretty_print = (indent_level is not None)
s = []
for c in self:
text = None
if isinstance(c, NavigableString):
text = c.output_ready(formatter)
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
preserve_whitespace = (
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
if pretty_print and not preserve_whitespace:
s.append(formatter.indent * (indent_level - 1))
s.append(text)
if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@ -1862,7 +1998,8 @@ class Tag(PageElement):
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
r = None
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
**kwargs)
if l:
r = l[0]
return r
@ -1889,7 +2026,9 @@ class Tag(PageElement):
generator = self.descendants
if not recursive:
generator = self.children
return self._find_all(name, attrs, string, limit, generator, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, generator,
_stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3
findChildren = find_all # BS2
@ -1903,6 +2042,18 @@ class Tag(PageElement):
# return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def self_and_descendants(self):
"""Iterate over this PageElement and its children in a
breadth-first sequence.
:yield: A sequence of PageElements.
"""
if not self.hidden:
yield self
for i in self.descendants:
yield i
@property
def descendants(self):
"""Iterate over all children of this PageElement in a
@ -1929,16 +2080,13 @@ class Tag(PageElement):
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param kwargs: Keyword arguments to be passed into SoupSieve's
:param kwargs: Keyword arguments to be passed into Soup Sieve's
soupsieve.select() method.
:return: A Tag.
:rtype: bs4.element.Tag
"""
value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
return None
return self.css.select_one(selector, namespaces, **kwargs)
def select(self, selector, namespaces=None, limit=None, **kwargs):
"""Perform a CSS selection operation on the current element.
@ -1954,27 +2102,18 @@ class Tag(PageElement):
:param limit: After finding this number of results, stop looking.
:param kwargs: Keyword arguments to be passed into SoupSieve's
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method.
:return: A ResultSet of Tags.
:rtype: bs4.element.ResultSet
"""
if namespaces is None:
namespaces = self._namespaces
if limit is None:
limit = 0
if soupsieve is None:
raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
return self.css.select(selector, namespaces, limit, **kwargs)
# We do this because it's more consistent and because
# ResultSet.__getattr__ has a helpful error message.
return ResultSet(None, results)
@property
def css(self):
"""Return an interface to the CSS selector API."""
return CSS(self)
# Old names for backwards compatibility
def childGenerator(self):
@ -1993,7 +2132,7 @@ class Tag(PageElement):
"""
warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning
DeprecationWarning, stacklevel=2
)
return self.has_attr(key)
@ -2019,12 +2158,12 @@ class SoupStrainer(object):
:param attrs: A dictionary of filters on attribute values.
:param string: A filter for a NavigableString with specific text.
:kwargs: A dictionary of filters on attribute values.
"""
"""
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning
DeprecationWarning, stacklevel=2
)
self.name = self._normalize_search_value(name)
@ -2118,7 +2257,7 @@ class SoupStrainer(object):
# looking at a tag with a different name.
if markup and not markup.prefix and self.name != markup.name:
return False
call_function_with_tag_data = (
isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
@ -2204,7 +2343,7 @@ class SoupStrainer(object):
if self._matches(' '.join(markup), match_against):
return True
return False
if match_against is True:
# True matches any non-None value.
return markup is not None
@ -2248,11 +2387,11 @@ class SoupStrainer(object):
return True
else:
return False
# Beyond this point we might need to run the test twice: once against
# the tag's name and once against its prefixed name.
match = False
if not match and isinstance(match_against, str):
# Exact string match
match = markup == match_against

View file

@ -97,7 +97,7 @@ class Formatter(EntitySubstitution):
else:
indent = ' '
self.indent = indent
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.

View file

@ -32,7 +32,7 @@ from . import css_match as cm
from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401
import bs4 # type: ignore[import]
from typing import Optional, Any, Iterator, Iterable
from typing import Any, Iterator, Iterable
__all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
@ -45,17 +45,14 @@ SoupSieve = cm.SoupSieve
def compile( # noqa: A001
pattern: str,
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> cm.SoupSieve:
"""Compile CSS pattern."""
ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
if isinstance(pattern, SoupSieve):
if flags:
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
@ -65,7 +62,12 @@ def compile( # noqa: A001
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern
return cp._cached_css_compile(pattern, ns, cs, flags)
return cp._cached_css_compile(
pattern,
ct.Namespaces(namespaces) if namespaces is not None else namespaces,
ct.CustomSelectors(custom) if custom is not None else custom,
flags
)
def purge() -> None:
@ -77,10 +79,10 @@ def purge() -> None:
def closest(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Match closest ancestor."""
@ -91,10 +93,10 @@ def closest(
def match(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> bool:
"""Match node."""
@ -105,10 +107,10 @@ def match(
def filter( # noqa: A001
select: str,
iterable: Iterable['bs4.Tag'],
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> list['bs4.Tag']:
"""Filter list of nodes."""
@ -119,10 +121,10 @@ def filter( # noqa: A001
def select_one(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Select a single tag."""
@ -133,11 +135,11 @@ def select_one(
def select(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
limit: int = 0,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> list['bs4.Tag']:
"""Select the specified tags."""
@ -148,11 +150,11 @@ def select(
def iselect(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
namespaces: dict[str, str] | None = None,
limit: int = 0,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
custom: dict[str, str] | None = None,
**kwargs: Any
) -> Iterator['bs4.Tag']:
"""Iterate the specified tags."""

View file

@ -193,5 +193,5 @@ def parse_version(ver: str) -> Version:
return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(2, 5, 0, "final", post=1)
__version_info__ = Version(2, 4, 1, "final")
__version__ = __version_info__._get_canonical()

View file

@ -6,7 +6,7 @@ import re
from . import css_types as ct
import unicodedata
import bs4 # type: ignore[import]
from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -171,7 +171,7 @@ class _DocumentNav:
def get_children(
self,
el: bs4.Tag,
start: Optional[int] = None,
start: int | None = None,
reverse: bool = False,
tags: bool = True,
no_iframe: bool = False
@ -239,22 +239,22 @@ class _DocumentNav:
return parent
@staticmethod
def get_tag_name(el: bs4.Tag) -> Optional[str]:
def get_tag_name(el: bs4.Tag) -> str | None:
"""Get tag."""
return cast(Optional[str], el.name)
return cast('str | None', el.name)
@staticmethod
def get_prefix_name(el: bs4.Tag) -> Optional[str]:
def get_prefix_name(el: bs4.Tag) -> str | None:
"""Get prefix."""
return cast(Optional[str], el.prefix)
return cast('str | None', el.prefix)
@staticmethod
def get_uri(el: bs4.Tag) -> Optional[str]:
def get_uri(el: bs4.Tag) -> str | None:
"""Get namespace `URI`."""
return cast(Optional[str], el.namespace)
return cast('str | None', el.namespace)
@classmethod
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
@ -287,7 +287,7 @@ class _DocumentNav:
return bool(ns and ns == NS_XHTML)
@staticmethod
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
"""Return namespace and attribute name without the prefix."""
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
@ -330,8 +330,8 @@ class _DocumentNav:
cls,
el: bs4.Tag,
name: str,
default: Optional[str | Sequence[str]] = None
) -> Optional[str | Sequence[str]]:
default: str | Sequence[str] | None = None
) -> str | Sequence[str] | None:
"""Get attribute by name."""
value = default
@ -348,7 +348,7 @@ class _DocumentNav:
return value
@classmethod
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
"""Iterate attributes."""
for k, v in el.attrs.items():
@ -424,10 +424,10 @@ class Inputs:
return 0 <= minutes <= 59
@classmethod
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
"""Parse the input value."""
parsed = None # type: Optional[tuple[float, ...]]
parsed = None # type: tuple[float, ...] | None
if value is None:
return value
if itype == "date":
@ -486,7 +486,7 @@ class CSSMatch(_DocumentNav):
self,
selectors: ct.SelectorList,
scope: bs4.Tag,
namespaces: Optional[ct.Namespaces],
namespaces: ct.Namespaces | None,
flags: int
) -> None:
"""Initialize."""
@ -545,19 +545,19 @@ class CSSMatch(_DocumentNav):
return self.get_tag_ns(el) == NS_XHTML
def get_tag(self, el: bs4.Tag) -> Optional[str]:
def get_tag(self, el: bs4.Tag) -> str | None:
"""Get tag."""
name = self.get_tag_name(el)
return util.lower(name) if name is not None and not self.is_xml else name
def get_prefix(self, el: bs4.Tag) -> Optional[str]:
def get_prefix(self, el: bs4.Tag) -> str | None:
"""Get prefix."""
prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
def find_bidi(self, el: bs4.Tag) -> Optional[int]:
def find_bidi(self, el: bs4.Tag) -> int | None:
"""Get directionality from element text."""
for node in self.get_children(el, tags=False):
@ -653,8 +653,8 @@ class CSSMatch(_DocumentNav):
self,
el: bs4.Tag,
attr: str,
prefix: Optional[str]
) -> Optional[str | Sequence[str]]:
prefix: str | None
) -> str | Sequence[str] | None:
"""Match attribute name and return value if it exists."""
value = None
@ -751,7 +751,7 @@ class CSSMatch(_DocumentNav):
name not in (self.get_tag(el), '*')
)
def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
"""Match the tag."""
match = True
@ -1030,7 +1030,7 @@ class CSSMatch(_DocumentNav):
"""Match element if it contains text."""
match = True
content = None # type: Optional[str | Sequence[str]]
content = None # type: str | Sequence[str] | None
for contain_list in contains:
if content is None:
if contain_list.own:
@ -1099,7 +1099,7 @@ class CSSMatch(_DocumentNav):
match = False
name = cast(str, self.get_attribute_by_name(el, 'name'))
def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
"""Find this input's form."""
form = None
parent = self.get_parent(el, no_iframe=True)
@ -1478,7 +1478,7 @@ class CSSMatch(_DocumentNav):
if lim < 1:
break
def closest(self) -> Optional[bs4.Tag]:
def closest(self) -> bs4.Tag | None:
"""Match closest ancestor."""
current = self.tag
@ -1506,7 +1506,7 @@ class SoupSieve(ct.Immutable):
pattern: str
selectors: ct.SelectorList
namespaces: Optional[ct.Namespaces]
namespaces: ct.Namespaces | None
custom: dict[str, str]
flags: int
@ -1516,8 +1516,8 @@ class SoupSieve(ct.Immutable):
self,
pattern: str,
selectors: ct.SelectorList,
namespaces: Optional[ct.Namespaces],
custom: Optional[ct.CustomSelectors],
namespaces: ct.Namespaces | None,
custom: ct.CustomSelectors | None,
flags: int
):
"""Initialize."""

View file

@ -7,7 +7,7 @@ from . import css_match as cm
from . import css_types as ct
from .util import SelectorSyntaxError
import warnings
from typing import Optional, Match, Any, Iterator, cast
from typing import Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -113,7 +113,7 @@ VALUE = r'''
'''.format(nl=NEWLINE, ident=IDENTIFIER)
# Attribute value comparison. `!=` is handled special as it is non-standard.
ATTR = r'''
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}*(?P<case>[is]))?)?{ws}*\]
'''.format(ws=WSC, value=VALUE)
# Selector patterns
@ -207,8 +207,8 @@ _MAXCACHE = 500
@lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile(
pattern: str,
namespaces: Optional[ct.Namespaces],
custom: Optional[ct.CustomSelectors],
namespaces: ct.Namespaces | None,
custom: ct.CustomSelectors | None,
flags: int
) -> cm.SoupSieve:
"""Cached CSS compile."""
@ -233,7 +233,7 @@ def _purge_cache() -> None:
_cached_css_compile.cache_clear()
def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
"""Process custom."""
custom_selectors = {}
@ -317,7 +317,7 @@ class SelectorPattern:
return self.name
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
"""Match the selector."""
return self.re_pattern.match(selector, index)
@ -336,7 +336,7 @@ class SpecialPseudoPattern(SelectorPattern):
for pseudo in p[1]:
self.patterns[pseudo] = pattern
self.matched_name = None # type: Optional[SelectorPattern]
self.matched_name = None # type: SelectorPattern | None
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
def get_name(self) -> str:
@ -344,7 +344,7 @@ class SpecialPseudoPattern(SelectorPattern):
return '' if self.matched_name is None else self.matched_name.get_name()
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
"""Match the selector."""
pseudo = None
@ -372,14 +372,14 @@ class _Selector:
def __init__(self, **kwargs: Any) -> None:
"""Initialize."""
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
self.ids = kwargs.get('ids', []) # type: list[str]
self.classes = kwargs.get('classes', []) # type: list[str]
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
self.rel_type = kwargs.get('rel_type', None) # type: str | None
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) # type: int
@ -462,7 +462,7 @@ class CSSParser:
def __init__(
self,
selector: str,
custom: Optional[dict[str, str | ct.SelectorList]] = None,
custom: dict[str, str | ct.SelectorList] | None = None,
flags: int = 0
) -> None:
"""Initialize."""
@ -723,7 +723,7 @@ class CSSParser:
if postfix == '_child':
if m.group('of'):
# Parse the rest of `of S`.
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN | FLG_FORGIVE)
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
else:
# Use default `*|*` for `of S`.
nth_sel = CSS_NTH_OF_S_DEFAULT
@ -753,7 +753,7 @@ class CSSParser:
if name == ':not':
flags |= FLG_NOT
elif name == ':has':
flags |= FLG_RELATIVE | FLG_FORGIVE
flags |= FLG_RELATIVE
elif name in (':where', ':is'):
flags |= FLG_FORGIVE
@ -777,11 +777,6 @@ class CSSParser:
if not combinator:
combinator = WS_COMBINATOR
if combinator == COMMA_COMBINATOR:
if not has_selector:
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
sel.no_match = True
sel.rel_type = rel_type
selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR
@ -1070,22 +1065,12 @@ class CSSParser:
selectors.append(sel)
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive:
if is_relative:
# Handle relative selectors pseudo-classes with empty slots like `:has()`
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
sel.rel_type = rel_type
sel.no_match = True
selectors[-1].relations.append(sel)
has_selector = True
else:
# Handle normal pseudo-classes with empty slots
if not selectors or not relations:
# Others like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
has_selector = True
elif is_forgive and (not selectors or not relations):
# Handle normal pseudo-classes with empty slots like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
has_selector = True
if not has_selector:
# We will always need to finish a selector when `:has()` is used as it leads with combining.

View file

@ -2,7 +2,7 @@
from __future__ import annotations
import copyreg
from .pretty import pretty
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping
__all__ = (
'Selector',
@ -189,28 +189,28 @@ class Selector(Immutable):
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
)
tag: Optional[SelectorTag]
tag: SelectorTag | None
ids: tuple[str, ...]
classes: tuple[str, ...]
attributes: tuple[SelectorAttribute, ...]
nth: tuple[SelectorNth, ...]
selectors: tuple[SelectorList, ...]
relation: SelectorList
rel_type: Optional[str]
rel_type: str | None
contains: tuple[SelectorContains, ...]
lang: tuple[SelectorLang, ...]
flags: int
def __init__(
self,
tag: Optional[SelectorTag],
tag: SelectorTag | None,
ids: tuple[str, ...],
classes: tuple[str, ...],
attributes: tuple[SelectorAttribute, ...],
nth: tuple[SelectorNth, ...],
selectors: tuple[SelectorList, ...],
relation: SelectorList,
rel_type: Optional[str],
rel_type: str | None,
contains: tuple[SelectorContains, ...],
lang: tuple[SelectorLang, ...],
flags: int
@ -247,9 +247,9 @@ class SelectorTag(Immutable):
__slots__ = ("name", "prefix", "_hash")
name: str
prefix: Optional[str]
prefix: str | None
def __init__(self, name: str, prefix: Optional[str]) -> None:
def __init__(self, name: str, prefix: str | None) -> None:
"""Initialize."""
super().__init__(name=name, prefix=prefix)
@ -262,15 +262,15 @@ class SelectorAttribute(Immutable):
attribute: str
prefix: str
pattern: Optional[Pattern[str]]
xml_type_pattern: Optional[Pattern[str]]
pattern: Pattern[str] | None
xml_type_pattern: Pattern[str] | None
def __init__(
self,
attribute: str,
prefix: str,
pattern: Optional[Pattern[str]],
xml_type_pattern: Optional[Pattern[str]]
pattern: Pattern[str] | None,
xml_type_pattern: Pattern[str] | None
) -> None:
"""Initialize."""
@ -360,7 +360,7 @@ class SelectorList(Immutable):
def __init__(
self,
selectors: Optional[Iterable[Selector | SelectorNull]] = None,
selectors: Iterable[Selector | SelectorNull] | None = None,
is_not: bool = False,
is_html: bool = False
) -> None:

View file

@ -3,7 +3,7 @@ from __future__ import annotations
from functools import wraps, lru_cache
import warnings
import re
from typing import Callable, Any, Optional
from typing import Callable, Any
DEBUG = 0x00001
@ -27,7 +27,7 @@ def lower(string: str) -> str:
class SelectorSyntaxError(Exception):
"""Syntax error in a CSS selector."""
def __init__(self, msg: str, pattern: Optional[str] = None, index: Optional[int] = None) -> None:
def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None:
"""Initialize."""
self.line = None
@ -84,7 +84,7 @@ def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
col = 1
text = [] # type: list[str]
line = 1
offset = None # type: Optional[int]
offset = None # type: int | None
# Split pattern by newline and handle the text before the newline
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):