Update Beautiful Soup 4.13.0b3 (55e006b) → 4.13.4 (9752e85).

This commit is contained in:
JackDandy 2025-06-26 18:09:22 +01:00
parent 2b039695b8
commit 4a7ea7746f
11 changed files with 169 additions and 66 deletions

View file

@ -2,7 +2,7 @@
* Update apprise 1.8.0 (81caf92) to 1.9.2 (a2a2216)
* Update attr 23.2.0 (b393d79) to 24.3.0 (598494a)
* Update Beautiful Soup 4.12.3 (7fb5175) to 4.13.0b3 (55e006b)
* Update Beautiful Soup 4.12.3 (7fb5175) to 4.13.4 (9752e85)
* Update CacheControl 0.14.0 (e2be0c2) to 0.14.3 (116113c)
* Update certifi 2024.08.30 to 2024.12.14
* Update chardet packages 5.1.0 (8087f00) to 5.3.0dev0 (8e8dfcd)
@ -34,6 +34,7 @@
[develop changelog]
* Update Beautiful Soup 4.12.3 (7fb5175) to 4.13.0b3 (55e006b)
* Update CacheControl 0.14.0 (e2be0c2) to 0.14.2 (928422d)
* Update Msgpack 1.0.6 (e1d3d5d) to 1.1.0 (0eeabfb)
* Update soupsieve 2.5.0 (dc71495) to 2.6.0 (a8080d9)

View file

@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.13.0b3"
__version__ = "4.13.4"
__copyright__ = "Copyright (c) 2004-2025 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"

View file

@ -25,7 +25,7 @@ def _deprecated_alias(old_name: str, new_name: str, version: str):
:meta private:
"""
@property
@property # type:ignore
def alias(self) -> Any:
":meta private:"
warnings.warn(

View file

@ -192,17 +192,11 @@ class TreeBuilder(object):
doesn't keep track of this information, then store_line_numbers
is irrelevant.
:param attribute_dict_class: A Tag's attribute values (available
as tag.attrs) willl be stored in an instance of this class.
The default is Beautiful Soup's built-in `AttributeDict` class and
you will probably never need to change it.
:param attribute_dict_class: The value of a multi-valued attribute
(such as HTML's 'class') willl be stored in an instance of this
class. The default is Beautiful Soup's built-in
`AttributeValueList`, which is a normal Python list, and you
will probably never need to change it.
"""
USE_DEFAULT: Any = object() #: :meta private:
@ -266,7 +260,7 @@ class TreeBuilder(object):
#: The textual contents of tags with these names should be
#: instantiated with some class other than `bs4.element.NavigableString`.
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} # type:ignore
#: By default, tags are treated as empty-element tags if they have
#: no contents--that is, using XML rules. HTMLTreeBuilder
@ -605,7 +599,7 @@ class HTMLTreeBuilder(TreeBuilder):
#:
#: TODO: Arguably <noscript> could go here but it seems
#: qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { # type:ignore
"rt": RubyTextString,
"rp": RubyParenthesisString,
"style": Stylesheet,

View file

@ -136,7 +136,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# HTMLBinaryInputStream.__init__.
extra_kwargs["override_encoding"] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
doc = parser.parse(markup, **extra_kwargs) # type:ignore
# Set the character encoding detected by the tokenizer.
if isinstance(markup, str):
@ -144,7 +144,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore
# The encoding is an html5lib Encoding object. We want to
# use a string for compatibility with other tree builders.
original_encoding = original_encoding.name
@ -227,7 +227,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
# This represents the point immediately after the end of the
# tag. We don't know when the tag started, but we do know
# where it ended -- the character just before this one.
sourceline, sourcepos = self.parser.tokenizer.stream.position()
sourceline, sourcepos = self.parser.tokenizer.stream.position() # type:ignore
assert sourcepos is not None
sourcepos = sourcepos - 1
tag = self.soup.new_tag(
@ -266,7 +266,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def getDocument(self) -> "BeautifulSoup":
return self.soup
def testSerializer(self, element: "Element") -> str:
def testSerializer(self, element: "Element") -> None:
"""This is only used by the html5lib unit tests. Since we
don't currently hook into those tests, the implementation is
left blank.
@ -337,7 +337,7 @@ class BeautifulSoupNode(treebuilder_base.Node):
# TODO-TYPING: typeshed stubs are incorrect about this;
# cloneNode returns a new Node, not None.
def cloneNode(self) -> treebuilder_base.Node:
def cloneNode(self) -> treebuilder_base.Node: # type:ignore
raise NotImplementedError()
@ -564,12 +564,12 @@ class Element(BeautifulSoupNode):
# TODO-TYPING: typeshed stubs are incorrect about this;
# hasContent returns a boolean, not None.
def hasContent(self) -> bool:
def hasContent(self) -> bool: # type:ignore
return len(self.element.contents) > 0
# TODO-TYPING: typeshed stubs are incorrect about this;
# cloneNode returns a new Node, not None.
def cloneNode(self) -> treebuilder_base.Node:
def cloneNode(self) -> treebuilder_base.Node: # type:ignore
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key, value in self.attributes:

View file

@ -22,11 +22,13 @@ from typing import (
TYPE_CHECKING,
Union,
)
from typing_extensions import TypeAlias
from io import BytesIO
from io import StringIO
from lxml import etree
from typing_extensions import TypeAlias
from lxml import etree # type:ignore
from ..element import (
AttributeDict,
XMLAttributeDict,
@ -180,6 +182,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
if self.is_xml:
self.processing_instruction_class = XMLProcessingInstruction
else:
self.processing_instruction_class = ProcessingInstruction
if "attribute_dict_class" not in kwargs:
kwargs["attribute_dict_class"] = XMLAttributeDict
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
@ -226,14 +233,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
document to Unicode and parsing it. Each strategy will be tried
in turn.
"""
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
if not self.is_xml:
# We're in HTML mode, so if we're given XML, that's worth
# noting.
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
else:
self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
@ -274,7 +277,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
markup,
known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings,
is_html=is_html,
is_html=not self.is_xml,
exclude_encodings=exclude_encodings,
)
for encoding in detector.encodings:

View file

@ -47,7 +47,7 @@ import warnings
chardet_module: Optional[ModuleType] = None
try:
# PyPI package: cchardet
import cchardet
import cchardet # type:ignore
chardet_module = cchardet
except ImportError:
@ -60,7 +60,7 @@ except ImportError:
except ImportError:
try:
# PyPI package: charset-normalizer
import charset_normalizer
import charset_normalizer # type:ignore
chardet_module = charset_normalizer
except ImportError:
@ -797,9 +797,9 @@ class UnicodeDammit:
)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, str) or markup == b"":
self.markup = markup
self.unicode_markup = str(markup)
if isinstance(markup, str):
self.markup = markup.encode("utf8")
self.unicode_markup = markup
self.original_encoding = None
return

View file

@ -52,7 +52,7 @@ def diagnose(data: "_IncomingMarkup") -> None:
if "lxml" in basic_parsers:
basic_parsers.append("lxml-xml")
try:
from lxml import etree
from lxml import etree # type:ignore
print(("Found lxml version %s" % ".".join(map(str, etree.LXML_VERSION))))
except ImportError:

View file

@ -37,6 +37,7 @@ from typing import (
TypeVar,
Union,
cast,
overload,
)
from typing_extensions import (
Self,
@ -223,7 +224,7 @@ class AttributeValueList(List[str]):
"""
class AttributeDict(dict):
class AttributeDict(Dict[Any,Any]):
"""Superclass for the dictionary used to hold a tag's
attributes. You can use this, but it's just a regular dict with no
special logic.
@ -235,7 +236,7 @@ class XMLAttributeDict(AttributeDict):
incoming values for consistency with the HTML spec.
"""
def __setitem__(self, key: str, value: Any):
def __setitem__(self, key: str, value: Any) -> None:
"""Set an attribute value, possibly modifying it to comply with
the XML spec.
@ -273,7 +274,7 @@ class HTMLAttributeDict(AttributeDict):
around boolean attributes that XML doesn't have.
"""
def __setitem__(self, key: str, value: Any):
def __setitem__(self, key: str, value: Any) -> None:
"""Set an attribute value, possibly modifying it to comply
with the HTML spec,
"""
@ -389,7 +390,7 @@ class PageElement(object):
:param previous_element: The element parsed immediately before
this one.
:param next_element: The element parsed immediately before
:param next_element: The element parsed immediately after
this one.
:param previous_sibling: The most recently encountered element
@ -1231,7 +1232,7 @@ class PageElement(object):
"""
return self._self_and(self.parents)
def _self_and(self, other_generator):
def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
"""Modify a generator by yielding this element, then everything
yielded by the other generator.
"""
@ -1317,6 +1318,14 @@ class NavigableString(str, PageElement):
def __getnewargs__(self) -> Tuple[str]:
return (str(self),)
# TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
# is introduced in 3.8. This can be changed once 3.7 support is dropped.
def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
"""Raise an exception """
if isinstance(key, str):
raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
return super(NavigableString, self).__getitem__(key)
@property
def string(self) -> str:
"""Convenience property defined to match `Tag.string`.
@ -2188,7 +2197,8 @@ class Tag(PageElement):
elif isinstance(value, list):
list_value = value
else:
value = cast(str, value)
if not isinstance(value, str):
value = cast(str, value)
list_value = self.attribute_value_list_class([value])
return list_value
@ -2597,6 +2607,22 @@ class Tag(PageElement):
or self.name not in self.preserve_whitespace_tags
)
@overload
def prettify(
self,
encoding: None = None,
formatter: _FormatterOrName = "minimal",
) -> str:
...
@overload
def prettify(
self,
encoding: _Encoding,
formatter: _FormatterOrName = "minimal",
) -> bytes:
...
def prettify(
self,
encoding: Optional[_Encoding] = None,

View file

@ -78,6 +78,21 @@ class ElementFilter(object):
"""
self.match_function = match_function
@property
def includes_everything(self) -> bool:
"""Does this `ElementFilter` obviously include everything? If so,
the filter process can be made much faster.
The `ElementFilter` might turn out to include everything even
if this returns `False`, but it won't include everything in an
obvious way.
The base `ElementFilter` implementation includes things based on
the match function, so includes_everything is only true if
there is no match function.
"""
return not self.match_function
@property
def excludes_everything(self) -> bool:
"""Does this `ElementFilter` obviously exclude everything? If
@ -88,19 +103,25 @@ class ElementFilter(object):
if this returns `False`, but it won't exclude everything in an
obvious way.
The base `ElementFilter` implementation excludes *nothing*, so
the base implementation of `excludes_everything` always
returns `False`.
The base `ElementFilter` implementation excludes things based
on a match function we can't inspect, so excludes_everything
is always false.
"""
return False
def match(self, element: PageElement) -> bool:
def match(self, element: PageElement, _known_rules:bool=False) -> bool:
"""Does the given PageElement match the rules set down by this
ElementFilter?
The base implementation delegates to the function passed in to
the constructor.
:param _known_rules: Defined for compatibility with
SoupStrainer._match(). Used more for consistency than because
we need the performance optimization.
"""
if not _known_rules and self.includes_everything:
return True
if not self.match_function:
return True
return self.match_function(element)
@ -111,13 +132,18 @@ class ElementFilter(object):
Acts like Python's built-in `filter`, using
`ElementFilter.match` as the filtering function.
"""
# If there are no rules at all, don't bother filtering. Let
# anything through.
if self.includes_everything:
for i in generator:
yield i
while True:
try:
i = next(generator)
except StopIteration:
break
if i:
if self.match(i):
if self.match(i, _known_rules=True):
yield cast("_OneElement", i)
def find(self, generator: Iterator[PageElement]) -> _AtMostOneElement:
@ -190,6 +216,7 @@ class MatchRule(object):
string: Optional[str]
pattern: Optional[_RegularExpressionProtocol]
present: Optional[bool]
exclude_everything: Optional[bool]
# TODO-TYPING: All MatchRule objects also have an attribute
# ``function``, but the type of the function depends on the
# subclass.
@ -200,6 +227,7 @@ class MatchRule(object):
pattern: Optional[_RegularExpressionProtocol] = None,
function: Optional[Callable] = None,
present: Optional[bool] = None,
exclude_everything: Optional[bool] = None
):
if isinstance(string, bytes):
string = string.decode("utf8")
@ -212,19 +240,20 @@ class MatchRule(object):
self.pattern = pattern
self.function = function
self.present = present
self.exclude_everything = exclude_everything
values = [
x
for x in (self.string, self.pattern, self.function, self.present)
for x in (self.string, self.pattern, self.function, self.present, self.exclude_everything)
if x is not None
]
if len(values) == 0:
raise ValueError(
"Either string, pattern, function or present must be provided."
"Either string, pattern, function, present, or exclude_everything must be provided."
)
if len(values) > 1:
raise ValueError(
"At most one of string, pattern, function and present must be provided."
"At most one of string, pattern, function, present, and exclude_everything must be provided."
)
def _base_match(self, string: Optional[str]) -> Optional[bool]:
@ -234,6 +263,10 @@ class MatchRule(object):
:return: True or False if we have a (positive or negative)
match; None if we need to keep trying.
"""
# self.exclude_everything matches nothing.
if self.exclude_everything:
return False
# self.present==True matches everything except None.
if self.present is True:
return string is not None
@ -357,9 +390,15 @@ class SoupStrainer(ElementFilter):
stacklevel=2,
)
self.name_rules = cast(
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
)
if name is None and not attrs and not string and not kwargs:
# Special case for backwards compatibility. Instantiating
# a SoupStrainer with no arguments whatsoever gets you one
# that matches all Tags, and only Tags.
self.name_rules = [TagNameMatchRule(present=True)]
else:
self.name_rules = cast(
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
)
self.attribute_rules = defaultdict(list)
if not isinstance(attrs, dict):
@ -395,17 +434,35 @@ class SoupStrainer(ElementFilter):
#: variable might have. Look at the .string_rules list instead.
self.__string = string
@property
def includes_everything(self) -> bool:
"""Check whether the provided rules will obviously include
everything. (They might include everything even if this returns `False`,
but not in an obvious way.)
"""
return not self.name_rules and not self.string_rules and not self.attribute_rules
@property
def excludes_everything(self) -> bool:
"""Check whether the provided rules will obviously exclude
everything. (They might exclude everything even if this returns `False`,
but not in an obvious way.)
"""
return (
True
if (self.string_rules and (self.name_rules or self.attribute_rules))
else False
)
if (self.string_rules and (self.name_rules or self.attribute_rules)):
# This is self-contradictory, so the rules exclude everything.
return True
# If there's a rule that ended up treated as an "exclude everything"
# rule due to creating a logical inconsistency, then the rules
# exclude everything.
if any(x.exclude_everything for x in self.string_rules):
return True
if any(x.exclude_everything for x in self.name_rules):
return True
for ruleset in self.attribute_rules.values():
if any(x.exclude_everything for x in ruleset):
return True
return False
@property
def string(self) -> Optional[_StrainableString]:
@ -454,18 +511,24 @@ class SoupStrainer(ElementFilter):
elif isinstance(obj, _RegularExpressionProtocol):
yield rule_class(pattern=obj)
elif hasattr(obj, "__iter__"):
if not obj:
# The attribute is being matched against the null set,
# which means it should exclude everything.
yield rule_class(exclude_everything=True)
for o in obj:
if not isinstance(o, (bytes, str)) and hasattr(o, "__iter__"):
# This is almost certainly the user's
# mistake. This list contains another list, which
# opens up the possibility of infinite
# self-reference. In the interests of avoiding
# infinite recursion, we'll ignore this item
# rather than looking inside.
# infinite recursion, we'll treat this as an
# impossible match and issue a rule that excludes
# everything, rather than looking inside.
warnings.warn(
f"Ignoring nested list {o} to avoid the possibility of infinite recursion.",
stacklevel=5,
)
yield rule_class(exclude_everything=True)
continue
for x in cls._make_match_rules(o, rule_class):
yield x
@ -487,6 +550,10 @@ class SoupStrainer(ElementFilter):
but a `SoupStrainer` that *only* contains `StringMatchRule`
cannot match a `Tag`, only a `NavigableString`.
"""
# If there are no rules at all, let anything through.
#if self.includes_everything:
# return True
# String rules cannot not match a Tag on their own.
if not self.name_rules and not self.attribute_rules:
return False
@ -515,8 +582,12 @@ class SoupStrainer(ElementFilter):
# [f"{k}={v}" for k, v in sorted(tag.attrs.items())]
# )
# print(f"Testing <{tag.name} {attrs}>{tag.string}</{tag.name}> against {rule}")
# If the rule contains a function, the function will be called
# with `tag`. It will not be called a second time with
# `prefixed_name`.
if rule.matches_tag(tag) or (
prefixed_name is not None and rule.matches_string(prefixed_name)
not rule.function and prefixed_name is not None and rule.matches_string(prefixed_name)
):
name_matches = True
break
@ -647,24 +718,30 @@ class SoupStrainer(ElementFilter):
return True
return False
def match(self, element: PageElement) -> bool:
def match(self, element: PageElement, _known_rules: bool=False) -> bool:
"""Does the given `PageElement` match the rules set down by this
`SoupStrainer`?
The find_* methods rely heavily on this method to find matches.
:param element: A `PageElement`.
:param _known_rules: Set to true in the common case where
we already checked and found at least one rule in this SoupStrainer
that might exclude a PageElement. Without this, we need
to check .includes_everything every time, just to be safe.
:return: `True` if the element matches this `SoupStrainer`'s rules; `False` otherwise.
"""
# If there are no rules at all, let anything through.
if not _known_rules and self.includes_everything:
return True
if isinstance(element, Tag):
return self.matches_tag(element)
assert isinstance(element, NavigableString)
if not (self.name_rules or self.attribute_rules):
# A NavigableString can only match a SoupStrainer that
# does not define any name or attribute restrictions.
for rule in self.string_rules:
if rule.matches_string(element):
return True
# does not define any name or attribute rules.
# Then it comes down to the string rules.
return self.matches_any_string_rule(element)
return False
@_deprecated("allow_tag_creation", "4.13.0")

View file

@ -83,7 +83,7 @@ class Formatter(EntitySubstitution):
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: int = 1,
indent: Union[int,str] = 1,
):
r"""Constructor.
@ -168,7 +168,7 @@ class Formatter(EntitySubstitution):
return self.substitute(value)
def attributes(
self, tag: bs4.element.Tag
self, tag: bs4.element.Tag # type:ignore
) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
"""Reorder a tag's attributes however you want.
@ -201,7 +201,7 @@ class HTMLFormatter(Formatter):
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: int = 1,
indent: Union[int,str] = 1,
):
super(HTMLFormatter, self).__init__(
self.HTML,
@ -209,6 +209,7 @@ class HTMLFormatter(Formatter):
void_element_close_prefix,
cdata_containing_tags,
empty_attributes_are_booleans,
indent=indent
)
@ -223,7 +224,7 @@ class XMLFormatter(Formatter):
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: int = 1,
indent: Union[int,str] = 1,
):
super(XMLFormatter, self).__init__(
self.XML,
@ -231,6 +232,7 @@ class XMLFormatter(Formatter):
void_element_close_prefix,
cdata_containing_tags,
empty_attributes_are_booleans,
indent=indent,
)