mirror of
https://github.com/SickGear/SickGear.git
synced 2025-12-04 16:14:36 +00:00
Update Beautiful Soup 4.13.0b3 (55e006b) → 4.13.4 (9752e85).
This commit is contained in:
parent
2b039695b8
commit
4a7ea7746f
11 changed files with 169 additions and 66 deletions
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
* Update apprise 1.8.0 (81caf92) to 1.9.2 (a2a2216)
|
||||
* Update attr 23.2.0 (b393d79) to 24.3.0 (598494a)
|
||||
* Update Beautiful Soup 4.12.3 (7fb5175) to 4.13.0b3 (55e006b)
|
||||
* Update Beautiful Soup 4.12.3 (7fb5175) to 4.13.4 (9752e85)
|
||||
* Update CacheControl 0.14.0 (e2be0c2) to 0.14.3 (116113c)
|
||||
* Update certifi 2024.08.30 to 2024.12.14
|
||||
* Update chardet packages 5.1.0 (8087f00) to 5.3.0dev0 (8e8dfcd)
|
||||
|
|
@ -34,6 +34,7 @@
|
|||
|
||||
[develop changelog]
|
||||
|
||||
* Update Beautiful Soup 4.12.3 (7fb5175) to 4.13.0b3 (55e006b)
|
||||
* Update CacheControl 0.14.0 (e2be0c2) to 0.14.2 (928422d)
|
||||
* Update Msgpack 1.0.6 (e1d3d5d) to 1.1.0 (0eeabfb)
|
||||
* Update soupsieve 2.5.0 (dc71495) to 2.6.0 (a8080d9)
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.13.0b3"
|
||||
__version__ = "4.13.4"
|
||||
__copyright__ = "Copyright (c) 2004-2025 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ def _deprecated_alias(old_name: str, new_name: str, version: str):
|
|||
:meta private:
|
||||
"""
|
||||
|
||||
@property
|
||||
@property # type:ignore
|
||||
def alias(self) -> Any:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
|
|
|
|||
|
|
@ -192,17 +192,11 @@ class TreeBuilder(object):
|
|||
doesn't keep track of this information, then store_line_numbers
|
||||
is irrelevant.
|
||||
|
||||
:param attribute_dict_class: A Tag's attribute values (available
|
||||
as tag.attrs) willl be stored in an instance of this class.
|
||||
The default is Beautiful Soup's built-in `AttributeDict` class and
|
||||
you will probably never need to change it.
|
||||
|
||||
:param attribute_dict_class: The value of a multi-valued attribute
|
||||
(such as HTML's 'class') willl be stored in an instance of this
|
||||
class. The default is Beautiful Soup's built-in
|
||||
`AttributeValueList`, which is a normal Python list, and you
|
||||
will probably never need to change it.
|
||||
|
||||
"""
|
||||
|
||||
USE_DEFAULT: Any = object() #: :meta private:
|
||||
|
|
@ -266,7 +260,7 @@ class TreeBuilder(object):
|
|||
|
||||
#: The textual contents of tags with these names should be
|
||||
#: instantiated with some class other than `bs4.element.NavigableString`.
|
||||
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}
|
||||
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} # type:ignore
|
||||
|
||||
#: By default, tags are treated as empty-element tags if they have
|
||||
#: no contents--that is, using XML rules. HTMLTreeBuilder
|
||||
|
|
@ -605,7 +599,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
#:
|
||||
#: TODO: Arguably <noscript> could go here but it seems
|
||||
#: qualitatively different from the other tags.
|
||||
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
|
||||
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { # type:ignore
|
||||
"rt": RubyTextString,
|
||||
"rp": RubyParenthesisString,
|
||||
"style": Stylesheet,
|
||||
|
|
|
|||
|
|
@ -136,7 +136,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
|||
# HTMLBinaryInputStream.__init__.
|
||||
extra_kwargs["override_encoding"] = self.user_specified_encoding
|
||||
|
||||
doc = parser.parse(markup, **extra_kwargs)
|
||||
doc = parser.parse(markup, **extra_kwargs) # type:ignore
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, str):
|
||||
|
|
@ -144,7 +144,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
|||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore
|
||||
# The encoding is an html5lib Encoding object. We want to
|
||||
# use a string for compatibility with other tree builders.
|
||||
original_encoding = original_encoding.name
|
||||
|
|
@ -227,7 +227,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
|||
# This represents the point immediately after the end of the
|
||||
# tag. We don't know when the tag started, but we do know
|
||||
# where it ended -- the character just before this one.
|
||||
sourceline, sourcepos = self.parser.tokenizer.stream.position()
|
||||
sourceline, sourcepos = self.parser.tokenizer.stream.position() # type:ignore
|
||||
assert sourcepos is not None
|
||||
sourcepos = sourcepos - 1
|
||||
tag = self.soup.new_tag(
|
||||
|
|
@ -266,7 +266,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
|||
def getDocument(self) -> "BeautifulSoup":
|
||||
return self.soup
|
||||
|
||||
def testSerializer(self, element: "Element") -> str:
|
||||
def testSerializer(self, element: "Element") -> None:
|
||||
"""This is only used by the html5lib unit tests. Since we
|
||||
don't currently hook into those tests, the implementation is
|
||||
left blank.
|
||||
|
|
@ -337,7 +337,7 @@ class BeautifulSoupNode(treebuilder_base.Node):
|
|||
|
||||
# TODO-TYPING: typeshed stubs are incorrect about this;
|
||||
# cloneNode returns a new Node, not None.
|
||||
def cloneNode(self) -> treebuilder_base.Node:
|
||||
def cloneNode(self) -> treebuilder_base.Node: # type:ignore
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
|
|
@ -564,12 +564,12 @@ class Element(BeautifulSoupNode):
|
|||
|
||||
# TODO-TYPING: typeshed stubs are incorrect about this;
|
||||
# hasContent returns a boolean, not None.
|
||||
def hasContent(self) -> bool:
|
||||
def hasContent(self) -> bool: # type:ignore
|
||||
return len(self.element.contents) > 0
|
||||
|
||||
# TODO-TYPING: typeshed stubs are incorrect about this;
|
||||
# cloneNode returns a new Node, not None.
|
||||
def cloneNode(self) -> treebuilder_base.Node:
|
||||
def cloneNode(self) -> treebuilder_base.Node: # type:ignore
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key, value in self.attributes:
|
||||
|
|
|
|||
|
|
@ -22,11 +22,13 @@ from typing import (
|
|||
TYPE_CHECKING,
|
||||
Union,
|
||||
)
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from io import BytesIO
|
||||
from io import StringIO
|
||||
from lxml import etree
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from lxml import etree # type:ignore
|
||||
from ..element import (
|
||||
AttributeDict,
|
||||
XMLAttributeDict,
|
||||
|
|
@ -180,6 +182,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
|
||||
if self.is_xml:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
else:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
|
||||
if "attribute_dict_class" not in kwargs:
|
||||
kwargs["attribute_dict_class"] = XMLAttributeDict
|
||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||
|
|
@ -226,14 +233,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
document to Unicode and parsing it. Each strategy will be tried
|
||||
in turn.
|
||||
"""
|
||||
is_html = not self.is_xml
|
||||
if is_html:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
if not self.is_xml:
|
||||
# We're in HTML mode, so if we're given XML, that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
|
||||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
|
|
@ -274,7 +277,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
markup,
|
||||
known_definite_encodings=known_definite_encodings,
|
||||
user_encodings=user_encodings,
|
||||
is_html=is_html,
|
||||
is_html=not self.is_xml,
|
||||
exclude_encodings=exclude_encodings,
|
||||
)
|
||||
for encoding in detector.encodings:
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ import warnings
|
|||
chardet_module: Optional[ModuleType] = None
|
||||
try:
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
import cchardet # type:ignore
|
||||
|
||||
chardet_module = cchardet
|
||||
except ImportError:
|
||||
|
|
@ -60,7 +60,7 @@ except ImportError:
|
|||
except ImportError:
|
||||
try:
|
||||
# PyPI package: charset-normalizer
|
||||
import charset_normalizer
|
||||
import charset_normalizer # type:ignore
|
||||
|
||||
chardet_module = charset_normalizer
|
||||
except ImportError:
|
||||
|
|
@ -797,9 +797,9 @@ class UnicodeDammit:
|
|||
)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, str) or markup == b"":
|
||||
self.markup = markup
|
||||
self.unicode_markup = str(markup)
|
||||
if isinstance(markup, str):
|
||||
self.markup = markup.encode("utf8")
|
||||
self.unicode_markup = markup
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ def diagnose(data: "_IncomingMarkup") -> None:
|
|||
if "lxml" in basic_parsers:
|
||||
basic_parsers.append("lxml-xml")
|
||||
try:
|
||||
from lxml import etree
|
||||
from lxml import etree # type:ignore
|
||||
|
||||
print(("Found lxml version %s" % ".".join(map(str, etree.LXML_VERSION))))
|
||||
except ImportError:
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ from typing import (
|
|||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
from typing_extensions import (
|
||||
Self,
|
||||
|
|
@ -223,7 +224,7 @@ class AttributeValueList(List[str]):
|
|||
"""
|
||||
|
||||
|
||||
class AttributeDict(dict):
|
||||
class AttributeDict(Dict[Any,Any]):
|
||||
"""Superclass for the dictionary used to hold a tag's
|
||||
attributes. You can use this, but it's just a regular dict with no
|
||||
special logic.
|
||||
|
|
@ -235,7 +236,7 @@ class XMLAttributeDict(AttributeDict):
|
|||
incoming values for consistency with the HTML spec.
|
||||
"""
|
||||
|
||||
def __setitem__(self, key: str, value: Any):
|
||||
def __setitem__(self, key: str, value: Any) -> None:
|
||||
"""Set an attribute value, possibly modifying it to comply with
|
||||
the XML spec.
|
||||
|
||||
|
|
@ -273,7 +274,7 @@ class HTMLAttributeDict(AttributeDict):
|
|||
around boolean attributes that XML doesn't have.
|
||||
"""
|
||||
|
||||
def __setitem__(self, key: str, value: Any):
|
||||
def __setitem__(self, key: str, value: Any) -> None:
|
||||
"""Set an attribute value, possibly modifying it to comply
|
||||
with the HTML spec,
|
||||
"""
|
||||
|
|
@ -389,7 +390,7 @@ class PageElement(object):
|
|||
:param previous_element: The element parsed immediately before
|
||||
this one.
|
||||
|
||||
:param next_element: The element parsed immediately before
|
||||
:param next_element: The element parsed immediately after
|
||||
this one.
|
||||
|
||||
:param previous_sibling: The most recently encountered element
|
||||
|
|
@ -1231,7 +1232,7 @@ class PageElement(object):
|
|||
"""
|
||||
return self._self_and(self.parents)
|
||||
|
||||
def _self_and(self, other_generator):
|
||||
def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
|
||||
"""Modify a generator by yielding this element, then everything
|
||||
yielded by the other generator.
|
||||
"""
|
||||
|
|
@ -1317,6 +1318,14 @@ class NavigableString(str, PageElement):
|
|||
def __getnewargs__(self) -> Tuple[str]:
|
||||
return (str(self),)
|
||||
|
||||
# TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
|
||||
# is introduced in 3.8. This can be changed once 3.7 support is dropped.
|
||||
def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
|
||||
"""Raise an exception """
|
||||
if isinstance(key, str):
|
||||
raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
|
||||
return super(NavigableString, self).__getitem__(key)
|
||||
|
||||
@property
|
||||
def string(self) -> str:
|
||||
"""Convenience property defined to match `Tag.string`.
|
||||
|
|
@ -2188,7 +2197,8 @@ class Tag(PageElement):
|
|||
elif isinstance(value, list):
|
||||
list_value = value
|
||||
else:
|
||||
value = cast(str, value)
|
||||
if not isinstance(value, str):
|
||||
value = cast(str, value)
|
||||
list_value = self.attribute_value_list_class([value])
|
||||
return list_value
|
||||
|
||||
|
|
@ -2597,6 +2607,22 @@ class Tag(PageElement):
|
|||
or self.name not in self.preserve_whitespace_tags
|
||||
)
|
||||
|
||||
@overload
|
||||
def prettify(
|
||||
self,
|
||||
encoding: None = None,
|
||||
formatter: _FormatterOrName = "minimal",
|
||||
) -> str:
|
||||
...
|
||||
|
||||
@overload
|
||||
def prettify(
|
||||
self,
|
||||
encoding: _Encoding,
|
||||
formatter: _FormatterOrName = "minimal",
|
||||
) -> bytes:
|
||||
...
|
||||
|
||||
def prettify(
|
||||
self,
|
||||
encoding: Optional[_Encoding] = None,
|
||||
|
|
|
|||
|
|
@ -78,6 +78,21 @@ class ElementFilter(object):
|
|||
"""
|
||||
self.match_function = match_function
|
||||
|
||||
@property
|
||||
def includes_everything(self) -> bool:
|
||||
"""Does this `ElementFilter` obviously include everything? If so,
|
||||
the filter process can be made much faster.
|
||||
|
||||
The `ElementFilter` might turn out to include everything even
|
||||
if this returns `False`, but it won't include everything in an
|
||||
obvious way.
|
||||
|
||||
The base `ElementFilter` implementation includes things based on
|
||||
the match function, so includes_everything is only true if
|
||||
there is no match function.
|
||||
"""
|
||||
return not self.match_function
|
||||
|
||||
@property
|
||||
def excludes_everything(self) -> bool:
|
||||
"""Does this `ElementFilter` obviously exclude everything? If
|
||||
|
|
@ -88,19 +103,25 @@ class ElementFilter(object):
|
|||
if this returns `False`, but it won't exclude everything in an
|
||||
obvious way.
|
||||
|
||||
The base `ElementFilter` implementation excludes *nothing*, so
|
||||
the base implementation of `excludes_everything` always
|
||||
returns `False`.
|
||||
The base `ElementFilter` implementation excludes things based
|
||||
on a match function we can't inspect, so excludes_everything
|
||||
is always false.
|
||||
"""
|
||||
return False
|
||||
|
||||
def match(self, element: PageElement) -> bool:
|
||||
def match(self, element: PageElement, _known_rules:bool=False) -> bool:
|
||||
"""Does the given PageElement match the rules set down by this
|
||||
ElementFilter?
|
||||
|
||||
The base implementation delegates to the function passed in to
|
||||
the constructor.
|
||||
|
||||
:param _known_rules: Defined for compatibility with
|
||||
SoupStrainer._match(). Used more for consistency than because
|
||||
we need the performance optimization.
|
||||
"""
|
||||
if not _known_rules and self.includes_everything:
|
||||
return True
|
||||
if not self.match_function:
|
||||
return True
|
||||
return self.match_function(element)
|
||||
|
|
@ -111,13 +132,18 @@ class ElementFilter(object):
|
|||
Acts like Python's built-in `filter`, using
|
||||
`ElementFilter.match` as the filtering function.
|
||||
"""
|
||||
# If there are no rules at all, don't bother filtering. Let
|
||||
# anything through.
|
||||
if self.includes_everything:
|
||||
for i in generator:
|
||||
yield i
|
||||
while True:
|
||||
try:
|
||||
i = next(generator)
|
||||
except StopIteration:
|
||||
break
|
||||
if i:
|
||||
if self.match(i):
|
||||
if self.match(i, _known_rules=True):
|
||||
yield cast("_OneElement", i)
|
||||
|
||||
def find(self, generator: Iterator[PageElement]) -> _AtMostOneElement:
|
||||
|
|
@ -190,6 +216,7 @@ class MatchRule(object):
|
|||
string: Optional[str]
|
||||
pattern: Optional[_RegularExpressionProtocol]
|
||||
present: Optional[bool]
|
||||
exclude_everything: Optional[bool]
|
||||
# TODO-TYPING: All MatchRule objects also have an attribute
|
||||
# ``function``, but the type of the function depends on the
|
||||
# subclass.
|
||||
|
|
@ -200,6 +227,7 @@ class MatchRule(object):
|
|||
pattern: Optional[_RegularExpressionProtocol] = None,
|
||||
function: Optional[Callable] = None,
|
||||
present: Optional[bool] = None,
|
||||
exclude_everything: Optional[bool] = None
|
||||
):
|
||||
if isinstance(string, bytes):
|
||||
string = string.decode("utf8")
|
||||
|
|
@ -212,19 +240,20 @@ class MatchRule(object):
|
|||
self.pattern = pattern
|
||||
self.function = function
|
||||
self.present = present
|
||||
self.exclude_everything = exclude_everything
|
||||
|
||||
values = [
|
||||
x
|
||||
for x in (self.string, self.pattern, self.function, self.present)
|
||||
for x in (self.string, self.pattern, self.function, self.present, self.exclude_everything)
|
||||
if x is not None
|
||||
]
|
||||
if len(values) == 0:
|
||||
raise ValueError(
|
||||
"Either string, pattern, function or present must be provided."
|
||||
"Either string, pattern, function, present, or exclude_everything must be provided."
|
||||
)
|
||||
if len(values) > 1:
|
||||
raise ValueError(
|
||||
"At most one of string, pattern, function and present must be provided."
|
||||
"At most one of string, pattern, function, present, and exclude_everything must be provided."
|
||||
)
|
||||
|
||||
def _base_match(self, string: Optional[str]) -> Optional[bool]:
|
||||
|
|
@ -234,6 +263,10 @@ class MatchRule(object):
|
|||
:return: True or False if we have a (positive or negative)
|
||||
match; None if we need to keep trying.
|
||||
"""
|
||||
# self.exclude_everything matches nothing.
|
||||
if self.exclude_everything:
|
||||
return False
|
||||
|
||||
# self.present==True matches everything except None.
|
||||
if self.present is True:
|
||||
return string is not None
|
||||
|
|
@ -357,9 +390,15 @@ class SoupStrainer(ElementFilter):
|
|||
stacklevel=2,
|
||||
)
|
||||
|
||||
self.name_rules = cast(
|
||||
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
|
||||
)
|
||||
if name is None and not attrs and not string and not kwargs:
|
||||
# Special case for backwards compatibility. Instantiating
|
||||
# a SoupStrainer with no arguments whatsoever gets you one
|
||||
# that matches all Tags, and only Tags.
|
||||
self.name_rules = [TagNameMatchRule(present=True)]
|
||||
else:
|
||||
self.name_rules = cast(
|
||||
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
|
||||
)
|
||||
self.attribute_rules = defaultdict(list)
|
||||
|
||||
if not isinstance(attrs, dict):
|
||||
|
|
@ -395,17 +434,35 @@ class SoupStrainer(ElementFilter):
|
|||
#: variable might have. Look at the .string_rules list instead.
|
||||
self.__string = string
|
||||
|
||||
@property
|
||||
def includes_everything(self) -> bool:
|
||||
"""Check whether the provided rules will obviously include
|
||||
everything. (They might include everything even if this returns `False`,
|
||||
but not in an obvious way.)
|
||||
"""
|
||||
return not self.name_rules and not self.string_rules and not self.attribute_rules
|
||||
|
||||
@property
|
||||
def excludes_everything(self) -> bool:
|
||||
"""Check whether the provided rules will obviously exclude
|
||||
everything. (They might exclude everything even if this returns `False`,
|
||||
but not in an obvious way.)
|
||||
"""
|
||||
return (
|
||||
True
|
||||
if (self.string_rules and (self.name_rules or self.attribute_rules))
|
||||
else False
|
||||
)
|
||||
if (self.string_rules and (self.name_rules or self.attribute_rules)):
|
||||
# This is self-contradictory, so the rules exclude everything.
|
||||
return True
|
||||
|
||||
# If there's a rule that ended up treated as an "exclude everything"
|
||||
# rule due to creating a logical inconsistency, then the rules
|
||||
# exclude everything.
|
||||
if any(x.exclude_everything for x in self.string_rules):
|
||||
return True
|
||||
if any(x.exclude_everything for x in self.name_rules):
|
||||
return True
|
||||
for ruleset in self.attribute_rules.values():
|
||||
if any(x.exclude_everything for x in ruleset):
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def string(self) -> Optional[_StrainableString]:
|
||||
|
|
@ -454,18 +511,24 @@ class SoupStrainer(ElementFilter):
|
|||
elif isinstance(obj, _RegularExpressionProtocol):
|
||||
yield rule_class(pattern=obj)
|
||||
elif hasattr(obj, "__iter__"):
|
||||
if not obj:
|
||||
# The attribute is being matched against the null set,
|
||||
# which means it should exclude everything.
|
||||
yield rule_class(exclude_everything=True)
|
||||
for o in obj:
|
||||
if not isinstance(o, (bytes, str)) and hasattr(o, "__iter__"):
|
||||
# This is almost certainly the user's
|
||||
# mistake. This list contains another list, which
|
||||
# opens up the possibility of infinite
|
||||
# self-reference. In the interests of avoiding
|
||||
# infinite recursion, we'll ignore this item
|
||||
# rather than looking inside.
|
||||
# infinite recursion, we'll treat this as an
|
||||
# impossible match and issue a rule that excludes
|
||||
# everything, rather than looking inside.
|
||||
warnings.warn(
|
||||
f"Ignoring nested list {o} to avoid the possibility of infinite recursion.",
|
||||
stacklevel=5,
|
||||
)
|
||||
yield rule_class(exclude_everything=True)
|
||||
continue
|
||||
for x in cls._make_match_rules(o, rule_class):
|
||||
yield x
|
||||
|
|
@ -487,6 +550,10 @@ class SoupStrainer(ElementFilter):
|
|||
but a `SoupStrainer` that *only* contains `StringMatchRule`
|
||||
cannot match a `Tag`, only a `NavigableString`.
|
||||
"""
|
||||
# If there are no rules at all, let anything through.
|
||||
#if self.includes_everything:
|
||||
# return True
|
||||
|
||||
# String rules cannot not match a Tag on their own.
|
||||
if not self.name_rules and not self.attribute_rules:
|
||||
return False
|
||||
|
|
@ -515,8 +582,12 @@ class SoupStrainer(ElementFilter):
|
|||
# [f"{k}={v}" for k, v in sorted(tag.attrs.items())]
|
||||
# )
|
||||
# print(f"Testing <{tag.name} {attrs}>{tag.string}</{tag.name}> against {rule}")
|
||||
|
||||
# If the rule contains a function, the function will be called
|
||||
# with `tag`. It will not be called a second time with
|
||||
# `prefixed_name`.
|
||||
if rule.matches_tag(tag) or (
|
||||
prefixed_name is not None and rule.matches_string(prefixed_name)
|
||||
not rule.function and prefixed_name is not None and rule.matches_string(prefixed_name)
|
||||
):
|
||||
name_matches = True
|
||||
break
|
||||
|
|
@ -647,24 +718,30 @@ class SoupStrainer(ElementFilter):
|
|||
return True
|
||||
return False
|
||||
|
||||
def match(self, element: PageElement) -> bool:
|
||||
def match(self, element: PageElement, _known_rules: bool=False) -> bool:
|
||||
"""Does the given `PageElement` match the rules set down by this
|
||||
`SoupStrainer`?
|
||||
|
||||
The find_* methods rely heavily on this method to find matches.
|
||||
|
||||
:param element: A `PageElement`.
|
||||
:param _known_rules: Set to true in the common case where
|
||||
we already checked and found at least one rule in this SoupStrainer
|
||||
that might exclude a PageElement. Without this, we need
|
||||
to check .includes_everything every time, just to be safe.
|
||||
:return: `True` if the element matches this `SoupStrainer`'s rules; `False` otherwise.
|
||||
"""
|
||||
# If there are no rules at all, let anything through.
|
||||
if not _known_rules and self.includes_everything:
|
||||
return True
|
||||
if isinstance(element, Tag):
|
||||
return self.matches_tag(element)
|
||||
assert isinstance(element, NavigableString)
|
||||
if not (self.name_rules or self.attribute_rules):
|
||||
# A NavigableString can only match a SoupStrainer that
|
||||
# does not define any name or attribute restrictions.
|
||||
for rule in self.string_rules:
|
||||
if rule.matches_string(element):
|
||||
return True
|
||||
# does not define any name or attribute rules.
|
||||
# Then it comes down to the string rules.
|
||||
return self.matches_any_string_rule(element)
|
||||
return False
|
||||
|
||||
@_deprecated("allow_tag_creation", "4.13.0")
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class Formatter(EntitySubstitution):
|
|||
void_element_close_prefix: str = "/",
|
||||
cdata_containing_tags: Optional[Set[str]] = None,
|
||||
empty_attributes_are_booleans: bool = False,
|
||||
indent: int = 1,
|
||||
indent: Union[int,str] = 1,
|
||||
):
|
||||
r"""Constructor.
|
||||
|
||||
|
|
@ -168,7 +168,7 @@ class Formatter(EntitySubstitution):
|
|||
return self.substitute(value)
|
||||
|
||||
def attributes(
|
||||
self, tag: bs4.element.Tag
|
||||
self, tag: bs4.element.Tag # type:ignore
|
||||
) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
|
||||
"""Reorder a tag's attributes however you want.
|
||||
|
||||
|
|
@ -201,7 +201,7 @@ class HTMLFormatter(Formatter):
|
|||
void_element_close_prefix: str = "/",
|
||||
cdata_containing_tags: Optional[Set[str]] = None,
|
||||
empty_attributes_are_booleans: bool = False,
|
||||
indent: int = 1,
|
||||
indent: Union[int,str] = 1,
|
||||
):
|
||||
super(HTMLFormatter, self).__init__(
|
||||
self.HTML,
|
||||
|
|
@ -209,6 +209,7 @@ class HTMLFormatter(Formatter):
|
|||
void_element_close_prefix,
|
||||
cdata_containing_tags,
|
||||
empty_attributes_are_booleans,
|
||||
indent=indent
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -223,7 +224,7 @@ class XMLFormatter(Formatter):
|
|||
void_element_close_prefix: str = "/",
|
||||
cdata_containing_tags: Optional[Set[str]] = None,
|
||||
empty_attributes_are_booleans: bool = False,
|
||||
indent: int = 1,
|
||||
indent: Union[int,str] = 1,
|
||||
):
|
||||
super(XMLFormatter, self).__init__(
|
||||
self.XML,
|
||||
|
|
@ -231,6 +232,7 @@ class XMLFormatter(Formatter):
|
|||
void_element_close_prefix,
|
||||
cdata_containing_tags,
|
||||
empty_attributes_are_booleans,
|
||||
indent=indent,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue