mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-22 12:55:05 +00:00
Update Beautiful Soup 4.11.1 (r642) → 4.12.2 and soupsieve 2.3.2.post1 (792d566) → 2.4.1 (2e66beb).
This commit is contained in:
parent
18370cebab
commit
997e6955b2
14 changed files with 794 additions and 475 deletions
|
@ -1,4 +1,10 @@
|
||||||
### 3.29.2 (2023-05-28 07:45:00 UTC)
|
### 3.30.0 (2023-0x-xx xx:xx:00 UTC)
|
||||||
|
|
||||||
|
* Update Beautiful Soup 4.11.1 (r642) to 4.12.2
|
||||||
|
* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
|
||||||
|
|
||||||
|
|
||||||
|
### 3.29.2 (2023-05-28 07:45:00 UTC)
|
||||||
|
|
||||||
* Fix find show results returned as newest/oldest that are then sorted z to a
|
* Fix find show results returned as newest/oldest that are then sorted z to a
|
||||||
* Fix add show "TheTVDB via Trakt"
|
* Fix add show "TheTVDB via Trakt"
|
||||||
|
|
|
@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
provides methods and Pythonic idioms that make it easy to navigate,
|
provides methods and Pythonic idioms that make it easy to navigate,
|
||||||
search, and modify the parse tree.
|
search, and modify the parse tree.
|
||||||
|
|
||||||
Beautiful Soup works with Python 3.5 and up. It works better if lxml
|
Beautiful Soup works with Python 3.6 and up. It works better if lxml
|
||||||
and/or html5lib is installed.
|
and/or html5lib is installed.
|
||||||
|
|
||||||
For more than you ever wanted to know about Beautiful Soup, see the
|
For more than you ever wanted to know about Beautiful Soup, see the
|
||||||
|
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.11.1"
|
__version__ = "4.12.2"
|
||||||
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
|
||||||
# Use of this source code is governed by the MIT license.
|
# Use of this source code is governed by the MIT license.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
|
@ -38,11 +38,13 @@ from .builder import (
|
||||||
builder_registry,
|
builder_registry,
|
||||||
ParserRejectedMarkup,
|
ParserRejectedMarkup,
|
||||||
XMLParsedAsHTMLWarning,
|
XMLParsedAsHTMLWarning,
|
||||||
|
HTMLParserTreeBuilder
|
||||||
)
|
)
|
||||||
from .dammit import UnicodeDammit
|
from .dammit import UnicodeDammit
|
||||||
from .element import (
|
from .element import (
|
||||||
CData,
|
CData,
|
||||||
Comment,
|
Comment,
|
||||||
|
CSS,
|
||||||
DEFAULT_OUTPUT_ENCODING,
|
DEFAULT_OUTPUT_ENCODING,
|
||||||
Declaration,
|
Declaration,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
@ -211,7 +213,7 @@ class BeautifulSoup(Tag):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'The "%s" argument to the BeautifulSoup constructor '
|
'The "%s" argument to the BeautifulSoup constructor '
|
||||||
'has been renamed to "%s."' % (old_name, new_name),
|
'has been renamed to "%s."' % (old_name, new_name),
|
||||||
DeprecationWarning
|
DeprecationWarning, stacklevel=3
|
||||||
)
|
)
|
||||||
return kwargs.pop(old_name)
|
return kwargs.pop(old_name)
|
||||||
return None
|
return None
|
||||||
|
@ -348,26 +350,50 @@ class BeautifulSoup(Tag):
|
||||||
self.markup = None
|
self.markup = None
|
||||||
self.builder.soup = None
|
self.builder.soup = None
|
||||||
|
|
||||||
def __copy__(self):
|
def _clone(self):
|
||||||
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
|
"""Create a new BeautifulSoup object with the same TreeBuilder,
|
||||||
copy = type(self)(
|
but not associated with any markup.
|
||||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Although we encoded the tree to UTF-8, that may not have
|
This is the first step of the deepcopy process.
|
||||||
# been the encoding of the original markup. Set the copy's
|
"""
|
||||||
# .original_encoding to reflect the original object's
|
clone = type(self)("", None, self.builder)
|
||||||
# .original_encoding.
|
|
||||||
copy.original_encoding = self.original_encoding
|
# Keep track of the encoding of the original document,
|
||||||
return copy
|
# since we won't be parsing it again.
|
||||||
|
clone.original_encoding = self.original_encoding
|
||||||
|
return clone
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
# Frequently a tree builder can't be pickled.
|
# Frequently a tree builder can't be pickled.
|
||||||
d = dict(self.__dict__)
|
d = dict(self.__dict__)
|
||||||
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
||||||
d['builder'] = None
|
d['builder'] = type(self.builder)
|
||||||
|
# Store the contents as a Unicode string.
|
||||||
|
d['contents'] = []
|
||||||
|
d['markup'] = self.decode()
|
||||||
|
|
||||||
|
# If _most_recent_element is present, it's a Tag object left
|
||||||
|
# over from initial parse. It might not be picklable and we
|
||||||
|
# don't need it.
|
||||||
|
if '_most_recent_element' in d:
|
||||||
|
del d['_most_recent_element']
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
# If necessary, restore the TreeBuilder by looking it up.
|
||||||
|
self.__dict__ = state
|
||||||
|
if isinstance(self.builder, type):
|
||||||
|
self.builder = self.builder()
|
||||||
|
elif not self.builder:
|
||||||
|
# We don't know which builder was used to build this
|
||||||
|
# parse tree, so use a default we know is always available.
|
||||||
|
self.builder = HTMLParserTreeBuilder()
|
||||||
|
self.builder.soup = self
|
||||||
|
self.reset()
|
||||||
|
self._feed()
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _decode_markup(cls, markup):
|
def _decode_markup(cls, markup):
|
||||||
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
|
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
|
||||||
|
@ -405,7 +431,8 @@ class BeautifulSoup(Tag):
|
||||||
'The input looks more like a URL than markup. You may want to use'
|
'The input looks more like a URL than markup. You may want to use'
|
||||||
' an HTTP client like requests to get the document behind'
|
' an HTTP client like requests to get the document behind'
|
||||||
' the URL, and feed that document to Beautiful Soup.',
|
' the URL, and feed that document to Beautiful Soup.',
|
||||||
MarkupResemblesLocatorWarning
|
MarkupResemblesLocatorWarning,
|
||||||
|
stacklevel=3
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
@ -436,7 +463,7 @@ class BeautifulSoup(Tag):
|
||||||
'The input looks more like a filename than markup. You may'
|
'The input looks more like a filename than markup. You may'
|
||||||
' want to open this file and pass the filehandle into'
|
' want to open this file and pass the filehandle into'
|
||||||
' Beautiful Soup.',
|
' Beautiful Soup.',
|
||||||
MarkupResemblesLocatorWarning
|
MarkupResemblesLocatorWarning, stacklevel=3
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
@ -467,6 +494,7 @@ class BeautifulSoup(Tag):
|
||||||
self.open_tag_counter = Counter()
|
self.open_tag_counter = Counter()
|
||||||
self.preserve_whitespace_tag_stack = []
|
self.preserve_whitespace_tag_stack = []
|
||||||
self.string_container_stack = []
|
self.string_container_stack = []
|
||||||
|
self._most_recent_element = None
|
||||||
self.pushTag(self)
|
self.pushTag(self)
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
||||||
|
@ -748,7 +776,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def decode(self, pretty_print=False,
|
def decode(self, pretty_print=False,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal", iterator=None):
|
||||||
"""Returns a string or Unicode representation of the parse tree
|
"""Returns a string or Unicode representation of the parse tree
|
||||||
as an HTML or XML document.
|
as an HTML or XML document.
|
||||||
|
|
||||||
|
@ -775,7 +803,7 @@ class BeautifulSoup(Tag):
|
||||||
else:
|
else:
|
||||||
indent_level = 0
|
indent_level = 0
|
||||||
return prefix + super(BeautifulSoup, self).decode(
|
return prefix + super(BeautifulSoup, self).decode(
|
||||||
indent_level, eventual_encoding, formatter)
|
indent_level, eventual_encoding, formatter, iterator)
|
||||||
|
|
||||||
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
||||||
_s = BeautifulSoup
|
_s = BeautifulSoup
|
||||||
|
@ -789,7 +817,7 @@ class BeautifulStoneSoup(BeautifulSoup):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||||
'it, pass features="xml" into the BeautifulSoup constructor.',
|
'it, pass features="xml" into the BeautifulSoup constructor.',
|
||||||
DeprecationWarning
|
DeprecationWarning, stacklevel=2
|
||||||
)
|
)
|
||||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
# ATM because the html5lib TreeBuilder doesn't use
|
# ATM because the html5lib TreeBuilder doesn't use
|
||||||
# UnicodeDammit.
|
# UnicodeDammit.
|
||||||
if exclude_encodings:
|
if exclude_encodings:
|
||||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
warnings.warn(
|
||||||
|
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
|
||||||
|
stacklevel=3
|
||||||
|
)
|
||||||
|
|
||||||
# html5lib only parses HTML, so if it's given XML that's worth
|
# html5lib only parses HTML, so if it's given XML that's worth
|
||||||
# noting.
|
# noting.
|
||||||
|
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
# These methods are defined by Beautiful Soup.
|
# These methods are defined by Beautiful Soup.
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
if self.soup.parse_only is not None:
|
if self.soup.parse_only is not None:
|
||||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
warnings.warn(
|
||||||
|
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
|
||||||
|
stacklevel=4
|
||||||
|
)
|
||||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||||
self.underlying_builder.parser = parser
|
self.underlying_builder.parser = parser
|
||||||
extra_kwargs = dict()
|
extra_kwargs = dict()
|
||||||
|
|
|
@ -10,30 +10,9 @@ __all__ = [
|
||||||
|
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
try:
|
|
||||||
from html.parser import HTMLParseError
|
|
||||||
except ImportError as e:
|
|
||||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
|
||||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
|
||||||
class HTMLParseError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
|
||||||
# argument, which we'd like to set to False. Unfortunately,
|
|
||||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
|
||||||
# before Python 3.2.3.
|
|
||||||
#
|
|
||||||
# At the end of this file, we monkeypatch HTMLParser so that
|
|
||||||
# strict=True works well on Python 3.2.2.
|
|
||||||
major, minor, release = sys.version_info[:3]
|
|
||||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
|
||||||
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
|
||||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
|
||||||
|
|
||||||
|
|
||||||
from ..element import (
|
from ..element import (
|
||||||
CData,
|
CData,
|
||||||
Comment,
|
Comment,
|
||||||
|
@ -45,6 +24,7 @@ from ..dammit import EntitySubstitution, UnicodeDammit
|
||||||
|
|
||||||
from ..builder import (
|
from ..builder import (
|
||||||
DetectsXMLParsedAsHTML,
|
DetectsXMLParsedAsHTML,
|
||||||
|
ParserRejectedMarkup,
|
||||||
HTML,
|
HTML,
|
||||||
HTMLTreeBuilder,
|
HTMLTreeBuilder,
|
||||||
STRICT,
|
STRICT,
|
||||||
|
@ -91,18 +71,21 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||||
|
|
||||||
self._initialize_xml_detector()
|
self._initialize_xml_detector()
|
||||||
|
|
||||||
def error(self, msg):
|
def error(self, message):
|
||||||
"""In Python 3, HTMLParser subclasses must implement error(), although
|
# NOTE: This method is required so long as Python 3.9 is
|
||||||
this requirement doesn't appear to be documented.
|
# supported. The corresponding code is removed from HTMLParser
|
||||||
|
# in 3.5, but not removed from ParserBase until 3.10.
|
||||||
In Python 2, HTMLParser implements error() by raising an exception,
|
# https://github.com/python/cpython/issues/76025
|
||||||
which we don't want to do.
|
#
|
||||||
|
# The original implementation turned the error into a warning,
|
||||||
In any event, this method is called only on very strange
|
# but in every case I discovered, this made HTMLParser
|
||||||
markup and our best strategy is to pretend it didn't happen
|
# immediately crash with an error message that was less
|
||||||
and keep going.
|
# helpful than the warning. The new implementation makes it
|
||||||
"""
|
# more clear that html.parser just can't parse this
|
||||||
warnings.warn(msg)
|
# markup. The 3.10 implementation does the same, though it
|
||||||
|
# raises AssertionError rather than calling a method. (We
|
||||||
|
# catch this error and wrap it in a ParserRejectedMarkup.)
|
||||||
|
raise ParserRejectedMarkup(message)
|
||||||
|
|
||||||
def handle_startendtag(self, name, attrs):
|
def handle_startendtag(self, name, attrs):
|
||||||
"""Handle an incoming empty-element tag.
|
"""Handle an incoming empty-element tag.
|
||||||
|
@ -203,9 +186,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||||
|
|
||||||
:param name: Character number, possibly in hexadecimal.
|
:param name: Character number, possibly in hexadecimal.
|
||||||
"""
|
"""
|
||||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
# TODO: This was originally a workaround for a bug in
|
||||||
# it's fixed in all supported versions.
|
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
|
||||||
# http://bugs.python.org/issue13633
|
# been fixed, but removing this code still makes some
|
||||||
|
# Beautiful Soup tests fail. This needs investigation.
|
||||||
if name.startswith('x'):
|
if name.startswith('x'):
|
||||||
real_name = int(name.lstrip('x'), 16)
|
real_name = int(name.lstrip('x'), 16)
|
||||||
elif name.startswith('X'):
|
elif name.startswith('X'):
|
||||||
|
@ -333,9 +317,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
parser_args = parser_args or []
|
parser_args = parser_args or []
|
||||||
parser_kwargs = parser_kwargs or {}
|
parser_kwargs = parser_kwargs or {}
|
||||||
parser_kwargs.update(extra_parser_kwargs)
|
parser_kwargs.update(extra_parser_kwargs)
|
||||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
|
||||||
parser_kwargs['strict'] = False
|
|
||||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
|
||||||
parser_kwargs['convert_charrefs'] = False
|
parser_kwargs['convert_charrefs'] = False
|
||||||
self.parser_args = (parser_args, parser_kwargs)
|
self.parser_args = (parser_args, parser_kwargs)
|
||||||
|
|
||||||
|
@ -397,103 +378,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
try:
|
try:
|
||||||
parser.feed(markup)
|
parser.feed(markup)
|
||||||
|
except AssertionError as e:
|
||||||
|
# html.parser raises AssertionError in rare cases to
|
||||||
|
# indicate a fatal problem with the markup, especially
|
||||||
|
# when there's an error in the doctype declaration.
|
||||||
|
raise ParserRejectedMarkup(e)
|
||||||
parser.close()
|
parser.close()
|
||||||
except HTMLParseError as e:
|
|
||||||
warnings.warn(RuntimeWarning(
|
|
||||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
|
||||||
raise e
|
|
||||||
parser.already_closed_empty_element = []
|
parser.already_closed_empty_element = []
|
||||||
|
|
||||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
|
||||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
|
||||||
# string.
|
|
||||||
#
|
|
||||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
|
||||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
|
||||||
import re
|
|
||||||
attrfind_tolerant = re.compile(
|
|
||||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
|
||||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
|
||||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
|
||||||
|
|
||||||
locatestarttagend = re.compile(r"""
|
|
||||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
|
||||||
(?:\s+ # whitespace before attribute name
|
|
||||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
|
||||||
(?:\s*=\s* # value indicator
|
|
||||||
(?:'[^']*' # LITA-enclosed value
|
|
||||||
|\"[^\"]*\" # LIT-enclosed value
|
|
||||||
|[^'\">\s]+ # bare value
|
|
||||||
)
|
|
||||||
)?
|
|
||||||
)
|
|
||||||
)*
|
|
||||||
\s* # trailing whitespace
|
|
||||||
""", re.VERBOSE)
|
|
||||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
|
||||||
|
|
||||||
from html.parser import tagfind, attrfind
|
|
||||||
|
|
||||||
def parse_starttag(self, i):
|
|
||||||
self.__starttag_text = None
|
|
||||||
endpos = self.check_for_whole_start_tag(i)
|
|
||||||
if endpos < 0:
|
|
||||||
return endpos
|
|
||||||
rawdata = self.rawdata
|
|
||||||
self.__starttag_text = rawdata[i:endpos]
|
|
||||||
|
|
||||||
# Now parse the data between i+1 and j into a tag and attrs
|
|
||||||
attrs = []
|
|
||||||
match = tagfind.match(rawdata, i+1)
|
|
||||||
assert match, 'unexpected call to parse_starttag()'
|
|
||||||
k = match.end()
|
|
||||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
|
||||||
while k < endpos:
|
|
||||||
if self.strict:
|
|
||||||
m = attrfind.match(rawdata, k)
|
|
||||||
else:
|
|
||||||
m = attrfind_tolerant.match(rawdata, k)
|
|
||||||
if not m:
|
|
||||||
break
|
|
||||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
|
||||||
if not rest:
|
|
||||||
attrvalue = None
|
|
||||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
|
||||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
|
||||||
attrvalue = attrvalue[1:-1]
|
|
||||||
if attrvalue:
|
|
||||||
attrvalue = self.unescape(attrvalue)
|
|
||||||
attrs.append((attrname.lower(), attrvalue))
|
|
||||||
k = m.end()
|
|
||||||
|
|
||||||
end = rawdata[k:endpos].strip()
|
|
||||||
if end not in (">", "/>"):
|
|
||||||
lineno, offset = self.getpos()
|
|
||||||
if "\n" in self.__starttag_text:
|
|
||||||
lineno = lineno + self.__starttag_text.count("\n")
|
|
||||||
offset = len(self.__starttag_text) \
|
|
||||||
- self.__starttag_text.rfind("\n")
|
|
||||||
else:
|
|
||||||
offset = offset + len(self.__starttag_text)
|
|
||||||
if self.strict:
|
|
||||||
self.error("junk characters in start tag: %r"
|
|
||||||
% (rawdata[k:endpos][:20],))
|
|
||||||
self.handle_data(rawdata[i:endpos])
|
|
||||||
return endpos
|
|
||||||
if end.endswith('/>'):
|
|
||||||
# XHTML-style empty tag: <span attr="value" />
|
|
||||||
self.handle_startendtag(tag, attrs)
|
|
||||||
else:
|
|
||||||
self.handle_starttag(tag, attrs)
|
|
||||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
|
||||||
self.set_cdata_mode(tag)
|
|
||||||
return endpos
|
|
||||||
|
|
||||||
def set_cdata_mode(self, elem):
|
|
||||||
self.cdata_elem = elem.lower()
|
|
||||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
|
||||||
|
|
||||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
|
||||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
|
||||||
|
|
||||||
CONSTRUCTOR_TAKES_STRICT = True
|
|
||||||
|
|
280
lib/bs4/css.py
Normal file
280
lib/bs4/css.py
Normal file
|
@ -0,0 +1,280 @@
|
||||||
|
"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
try:
|
||||||
|
import soupsieve
|
||||||
|
except ImportError as e:
|
||||||
|
soupsieve = None
|
||||||
|
warnings.warn(
|
||||||
|
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CSS(object):
|
||||||
|
"""A proxy object against the soupsieve library, to simplify its
|
||||||
|
CSS selector API.
|
||||||
|
|
||||||
|
Acquire this object through the .css attribute on the
|
||||||
|
BeautifulSoup object, or on the Tag you want to use as the
|
||||||
|
starting point for a CSS selector.
|
||||||
|
|
||||||
|
The main advantage of doing this is that the tag to be selected
|
||||||
|
against doesn't need to be explicitly specified in the function
|
||||||
|
calls, since it's already scoped to a tag.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tag, api=soupsieve):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
You don't need to instantiate this class yourself; instead,
|
||||||
|
access the .css attribute on the BeautifulSoup object, or on
|
||||||
|
the Tag you want to use as the starting point for your CSS
|
||||||
|
selector.
|
||||||
|
|
||||||
|
:param tag: All CSS selectors will use this as their starting
|
||||||
|
point.
|
||||||
|
|
||||||
|
:param api: A plug-in replacement for the soupsieve module,
|
||||||
|
designed mainly for use in tests.
|
||||||
|
"""
|
||||||
|
if api is None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||||||
|
)
|
||||||
|
self.api = api
|
||||||
|
self.tag = tag
|
||||||
|
|
||||||
|
def escape(self, ident):
|
||||||
|
"""Escape a CSS identifier.
|
||||||
|
|
||||||
|
This is a simple wrapper around soupselect.escape(). See the
|
||||||
|
documentation for that function for more information.
|
||||||
|
"""
|
||||||
|
if soupsieve is None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Cannot escape CSS identifiers because the soupsieve package is not installed."
|
||||||
|
)
|
||||||
|
return self.api.escape(ident)
|
||||||
|
|
||||||
|
def _ns(self, ns, select):
|
||||||
|
"""Normalize a dictionary of namespaces."""
|
||||||
|
if not isinstance(select, self.api.SoupSieve) and ns is None:
|
||||||
|
# If the selector is a precompiled pattern, it already has
|
||||||
|
# a namespace context compiled in, which cannot be
|
||||||
|
# replaced.
|
||||||
|
ns = self.tag._namespaces
|
||||||
|
return ns
|
||||||
|
|
||||||
|
def _rs(self, results):
|
||||||
|
"""Normalize a list of results to a Resultset.
|
||||||
|
|
||||||
|
A ResultSet is more consistent with the rest of Beautiful
|
||||||
|
Soup's API, and ResultSet.__getattr__ has a helpful error
|
||||||
|
message if you try to treat a list of results as a single
|
||||||
|
result (a common mistake).
|
||||||
|
"""
|
||||||
|
# Import here to avoid circular import
|
||||||
|
from .element import ResultSet
|
||||||
|
return ResultSet(None, results)
|
||||||
|
|
||||||
|
def compile(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Pre-compile a selector and return the compiled object.
|
||||||
|
|
||||||
|
:param selector: A CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will use the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.compile() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.compile() method.
|
||||||
|
|
||||||
|
:return: A precompiled selector object.
|
||||||
|
:rtype: soupsieve.SoupSieve
|
||||||
|
"""
|
||||||
|
return self.api.compile(
|
||||||
|
select, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def select_one(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Perform a CSS selection operation on the current Tag and return the
|
||||||
|
first result.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.select_one()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param selector: A CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will use the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.select_one() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.select_one() method.
|
||||||
|
|
||||||
|
:return: A Tag, or None if the selector has no match.
|
||||||
|
:rtype: bs4.element.Tag
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.api.select_one(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||||||
|
"""Perform a CSS selection operation on the current Tag.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.select()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param selector: A string containing a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param limit: After finding this number of results, stop looking.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.select() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.select() method.
|
||||||
|
|
||||||
|
:return: A ResultSet of Tag objects.
|
||||||
|
:rtype: bs4.element.ResultSet
|
||||||
|
|
||||||
|
"""
|
||||||
|
if limit is None:
|
||||||
|
limit = 0
|
||||||
|
|
||||||
|
return self._rs(
|
||||||
|
self.api.select(
|
||||||
|
select, self.tag, self._ns(namespaces, select), limit, flags,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||||||
|
"""Perform a CSS selection operation on the current Tag.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.iselect()
|
||||||
|
method. It is the same as select(), but it returns a generator
|
||||||
|
instead of a list.
|
||||||
|
|
||||||
|
:param selector: A string containing a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param limit: After finding this number of results, stop looking.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.iselect() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.iselect() method.
|
||||||
|
|
||||||
|
:return: A generator
|
||||||
|
:rtype: types.GeneratorType
|
||||||
|
"""
|
||||||
|
return self.api.iselect(
|
||||||
|
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def closest(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Find the Tag closest to this one that matches the given selector.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.closest()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param selector: A string containing a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.closest() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.closest() method.
|
||||||
|
|
||||||
|
:return: A Tag, or None if there is no match.
|
||||||
|
:rtype: bs4.Tag
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.api.closest(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def match(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Check whether this Tag matches the given CSS selector.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.match()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param: a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.match() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.match() method.
|
||||||
|
|
||||||
|
:return: True if this Tag matches the selector; False otherwise.
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
return self.api.match(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def filter(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Filter this Tag's direct children based on the given CSS selector.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. It works the same way as
|
||||||
|
passing this Tag into that library's soupsieve.filter()
|
||||||
|
method. More information, for more information see the
|
||||||
|
documentation for soupsieve.filter().
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.filter() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.filter() method.
|
||||||
|
|
||||||
|
:return: A ResultSet of Tag objects.
|
||||||
|
:rtype: bs4.element.ResultSet
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._rs(
|
||||||
|
self.api.filter(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
)
|
|
@ -59,21 +59,6 @@ def diagnose(data):
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
|
||||||
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
|
|
||||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
if os.path.exists(data):
|
|
||||||
print(('"%s" looks like a filename. Reading data from the file.' % data))
|
|
||||||
with open(data) as fp:
|
|
||||||
data = fp.read()
|
|
||||||
except ValueError:
|
|
||||||
# This can happen on some platforms when the 'filename' is
|
|
||||||
# too long. Assume it's data and not a filename.
|
|
||||||
pass
|
|
||||||
print("")
|
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print(("Trying to parse your markup with %s" % parser))
|
print(("Trying to parse your markup with %s" % parser))
|
||||||
|
|
|
@ -8,14 +8,8 @@ except ImportError as e:
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
try:
|
|
||||||
import soupsieve
|
|
||||||
except ImportError as e:
|
|
||||||
soupsieve = None
|
|
||||||
warnings.warn(
|
|
||||||
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
from .css import CSS
|
||||||
from .formatter import (
|
from .formatter import (
|
||||||
Formatter,
|
Formatter,
|
||||||
HTMLFormatter,
|
HTMLFormatter,
|
||||||
|
@ -154,6 +148,11 @@ class PageElement(object):
|
||||||
NavigableString, Tag, etc. are all subclasses of PageElement.
|
NavigableString, Tag, etc. are all subclasses of PageElement.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# In general, we can't tell just by looking at an element whether
|
||||||
|
# it's contained in an XML document or an HTML document. But for
|
||||||
|
# Tags (q.v.) we can store this information at parse time.
|
||||||
|
known_xml = None
|
||||||
|
|
||||||
def setup(self, parent=None, previous_element=None, next_element=None,
|
def setup(self, parent=None, previous_element=None, next_element=None,
|
||||||
previous_sibling=None, next_sibling=None):
|
previous_sibling=None, next_sibling=None):
|
||||||
"""Sets up the initial relations between this element and
|
"""Sets up the initial relations between this element and
|
||||||
|
@ -496,13 +495,16 @@ class PageElement(object):
|
||||||
def extend(self, tags):
|
def extend(self, tags):
|
||||||
"""Appends the given PageElements to this one's contents.
|
"""Appends the given PageElements to this one's contents.
|
||||||
|
|
||||||
:param tags: A list of PageElements.
|
:param tags: A list of PageElements. If a single Tag is
|
||||||
|
provided instead, this PageElement's contents will be extended
|
||||||
|
with that Tag's contents.
|
||||||
"""
|
"""
|
||||||
if isinstance(tags, Tag):
|
if isinstance(tags, Tag):
|
||||||
# Calling self.append() on another tag's contents will change
|
tags = tags.contents
|
||||||
# the list we're iterating over. Make a list that won't
|
if isinstance(tags, list):
|
||||||
# change.
|
# Moving items around the tree may change their position in
|
||||||
tags = list(tags.contents)
|
# the original list. Make a list that won't change.
|
||||||
|
tags = list(tags)
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
self.append(tag)
|
self.append(tag)
|
||||||
|
|
||||||
|
@ -586,8 +588,9 @@ class PageElement(object):
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A ResultSet containing PageElements.
|
:return: A ResultSet containing PageElements.
|
||||||
"""
|
"""
|
||||||
|
_stacklevel = kwargs.pop('_stacklevel', 2)
|
||||||
return self._find_all(name, attrs, string, limit, self.next_elements,
|
return self._find_all(name, attrs, string, limit, self.next_elements,
|
||||||
**kwargs)
|
_stacklevel=_stacklevel+1, **kwargs)
|
||||||
findAllNext = find_all_next # BS3
|
findAllNext = find_all_next # BS3
|
||||||
|
|
||||||
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
|
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
|
||||||
|
@ -624,8 +627,11 @@ class PageElement(object):
|
||||||
:return: A ResultSet of PageElements.
|
:return: A ResultSet of PageElements.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, string, limit,
|
_stacklevel = kwargs.pop('_stacklevel', 2)
|
||||||
self.next_siblings, **kwargs)
|
return self._find_all(
|
||||||
|
name, attrs, string, limit,
|
||||||
|
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
|
||||||
|
)
|
||||||
findNextSiblings = find_next_siblings # BS3
|
findNextSiblings = find_next_siblings # BS3
|
||||||
fetchNextSiblings = find_next_siblings # BS2
|
fetchNextSiblings = find_next_siblings # BS2
|
||||||
|
|
||||||
|
@ -663,8 +669,11 @@ class PageElement(object):
|
||||||
:return: A ResultSet of PageElements.
|
:return: A ResultSet of PageElements.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, string, limit, self.previous_elements,
|
_stacklevel = kwargs.pop('_stacklevel', 2)
|
||||||
**kwargs)
|
return self._find_all(
|
||||||
|
name, attrs, string, limit, self.previous_elements,
|
||||||
|
_stacklevel=_stacklevel+1, **kwargs
|
||||||
|
)
|
||||||
findAllPrevious = find_all_previous # BS3
|
findAllPrevious = find_all_previous # BS3
|
||||||
fetchPrevious = find_all_previous # BS2
|
fetchPrevious = find_all_previous # BS2
|
||||||
|
|
||||||
|
@ -702,8 +711,11 @@ class PageElement(object):
|
||||||
:return: A ResultSet of PageElements.
|
:return: A ResultSet of PageElements.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, string, limit,
|
_stacklevel = kwargs.pop('_stacklevel', 2)
|
||||||
self.previous_siblings, **kwargs)
|
return self._find_all(
|
||||||
|
name, attrs, string, limit,
|
||||||
|
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
|
||||||
|
)
|
||||||
findPreviousSiblings = find_previous_siblings # BS3
|
findPreviousSiblings = find_previous_siblings # BS3
|
||||||
fetchPreviousSiblings = find_previous_siblings # BS2
|
fetchPreviousSiblings = find_previous_siblings # BS2
|
||||||
|
|
||||||
|
@ -724,7 +736,7 @@ class PageElement(object):
|
||||||
# NOTE: We can't use _find_one because findParents takes a different
|
# NOTE: We can't use _find_one because findParents takes a different
|
||||||
# set of arguments.
|
# set of arguments.
|
||||||
r = None
|
r = None
|
||||||
l = self.find_parents(name, attrs, 1, **kwargs)
|
l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
|
||||||
if l:
|
if l:
|
||||||
r = l[0]
|
r = l[0]
|
||||||
return r
|
return r
|
||||||
|
@ -744,8 +756,9 @@ class PageElement(object):
|
||||||
:return: A PageElement.
|
:return: A PageElement.
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
|
_stacklevel = kwargs.pop('_stacklevel', 2)
|
||||||
return self._find_all(name, attrs, None, limit, self.parents,
|
return self._find_all(name, attrs, None, limit, self.parents,
|
||||||
**kwargs)
|
_stacklevel=_stacklevel+1, **kwargs)
|
||||||
findParents = find_parents # BS3
|
findParents = find_parents # BS3
|
||||||
fetchParents = find_parents # BS2
|
fetchParents = find_parents # BS2
|
||||||
|
|
||||||
|
@ -771,19 +784,20 @@ class PageElement(object):
|
||||||
|
|
||||||
def _find_one(self, method, name, attrs, string, **kwargs):
|
def _find_one(self, method, name, attrs, string, **kwargs):
|
||||||
r = None
|
r = None
|
||||||
l = method(name, attrs, string, 1, **kwargs)
|
l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
|
||||||
if l:
|
if l:
|
||||||
r = l[0]
|
r = l[0]
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
|
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
|
||||||
"Iterates over a generator looking for things that match."
|
"Iterates over a generator looking for things that match."
|
||||||
|
_stacklevel = kwargs.pop('_stacklevel', 3)
|
||||||
|
|
||||||
if string is None and 'text' in kwargs:
|
if string is None and 'text' in kwargs:
|
||||||
string = kwargs.pop('text')
|
string = kwargs.pop('text')
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
|
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
|
||||||
DeprecationWarning
|
DeprecationWarning, stacklevel=_stacklevel
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(name, SoupStrainer):
|
if isinstance(name, SoupStrainer):
|
||||||
|
@ -926,11 +940,6 @@ class NavigableString(str, PageElement):
|
||||||
PREFIX = ''
|
PREFIX = ''
|
||||||
SUFFIX = ''
|
SUFFIX = ''
|
||||||
|
|
||||||
# We can't tell just by looking at a string whether it's contained
|
|
||||||
# in an XML document or an HTML document.
|
|
||||||
|
|
||||||
known_xml = None
|
|
||||||
|
|
||||||
def __new__(cls, value):
|
def __new__(cls, value):
|
||||||
"""Create a new NavigableString.
|
"""Create a new NavigableString.
|
||||||
|
|
||||||
|
@ -946,12 +955,22 @@ class NavigableString(str, PageElement):
|
||||||
u.setup()
|
u.setup()
|
||||||
return u
|
return u
|
||||||
|
|
||||||
def __copy__(self):
|
def __deepcopy__(self, memo, recursive=False):
|
||||||
"""A copy of a NavigableString has the same contents and class
|
"""A copy of a NavigableString has the same contents and class
|
||||||
as the original, but it is not connected to the parse tree.
|
as the original, but it is not connected to the parse tree.
|
||||||
|
|
||||||
|
:param recursive: This parameter is ignored; it's only defined
|
||||||
|
so that NavigableString.__deepcopy__ implements the same
|
||||||
|
signature as Tag.__deepcopy__.
|
||||||
"""
|
"""
|
||||||
return type(self)(self)
|
return type(self)(self)
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
"""A copy of a NavigableString can only be a deep copy, because
|
||||||
|
only one PageElement can occupy a given place in a parse tree.
|
||||||
|
"""
|
||||||
|
return self.__deepcopy__({})
|
||||||
|
|
||||||
def __getnewargs__(self):
|
def __getnewargs__(self):
|
||||||
return (str(self),)
|
return (str(self),)
|
||||||
|
|
||||||
|
@ -1296,22 +1315,57 @@ class Tag(PageElement):
|
||||||
|
|
||||||
parserClass = _alias("parser_class") # BS3
|
parserClass = _alias("parser_class") # BS3
|
||||||
|
|
||||||
def __copy__(self):
|
def __deepcopy__(self, memo, recursive=True):
|
||||||
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
|
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
|
||||||
Its contents are a copy of the old Tag's contents.
|
Its contents are a copy of the old Tag's contents.
|
||||||
"""
|
"""
|
||||||
|
clone = self._clone()
|
||||||
|
|
||||||
|
if recursive:
|
||||||
|
# Clone this tag's descendants recursively, but without
|
||||||
|
# making any recursive function calls.
|
||||||
|
tag_stack = [clone]
|
||||||
|
for event, element in self._event_stream(self.descendants):
|
||||||
|
if event is Tag.END_ELEMENT_EVENT:
|
||||||
|
# Stop appending incoming Tags to the Tag that was
|
||||||
|
# just closed.
|
||||||
|
tag_stack.pop()
|
||||||
|
else:
|
||||||
|
descendant_clone = element.__deepcopy__(
|
||||||
|
memo, recursive=False
|
||||||
|
)
|
||||||
|
# Add to its parent's .contents
|
||||||
|
tag_stack[-1].append(descendant_clone)
|
||||||
|
|
||||||
|
if event is Tag.START_ELEMENT_EVENT:
|
||||||
|
# Add the Tag itself to the stack so that its
|
||||||
|
# children will be .appended to it.
|
||||||
|
tag_stack.append(descendant_clone)
|
||||||
|
return clone
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
"""A copy of a Tag must always be a deep copy, because a Tag's
|
||||||
|
children can only have one parent at a time.
|
||||||
|
"""
|
||||||
|
return self.__deepcopy__({})
|
||||||
|
|
||||||
|
def _clone(self):
|
||||||
|
"""Create a new Tag just like this one, but with no
|
||||||
|
contents and unattached to any parse tree.
|
||||||
|
|
||||||
|
This is the first step in the deepcopy process.
|
||||||
|
"""
|
||||||
clone = type(self)(
|
clone = type(self)(
|
||||||
None, self.builder, self.name, self.namespace,
|
None, self.builder, self.name, self.namespace,
|
||||||
self.prefix, self.attrs, is_xml=self._is_xml,
|
self.prefix, self.attrs, is_xml=self._is_xml,
|
||||||
sourceline=self.sourceline, sourcepos=self.sourcepos,
|
sourceline=self.sourceline, sourcepos=self.sourcepos,
|
||||||
can_be_empty_element=self.can_be_empty_element,
|
can_be_empty_element=self.can_be_empty_element,
|
||||||
cdata_list_attributes=self.cdata_list_attributes,
|
cdata_list_attributes=self.cdata_list_attributes,
|
||||||
preserve_whitespace_tags=self.preserve_whitespace_tags
|
preserve_whitespace_tags=self.preserve_whitespace_tags,
|
||||||
|
interesting_string_types=self.interesting_string_types
|
||||||
)
|
)
|
||||||
for attr in ('can_be_empty_element', 'hidden'):
|
for attr in ('can_be_empty_element', 'hidden'):
|
||||||
setattr(clone, attr, getattr(self, attr))
|
setattr(clone, attr, getattr(self, attr))
|
||||||
for child in self.contents:
|
|
||||||
clone.append(child.__copy__())
|
|
||||||
return clone
|
return clone
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -1558,7 +1612,7 @@ class Tag(PageElement):
|
||||||
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
|
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
|
||||||
name=tag_name
|
name=tag_name
|
||||||
),
|
),
|
||||||
DeprecationWarning
|
DeprecationWarning, stacklevel=2
|
||||||
)
|
)
|
||||||
return self.find(tag_name)
|
return self.find(tag_name)
|
||||||
# We special case contents to avoid recursion.
|
# We special case contents to avoid recursion.
|
||||||
|
@ -1634,28 +1688,178 @@ class Tag(PageElement):
|
||||||
|
|
||||||
def decode(self, indent_level=None,
|
def decode(self, indent_level=None,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal",
|
||||||
"""Render a Unicode representation of this PageElement and its
|
iterator=None):
|
||||||
contents.
|
pieces = []
|
||||||
|
|
||||||
:param indent_level: Each line of the rendering will be
|
|
||||||
indented this many spaces. Used internally in
|
|
||||||
recursive calls while pretty-printing.
|
|
||||||
:param eventual_encoding: The tag is destined to be
|
|
||||||
encoded into this encoding. This method is _not_
|
|
||||||
responsible for performing that encoding. This information
|
|
||||||
is passed in so that it can be substituted in if the
|
|
||||||
document contains a <META> tag that mentions the document's
|
|
||||||
encoding.
|
|
||||||
:param formatter: A Formatter object, or a string naming one of
|
|
||||||
the standard formatters.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# First off, turn a non-Formatter `formatter` into a Formatter
|
# First off, turn a non-Formatter `formatter` into a Formatter
|
||||||
# object. This will stop the lookup from happening over and
|
# object. This will stop the lookup from happening over and
|
||||||
# over again.
|
# over again.
|
||||||
if not isinstance(formatter, Formatter):
|
if not isinstance(formatter, Formatter):
|
||||||
formatter = self.formatter_for_name(formatter)
|
formatter = self.formatter_for_name(formatter)
|
||||||
|
|
||||||
|
if indent_level is True:
|
||||||
|
indent_level = 0
|
||||||
|
|
||||||
|
# The currently active tag that put us into string literal
|
||||||
|
# mode. Until this element is closed, children will be treated
|
||||||
|
# as string literals and not pretty-printed. String literal
|
||||||
|
# mode is turned on immediately after this tag begins, and
|
||||||
|
# turned off immediately before it's closed. This means there
|
||||||
|
# will be whitespace before and after the tag itself.
|
||||||
|
string_literal_tag = None
|
||||||
|
|
||||||
|
for event, element in self._event_stream(iterator):
|
||||||
|
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
|
||||||
|
piece = element._format_tag(
|
||||||
|
eventual_encoding, formatter, opening=True
|
||||||
|
)
|
||||||
|
elif event is Tag.END_ELEMENT_EVENT:
|
||||||
|
piece = element._format_tag(
|
||||||
|
eventual_encoding, formatter, opening=False
|
||||||
|
)
|
||||||
|
if indent_level is not None:
|
||||||
|
indent_level -= 1
|
||||||
|
else:
|
||||||
|
piece = element.output_ready(formatter)
|
||||||
|
|
||||||
|
# Now we need to apply the 'prettiness' -- extra
|
||||||
|
# whitespace before and/or after this tag. This can get
|
||||||
|
# complicated because certain tags, like <pre> and
|
||||||
|
# <script>, can't be prettified, since adding whitespace would
|
||||||
|
# change the meaning of the content.
|
||||||
|
|
||||||
|
# The default behavior is to add whitespace before and
|
||||||
|
# after an element when string literal mode is off, and to
|
||||||
|
# leave things as they are when string literal mode is on.
|
||||||
|
if string_literal_tag:
|
||||||
|
indent_before = indent_after = False
|
||||||
|
else:
|
||||||
|
indent_before = indent_after = True
|
||||||
|
|
||||||
|
# The only time the behavior is more complex than that is
|
||||||
|
# when we encounter an opening or closing tag that might
|
||||||
|
# put us into or out of string literal mode.
|
||||||
|
if (event is Tag.START_ELEMENT_EVENT
|
||||||
|
and not string_literal_tag
|
||||||
|
and not element._should_pretty_print()):
|
||||||
|
# We are about to enter string literal mode. Add
|
||||||
|
# whitespace before this tag, but not after. We
|
||||||
|
# will stay in string literal mode until this tag
|
||||||
|
# is closed.
|
||||||
|
indent_before = True
|
||||||
|
indent_after = False
|
||||||
|
string_literal_tag = element
|
||||||
|
elif (event is Tag.END_ELEMENT_EVENT
|
||||||
|
and element is string_literal_tag):
|
||||||
|
# We are about to exit string literal mode by closing
|
||||||
|
# the tag that sent us into that mode. Add whitespace
|
||||||
|
# after this tag, but not before.
|
||||||
|
indent_before = False
|
||||||
|
indent_after = True
|
||||||
|
string_literal_tag = None
|
||||||
|
|
||||||
|
# Now we know whether to add whitespace before and/or
|
||||||
|
# after this element.
|
||||||
|
if indent_level is not None:
|
||||||
|
if (indent_before or indent_after):
|
||||||
|
if isinstance(element, NavigableString):
|
||||||
|
piece = piece.strip()
|
||||||
|
if piece:
|
||||||
|
piece = self._indent_string(
|
||||||
|
piece, indent_level, formatter,
|
||||||
|
indent_before, indent_after
|
||||||
|
)
|
||||||
|
if event == Tag.START_ELEMENT_EVENT:
|
||||||
|
indent_level += 1
|
||||||
|
pieces.append(piece)
|
||||||
|
return "".join(pieces)
|
||||||
|
|
||||||
|
# Names for the different events yielded by _event_stream
|
||||||
|
START_ELEMENT_EVENT = object()
|
||||||
|
END_ELEMENT_EVENT = object()
|
||||||
|
EMPTY_ELEMENT_EVENT = object()
|
||||||
|
STRING_ELEMENT_EVENT = object()
|
||||||
|
|
||||||
|
def _event_stream(self, iterator=None):
|
||||||
|
"""Yield a sequence of events that can be used to reconstruct the DOM
|
||||||
|
for this element.
|
||||||
|
|
||||||
|
This lets us recreate the nested structure of this element
|
||||||
|
(e.g. when formatting it as a string) without using recursive
|
||||||
|
method calls.
|
||||||
|
|
||||||
|
This is similar in concept to the SAX API, but it's a simpler
|
||||||
|
interface designed for internal use. The events are different
|
||||||
|
from SAX and the arguments associated with the events are Tags
|
||||||
|
and other Beautiful Soup objects.
|
||||||
|
|
||||||
|
:param iterator: An alternate iterator to use when traversing
|
||||||
|
the tree.
|
||||||
|
"""
|
||||||
|
tag_stack = []
|
||||||
|
|
||||||
|
iterator = iterator or self.self_and_descendants
|
||||||
|
|
||||||
|
for c in iterator:
|
||||||
|
# If the parent of the element we're about to yield is not
|
||||||
|
# the tag currently on the stack, it means that the tag on
|
||||||
|
# the stack closed before this element appeared.
|
||||||
|
while tag_stack and c.parent != tag_stack[-1]:
|
||||||
|
now_closed_tag = tag_stack.pop()
|
||||||
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
||||||
|
|
||||||
|
if isinstance(c, Tag):
|
||||||
|
if c.is_empty_element:
|
||||||
|
yield Tag.EMPTY_ELEMENT_EVENT, c
|
||||||
|
else:
|
||||||
|
yield Tag.START_ELEMENT_EVENT, c
|
||||||
|
tag_stack.append(c)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
yield Tag.STRING_ELEMENT_EVENT, c
|
||||||
|
|
||||||
|
while tag_stack:
|
||||||
|
now_closed_tag = tag_stack.pop()
|
||||||
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
||||||
|
|
||||||
|
def _indent_string(self, s, indent_level, formatter,
|
||||||
|
indent_before, indent_after):
|
||||||
|
"""Add indentation whitespace before and/or after a string.
|
||||||
|
|
||||||
|
:param s: The string to amend with whitespace.
|
||||||
|
:param indent_level: The indentation level; affects how much
|
||||||
|
whitespace goes before the string.
|
||||||
|
:param indent_before: Whether or not to add whitespace
|
||||||
|
before the string.
|
||||||
|
:param indent_after: Whether or not to add whitespace
|
||||||
|
(a newline) after the string.
|
||||||
|
"""
|
||||||
|
space_before = ''
|
||||||
|
if indent_before and indent_level:
|
||||||
|
space_before = (formatter.indent * indent_level)
|
||||||
|
|
||||||
|
space_after = ''
|
||||||
|
if indent_after:
|
||||||
|
space_after = "\n"
|
||||||
|
|
||||||
|
return space_before + s + space_after
|
||||||
|
|
||||||
|
def _format_tag(self, eventual_encoding, formatter, opening):
|
||||||
|
# A tag starts with the < character (see below).
|
||||||
|
|
||||||
|
# Then the / character, if this is a closing tag.
|
||||||
|
closing_slash = ''
|
||||||
|
if not opening:
|
||||||
|
closing_slash = '/'
|
||||||
|
|
||||||
|
# Then an optional namespace prefix.
|
||||||
|
prefix = ''
|
||||||
|
if self.prefix:
|
||||||
|
prefix = self.prefix + ":"
|
||||||
|
|
||||||
|
# Then a list of attribute values, if this is an opening tag.
|
||||||
|
attribute_string = ''
|
||||||
|
if opening:
|
||||||
attributes = formatter.attributes(self)
|
attributes = formatter.attributes(self)
|
||||||
attrs = []
|
attrs = []
|
||||||
for key, val in attributes:
|
for key, val in attributes:
|
||||||
|
@ -1677,63 +1881,19 @@ class Tag(PageElement):
|
||||||
str(key) + '='
|
str(key) + '='
|
||||||
+ formatter.quoted_attribute_value(text))
|
+ formatter.quoted_attribute_value(text))
|
||||||
attrs.append(decoded)
|
attrs.append(decoded)
|
||||||
close = ''
|
|
||||||
closeTag = ''
|
|
||||||
|
|
||||||
prefix = ''
|
|
||||||
if self.prefix:
|
|
||||||
prefix = self.prefix + ":"
|
|
||||||
|
|
||||||
if self.is_empty_element:
|
|
||||||
close = formatter.void_element_close_prefix or ''
|
|
||||||
else:
|
|
||||||
closeTag = '</%s%s>' % (prefix, self.name)
|
|
||||||
|
|
||||||
pretty_print = self._should_pretty_print(indent_level)
|
|
||||||
space = ''
|
|
||||||
indent_space = ''
|
|
||||||
if indent_level is not None:
|
|
||||||
indent_space = (formatter.indent * (indent_level - 1))
|
|
||||||
if pretty_print:
|
|
||||||
space = indent_space
|
|
||||||
indent_contents = indent_level + 1
|
|
||||||
else:
|
|
||||||
indent_contents = None
|
|
||||||
contents = self.decode_contents(
|
|
||||||
indent_contents, eventual_encoding, formatter
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.hidden:
|
|
||||||
# This is the 'document root' object.
|
|
||||||
s = contents
|
|
||||||
else:
|
|
||||||
s = []
|
|
||||||
attribute_string = ''
|
|
||||||
if attrs:
|
if attrs:
|
||||||
attribute_string = ' ' + ' '.join(attrs)
|
attribute_string = ' ' + ' '.join(attrs)
|
||||||
if indent_level is not None:
|
|
||||||
# Even if this particular tag is not pretty-printed,
|
|
||||||
# we should indent up to the start of the tag.
|
|
||||||
s.append(indent_space)
|
|
||||||
s.append('<%s%s%s%s>' % (
|
|
||||||
prefix, self.name, attribute_string, close))
|
|
||||||
if pretty_print:
|
|
||||||
s.append("\n")
|
|
||||||
s.append(contents)
|
|
||||||
if pretty_print and contents and contents[-1] != "\n":
|
|
||||||
s.append("\n")
|
|
||||||
if pretty_print and closeTag:
|
|
||||||
s.append(space)
|
|
||||||
s.append(closeTag)
|
|
||||||
if indent_level is not None and closeTag and self.next_sibling:
|
|
||||||
# Even if this particular tag is not pretty-printed,
|
|
||||||
# we're now done with the tag, and we should add a
|
|
||||||
# newline if appropriate.
|
|
||||||
s.append("\n")
|
|
||||||
s = ''.join(s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def _should_pretty_print(self, indent_level):
|
# Then an optional closing slash (for a void element in an
|
||||||
|
# XML document).
|
||||||
|
void_element_closing_slash = ''
|
||||||
|
if self.is_empty_element:
|
||||||
|
void_element_closing_slash = formatter.void_element_close_prefix or ''
|
||||||
|
|
||||||
|
# Put it all together.
|
||||||
|
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
|
||||||
|
|
||||||
|
def _should_pretty_print(self, indent_level=1):
|
||||||
"""Should this tag be pretty-printed?
|
"""Should this tag be pretty-printed?
|
||||||
|
|
||||||
Most of them should, but some (such as <pre> in HTML
|
Most of them should, but some (such as <pre> in HTML
|
||||||
|
@ -1784,32 +1944,8 @@ class Tag(PageElement):
|
||||||
the standard Formatters.
|
the standard Formatters.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# First off, turn a string formatter into a Formatter object. This
|
return self.decode(indent_level, eventual_encoding, formatter,
|
||||||
# will stop the lookup from happening over and over again.
|
iterator=self.descendants)
|
||||||
if not isinstance(formatter, Formatter):
|
|
||||||
formatter = self.formatter_for_name(formatter)
|
|
||||||
|
|
||||||
pretty_print = (indent_level is not None)
|
|
||||||
s = []
|
|
||||||
for c in self:
|
|
||||||
text = None
|
|
||||||
if isinstance(c, NavigableString):
|
|
||||||
text = c.output_ready(formatter)
|
|
||||||
elif isinstance(c, Tag):
|
|
||||||
s.append(c.decode(indent_level, eventual_encoding,
|
|
||||||
formatter))
|
|
||||||
preserve_whitespace = (
|
|
||||||
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
|
|
||||||
)
|
|
||||||
if text and indent_level and not preserve_whitespace:
|
|
||||||
text = text.strip()
|
|
||||||
if text:
|
|
||||||
if pretty_print and not preserve_whitespace:
|
|
||||||
s.append(formatter.indent * (indent_level - 1))
|
|
||||||
s.append(text)
|
|
||||||
if pretty_print and not preserve_whitespace:
|
|
||||||
s.append("\n")
|
|
||||||
return ''.join(s)
|
|
||||||
|
|
||||||
def encode_contents(
|
def encode_contents(
|
||||||
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
|
@ -1862,7 +1998,8 @@ class Tag(PageElement):
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
r = None
|
r = None
|
||||||
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
|
l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
|
||||||
|
**kwargs)
|
||||||
if l:
|
if l:
|
||||||
r = l[0]
|
r = l[0]
|
||||||
return r
|
return r
|
||||||
|
@ -1889,7 +2026,9 @@ class Tag(PageElement):
|
||||||
generator = self.descendants
|
generator = self.descendants
|
||||||
if not recursive:
|
if not recursive:
|
||||||
generator = self.children
|
generator = self.children
|
||||||
return self._find_all(name, attrs, string, limit, generator, **kwargs)
|
_stacklevel = kwargs.pop('_stacklevel', 2)
|
||||||
|
return self._find_all(name, attrs, string, limit, generator,
|
||||||
|
_stacklevel=_stacklevel+1, **kwargs)
|
||||||
findAll = find_all # BS3
|
findAll = find_all # BS3
|
||||||
findChildren = find_all # BS2
|
findChildren = find_all # BS2
|
||||||
|
|
||||||
|
@ -1903,6 +2042,18 @@ class Tag(PageElement):
|
||||||
# return iter() to make the purpose of the method clear
|
# return iter() to make the purpose of the method clear
|
||||||
return iter(self.contents) # XXX This seems to be untested.
|
return iter(self.contents) # XXX This seems to be untested.
|
||||||
|
|
||||||
|
@property
|
||||||
|
def self_and_descendants(self):
|
||||||
|
"""Iterate over this PageElement and its children in a
|
||||||
|
breadth-first sequence.
|
||||||
|
|
||||||
|
:yield: A sequence of PageElements.
|
||||||
|
"""
|
||||||
|
if not self.hidden:
|
||||||
|
yield self
|
||||||
|
for i in self.descendants:
|
||||||
|
yield i
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def descendants(self):
|
def descendants(self):
|
||||||
"""Iterate over all children of this PageElement in a
|
"""Iterate over all children of this PageElement in a
|
||||||
|
@ -1929,16 +2080,13 @@ class Tag(PageElement):
|
||||||
Beautiful Soup will use the prefixes it encountered while
|
Beautiful Soup will use the prefixes it encountered while
|
||||||
parsing the document.
|
parsing the document.
|
||||||
|
|
||||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||||
soupsieve.select() method.
|
soupsieve.select() method.
|
||||||
|
|
||||||
:return: A Tag.
|
:return: A Tag.
|
||||||
:rtype: bs4.element.Tag
|
:rtype: bs4.element.Tag
|
||||||
"""
|
"""
|
||||||
value = self.select(selector, namespaces, 1, **kwargs)
|
return self.css.select_one(selector, namespaces, **kwargs)
|
||||||
if value:
|
|
||||||
return value[0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
def select(self, selector, namespaces=None, limit=None, **kwargs):
|
def select(self, selector, namespaces=None, limit=None, **kwargs):
|
||||||
"""Perform a CSS selection operation on the current element.
|
"""Perform a CSS selection operation on the current element.
|
||||||
|
@ -1960,21 +2108,12 @@ class Tag(PageElement):
|
||||||
:return: A ResultSet of Tags.
|
:return: A ResultSet of Tags.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
if namespaces is None:
|
return self.css.select(selector, namespaces, limit, **kwargs)
|
||||||
namespaces = self._namespaces
|
|
||||||
|
|
||||||
if limit is None:
|
@property
|
||||||
limit = 0
|
def css(self):
|
||||||
if soupsieve is None:
|
"""Return an interface to the CSS selector API."""
|
||||||
raise NotImplementedError(
|
return CSS(self)
|
||||||
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
|
||||||
)
|
|
||||||
|
|
||||||
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
|
|
||||||
|
|
||||||
# We do this because it's more consistent and because
|
|
||||||
# ResultSet.__getattr__ has a helpful error message.
|
|
||||||
return ResultSet(None, results)
|
|
||||||
|
|
||||||
# Old names for backwards compatibility
|
# Old names for backwards compatibility
|
||||||
def childGenerator(self):
|
def childGenerator(self):
|
||||||
|
@ -1993,7 +2132,7 @@ class Tag(PageElement):
|
||||||
"""
|
"""
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'has_key is deprecated. Use has_attr(key) instead.',
|
'has_key is deprecated. Use has_attr(key) instead.',
|
||||||
DeprecationWarning
|
DeprecationWarning, stacklevel=2
|
||||||
)
|
)
|
||||||
return self.has_attr(key)
|
return self.has_attr(key)
|
||||||
|
|
||||||
|
@ -2024,7 +2163,7 @@ class SoupStrainer(object):
|
||||||
string = kwargs.pop('text')
|
string = kwargs.pop('text')
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
|
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
|
||||||
DeprecationWarning
|
DeprecationWarning, stacklevel=2
|
||||||
)
|
)
|
||||||
|
|
||||||
self.name = self._normalize_search_value(name)
|
self.name = self._normalize_search_value(name)
|
||||||
|
|
|
@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
|
||||||
"""A generic Formatter for HTML."""
|
"""A generic Formatter for HTML."""
|
||||||
REGISTRY = {}
|
REGISTRY = {}
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class XMLFormatter(Formatter):
|
class XMLFormatter(Formatter):
|
||||||
"""A generic Formatter for XML."""
|
"""A generic Formatter for XML."""
|
||||||
REGISTRY = {}
|
REGISTRY = {}
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
# Set up aliases for the default formatters.
|
# Set up aliases for the default formatters.
|
||||||
|
|
|
@ -32,7 +32,7 @@ from . import css_match as cm
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
||||||
import bs4 # type: ignore[import]
|
import bs4 # type: ignore[import]
|
||||||
from typing import Optional, Any, Iterator, Iterable
|
from typing import Any, Iterator, Iterable
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
||||||
|
@ -45,17 +45,14 @@ SoupSieve = cm.SoupSieve
|
||||||
|
|
||||||
def compile( # noqa: A001
|
def compile( # noqa: A001
|
||||||
pattern: str,
|
pattern: str,
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> cm.SoupSieve:
|
) -> cm.SoupSieve:
|
||||||
"""Compile CSS pattern."""
|
"""Compile CSS pattern."""
|
||||||
|
|
||||||
ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
|
|
||||||
cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
|
|
||||||
|
|
||||||
if isinstance(pattern, SoupSieve):
|
if isinstance(pattern, SoupSieve):
|
||||||
if flags:
|
if flags:
|
||||||
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
|
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
|
||||||
|
@ -65,7 +62,12 @@ def compile( # noqa: A001
|
||||||
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
|
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
return cp._cached_css_compile(pattern, ns, cs, flags)
|
return cp._cached_css_compile(
|
||||||
|
pattern,
|
||||||
|
ct.Namespaces(namespaces) if namespaces is not None else namespaces,
|
||||||
|
ct.CustomSelectors(custom) if custom is not None else custom,
|
||||||
|
flags
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def purge() -> None:
|
def purge() -> None:
|
||||||
|
@ -77,10 +79,10 @@ def purge() -> None:
|
||||||
def closest(
|
def closest(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> 'bs4.Tag':
|
) -> 'bs4.Tag':
|
||||||
"""Match closest ancestor."""
|
"""Match closest ancestor."""
|
||||||
|
@ -91,10 +93,10 @@ def closest(
|
||||||
def match(
|
def match(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Match node."""
|
"""Match node."""
|
||||||
|
@ -105,10 +107,10 @@ def match(
|
||||||
def filter( # noqa: A001
|
def filter( # noqa: A001
|
||||||
select: str,
|
select: str,
|
||||||
iterable: Iterable['bs4.Tag'],
|
iterable: Iterable['bs4.Tag'],
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> list['bs4.Tag']:
|
) -> list['bs4.Tag']:
|
||||||
"""Filter list of nodes."""
|
"""Filter list of nodes."""
|
||||||
|
@ -119,10 +121,10 @@ def filter( # noqa: A001
|
||||||
def select_one(
|
def select_one(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> 'bs4.Tag':
|
) -> 'bs4.Tag':
|
||||||
"""Select a single tag."""
|
"""Select a single tag."""
|
||||||
|
@ -133,11 +135,11 @@ def select_one(
|
||||||
def select(
|
def select(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
limit: int = 0,
|
limit: int = 0,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> list['bs4.Tag']:
|
) -> list['bs4.Tag']:
|
||||||
"""Select the specified tags."""
|
"""Select the specified tags."""
|
||||||
|
@ -148,11 +150,11 @@ def select(
|
||||||
def iselect(
|
def iselect(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
limit: int = 0,
|
limit: int = 0,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> Iterator['bs4.Tag']:
|
) -> Iterator['bs4.Tag']:
|
||||||
"""Iterate the specified tags."""
|
"""Iterate the specified tags."""
|
||||||
|
|
|
@ -193,5 +193,5 @@ def parse_version(ver: str) -> Version:
|
||||||
return Version(major, minor, micro, release, pre, post, dev)
|
return Version(major, minor, micro, release, pre, post, dev)
|
||||||
|
|
||||||
|
|
||||||
__version_info__ = Version(2, 5, 0, "final", post=1)
|
__version_info__ = Version(2, 4, 1, "final")
|
||||||
__version__ = __version_info__._get_canonical()
|
__version__ = __version_info__._get_canonical()
|
||||||
|
|
|
@ -6,7 +6,7 @@ import re
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import bs4 # type: ignore[import]
|
import bs4 # type: ignore[import]
|
||||||
from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
|
from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
|
||||||
|
|
||||||
# Empty tag pattern (whitespace okay)
|
# Empty tag pattern (whitespace okay)
|
||||||
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
||||||
|
@ -171,7 +171,7 @@ class _DocumentNav:
|
||||||
def get_children(
|
def get_children(
|
||||||
self,
|
self,
|
||||||
el: bs4.Tag,
|
el: bs4.Tag,
|
||||||
start: Optional[int] = None,
|
start: int | None = None,
|
||||||
reverse: bool = False,
|
reverse: bool = False,
|
||||||
tags: bool = True,
|
tags: bool = True,
|
||||||
no_iframe: bool = False
|
no_iframe: bool = False
|
||||||
|
@ -239,22 +239,22 @@ class _DocumentNav:
|
||||||
return parent
|
return parent
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_tag_name(el: bs4.Tag) -> Optional[str]:
|
def get_tag_name(el: bs4.Tag) -> str | None:
|
||||||
"""Get tag."""
|
"""Get tag."""
|
||||||
|
|
||||||
return cast(Optional[str], el.name)
|
return cast('str | None', el.name)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_prefix_name(el: bs4.Tag) -> Optional[str]:
|
def get_prefix_name(el: bs4.Tag) -> str | None:
|
||||||
"""Get prefix."""
|
"""Get prefix."""
|
||||||
|
|
||||||
return cast(Optional[str], el.prefix)
|
return cast('str | None', el.prefix)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_uri(el: bs4.Tag) -> Optional[str]:
|
def get_uri(el: bs4.Tag) -> str | None:
|
||||||
"""Get namespace `URI`."""
|
"""Get namespace `URI`."""
|
||||||
|
|
||||||
return cast(Optional[str], el.namespace)
|
return cast('str | None', el.namespace)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
||||||
|
@ -287,7 +287,7 @@ class _DocumentNav:
|
||||||
return bool(ns and ns == NS_XHTML)
|
return bool(ns and ns == NS_XHTML)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
|
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
|
||||||
"""Return namespace and attribute name without the prefix."""
|
"""Return namespace and attribute name without the prefix."""
|
||||||
|
|
||||||
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
|
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
|
||||||
|
@ -330,8 +330,8 @@ class _DocumentNav:
|
||||||
cls,
|
cls,
|
||||||
el: bs4.Tag,
|
el: bs4.Tag,
|
||||||
name: str,
|
name: str,
|
||||||
default: Optional[str | Sequence[str]] = None
|
default: str | Sequence[str] | None = None
|
||||||
) -> Optional[str | Sequence[str]]:
|
) -> str | Sequence[str] | None:
|
||||||
"""Get attribute by name."""
|
"""Get attribute by name."""
|
||||||
|
|
||||||
value = default
|
value = default
|
||||||
|
@ -348,7 +348,7 @@ class _DocumentNav:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
|
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
|
||||||
"""Iterate attributes."""
|
"""Iterate attributes."""
|
||||||
|
|
||||||
for k, v in el.attrs.items():
|
for k, v in el.attrs.items():
|
||||||
|
@ -424,10 +424,10 @@ class Inputs:
|
||||||
return 0 <= minutes <= 59
|
return 0 <= minutes <= 59
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
|
def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
|
||||||
"""Parse the input value."""
|
"""Parse the input value."""
|
||||||
|
|
||||||
parsed = None # type: Optional[tuple[float, ...]]
|
parsed = None # type: tuple[float, ...] | None
|
||||||
if value is None:
|
if value is None:
|
||||||
return value
|
return value
|
||||||
if itype == "date":
|
if itype == "date":
|
||||||
|
@ -486,7 +486,7 @@ class CSSMatch(_DocumentNav):
|
||||||
self,
|
self,
|
||||||
selectors: ct.SelectorList,
|
selectors: ct.SelectorList,
|
||||||
scope: bs4.Tag,
|
scope: bs4.Tag,
|
||||||
namespaces: Optional[ct.Namespaces],
|
namespaces: ct.Namespaces | None,
|
||||||
flags: int
|
flags: int
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
@ -545,19 +545,19 @@ class CSSMatch(_DocumentNav):
|
||||||
|
|
||||||
return self.get_tag_ns(el) == NS_XHTML
|
return self.get_tag_ns(el) == NS_XHTML
|
||||||
|
|
||||||
def get_tag(self, el: bs4.Tag) -> Optional[str]:
|
def get_tag(self, el: bs4.Tag) -> str | None:
|
||||||
"""Get tag."""
|
"""Get tag."""
|
||||||
|
|
||||||
name = self.get_tag_name(el)
|
name = self.get_tag_name(el)
|
||||||
return util.lower(name) if name is not None and not self.is_xml else name
|
return util.lower(name) if name is not None and not self.is_xml else name
|
||||||
|
|
||||||
def get_prefix(self, el: bs4.Tag) -> Optional[str]:
|
def get_prefix(self, el: bs4.Tag) -> str | None:
|
||||||
"""Get prefix."""
|
"""Get prefix."""
|
||||||
|
|
||||||
prefix = self.get_prefix_name(el)
|
prefix = self.get_prefix_name(el)
|
||||||
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
|
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
|
||||||
|
|
||||||
def find_bidi(self, el: bs4.Tag) -> Optional[int]:
|
def find_bidi(self, el: bs4.Tag) -> int | None:
|
||||||
"""Get directionality from element text."""
|
"""Get directionality from element text."""
|
||||||
|
|
||||||
for node in self.get_children(el, tags=False):
|
for node in self.get_children(el, tags=False):
|
||||||
|
@ -653,8 +653,8 @@ class CSSMatch(_DocumentNav):
|
||||||
self,
|
self,
|
||||||
el: bs4.Tag,
|
el: bs4.Tag,
|
||||||
attr: str,
|
attr: str,
|
||||||
prefix: Optional[str]
|
prefix: str | None
|
||||||
) -> Optional[str | Sequence[str]]:
|
) -> str | Sequence[str] | None:
|
||||||
"""Match attribute name and return value if it exists."""
|
"""Match attribute name and return value if it exists."""
|
||||||
|
|
||||||
value = None
|
value = None
|
||||||
|
@ -751,7 +751,7 @@ class CSSMatch(_DocumentNav):
|
||||||
name not in (self.get_tag(el), '*')
|
name not in (self.get_tag(el), '*')
|
||||||
)
|
)
|
||||||
|
|
||||||
def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
|
def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
|
||||||
"""Match the tag."""
|
"""Match the tag."""
|
||||||
|
|
||||||
match = True
|
match = True
|
||||||
|
@ -1030,7 +1030,7 @@ class CSSMatch(_DocumentNav):
|
||||||
"""Match element if it contains text."""
|
"""Match element if it contains text."""
|
||||||
|
|
||||||
match = True
|
match = True
|
||||||
content = None # type: Optional[str | Sequence[str]]
|
content = None # type: str | Sequence[str] | None
|
||||||
for contain_list in contains:
|
for contain_list in contains:
|
||||||
if content is None:
|
if content is None:
|
||||||
if contain_list.own:
|
if contain_list.own:
|
||||||
|
@ -1099,7 +1099,7 @@ class CSSMatch(_DocumentNav):
|
||||||
match = False
|
match = False
|
||||||
name = cast(str, self.get_attribute_by_name(el, 'name'))
|
name = cast(str, self.get_attribute_by_name(el, 'name'))
|
||||||
|
|
||||||
def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
|
def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
|
||||||
"""Find this input's form."""
|
"""Find this input's form."""
|
||||||
form = None
|
form = None
|
||||||
parent = self.get_parent(el, no_iframe=True)
|
parent = self.get_parent(el, no_iframe=True)
|
||||||
|
@ -1478,7 +1478,7 @@ class CSSMatch(_DocumentNav):
|
||||||
if lim < 1:
|
if lim < 1:
|
||||||
break
|
break
|
||||||
|
|
||||||
def closest(self) -> Optional[bs4.Tag]:
|
def closest(self) -> bs4.Tag | None:
|
||||||
"""Match closest ancestor."""
|
"""Match closest ancestor."""
|
||||||
|
|
||||||
current = self.tag
|
current = self.tag
|
||||||
|
@ -1506,7 +1506,7 @@ class SoupSieve(ct.Immutable):
|
||||||
|
|
||||||
pattern: str
|
pattern: str
|
||||||
selectors: ct.SelectorList
|
selectors: ct.SelectorList
|
||||||
namespaces: Optional[ct.Namespaces]
|
namespaces: ct.Namespaces | None
|
||||||
custom: dict[str, str]
|
custom: dict[str, str]
|
||||||
flags: int
|
flags: int
|
||||||
|
|
||||||
|
@ -1516,8 +1516,8 @@ class SoupSieve(ct.Immutable):
|
||||||
self,
|
self,
|
||||||
pattern: str,
|
pattern: str,
|
||||||
selectors: ct.SelectorList,
|
selectors: ct.SelectorList,
|
||||||
namespaces: Optional[ct.Namespaces],
|
namespaces: ct.Namespaces | None,
|
||||||
custom: Optional[ct.CustomSelectors],
|
custom: ct.CustomSelectors | None,
|
||||||
flags: int
|
flags: int
|
||||||
):
|
):
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
|
@ -7,7 +7,7 @@ from . import css_match as cm
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
from .util import SelectorSyntaxError
|
from .util import SelectorSyntaxError
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Optional, Match, Any, Iterator, cast
|
from typing import Match, Any, Iterator, cast
|
||||||
|
|
||||||
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
||||||
|
|
||||||
|
@ -113,7 +113,7 @@ VALUE = r'''
|
||||||
'''.format(nl=NEWLINE, ident=IDENTIFIER)
|
'''.format(nl=NEWLINE, ident=IDENTIFIER)
|
||||||
# Attribute value comparison. `!=` is handled special as it is non-standard.
|
# Attribute value comparison. `!=` is handled special as it is non-standard.
|
||||||
ATTR = r'''
|
ATTR = r'''
|
||||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}*(?P<case>[is]))?)?{ws}*\]
|
||||||
'''.format(ws=WSC, value=VALUE)
|
'''.format(ws=WSC, value=VALUE)
|
||||||
|
|
||||||
# Selector patterns
|
# Selector patterns
|
||||||
|
@ -207,8 +207,8 @@ _MAXCACHE = 500
|
||||||
@lru_cache(maxsize=_MAXCACHE)
|
@lru_cache(maxsize=_MAXCACHE)
|
||||||
def _cached_css_compile(
|
def _cached_css_compile(
|
||||||
pattern: str,
|
pattern: str,
|
||||||
namespaces: Optional[ct.Namespaces],
|
namespaces: ct.Namespaces | None,
|
||||||
custom: Optional[ct.CustomSelectors],
|
custom: ct.CustomSelectors | None,
|
||||||
flags: int
|
flags: int
|
||||||
) -> cm.SoupSieve:
|
) -> cm.SoupSieve:
|
||||||
"""Cached CSS compile."""
|
"""Cached CSS compile."""
|
||||||
|
@ -233,7 +233,7 @@ def _purge_cache() -> None:
|
||||||
_cached_css_compile.cache_clear()
|
_cached_css_compile.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
|
def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
|
||||||
"""Process custom."""
|
"""Process custom."""
|
||||||
|
|
||||||
custom_selectors = {}
|
custom_selectors = {}
|
||||||
|
@ -317,7 +317,7 @@ class SelectorPattern:
|
||||||
|
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
|
||||||
"""Match the selector."""
|
"""Match the selector."""
|
||||||
|
|
||||||
return self.re_pattern.match(selector, index)
|
return self.re_pattern.match(selector, index)
|
||||||
|
@ -336,7 +336,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
for pseudo in p[1]:
|
for pseudo in p[1]:
|
||||||
self.patterns[pseudo] = pattern
|
self.patterns[pseudo] = pattern
|
||||||
|
|
||||||
self.matched_name = None # type: Optional[SelectorPattern]
|
self.matched_name = None # type: SelectorPattern | None
|
||||||
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
|
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
|
||||||
|
|
||||||
def get_name(self) -> str:
|
def get_name(self) -> str:
|
||||||
|
@ -344,7 +344,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
|
|
||||||
return '' if self.matched_name is None else self.matched_name.get_name()
|
return '' if self.matched_name is None else self.matched_name.get_name()
|
||||||
|
|
||||||
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
|
||||||
"""Match the selector."""
|
"""Match the selector."""
|
||||||
|
|
||||||
pseudo = None
|
pseudo = None
|
||||||
|
@ -372,14 +372,14 @@ class _Selector:
|
||||||
def __init__(self, **kwargs: Any) -> None:
|
def __init__(self, **kwargs: Any) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
|
self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
|
||||||
self.ids = kwargs.get('ids', []) # type: list[str]
|
self.ids = kwargs.get('ids', []) # type: list[str]
|
||||||
self.classes = kwargs.get('classes', []) # type: list[str]
|
self.classes = kwargs.get('classes', []) # type: list[str]
|
||||||
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
|
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
|
||||||
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
|
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
|
||||||
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
|
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
|
||||||
self.relations = kwargs.get('relations', []) # type: list[_Selector]
|
self.relations = kwargs.get('relations', []) # type: list[_Selector]
|
||||||
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
|
self.rel_type = kwargs.get('rel_type', None) # type: str | None
|
||||||
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
|
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
|
||||||
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
|
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
|
||||||
self.flags = kwargs.get('flags', 0) # type: int
|
self.flags = kwargs.get('flags', 0) # type: int
|
||||||
|
@ -462,7 +462,7 @@ class CSSParser:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
selector: str,
|
selector: str,
|
||||||
custom: Optional[dict[str, str | ct.SelectorList]] = None,
|
custom: dict[str, str | ct.SelectorList] | None = None,
|
||||||
flags: int = 0
|
flags: int = 0
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
@ -723,7 +723,7 @@ class CSSParser:
|
||||||
if postfix == '_child':
|
if postfix == '_child':
|
||||||
if m.group('of'):
|
if m.group('of'):
|
||||||
# Parse the rest of `of S`.
|
# Parse the rest of `of S`.
|
||||||
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN | FLG_FORGIVE)
|
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
|
||||||
else:
|
else:
|
||||||
# Use default `*|*` for `of S`.
|
# Use default `*|*` for `of S`.
|
||||||
nth_sel = CSS_NTH_OF_S_DEFAULT
|
nth_sel = CSS_NTH_OF_S_DEFAULT
|
||||||
|
@ -753,7 +753,7 @@ class CSSParser:
|
||||||
if name == ':not':
|
if name == ':not':
|
||||||
flags |= FLG_NOT
|
flags |= FLG_NOT
|
||||||
elif name == ':has':
|
elif name == ':has':
|
||||||
flags |= FLG_RELATIVE | FLG_FORGIVE
|
flags |= FLG_RELATIVE
|
||||||
elif name in (':where', ':is'):
|
elif name in (':where', ':is'):
|
||||||
flags |= FLG_FORGIVE
|
flags |= FLG_FORGIVE
|
||||||
|
|
||||||
|
@ -777,11 +777,6 @@ class CSSParser:
|
||||||
if not combinator:
|
if not combinator:
|
||||||
combinator = WS_COMBINATOR
|
combinator = WS_COMBINATOR
|
||||||
if combinator == COMMA_COMBINATOR:
|
if combinator == COMMA_COMBINATOR:
|
||||||
if not has_selector:
|
|
||||||
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
|
|
||||||
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
|
|
||||||
sel.no_match = True
|
|
||||||
|
|
||||||
sel.rel_type = rel_type
|
sel.rel_type = rel_type
|
||||||
selectors[-1].relations.append(sel)
|
selectors[-1].relations.append(sel)
|
||||||
rel_type = ":" + WS_COMBINATOR
|
rel_type = ":" + WS_COMBINATOR
|
||||||
|
@ -1070,18 +1065,8 @@ class CSSParser:
|
||||||
selectors.append(sel)
|
selectors.append(sel)
|
||||||
|
|
||||||
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
|
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
|
||||||
elif is_forgive:
|
elif is_forgive and (not selectors or not relations):
|
||||||
if is_relative:
|
# Handle normal pseudo-classes with empty slots like `:is()` etc.
|
||||||
# Handle relative selectors pseudo-classes with empty slots like `:has()`
|
|
||||||
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
|
|
||||||
sel.rel_type = rel_type
|
|
||||||
sel.no_match = True
|
|
||||||
selectors[-1].relations.append(sel)
|
|
||||||
has_selector = True
|
|
||||||
else:
|
|
||||||
# Handle normal pseudo-classes with empty slots
|
|
||||||
if not selectors or not relations:
|
|
||||||
# Others like `:is()` etc.
|
|
||||||
sel.no_match = True
|
sel.no_match = True
|
||||||
del relations[:]
|
del relations[:]
|
||||||
selectors.append(sel)
|
selectors.append(sel)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import copyreg
|
import copyreg
|
||||||
from .pretty import pretty
|
from .pretty import pretty
|
||||||
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
|
from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'Selector',
|
'Selector',
|
||||||
|
@ -189,28 +189,28 @@ class Selector(Immutable):
|
||||||
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
|
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
|
||||||
)
|
)
|
||||||
|
|
||||||
tag: Optional[SelectorTag]
|
tag: SelectorTag | None
|
||||||
ids: tuple[str, ...]
|
ids: tuple[str, ...]
|
||||||
classes: tuple[str, ...]
|
classes: tuple[str, ...]
|
||||||
attributes: tuple[SelectorAttribute, ...]
|
attributes: tuple[SelectorAttribute, ...]
|
||||||
nth: tuple[SelectorNth, ...]
|
nth: tuple[SelectorNth, ...]
|
||||||
selectors: tuple[SelectorList, ...]
|
selectors: tuple[SelectorList, ...]
|
||||||
relation: SelectorList
|
relation: SelectorList
|
||||||
rel_type: Optional[str]
|
rel_type: str | None
|
||||||
contains: tuple[SelectorContains, ...]
|
contains: tuple[SelectorContains, ...]
|
||||||
lang: tuple[SelectorLang, ...]
|
lang: tuple[SelectorLang, ...]
|
||||||
flags: int
|
flags: int
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
tag: Optional[SelectorTag],
|
tag: SelectorTag | None,
|
||||||
ids: tuple[str, ...],
|
ids: tuple[str, ...],
|
||||||
classes: tuple[str, ...],
|
classes: tuple[str, ...],
|
||||||
attributes: tuple[SelectorAttribute, ...],
|
attributes: tuple[SelectorAttribute, ...],
|
||||||
nth: tuple[SelectorNth, ...],
|
nth: tuple[SelectorNth, ...],
|
||||||
selectors: tuple[SelectorList, ...],
|
selectors: tuple[SelectorList, ...],
|
||||||
relation: SelectorList,
|
relation: SelectorList,
|
||||||
rel_type: Optional[str],
|
rel_type: str | None,
|
||||||
contains: tuple[SelectorContains, ...],
|
contains: tuple[SelectorContains, ...],
|
||||||
lang: tuple[SelectorLang, ...],
|
lang: tuple[SelectorLang, ...],
|
||||||
flags: int
|
flags: int
|
||||||
|
@ -247,9 +247,9 @@ class SelectorTag(Immutable):
|
||||||
__slots__ = ("name", "prefix", "_hash")
|
__slots__ = ("name", "prefix", "_hash")
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
prefix: Optional[str]
|
prefix: str | None
|
||||||
|
|
||||||
def __init__(self, name: str, prefix: Optional[str]) -> None:
|
def __init__(self, name: str, prefix: str | None) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
super().__init__(name=name, prefix=prefix)
|
super().__init__(name=name, prefix=prefix)
|
||||||
|
@ -262,15 +262,15 @@ class SelectorAttribute(Immutable):
|
||||||
|
|
||||||
attribute: str
|
attribute: str
|
||||||
prefix: str
|
prefix: str
|
||||||
pattern: Optional[Pattern[str]]
|
pattern: Pattern[str] | None
|
||||||
xml_type_pattern: Optional[Pattern[str]]
|
xml_type_pattern: Pattern[str] | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
attribute: str,
|
attribute: str,
|
||||||
prefix: str,
|
prefix: str,
|
||||||
pattern: Optional[Pattern[str]],
|
pattern: Pattern[str] | None,
|
||||||
xml_type_pattern: Optional[Pattern[str]]
|
xml_type_pattern: Pattern[str] | None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
|
@ -360,7 +360,7 @@ class SelectorList(Immutable):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
selectors: Optional[Iterable[Selector | SelectorNull]] = None,
|
selectors: Iterable[Selector | SelectorNull] | None = None,
|
||||||
is_not: bool = False,
|
is_not: bool = False,
|
||||||
is_html: bool = False
|
is_html: bool = False
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||||
from functools import wraps, lru_cache
|
from functools import wraps, lru_cache
|
||||||
import warnings
|
import warnings
|
||||||
import re
|
import re
|
||||||
from typing import Callable, Any, Optional
|
from typing import Callable, Any
|
||||||
|
|
||||||
DEBUG = 0x00001
|
DEBUG = 0x00001
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ def lower(string: str) -> str:
|
||||||
class SelectorSyntaxError(Exception):
|
class SelectorSyntaxError(Exception):
|
||||||
"""Syntax error in a CSS selector."""
|
"""Syntax error in a CSS selector."""
|
||||||
|
|
||||||
def __init__(self, msg: str, pattern: Optional[str] = None, index: Optional[int] = None) -> None:
|
def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
self.line = None
|
self.line = None
|
||||||
|
@ -84,7 +84,7 @@ def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
|
||||||
col = 1
|
col = 1
|
||||||
text = [] # type: list[str]
|
text = [] # type: list[str]
|
||||||
line = 1
|
line = 1
|
||||||
offset = None # type: Optional[int]
|
offset = None # type: int | None
|
||||||
|
|
||||||
# Split pattern by newline and handle the text before the newline
|
# Split pattern by newline and handle the text before the newline
|
||||||
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
|
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
|
||||||
|
|
Loading…
Reference in a new issue