diff --git a/CHANGES.md b/CHANGES.md
index ec0a33c9..2194459a 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,4 +1,25 @@
-### 3.29.11 (2023-09-22 23:00:00 UTC)
+### 3.30.0 (2023-09-23 17:20:00 UTC)
+
+* Update Beautiful Soup 4.11.1 (r642) to 4.12.2
+* Update certifi 2023.05.07 to 2023.07.22
+* Update CacheControl 0.12.11 (c05ef9e) to 0.13.1 (783a338)
+* Update feedparser 6.0.10 (859ac57) to 6.0.10 (9865dec)
+* Update filelock 3.12.0 (b4713c9) to 3.12.4 (c1163ae)
+* Update idna library 3.4 (37c7d9b) to 3.4 (cab054c)
+* Update Msgpack 1.0.5 (0516c2c) to 1.0.6 (e1d3d5d)
+* Update package resource API 67.5.1 (f51eccd) to 68.1.2 (1ef36f2)
+* Update Requests library 2.29.0 (87d63de) to 2.31.0 (8812812)
+* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
+* Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984)
+* Update urllib3 1.26.15 (25cca389) to 2.0.5 (d9f85a7)
+* Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520)
+* Fix regex that was not using py312 notation
+* Change sort backlog and manual segment search results episode number
+* Change sort episodes when set to wanted on display show page
+* Add search of grouped options in shows drop down at view-show
+
+
+### 3.29.11 (2023-09-22 23:00:00 UTC)
* Fix pytvmaze country handling in NetworkBase
* Update issue template
diff --git a/gui/slick/interfaces/default/inc_top.tmpl b/gui/slick/interfaces/default/inc_top.tmpl
index 056384e4..2853a85d 100644
--- a/gui/slick/interfaces/default/inc_top.tmpl
+++ b/gui/slick/interfaces/default/inc_top.tmpl
@@ -190,7 +190,7 @@
trakt_played='most played this month', trakt_played_period_year='most played this year',
trakt_collected='most collected this month', trakt_collected_period_year='most collected this year',
trakt_recommended='recommended', trakt_watchlist='watchlist')
-#set $trakt_mode = $trakt_modes.get(re.sub('[\?=]', '_', $sg_var('TRAKT_MRU')), 'trends, tailored suggestions')
+#set $trakt_mode = $trakt_modes.get(re.sub(r'[\?=]', '_', $sg_var('TRAKT_MRU')), 'trends, tailored suggestions')
Trakt Cards
#set $imdb_func = $sg_str('IMDB_MRU').split('-')
diff --git a/gui/slick/js/displayShow.js b/gui/slick/js/displayShow.js
index 4dbc5b11..abd39f74 100644
--- a/gui/slick/js/displayShow.js
+++ b/gui/slick/js/displayShow.js
@@ -23,7 +23,83 @@ $(document).ready(function() {
}
return $('ended ' + data.text + '');
}
- select$.select2({templateResult: populateItem, templateSelection:populateItem});
+ // https://github.com/bevacqua/fuzzysearch
+ function fuzzysearch(needle, haystack) {
+ var hlen = haystack.length;
+ var nlen = needle.length;
+ if (nlen > hlen) {
+ return false;
+ }
+ if (nlen === hlen) {
+ return needle === haystack;
+ }
+ outer: for (var i = 0, j = 0; i < nlen; i++) {
+ var nch = needle.charCodeAt(i);
+ while (j < hlen) {
+ if (haystack.charCodeAt(j++) === nch) {
+ continue outer;
+ }
+ }
+ return false;
+ }
+ return true;
+ }
+
+ const white_space_regex = /\W/gui;
+ function sel_matcher(params, data) {
+
+ // If there are no search terms, return all of the data
+ if ($.trim(params.term) === '') {
+ return data;
+ }
+
+ // Do not display the item if there is no 'text' property
+ if (typeof data.text === 'undefined') {
+ return null;
+ }
+
+ // `params.term` should be the term that is used for searching
+ var param_term = params.term.toLowerCase().trim().replace(white_space_regex, '');
+
+ if ('undefined' !== typeof data.children) {
+ // `data.children` contains options to match against
+ var filteredChildren = [];
+ $.each(data.children, function (idx, child) {
+ // `child.text` is the text that is displayed for the data object
+ var param_data = child.text.toLowerCase().trim().replace(white_space_regex, '');
+
+ if (fuzzysearch(param_term, param_data)) {
+ filteredChildren.push(child);
+ }
+ });
+
+ // If any of the group's children match,
+ // then set the matched children on the group and return the group object
+ if (filteredChildren.length) {
+ var modifiedData = $.extend({}, data, true);
+ modifiedData.children = filteredChildren;
+
+ // You can return modified objects from here
+ // This includes matching the `children` how you want in nested data sets
+ return modifiedData;
+ }
+ }
+
+ // `data.text` is the text that is displayed for the data object
+ var param_data = data.text.toLowerCase().trim().replace(white_space_regex, '');
+ if (fuzzysearch(param_term, param_data)) {
+ var modifiedData = $.extend({}, data, true);
+
+ // You can return modified objects from here
+ // This includes matching the `children` how you want in nested data sets
+ return modifiedData;
+ }
+
+ // Return `null` if the term should not be displayed
+ return null;
+ }
+
+ select$.select2({templateResult: populateItem, templateSelection:populateItem, matcher: sel_matcher});
$('#prevShow, #nextShow').on('click', function() {
var select$ = $('#pickShow'),
diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index 4d8ee829..98092923 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
-Beautiful Soup works with Python 3.5 and up. It works better if lxml
+Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
@@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.11.1"
-__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
+__version__ = "4.12.2"
+__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@@ -38,11 +38,13 @@ from .builder import (
builder_registry,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
+ HTMLParserTreeBuilder
)
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
+ CSS,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
@@ -116,7 +118,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
@@ -211,7 +213,7 @@ class BeautifulSoup(Tag):
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name),
- DeprecationWarning
+ DeprecationWarning, stacklevel=3
)
return kwargs.pop(old_name)
return None
@@ -348,25 +350,49 @@ class BeautifulSoup(Tag):
self.markup = None
self.builder.soup = None
- def __copy__(self):
- """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
- copy = type(self)(
- self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
- )
+ def _clone(self):
+ """Create a new BeautifulSoup object with the same TreeBuilder,
+ but not associated with any markup.
- # Although we encoded the tree to UTF-8, that may not have
- # been the encoding of the original markup. Set the copy's
- # .original_encoding to reflect the original object's
- # .original_encoding.
- copy.original_encoding = self.original_encoding
- return copy
+ This is the first step of the deepcopy process.
+ """
+ clone = type(self)("", None, self.builder)
+ # Keep track of the encoding of the original document,
+ # since we won't be parsing it again.
+ clone.original_encoding = self.original_encoding
+ return clone
+
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
- d['builder'] = None
+ d['builder'] = type(self.builder)
+ # Store the contents as a Unicode string.
+ d['contents'] = []
+ d['markup'] = self.decode()
+
+ # If _most_recent_element is present, it's a Tag object left
+ # over from initial parse. It might not be picklable and we
+ # don't need it.
+ if '_most_recent_element' in d:
+ del d['_most_recent_element']
return d
+
+ def __setstate__(self, state):
+ # If necessary, restore the TreeBuilder by looking it up.
+ self.__dict__ = state
+ if isinstance(self.builder, type):
+ self.builder = self.builder()
+ elif not self.builder:
+ # We don't know which builder was used to build this
+ # parse tree, so use a default we know is always available.
+ self.builder = HTMLParserTreeBuilder()
+ self.builder.soup = self
+ self.reset()
+ self._feed()
+ return state
+
@classmethod
def _decode_markup(cls, markup):
@@ -405,7 +431,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.',
- MarkupResemblesLocatorWarning
+ MarkupResemblesLocatorWarning,
+ stacklevel=3
)
return True
return False
@@ -436,7 +463,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into'
' Beautiful Soup.',
- MarkupResemblesLocatorWarning
+ MarkupResemblesLocatorWarning, stacklevel=3
)
return True
return False
@@ -467,6 +494,7 @@ class BeautifulSoup(Tag):
self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
+ self._most_recent_element = None
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
@@ -748,7 +776,7 @@ class BeautifulSoup(Tag):
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
+ formatter="minimal", iterator=None):
"""Returns a string or Unicode representation of the parse tree
as an HTML or XML document.
@@ -775,7 +803,7 @@ class BeautifulSoup(Tag):
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
- indent_level, eventual_encoding, formatter)
+ indent_level, eventual_encoding, formatter, iterator)
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup
@@ -789,7 +817,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.',
- DeprecationWarning
+ DeprecationWarning, stacklevel=2
)
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index 463613a8..a5711d5d 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
- warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+ warnings.warn(
+ "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
+ stacklevel=3
+ )
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
@@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
- warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+ warnings.warn(
+ "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
+ stacklevel=4
+ )
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()
diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
index e37cdcde..4c5ced93 100644
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser
-try:
- from html.parser import HTMLParseError
-except ImportError as e:
- # HTMLParseError is removed in Python 3.5. Since it can never be
- # thrown in 3.5, we can just define our own class as a placeholder.
- class HTMLParseError(Exception):
- pass
-
import sys
import warnings
-# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
-# argument, which we'd like to set to False. Unfortunately,
-# http://bugs.python.org/issue13273 makes strict=True a better bet
-# before Python 3.2.3.
-#
-# At the end of this file, we monkeypatch HTMLParser so that
-# strict=True works well on Python 3.2.2.
-major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
-CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
-CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
-
-
from ..element import (
CData,
Comment,
@@ -45,6 +24,7 @@ from ..dammit import EntitySubstitution, UnicodeDammit
from ..builder import (
DetectsXMLParsedAsHTML,
+ ParserRejectedMarkup,
HTML,
HTMLTreeBuilder,
STRICT,
@@ -90,20 +70,23 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self.already_closed_empty_element = []
self._initialize_xml_detector()
-
- def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although
- this requirement doesn't appear to be documented.
- In Python 2, HTMLParser implements error() by raising an exception,
- which we don't want to do.
+ def error(self, message):
+ # NOTE: This method is required so long as Python 3.9 is
+ # supported. The corresponding code is removed from HTMLParser
+ # in 3.5, but not removed from ParserBase until 3.10.
+ # https://github.com/python/cpython/issues/76025
+ #
+ # The original implementation turned the error into a warning,
+ # but in every case I discovered, this made HTMLParser
+ # immediately crash with an error message that was less
+ # helpful than the warning. The new implementation makes it
+ # more clear that html.parser just can't parse this
+ # markup. The 3.10 implementation does the same, though it
+ # raises AssertionError rather than calling a method. (We
+ # catch this error and wrap it in a ParserRejectedMarkup.)
+ raise ParserRejectedMarkup(message)
- In any event, this method is called only on very strange
- markup and our best strategy is to pretend it didn't happen
- and keep going.
- """
- warnings.warn(msg)
-
def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
@@ -203,9 +186,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal.
"""
- # XXX workaround for a bug in HTMLParser. Remove this once
- # it's fixed in all supported versions.
- # http://bugs.python.org/issue13633
+ # TODO: This was originally a workaround for a bug in
+ # HTMLParser. (http://bugs.python.org/issue13633) The bug has
+ # been fixed, but removing this code still makes some
+ # Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@@ -333,10 +317,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
- if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
- parser_kwargs['strict'] = False
- if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
- parser_kwargs['convert_charrefs'] = False
+ parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
@@ -397,103 +378,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
- parser.close()
- except HTMLParseError as e:
- warnings.warn(RuntimeWarning(
- "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
- raise e
+ except AssertionError as e:
+ # html.parser raises AssertionError in rare cases to
+ # indicate a fatal problem with the markup, especially
+ # when there's an error in the doctype declaration.
+ raise ParserRejectedMarkup(e)
+ parser.close()
parser.already_closed_empty_element = []
-
-# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
-# 3.2.3 code. This ensures they don't treat markup like as a
-# string.
-#
-# XXX This code can be removed once most Python 3 users are on 3.2.3.
-if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
- import re
- attrfind_tolerant = re.compile(
- r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
- HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
-
- locatestarttagend = re.compile(r"""
- <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s+ # whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
- (?:\s*=\s* # value indicator
- (?:'[^']*' # LITA-enclosed value
- |\"[^\"]*\" # LIT-enclosed value
- |[^'\">\s]+ # bare value
- )
- )?
- )
- )*
- \s* # trailing whitespace
-""", re.VERBOSE)
- BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
-
- from html.parser import tagfind, attrfind
-
- def parse_starttag(self, i):
- self.__starttag_text = None
- endpos = self.check_for_whole_start_tag(i)
- if endpos < 0:
- return endpos
- rawdata = self.rawdata
- self.__starttag_text = rawdata[i:endpos]
-
- # Now parse the data between i+1 and j into a tag and attrs
- attrs = []
- match = tagfind.match(rawdata, i+1)
- assert match, 'unexpected call to parse_starttag()'
- k = match.end()
- self.lasttag = tag = rawdata[i+1:k].lower()
- while k < endpos:
- if self.strict:
- m = attrfind.match(rawdata, k)
- else:
- m = attrfind_tolerant.match(rawdata, k)
- if not m:
- break
- attrname, rest, attrvalue = m.group(1, 2, 3)
- if not rest:
- attrvalue = None
- elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
- attrvalue[:1] == '"' == attrvalue[-1:]:
- attrvalue = attrvalue[1:-1]
- if attrvalue:
- attrvalue = self.unescape(attrvalue)
- attrs.append((attrname.lower(), attrvalue))
- k = m.end()
-
- end = rawdata[k:endpos].strip()
- if end not in (">", "/>"):
- lineno, offset = self.getpos()
- if "\n" in self.__starttag_text:
- lineno = lineno + self.__starttag_text.count("\n")
- offset = len(self.__starttag_text) \
- - self.__starttag_text.rfind("\n")
- else:
- offset = offset + len(self.__starttag_text)
- if self.strict:
- self.error("junk characters in start tag: %r"
- % (rawdata[k:endpos][:20],))
- self.handle_data(rawdata[i:endpos])
- return endpos
- if end.endswith('/>'):
- # XHTML-style empty tag:
- self.handle_startendtag(tag, attrs)
- else:
- self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
- return endpos
-
- def set_cdata_mode(self, elem):
- self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
-
- BeautifulSoupHTMLParser.parse_starttag = parse_starttag
- BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
-
- CONSTRUCTOR_TAKES_STRICT = True
diff --git a/lib/bs4/css.py b/lib/bs4/css.py
new file mode 100644
index 00000000..572014b1
--- /dev/null
+++ b/lib/bs4/css.py
@@ -0,0 +1,280 @@
+"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
+
+import warnings
+try:
+ import soupsieve
+except ImportError as e:
+ soupsieve = None
+ warnings.warn(
+ 'The soupsieve package is not installed. CSS selectors cannot be used.'
+ )
+
+
+class CSS(object):
+ """A proxy object against the soupsieve library, to simplify its
+ CSS selector API.
+
+ Acquire this object through the .css attribute on the
+ BeautifulSoup object, or on the Tag you want to use as the
+ starting point for a CSS selector.
+
+ The main advantage of doing this is that the tag to be selected
+ against doesn't need to be explicitly specified in the function
+ calls, since it's already scoped to a tag.
+ """
+
+ def __init__(self, tag, api=soupsieve):
+ """Constructor.
+
+ You don't need to instantiate this class yourself; instead,
+ access the .css attribute on the BeautifulSoup object, or on
+ the Tag you want to use as the starting point for your CSS
+ selector.
+
+ :param tag: All CSS selectors will use this as their starting
+ point.
+
+ :param api: A plug-in replacement for the soupsieve module,
+ designed mainly for use in tests.
+ """
+ if api is None:
+ raise NotImplementedError(
+ "Cannot execute CSS selectors because the soupsieve package is not installed."
+ )
+ self.api = api
+ self.tag = tag
+
+ def escape(self, ident):
+ """Escape a CSS identifier.
+
+ This is a simple wrapper around soupselect.escape(). See the
+ documentation for that function for more information.
+ """
+ if soupsieve is None:
+ raise NotImplementedError(
+ "Cannot escape CSS identifiers because the soupsieve package is not installed."
+ )
+ return self.api.escape(ident)
+
+ def _ns(self, ns, select):
+ """Normalize a dictionary of namespaces."""
+ if not isinstance(select, self.api.SoupSieve) and ns is None:
+ # If the selector is a precompiled pattern, it already has
+ # a namespace context compiled in, which cannot be
+ # replaced.
+ ns = self.tag._namespaces
+ return ns
+
+ def _rs(self, results):
+ """Normalize a list of results to a Resultset.
+
+ A ResultSet is more consistent with the rest of Beautiful
+ Soup's API, and ResultSet.__getattr__ has a helpful error
+ message if you try to treat a list of results as a single
+ result (a common mistake).
+ """
+ # Import here to avoid circular import
+ from .element import ResultSet
+ return ResultSet(None, results)
+
+ def compile(self, select, namespaces=None, flags=0, **kwargs):
+ """Pre-compile a selector and return the compiled object.
+
+ :param selector: A CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.compile() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.compile() method.
+
+ :return: A precompiled selector object.
+ :rtype: soupsieve.SoupSieve
+ """
+ return self.api.compile(
+ select, self._ns(namespaces, select), flags, **kwargs
+ )
+
+ def select_one(self, select, namespaces=None, flags=0, **kwargs):
+ """Perform a CSS selection operation on the current Tag and return the
+ first result.
+
+ This uses the Soup Sieve library. For more information, see
+ that library's documentation for the soupsieve.select_one()
+ method.
+
+ :param selector: A CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.select_one() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select_one() method.
+
+ :return: A Tag, or None if the selector has no match.
+ :rtype: bs4.element.Tag
+
+ """
+ return self.api.select_one(
+ select, self.tag, self._ns(namespaces, select), flags, **kwargs
+ )
+
+ def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
+ """Perform a CSS selection operation on the current Tag.
+
+ This uses the Soup Sieve library. For more information, see
+ that library's documentation for the soupsieve.select()
+ method.
+
+ :param selector: A string containing a CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will pass in the prefixes it encountered while
+ parsing the document.
+
+ :param limit: After finding this number of results, stop looking.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.select() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select() method.
+
+ :return: A ResultSet of Tag objects.
+ :rtype: bs4.element.ResultSet
+
+ """
+ if limit is None:
+ limit = 0
+
+ return self._rs(
+ self.api.select(
+ select, self.tag, self._ns(namespaces, select), limit, flags,
+ **kwargs
+ )
+ )
+
+ def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
+ """Perform a CSS selection operation on the current Tag.
+
+ This uses the Soup Sieve library. For more information, see
+ that library's documentation for the soupsieve.iselect()
+ method. It is the same as select(), but it returns a generator
+ instead of a list.
+
+ :param selector: A string containing a CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will pass in the prefixes it encountered while
+ parsing the document.
+
+ :param limit: After finding this number of results, stop looking.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.iselect() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.iselect() method.
+
+ :return: A generator
+ :rtype: types.GeneratorType
+ """
+ return self.api.iselect(
+ select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
+ )
+
+ def closest(self, select, namespaces=None, flags=0, **kwargs):
+ """Find the Tag closest to this one that matches the given selector.
+
+ This uses the Soup Sieve library. For more information, see
+ that library's documentation for the soupsieve.closest()
+ method.
+
+ :param selector: A string containing a CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will pass in the prefixes it encountered while
+ parsing the document.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.closest() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.closest() method.
+
+ :return: A Tag, or None if there is no match.
+ :rtype: bs4.Tag
+
+ """
+ return self.api.closest(
+ select, self.tag, self._ns(namespaces, select), flags, **kwargs
+ )
+
+ def match(self, select, namespaces=None, flags=0, **kwargs):
+ """Check whether this Tag matches the given CSS selector.
+
+ This uses the Soup Sieve library. For more information, see
+ that library's documentation for the soupsieve.match()
+ method.
+
+ :param: a CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will pass in the prefixes it encountered while
+ parsing the document.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.match() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.match() method.
+
+ :return: True if this Tag matches the selector; False otherwise.
+ :rtype: bool
+ """
+ return self.api.match(
+ select, self.tag, self._ns(namespaces, select), flags, **kwargs
+ )
+
+ def filter(self, select, namespaces=None, flags=0, **kwargs):
+ """Filter this Tag's direct children based on the given CSS selector.
+
+ This uses the Soup Sieve library. It works the same way as
+ passing this Tag into that library's soupsieve.filter()
+ method. More information, for more information see the
+ documentation for soupsieve.filter().
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will pass in the prefixes it encountered while
+ parsing the document.
+
+ :param flags: Flags to be passed into Soup Sieve's
+ soupsieve.filter() method.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.filter() method.
+
+ :return: A ResultSet of Tag objects.
+ :rtype: bs4.element.ResultSet
+
+ """
+ return self._rs(
+ self.api.filter(
+ select, self.tag, self._ns(namespaces, select), flags, **kwargs
+ )
+ )
diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py
index e458729b..0b1c1e53 100644
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@@ -59,21 +59,6 @@ def diagnose(data):
if hasattr(data, 'read'):
data = data.read()
- elif data.startswith("http:") or data.startswith("https:"):
- print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
- print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
- return
- else:
- try:
- if os.path.exists(data):
- print(('"%s" looks like a filename. Reading data from the file.' % data))
- with open(data) as fp:
- data = fp.read()
- except ValueError:
- # This can happen on some platforms when the 'filename' is
- # too long. Assume it's data and not a filename.
- pass
- print("")
for parser in basic_parsers:
print(("Trying to parse your markup with %s" % parser))
diff --git a/lib/bs4/element.py b/lib/bs4/element.py
index 0eea8733..99fc8137 100644
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@@ -8,14 +8,8 @@ except ImportError as e:
import re
import sys
import warnings
-try:
- import soupsieve
-except ImportError as e:
- soupsieve = None
- warnings.warn(
- 'The soupsieve package is not installed. CSS selectors cannot be used.'
- )
+from .css import CSS
from .formatter import (
Formatter,
HTMLFormatter,
@@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([
"string-escape",
"string_escape",
])
-
+
class NamespacedAttribute(str):
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
('xml') and the name ('lang') that were used to create it.
"""
-
+
def __new__(cls, prefix, name=None, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
@@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
-
+
class PageElement(object):
"""Contains the navigational information for some part of the page:
that is, its current location in the parse tree.
NavigableString, Tag, etc. are all subclasses of PageElement.
"""
-
+
+ # In general, we can't tell just by looking at an element whether
+ # it's contained in an XML document or an HTML document. But for
+ # Tags (q.v.) we can store this information at parse time.
+ known_xml = None
+
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
@@ -163,7 +162,7 @@ class PageElement(object):
:param previous_element: The element parsed immediately before
this one.
-
+
:param next_element: The element parsed immediately before
this one.
@@ -257,11 +256,11 @@ class PageElement(object):
default = object()
def _all_strings(self, strip=False, types=default):
"""Yield all strings of certain classes, possibly stripping them.
-
+
This is implemented differently in Tag and NavigableString.
"""
raise NotImplementedError()
-
+
@property
def stripped_strings(self):
"""Yield all strings in this PageElement, stripping them first.
@@ -294,11 +293,11 @@ class PageElement(object):
strip, types=types)])
getText = get_text
text = property(get_text)
-
+
def replace_with(self, *args):
- """Replace this PageElement with one or more PageElements, keeping the
+ """Replace this PageElement with one or more PageElements, keeping the
rest of the tree the same.
-
+
:param args: One or more PageElements.
:return: `self`, no longer part of the tree.
"""
@@ -410,7 +409,7 @@ class PageElement(object):
This works the same way as `list.insert`.
:param position: The numeric position that should be occupied
- in `self.children` by the new PageElement.
+ in `self.children` by the new PageElement.
:param new_child: A PageElement.
"""
if new_child is None:
@@ -496,13 +495,16 @@ class PageElement(object):
def extend(self, tags):
"""Appends the given PageElements to this one's contents.
- :param tags: A list of PageElements.
+ :param tags: A list of PageElements. If a single Tag is
+ provided instead, this PageElement's contents will be extended
+ with that Tag's contents.
"""
if isinstance(tags, Tag):
- # Calling self.append() on another tag's contents will change
- # the list we're iterating over. Make a list that won't
- # change.
- tags = list(tags.contents)
+ tags = tags.contents
+ if isinstance(tags, list):
+ # Moving items around the tree may change their position in
+ # the original list. Make a list that won't change.
+ tags = list(tags)
for tag in tags:
self.append(tag)
@@ -543,7 +545,7 @@ class PageElement(object):
"Element has no parent, so 'after' has no meaning.")
if any(x is self for x in args):
raise ValueError("Can't insert an element after itself.")
-
+
offset = 0
for successor in args:
# Extract first so that the index won't be screwed up if they
@@ -586,8 +588,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements.
"""
+ _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements,
- **kwargs)
+ _stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@@ -624,8 +627,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
- return self._find_all(name, attrs, string, limit,
- self.next_siblings, **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit,
+ self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
+ )
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
@@ -663,8 +669,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
- return self._find_all(name, attrs, string, limit, self.previous_elements,
- **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit, self.previous_elements,
+ _stacklevel=_stacklevel+1, **kwargs
+ )
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
@@ -702,8 +711,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
- return self._find_all(name, attrs, string, limit,
- self.previous_siblings, **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit,
+ self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
+ )
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
@@ -724,7 +736,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
- l = self.find_parents(name, attrs, 1, **kwargs)
+ l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l:
r = l[0]
return r
@@ -744,8 +756,9 @@ class PageElement(object):
:return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
+ _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents,
- **kwargs)
+ _stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@@ -771,19 +784,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs):
r = None
- l = method(name, attrs, string, 1, **kwargs)
+ l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l:
r = l[0]
return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
+ _stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
- DeprecationWarning
+ DeprecationWarning, stacklevel=_stacklevel
)
if isinstance(name, SoupStrainer):
@@ -897,7 +911,7 @@ class PageElement(object):
:rtype: bool
"""
return getattr(self, '_decomposed', False) or False
-
+
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -921,16 +935,11 @@ class NavigableString(str, PageElement):
When Beautiful Soup parses the markup penguin, it will
create a NavigableString for the string "penguin".
- """
+ """
PREFIX = ''
SUFFIX = ''
- # We can't tell just by looking at a string whether it's contained
- # in an XML document or an HTML document.
-
- known_xml = None
-
def __new__(cls, value):
"""Create a new NavigableString.
@@ -946,12 +955,22 @@ class NavigableString(str, PageElement):
u.setup()
return u
- def __copy__(self):
+ def __deepcopy__(self, memo, recursive=False):
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
+
+ :param recursive: This parameter is ignored; it's only defined
+ so that NavigableString.__deepcopy__ implements the same
+ signature as Tag.__deepcopy__.
"""
return type(self)(self)
+ def __copy__(self):
+ """A copy of a NavigableString can only be a deep copy, because
+ only one PageElement can occupy a given place in a parse tree.
+ """
+ return self.__deepcopy__({})
+
def __getnewargs__(self):
return (str(self),)
@@ -1044,10 +1063,10 @@ class PreformattedString(NavigableString):
as comments (the Comment class) and CDATA blocks (the CData
class).
"""
-
+
PREFIX = ''
SUFFIX = ''
-
+
def output_ready(self, formatter=None):
"""Make this string ready for output by adding any subclass-specific
prefix or suffix.
@@ -1129,7 +1148,7 @@ class Stylesheet(NavigableString):
"""
pass
-
+
class Script(NavigableString):
"""A NavigableString representing an executable script (probably
Javascript).
@@ -1235,7 +1254,7 @@ class Tag(PageElement):
if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)):
self.sourceline = sourceline
- self.sourcepos = sourcepos
+ self.sourcepos = sourcepos
if attrs is None:
attrs = {}
elif attrs:
@@ -1293,25 +1312,60 @@ class Tag(PageElement):
self.interesting_string_types = builder.string_containers[self.name]
else:
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
-
+
parserClass = _alias("parser_class") # BS3
- def __copy__(self):
- """A copy of a Tag is a new Tag, unconnected to the parse tree.
+ def __deepcopy__(self, memo, recursive=True):
+ """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
+ clone = self._clone()
+
+ if recursive:
+ # Clone this tag's descendants recursively, but without
+ # making any recursive function calls.
+ tag_stack = [clone]
+ for event, element in self._event_stream(self.descendants):
+ if event is Tag.END_ELEMENT_EVENT:
+ # Stop appending incoming Tags to the Tag that was
+ # just closed.
+ tag_stack.pop()
+ else:
+ descendant_clone = element.__deepcopy__(
+ memo, recursive=False
+ )
+ # Add to its parent's .contents
+ tag_stack[-1].append(descendant_clone)
+
+ if event is Tag.START_ELEMENT_EVENT:
+ # Add the Tag itself to the stack so that its
+ # children will be .appended to it.
+ tag_stack.append(descendant_clone)
+ return clone
+
+ def __copy__(self):
+ """A copy of a Tag must always be a deep copy, because a Tag's
+ children can only have one parent at a time.
+ """
+ return self.__deepcopy__({})
+
+ def _clone(self):
+ """Create a new Tag just like this one, but with no
+ contents and unattached to any parse tree.
+
+ This is the first step in the deepcopy process.
+ """
clone = type(self)(
None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
- preserve_whitespace_tags=self.preserve_whitespace_tags
+ preserve_whitespace_tags=self.preserve_whitespace_tags,
+ interesting_string_types=self.interesting_string_types
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
- for child in self.contents:
- clone.append(child.__copy__())
return clone
@property
@@ -1417,7 +1471,7 @@ class Tag(PageElement):
i.contents = []
i._decomposed = True
i = n
-
+
def clear(self, decompose=False):
"""Wipe out all children of this PageElement by calling extract()
on them.
@@ -1505,7 +1559,7 @@ class Tag(PageElement):
if not isinstance(value, list):
value = [value]
return value
-
+
def has_attr(self, key):
"""Does this PageElement have an attribute with the given name?"""
return key in self.attrs
@@ -1558,7 +1612,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
),
- DeprecationWarning
+ DeprecationWarning, stacklevel=2
)
return self.find(tag_name)
# We special case contents to avoid recursion.
@@ -1592,7 +1646,7 @@ class Tag(PageElement):
def __repr__(self, encoding="unicode-escape"):
"""Renders this PageElement as a string.
- :param encoding: The encoding to use (Python 2 only).
+ :param encoding: The encoding to use (Python 2 only).
TODO: This is now ignored and a warning should be issued
if a value is provided.
:return: A (Unicode) string.
@@ -1634,106 +1688,212 @@ class Tag(PageElement):
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Render a Unicode representation of this PageElement and its
- contents.
-
- :param indent_level: Each line of the rendering will be
- indented this many spaces. Used internally in
- recursive calls while pretty-printing.
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a tag that mentions the document's
- encoding.
- :param formatter: A Formatter object, or a string naming one of
- the standard formatters.
- """
-
+ formatter="minimal",
+ iterator=None):
+ pieces = []
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
- attributes = formatter.attributes(self)
- attrs = []
- for key, val in attributes:
- if val is None:
- decoded = key
+
+ if indent_level is True:
+ indent_level = 0
+
+ # The currently active tag that put us into string literal
+ # mode. Until this element is closed, children will be treated
+ # as string literals and not pretty-printed. String literal
+ # mode is turned on immediately after this tag begins, and
+ # turned off immediately before it's closed. This means there
+ # will be whitespace before and after the tag itself.
+ string_literal_tag = None
+
+ for event, element in self._event_stream(iterator):
+ if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
+ piece = element._format_tag(
+ eventual_encoding, formatter, opening=True
+ )
+ elif event is Tag.END_ELEMENT_EVENT:
+ piece = element._format_tag(
+ eventual_encoding, formatter, opening=False
+ )
+ if indent_level is not None:
+ indent_level -= 1
else:
- if isinstance(val, list) or isinstance(val, tuple):
- val = ' '.join(val)
- elif not isinstance(val, str):
- val = str(val)
- elif (
- isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None
- ):
- val = val.encode(eventual_encoding)
+ piece = element.output_ready(formatter)
- text = formatter.attribute_value(val)
- decoded = (
- str(key) + '='
- + formatter.quoted_attribute_value(text))
- attrs.append(decoded)
- close = ''
- closeTag = ''
+ # Now we need to apply the 'prettiness' -- extra
+ # whitespace before and/or after this tag. This can get
+ # complicated because certain tags, like and
+ #