mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-03 18:03:37 +00:00
Merge pull request #978 from JackDandy/feature/UpdateBS
Update Beautiful Soup 4.4.0 (r397) to 4.6.0 (r449).
This commit is contained in:
commit
e69960ba2c
5 changed files with 131 additions and 23 deletions
|
@ -10,7 +10,7 @@
|
|||
* Change improve add show search results by comparing search term to an additional unidecoded result set
|
||||
* Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
|
||||
* Update backports_abc 0.4 to 0.5
|
||||
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
|
||||
* Update Beautiful Soup 4.4.0 (r397) to 4.6.0 (r449)
|
||||
* Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
|
||||
* Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
|
||||
* Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2)
|
||||
|
@ -93,6 +93,7 @@
|
|||
* Fix "too many redirects" or "no CSS/JS content" delivered
|
||||
* Change restart/shutdown to use updated jQuery
|
||||
* Remove AlphaReign torrent provider
|
||||
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
|
||||
* Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776)
|
||||
* Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30)
|
||||
* Update Tornado Web Server 4.5.dev1 (92f29b8) to 4.5.dev1 (38e493e)
|
||||
|
|
|
@ -82,7 +82,7 @@ class BeautifulSoup(Tag):
|
|||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
|
|
|
@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
"""
|
||||
|
||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||
'spacer', 'link', 'frame', 'base'])
|
||||
empty_element_tags = set([
|
||||
# These are from HTML5.
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
|
||||
# These are from HTML4, removed in HTML5.
|
||||
'spacer', 'frame'
|
||||
])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
|
|
|
@ -52,7 +52,31 @@ from bs4.builder import (
|
|||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
def handle_starttag(self, name, attrs):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
# Keep a list of empty-element tags that were encountered
|
||||
# without an explicit closing tag. If we encounter a closing tag
|
||||
# of this type, we'll associate it with one of those entries.
|
||||
#
|
||||
# This isn't a stack because we don't care about the
|
||||
# order. It's a list of closing tags we've already handled and
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
# This is only called when the markup looks like
|
||||
# <tag/>.
|
||||
|
||||
# is_startend() tells handle_starttag not to close the tag
|
||||
# just because its name matches a known empty-element tag. We
|
||||
# know that this is an empty-element tag and we want to call
|
||||
# handle_endtag ourselves.
|
||||
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||
self.handle_endtag(name)
|
||||
|
||||
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||
# XXX namespace
|
||||
attr_dict = {}
|
||||
for key, value in attrs:
|
||||
|
@ -62,9 +86,33 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
value = ''
|
||||
attr_dict[key] = value
|
||||
attrvalue = '""'
|
||||
self.soup.handle_starttag(name, None, None, attr_dict)
|
||||
#print "START", name
|
||||
tag = self.soup.handle_starttag(name, None, None, attr_dict)
|
||||
if tag and tag.is_empty_element and handle_empty_element:
|
||||
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||
# events for empty-element tags. (It's handled in
|
||||
# handle_startendtag, but only if the original markup looked like
|
||||
# <tag/>.)
|
||||
#
|
||||
# So we need to call handle_endtag() ourselves. Since we
|
||||
# know the start event is identical to the end event, we
|
||||
# don't want handle_endtag() to cross off any previous end
|
||||
# events for tags of this name.
|
||||
self.handle_endtag(name, check_already_closed=False)
|
||||
|
||||
def handle_endtag(self, name):
|
||||
# But we might encounter an explicit closing tag for this tag
|
||||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(name)
|
||||
|
||||
def handle_endtag(self, name, check_already_closed=True):
|
||||
#print "END", name
|
||||
if check_already_closed and name in self.already_closed_empty_element:
|
||||
# This is a redundant end tag for an empty-element tag.
|
||||
# We've already called handle_endtag() for it, so just
|
||||
# check it off the list.
|
||||
# print "ALREADY CLOSED", name
|
||||
self.already_closed_empty_element.remove(name)
|
||||
else:
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
|
@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
|||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
parser.already_closed_empty_element = []
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
|
|
|
@ -535,9 +535,16 @@ class PageElement(object):
|
|||
return ResultSet(strainer, result)
|
||||
elif isinstance(name, basestring):
|
||||
# Optimization to find all tags with a given name.
|
||||
if name.count(':') == 1:
|
||||
# This is a name with a prefix.
|
||||
prefix, name = name.split(':', 1)
|
||||
else:
|
||||
prefix = None
|
||||
result = (element for element in generator
|
||||
if isinstance(element, Tag)
|
||||
and element.name == name)
|
||||
and element.name == name
|
||||
and (prefix is None or element.prefix == prefix)
|
||||
)
|
||||
return ResultSet(strainer, result)
|
||||
results = ResultSet(strainer)
|
||||
while True:
|
||||
|
@ -863,7 +870,7 @@ class Tag(PageElement):
|
|||
Its contents are a copy of the old Tag's contents.
|
||||
"""
|
||||
clone = type(self)(None, self.builder, self.name, self.namespace,
|
||||
self.nsprefix, self.attrs, is_xml=self._is_xml)
|
||||
self.prefix, self.attrs, is_xml=self._is_xml)
|
||||
for attr in ('can_be_empty_element', 'hidden'):
|
||||
setattr(clone, attr, getattr(self, attr))
|
||||
for child in self.contents:
|
||||
|
@ -985,6 +992,13 @@ class Tag(PageElement):
|
|||
attribute."""
|
||||
return self.attrs.get(key, default)
|
||||
|
||||
def get_attribute_list(self, key, default=None):
|
||||
"""The same as get(), but always returns a list."""
|
||||
value = self.get(key, default)
|
||||
if not isinstance(value, list):
|
||||
value = [value]
|
||||
return value
|
||||
|
||||
def has_attr(self, key):
|
||||
return key in self.attrs
|
||||
|
||||
|
@ -1698,7 +1712,7 @@ class SoupStrainer(object):
|
|||
"I don't know how to match against a %s" % markup.__class__)
|
||||
return found
|
||||
|
||||
def _matches(self, markup, match_against):
|
||||
def _matches(self, markup, match_against, already_tried=None):
|
||||
# print u"Matching %s against %s" % (markup, match_against)
|
||||
result = False
|
||||
if isinstance(markup, list) or isinstance(markup, tuple):
|
||||
|
@ -1723,6 +1737,7 @@ class SoupStrainer(object):
|
|||
|
||||
# Custom callables take the tag as an argument, but all
|
||||
# other ways of matching match the tag name as a string.
|
||||
original_markup = markup
|
||||
if isinstance(markup, Tag):
|
||||
markup = markup.name
|
||||
|
||||
|
@ -1733,18 +1748,51 @@ class SoupStrainer(object):
|
|||
# None matches None, False, an empty string, an empty list, and so on.
|
||||
return not match_against
|
||||
|
||||
if isinstance(match_against, unicode):
|
||||
# Exact string match
|
||||
return markup == match_against
|
||||
if (hasattr(match_against, '__iter__')
|
||||
and not isinstance(match_against, basestring)):
|
||||
# We're asked to match against an iterable of items.
|
||||
# The markup must be match at least one item in the
|
||||
# iterable. We'll try each one in turn.
|
||||
#
|
||||
# To avoid infinite recursion we need to keep track of
|
||||
# items we've already seen.
|
||||
if not already_tried:
|
||||
already_tried = set()
|
||||
for item in match_against:
|
||||
if item.__hash__:
|
||||
key = item
|
||||
else:
|
||||
key = id(item)
|
||||
if key in already_tried:
|
||||
continue
|
||||
else:
|
||||
already_tried.add(key)
|
||||
if self._matches(original_markup, item, already_tried):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
if hasattr(match_against, 'match'):
|
||||
# Beyond this point we might need to run the test twice: once against
|
||||
# the tag's name and once against its prefixed name.
|
||||
match = False
|
||||
|
||||
if not match and isinstance(match_against, unicode):
|
||||
# Exact string match
|
||||
match = markup == match_against
|
||||
|
||||
if not match and hasattr(match_against, 'search'):
|
||||
# Regexp match
|
||||
return match_against.search(markup)
|
||||
|
||||
if hasattr(match_against, '__iter__'):
|
||||
# The markup must be an exact match against something
|
||||
# in the iterable.
|
||||
return markup in match_against
|
||||
if (not match
|
||||
and isinstance(original_markup, Tag)
|
||||
and original_markup.prefix):
|
||||
# Try the whole thing again with the prefixed tag name.
|
||||
return self._matches(
|
||||
original_markup.prefix + ':' + original_markup.name, match_against
|
||||
)
|
||||
|
||||
return match
|
||||
|
||||
|
||||
class ResultSet(list):
|
||||
|
@ -1753,3 +1801,8 @@ class ResultSet(list):
|
|||
def __init__(self, source, result=()):
|
||||
super(ResultSet, self).__init__(result)
|
||||
self.source = source
|
||||
|
||||
def __getattr__(self, key):
|
||||
raise AttributeError(
|
||||
"ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue