mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-22 09:33:37 +00:00
Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439).
This commit is contained in:
parent
9b497e6df2
commit
447c7e0fa8
9 changed files with 301 additions and 93 deletions
|
@ -9,6 +9,7 @@
|
||||||
* Change improve add show search results by comparing search term to an additional unidecoded result set
|
* Change improve add show search results by comparing search term to an additional unidecoded result set
|
||||||
* Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
|
* Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
|
||||||
* Update backports_abc 0.4 to 0.5
|
* Update backports_abc 0.4 to 0.5
|
||||||
|
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
|
||||||
|
|
||||||
|
|
||||||
[develop changelog]
|
[develop changelog]
|
||||||
|
|
|
@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/
|
||||||
|
|
||||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||||
provides provides methods and Pythonic idioms that make it easy to
|
provides methods and Pythonic idioms that make it easy to navigate,
|
||||||
navigate, search, and modify the parse tree.
|
search, and modify the parse tree.
|
||||||
|
|
||||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||||
and/or html5lib is installed.
|
and/or html5lib is installed.
|
||||||
|
|
||||||
For more than you ever wanted to know about Beautiful Soup, see the
|
For more than you ever wanted to know about Beautiful Soup, see the
|
||||||
documentation:
|
documentation:
|
||||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.4.0"
|
__version__ = "4.5.3"
|
||||||
__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .builder import builder_registry, ParserRejectedMarkup
|
from .builder import builder_registry, ParserRejectedMarkup
|
||||||
|
@ -77,7 +82,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
|
@ -137,6 +142,10 @@ class BeautifulSoup(Tag):
|
||||||
from_encoding = from_encoding or deprecated_argument(
|
from_encoding = from_encoding or deprecated_argument(
|
||||||
"fromEncoding", "from_encoding")
|
"fromEncoding", "from_encoding")
|
||||||
|
|
||||||
|
if from_encoding and isinstance(markup, unicode):
|
||||||
|
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||||
|
from_encoding = None
|
||||||
|
|
||||||
if len(kwargs) > 0:
|
if len(kwargs) > 0:
|
||||||
arg = kwargs.keys().pop()
|
arg = kwargs.keys().pop()
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
|
@ -161,19 +170,29 @@ class BeautifulSoup(Tag):
|
||||||
markup_type = "XML"
|
markup_type = "XML"
|
||||||
else:
|
else:
|
||||||
markup_type = "HTML"
|
markup_type = "HTML"
|
||||||
|
|
||||||
|
caller = traceback.extract_stack()[0]
|
||||||
|
filename = caller[0]
|
||||||
|
line_number = caller[1]
|
||||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||||
|
filename=filename,
|
||||||
|
line_number=line_number,
|
||||||
parser=builder.NAME,
|
parser=builder.NAME,
|
||||||
markup_type=markup_type))
|
markup_type=markup_type))
|
||||||
|
|
||||||
self.builder = builder
|
self.builder = builder
|
||||||
self.is_xml = builder.is_xml
|
self.is_xml = builder.is_xml
|
||||||
|
self.known_xml = self.is_xml
|
||||||
self.builder.soup = self
|
self.builder.soup = self
|
||||||
|
|
||||||
self.parse_only = parse_only
|
self.parse_only = parse_only
|
||||||
|
|
||||||
if hasattr(markup, 'read'): # It's a file-type object.
|
if hasattr(markup, 'read'): # It's a file-type object.
|
||||||
markup = markup.read()
|
markup = markup.read()
|
||||||
elif len(markup) <= 256:
|
elif len(markup) <= 256 and (
|
||||||
|
(isinstance(markup, bytes) and not b'<' in markup)
|
||||||
|
or (isinstance(markup, unicode) and not u'<' in markup)
|
||||||
|
):
|
||||||
# Print out warnings for a couple beginner problems
|
# Print out warnings for a couple beginner problems
|
||||||
# involving passing non-markup to Beautiful Soup.
|
# involving passing non-markup to Beautiful Soup.
|
||||||
# Beautiful Soup will still parse the input as markup,
|
# Beautiful Soup will still parse the input as markup,
|
||||||
|
@ -195,16 +214,10 @@ class BeautifulSoup(Tag):
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, unicode):
|
||||||
markup = markup.encode("utf8")
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
|
'"%s" looks like a filename, not markup. You should'
|
||||||
if markup[:5] == "http:" or markup[:6] == "https:":
|
'probably open this file and pass the filehandle into'
|
||||||
# TODO: This is ugly but I couldn't get it to work in
|
'Beautiful Soup.' % markup)
|
||||||
# Python 3 otherwise.
|
self._check_markup_is_url(markup)
|
||||||
if ((isinstance(markup, bytes) and not b' ' in markup)
|
|
||||||
or (isinstance(markup, unicode) and not u' ' in markup)):
|
|
||||||
if isinstance(markup, unicode):
|
|
||||||
markup = markup.encode("utf8")
|
|
||||||
warnings.warn(
|
|
||||||
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
|
||||||
|
|
||||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||||
self.contains_replacement_characters) in (
|
self.contains_replacement_characters) in (
|
||||||
|
@ -223,15 +236,52 @@ class BeautifulSoup(Tag):
|
||||||
self.builder.soup = None
|
self.builder.soup = None
|
||||||
|
|
||||||
def __copy__(self):
|
def __copy__(self):
|
||||||
return type(self)(self.encode(), builder=self.builder)
|
copy = type(self)(
|
||||||
|
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Although we encoded the tree to UTF-8, that may not have
|
||||||
|
# been the encoding of the original markup. Set the copy's
|
||||||
|
# .original_encoding to reflect the original object's
|
||||||
|
# .original_encoding.
|
||||||
|
copy.original_encoding = self.original_encoding
|
||||||
|
return copy
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
# Frequently a tree builder can't be pickled.
|
# Frequently a tree builder can't be pickled.
|
||||||
d = dict(self.__dict__)
|
d = dict(self.__dict__)
|
||||||
if 'builder' in d and not self.builder.picklable:
|
if 'builder' in d and not self.builder.picklable:
|
||||||
del d['builder']
|
d['builder'] = None
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _check_markup_is_url(markup):
|
||||||
|
"""
|
||||||
|
Check if markup looks like it's actually a url and raise a warning
|
||||||
|
if so. Markup can be unicode or str (py2) / bytes (py3).
|
||||||
|
"""
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
space = b' '
|
||||||
|
cant_start_with = (b"http:", b"https:")
|
||||||
|
elif isinstance(markup, unicode):
|
||||||
|
space = u' '
|
||||||
|
cant_start_with = (u"http:", u"https:")
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||||
|
if not space in markup:
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
decoded_markup = markup.decode('utf-8', 'replace')
|
||||||
|
else:
|
||||||
|
decoded_markup = markup
|
||||||
|
warnings.warn(
|
||||||
|
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||||
|
' HTTP client. You should probably use an HTTP client like'
|
||||||
|
' requests to get the document behind the URL, and feed'
|
||||||
|
' that document to Beautiful Soup.' % decoded_markup
|
||||||
|
)
|
||||||
|
|
||||||
def _feed(self):
|
def _feed(self):
|
||||||
# Convert the document to Unicode.
|
# Convert the document to Unicode.
|
||||||
self.builder.reset()
|
self.builder.reset()
|
||||||
|
@ -335,7 +385,18 @@ class BeautifulSoup(Tag):
|
||||||
if parent.next_sibling:
|
if parent.next_sibling:
|
||||||
# This node is being inserted into an element that has
|
# This node is being inserted into an element that has
|
||||||
# already been parsed. Deal with any dangling references.
|
# already been parsed. Deal with any dangling references.
|
||||||
index = parent.contents.index(o)
|
index = len(parent.contents)-1
|
||||||
|
while index >= 0:
|
||||||
|
if parent.contents[index] is o:
|
||||||
|
break
|
||||||
|
index -= 1
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Error building tree: supposedly %r was inserted "
|
||||||
|
"into %r after the fact, but I don't see it!" % (
|
||||||
|
o, parent
|
||||||
|
)
|
||||||
|
)
|
||||||
if index == 0:
|
if index == 0:
|
||||||
previous_element = parent
|
previous_element = parent
|
||||||
previous_sibling = None
|
previous_sibling = None
|
||||||
|
@ -387,7 +448,7 @@ class BeautifulSoup(Tag):
|
||||||
"""Push a start tag on to the stack.
|
"""Push a start tag on to the stack.
|
||||||
|
|
||||||
If this method returns None, the tag was rejected by the
|
If this method returns None, the tag was rejected by the
|
||||||
SoupStrainer. You should proceed as if the tag had not occured
|
SoupStrainer. You should proceed as if the tag had not occurred
|
||||||
in the document. For instance, if this was a self-closing tag,
|
in the document. For instance, if this was a self-closing tag,
|
||||||
don't call handle_endtag.
|
don't call handle_endtag.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
import sys
|
import sys
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CharsetMetaAttributeValue,
|
CharsetMetaAttributeValue,
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
|
HTMLAwareEntitySubstitution,
|
||||||
whitespace_re
|
whitespace_re
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
Such as which tags are empty-element tags.
|
Such as which tags are empty-element tags.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
preserve_whitespace_tags = set(['pre', 'textarea'])
|
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||||
'spacer', 'link', 'frame', 'base'])
|
'spacer', 'link', 'frame', 'base'])
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTML5TreeBuilder',
|
'HTML5TreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
from pdb import set_trace
|
|
||||||
import warnings
|
import warnings
|
||||||
|
import re
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
PERMISSIVE,
|
PERMISSIVE,
|
||||||
HTML,
|
HTML,
|
||||||
|
@ -15,7 +18,10 @@ from bs4.element import (
|
||||||
whitespace_re,
|
whitespace_re,
|
||||||
)
|
)
|
||||||
import html5lib
|
import html5lib
|
||||||
from html5lib.constants import namespaces
|
from html5lib.constants import (
|
||||||
|
namespaces,
|
||||||
|
prefixes,
|
||||||
|
)
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
@ -23,6 +29,15 @@ from bs4.element import (
|
||||||
Tag,
|
Tag,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Pre-0.99999999
|
||||||
|
from html5lib.treebuilders import _base as treebuilder_base
|
||||||
|
new_html5lib = False
|
||||||
|
except ImportError, e:
|
||||||
|
# 0.99999999 and up
|
||||||
|
from html5lib.treebuilders import base as treebuilder_base
|
||||||
|
new_html5lib = True
|
||||||
|
|
||||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
"""Use html5lib to build a tree."""
|
"""Use html5lib to build a tree."""
|
||||||
|
|
||||||
|
@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
if self.soup.parse_only is not None:
|
if self.soup.parse_only is not None:
|
||||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
|
||||||
|
extra_kwargs = dict()
|
||||||
|
if not isinstance(markup, unicode):
|
||||||
|
if new_html5lib:
|
||||||
|
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||||
|
else:
|
||||||
|
extra_kwargs['encoding'] = self.user_specified_encoding
|
||||||
|
doc = parser.parse(markup, **extra_kwargs)
|
||||||
|
|
||||||
# Set the character encoding detected by the tokenizer.
|
# Set the character encoding detected by the tokenizer.
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, unicode):
|
||||||
|
@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
# charEncoding to UTF-8 if it gets Unicode input.
|
# charEncoding to UTF-8 if it gets Unicode input.
|
||||||
doc.original_encoding = None
|
doc.original_encoding = None
|
||||||
else:
|
else:
|
||||||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||||
|
if not isinstance(original_encoding, basestring):
|
||||||
|
# In 0.99999999 and up, the encoding is an html5lib
|
||||||
|
# Encoding object. We want to use a string for compatibility
|
||||||
|
# with other tree builders.
|
||||||
|
original_encoding = original_encoding.name
|
||||||
|
doc.original_encoding = original_encoding
|
||||||
|
|
||||||
def create_treebuilder(self, namespaceHTMLElements):
|
def create_treebuilder(self, namespaceHTMLElements):
|
||||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||||
self.soup, namespaceHTMLElements)
|
namespaceHTMLElements, self.soup)
|
||||||
return self.underlying_builder
|
return self.underlying_builder
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
|
@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
|
|
||||||
def __init__(self, soup, namespaceHTMLElements):
|
def __init__(self, namespaceHTMLElements, soup=None):
|
||||||
self.soup = soup
|
if soup:
|
||||||
|
self.soup = soup
|
||||||
|
else:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
self.soup = BeautifulSoup("", "html.parser")
|
||||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||||
|
|
||||||
def documentClass(self):
|
def documentClass(self):
|
||||||
|
@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||||
return TextNode(Comment(data), self.soup)
|
return TextNode(Comment(data), self.soup)
|
||||||
|
|
||||||
def fragmentClass(self):
|
def fragmentClass(self):
|
||||||
self.soup = BeautifulSoup("")
|
from bs4 import BeautifulSoup
|
||||||
|
self.soup = BeautifulSoup("", "html.parser")
|
||||||
self.soup.name = "[document_fragment]"
|
self.soup.name = "[document_fragment]"
|
||||||
return Element(self.soup, self.soup, None)
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
|
@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||||
return self.soup
|
return self.soup
|
||||||
|
|
||||||
def getFragment(self):
|
def getFragment(self):
|
||||||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
return treebuilder_base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
rv = []
|
||||||
|
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
|
||||||
|
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if isinstance(element, BeautifulSoup):
|
||||||
|
pass
|
||||||
|
if isinstance(element, Doctype):
|
||||||
|
m = doctype_re.match(element)
|
||||||
|
if m:
|
||||||
|
name = m.group(1)
|
||||||
|
if m.lastindex > 1:
|
||||||
|
publicId = m.group(2) or ""
|
||||||
|
systemId = m.group(3) or m.group(4) or ""
|
||||||
|
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(' ' * indent, name, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||||
|
elif isinstance(element, Comment):
|
||||||
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
|
||||||
|
elif isinstance(element, NavigableString):
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||||
|
else:
|
||||||
|
if element.namespace:
|
||||||
|
name = "%s %s" % (prefixes[element.namespace],
|
||||||
|
element.name)
|
||||||
|
else:
|
||||||
|
name = element.name
|
||||||
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
|
if element.attrs:
|
||||||
|
attributes = []
|
||||||
|
for name, value in element.attrs.items():
|
||||||
|
if isinstance(name, NamespacedAttribute):
|
||||||
|
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join(value)
|
||||||
|
attributes.append((name, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
indent += 2
|
||||||
|
for child in element.children:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
class AttrList(object):
|
class AttrList(object):
|
||||||
def __init__(self, element):
|
def __init__(self, element):
|
||||||
|
@ -137,9 +220,9 @@ class AttrList(object):
|
||||||
return name in list(self.attrs.keys())
|
return name in list(self.attrs.keys())
|
||||||
|
|
||||||
|
|
||||||
class Element(html5lib.treebuilders._base.Node):
|
class Element(treebuilder_base.Node):
|
||||||
def __init__(self, element, soup, namespace):
|
def __init__(self, element, soup, namespace):
|
||||||
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
treebuilder_base.Node.__init__(self, element.name)
|
||||||
self.element = element
|
self.element = element
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
self.namespace = namespace
|
self.namespace = namespace
|
||||||
|
@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
child = node
|
child = node
|
||||||
elif node.element.__class__ == NavigableString:
|
elif node.element.__class__ == NavigableString:
|
||||||
string_child = child = node.element
|
string_child = child = node.element
|
||||||
|
node.parent = self
|
||||||
else:
|
else:
|
||||||
child = node.element
|
child = node.element
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
if not isinstance(child, basestring) and child.parent is not None:
|
if not isinstance(child, basestring) and child.parent is not None:
|
||||||
node.element.extract()
|
node.element.extract()
|
||||||
|
@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
most_recent_element=most_recent_element)
|
most_recent_element=most_recent_element)
|
||||||
|
|
||||||
def getAttributes(self):
|
def getAttributes(self):
|
||||||
|
if isinstance(self.element, Comment):
|
||||||
|
return {}
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
def setAttributes(self, attributes):
|
||||||
|
@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
attributes = property(getAttributes, setAttributes)
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
def insertText(self, data, insertBefore=None):
|
||||||
|
text = TextNode(self.soup.new_string(data), self.soup)
|
||||||
if insertBefore:
|
if insertBefore:
|
||||||
text = TextNode(self.soup.new_string(data), self.soup)
|
self.insertBefore(text, insertBefore)
|
||||||
self.insertBefore(data, insertBefore)
|
|
||||||
else:
|
else:
|
||||||
self.appendChild(data)
|
self.appendChild(text)
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
def insertBefore(self, node, refNode):
|
||||||
index = self.element.index(refNode.element)
|
index = self.element.index(refNode.element)
|
||||||
|
@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
# print "MOVE", self.element.contents
|
# print "MOVE", self.element.contents
|
||||||
# print "FROM", self.element
|
# print "FROM", self.element
|
||||||
# print "TO", new_parent.element
|
# print "TO", new_parent.element
|
||||||
|
|
||||||
element = self.element
|
element = self.element
|
||||||
new_parent_element = new_parent.element
|
new_parent_element = new_parent.element
|
||||||
# Determine what this tag's next_element will be once all the children
|
# Determine what this tag's next_element will be once all the children
|
||||||
|
@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||||
|
|
||||||
to_append = element.contents
|
to_append = element.contents
|
||||||
append_after = new_parent_element.contents
|
|
||||||
if len(to_append) > 0:
|
if len(to_append) > 0:
|
||||||
# Set the first child's previous_element and previous_sibling
|
# Set the first child's previous_element and previous_sibling
|
||||||
# to elements within the new parent
|
# to elements within the new parent
|
||||||
|
@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
if new_parents_last_child:
|
if new_parents_last_child:
|
||||||
new_parents_last_child.next_sibling = first_child
|
new_parents_last_child.next_sibling = first_child
|
||||||
|
|
||||||
# Fix the last child's next_element and next_sibling
|
# Find the very last element being moved. It is now the
|
||||||
last_child = to_append[-1]
|
# parent's last descendant. It has no .next_sibling and
|
||||||
last_child.next_element = new_parents_last_descendant_next_element
|
# its .next_element is whatever the previous last
|
||||||
|
# descendant had.
|
||||||
|
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||||
|
|
||||||
|
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||||
if new_parents_last_descendant_next_element:
|
if new_parents_last_descendant_next_element:
|
||||||
new_parents_last_descendant_next_element.previous_element = last_child
|
# TODO: This code has no test coverage and I'm not sure
|
||||||
last_child.next_sibling = None
|
# how to get html5lib to go through this path, but it's
|
||||||
|
# just the other side of the previous line.
|
||||||
|
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
|
||||||
|
last_childs_last_descendant.next_sibling = None
|
||||||
|
|
||||||
for child in to_append:
|
for child in to_append:
|
||||||
child.parent = new_parent_element
|
child.parent = new_parent_element
|
||||||
|
@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
|
|
||||||
class TextNode(Element):
|
class TextNode(Element):
|
||||||
def __init__(self, element, soup):
|
def __init__(self, element, soup):
|
||||||
html5lib.treebuilders._base.Node.__init__(self, None)
|
treebuilder_base.Node.__init__(self, None)
|
||||||
self.element = element
|
self.element = element
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||||
|
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTMLParserTreeBuilder',
|
'HTMLParserTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'LXMLTreeBuilderForXML',
|
'LXMLTreeBuilderForXML',
|
||||||
'LXMLTreeBuilder',
|
'LXMLTreeBuilder',
|
||||||
|
@ -12,6 +14,7 @@ from bs4.element import (
|
||||||
Doctype,
|
Doctype,
|
||||||
NamespacedAttribute,
|
NamespacedAttribute,
|
||||||
ProcessingInstruction,
|
ProcessingInstruction,
|
||||||
|
XMLProcessingInstruction,
|
||||||
)
|
)
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
FAST,
|
FAST,
|
||||||
|
@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||||
|
|
||||||
is_xml = True
|
is_xml = True
|
||||||
|
processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
NAME = "lxml-xml"
|
NAME = "lxml-xml"
|
||||||
ALTERNATE_NAMES = ["xml"]
|
ALTERNATE_NAMES = ["xml"]
|
||||||
|
@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
Each 4-tuple represents a strategy for parsing the document.
|
Each 4-tuple represents a strategy for parsing the document.
|
||||||
"""
|
"""
|
||||||
|
# Instead of using UnicodeDammit to convert the bytestring to
|
||||||
|
# Unicode using different encodings, use EncodingDetector to
|
||||||
|
# iterate over the encodings, and tell lxml to try to parse
|
||||||
|
# the document as each one in turn.
|
||||||
|
is_html = not self.is_xml
|
||||||
|
if is_html:
|
||||||
|
self.processing_instruction_class = ProcessingInstruction
|
||||||
|
else:
|
||||||
|
self.processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, unicode):
|
||||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||||
# this system?
|
# this system?
|
||||||
|
@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
yield (markup.encode("utf8"), "utf8",
|
yield (markup.encode("utf8"), "utf8",
|
||||||
document_declared_encoding, False)
|
document_declared_encoding, False)
|
||||||
|
|
||||||
# Instead of using UnicodeDammit to convert the bytestring to
|
|
||||||
# Unicode using different encodings, use EncodingDetector to
|
|
||||||
# iterate over the encodings, and tell lxml to try to parse
|
|
||||||
# the document as each one in turn.
|
|
||||||
is_html = not self.is_xml
|
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
detector = EncodingDetector(
|
detector = EncodingDetector(
|
||||||
markup, try_encodings, is_html, exclude_encodings)
|
markup, try_encodings, is_html, exclude_encodings)
|
||||||
|
@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
def pi(self, target, data):
|
def pi(self, target, data):
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
self.soup.handle_data(target + ' ' + data)
|
self.soup.handle_data(target + ' ' + data)
|
||||||
self.soup.endData(ProcessingInstruction)
|
self.soup.endData(self.processing_instruction_class)
|
||||||
|
|
||||||
def data(self, content):
|
def data(self, content):
|
||||||
self.soup.handle_data(content)
|
self.soup.handle_data(content)
|
||||||
|
@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
|
|
||||||
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||||
is_xml = False
|
is_xml = False
|
||||||
|
processing_instruction_class = ProcessingInstruction
|
||||||
|
|
||||||
def default_parser(self, encoding):
|
def default_parser(self, encoding):
|
||||||
return etree.HTMLParser
|
return etree.HTMLParser
|
||||||
|
|
|
@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||||
"""
|
"""
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
from pdb import set_trace
|
|
||||||
import codecs
|
import codecs
|
||||||
from htmlentitydefs import codepoint2name
|
from htmlentitydefs import codepoint2name
|
||||||
import re
|
import re
|
||||||
|
@ -309,7 +310,7 @@ class EncodingDetector:
|
||||||
else:
|
else:
|
||||||
xml_endpos = 1024
|
xml_endpos = 1024
|
||||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||||
|
|
||||||
declared_encoding = None
|
declared_encoding = None
|
||||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||||
if not declared_encoding_match and is_html:
|
if not declared_encoding_match and is_html:
|
||||||
|
@ -346,7 +347,7 @@ class UnicodeDammit:
|
||||||
self.tried_encodings = []
|
self.tried_encodings = []
|
||||||
self.contains_replacement_characters = False
|
self.contains_replacement_characters = False
|
||||||
self.is_html = is_html
|
self.is_html = is_html
|
||||||
|
self.log = logging.getLogger(__name__)
|
||||||
self.detector = EncodingDetector(
|
self.detector = EncodingDetector(
|
||||||
markup, override_encodings, is_html, exclude_encodings)
|
markup, override_encodings, is_html, exclude_encodings)
|
||||||
|
|
||||||
|
@ -376,9 +377,10 @@ class UnicodeDammit:
|
||||||
if encoding != "ascii":
|
if encoding != "ascii":
|
||||||
u = self._convert_from(encoding, "replace")
|
u = self._convert_from(encoding, "replace")
|
||||||
if u is not None:
|
if u is not None:
|
||||||
logging.warning(
|
self.log.warning(
|
||||||
"Some characters could not be decoded, and were "
|
"Some characters could not be decoded, and were "
|
||||||
"replaced with REPLACEMENT CHARACTER.")
|
"replaced with REPLACEMENT CHARACTER."
|
||||||
|
)
|
||||||
self.contains_replacement_characters = True
|
self.contains_replacement_characters = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -734,7 +736,7 @@ class UnicodeDammit:
|
||||||
0xde : b'\xc3\x9e', # Þ
|
0xde : b'\xc3\x9e', # Þ
|
||||||
0xdf : b'\xc3\x9f', # ß
|
0xdf : b'\xc3\x9f', # ß
|
||||||
0xe0 : b'\xc3\xa0', # à
|
0xe0 : b'\xc3\xa0', # à
|
||||||
0xe1 : b'\xa1', # á
|
0xe1 : b'\xa1', # á
|
||||||
0xe2 : b'\xc3\xa2', # â
|
0xe2 : b'\xc3\xa2', # â
|
||||||
0xe3 : b'\xc3\xa3', # ã
|
0xe3 : b'\xc3\xa3', # ã
|
||||||
0xe4 : b'\xc3\xa4', # ä
|
0xe4 : b'\xc3\xa4', # ä
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||||
|
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
|
@ -56,7 +58,8 @@ def diagnose(data):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif os.path.exists(data):
|
elif os.path.exists(data):
|
||||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||||
data = open(data).read()
|
with open(data) as fp:
|
||||||
|
data = fp.read()
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
# found in the LICENSE file.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
from pdb import set_trace
|
|
||||||
import collections
|
import collections
|
||||||
import re
|
import re
|
||||||
|
import shlex
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from bs4.dammit import EntitySubstitution
|
from bs4.dammit import EntitySubstitution
|
||||||
|
@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
|
||||||
|
|
||||||
preformatted_tags = set(["pre"])
|
preformatted_tags = set(["pre"])
|
||||||
|
|
||||||
|
preserve_whitespace_tags = set(['pre', 'textarea'])
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _substitute_if_appropriate(cls, ns, f):
|
def _substitute_if_appropriate(cls, ns, f):
|
||||||
if (isinstance(ns, NavigableString)
|
if (isinstance(ns, NavigableString)
|
||||||
|
@ -169,11 +173,19 @@ class PageElement(object):
|
||||||
|
|
||||||
This is used when mapping a formatter name ("minimal") to an
|
This is used when mapping a formatter name ("minimal") to an
|
||||||
appropriate function (one that performs entity-substitution on
|
appropriate function (one that performs entity-substitution on
|
||||||
the contents of <script> and <style> tags, or not). It's
|
the contents of <script> and <style> tags, or not). It can be
|
||||||
inefficient, but it should be called very rarely.
|
inefficient, but it should be called very rarely.
|
||||||
"""
|
"""
|
||||||
|
if self.known_xml is not None:
|
||||||
|
# Most of the time we will have determined this when the
|
||||||
|
# document is parsed.
|
||||||
|
return self.known_xml
|
||||||
|
|
||||||
|
# Otherwise, it's likely that this element was created by
|
||||||
|
# direct invocation of the constructor from within the user's
|
||||||
|
# Python code.
|
||||||
if self.parent is None:
|
if self.parent is None:
|
||||||
# This is the top-level object. It should have .is_xml set
|
# This is the top-level object. It should have .known_xml set
|
||||||
# from tree creation. If not, take a guess--BS is usually
|
# from tree creation. If not, take a guess--BS is usually
|
||||||
# used on HTML markup.
|
# used on HTML markup.
|
||||||
return getattr(self, 'is_xml', False)
|
return getattr(self, 'is_xml', False)
|
||||||
|
@ -637,7 +649,7 @@ class PageElement(object):
|
||||||
return lambda el: el._attr_value_as_string(
|
return lambda el: el._attr_value_as_string(
|
||||||
attribute, '').startswith(value)
|
attribute, '').startswith(value)
|
||||||
elif operator == '$':
|
elif operator == '$':
|
||||||
# string represenation of `attribute` ends with `value`
|
# string representation of `attribute` ends with `value`
|
||||||
return lambda el: el._attr_value_as_string(
|
return lambda el: el._attr_value_as_string(
|
||||||
attribute, '').endswith(value)
|
attribute, '').endswith(value)
|
||||||
elif operator == '*':
|
elif operator == '*':
|
||||||
|
@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement):
|
||||||
PREFIX = ''
|
PREFIX = ''
|
||||||
SUFFIX = ''
|
SUFFIX = ''
|
||||||
|
|
||||||
|
# We can't tell just by looking at a string whether it's contained
|
||||||
|
# in an XML document or an HTML document.
|
||||||
|
|
||||||
|
known_xml = None
|
||||||
|
|
||||||
def __new__(cls, value):
|
def __new__(cls, value):
|
||||||
"""Create a new NavigableString.
|
"""Create a new NavigableString.
|
||||||
|
|
||||||
|
@ -743,10 +760,16 @@ class CData(PreformattedString):
|
||||||
SUFFIX = u']]>'
|
SUFFIX = u']]>'
|
||||||
|
|
||||||
class ProcessingInstruction(PreformattedString):
|
class ProcessingInstruction(PreformattedString):
|
||||||
|
"""A SGML processing instruction."""
|
||||||
|
|
||||||
PREFIX = u'<?'
|
PREFIX = u'<?'
|
||||||
SUFFIX = u'>'
|
SUFFIX = u'>'
|
||||||
|
|
||||||
|
class XMLProcessingInstruction(ProcessingInstruction):
|
||||||
|
"""An XML processing instruction."""
|
||||||
|
PREFIX = u'<?'
|
||||||
|
SUFFIX = u'?>'
|
||||||
|
|
||||||
class Comment(PreformattedString):
|
class Comment(PreformattedString):
|
||||||
|
|
||||||
PREFIX = u'<!--'
|
PREFIX = u'<!--'
|
||||||
|
@ -781,7 +804,8 @@ class Tag(PageElement):
|
||||||
"""Represents a found HTML tag with its attributes and contents."""
|
"""Represents a found HTML tag with its attributes and contents."""
|
||||||
|
|
||||||
def __init__(self, parser=None, builder=None, name=None, namespace=None,
|
def __init__(self, parser=None, builder=None, name=None, namespace=None,
|
||||||
prefix=None, attrs=None, parent=None, previous=None):
|
prefix=None, attrs=None, parent=None, previous=None,
|
||||||
|
is_xml=None):
|
||||||
"Basic constructor."
|
"Basic constructor."
|
||||||
|
|
||||||
if parser is None:
|
if parser is None:
|
||||||
|
@ -795,6 +819,14 @@ class Tag(PageElement):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.namespace = namespace
|
self.namespace = namespace
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
if builder is not None:
|
||||||
|
preserve_whitespace_tags = builder.preserve_whitespace_tags
|
||||||
|
else:
|
||||||
|
if is_xml:
|
||||||
|
preserve_whitespace_tags = []
|
||||||
|
else:
|
||||||
|
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||||
|
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||||
if attrs is None:
|
if attrs is None:
|
||||||
attrs = {}
|
attrs = {}
|
||||||
elif attrs:
|
elif attrs:
|
||||||
|
@ -805,6 +837,13 @@ class Tag(PageElement):
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
else:
|
else:
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
|
|
||||||
|
# If possible, determine ahead of time whether this tag is an
|
||||||
|
# XML tag.
|
||||||
|
if builder:
|
||||||
|
self.known_xml = builder.is_xml
|
||||||
|
else:
|
||||||
|
self.known_xml = is_xml
|
||||||
self.attrs = attrs
|
self.attrs = attrs
|
||||||
self.contents = []
|
self.contents = []
|
||||||
self.setup(parent, previous)
|
self.setup(parent, previous)
|
||||||
|
@ -824,7 +863,7 @@ class Tag(PageElement):
|
||||||
Its contents are a copy of the old Tag's contents.
|
Its contents are a copy of the old Tag's contents.
|
||||||
"""
|
"""
|
||||||
clone = type(self)(None, self.builder, self.name, self.namespace,
|
clone = type(self)(None, self.builder, self.name, self.namespace,
|
||||||
self.nsprefix, self.attrs)
|
self.nsprefix, self.attrs, is_xml=self._is_xml)
|
||||||
for attr in ('can_be_empty_element', 'hidden'):
|
for attr in ('can_be_empty_element', 'hidden'):
|
||||||
setattr(clone, attr, getattr(self, attr))
|
setattr(clone, attr, getattr(self, attr))
|
||||||
for child in self.contents:
|
for child in self.contents:
|
||||||
|
@ -997,7 +1036,7 @@ class Tag(PageElement):
|
||||||
tag_name, tag_name))
|
tag_name, tag_name))
|
||||||
return self.find(tag_name)
|
return self.find(tag_name)
|
||||||
# We special case contents to avoid recursion.
|
# We special case contents to avoid recursion.
|
||||||
elif not tag.startswith("__") and not tag=="contents":
|
elif not tag.startswith("__") and not tag == "contents":
|
||||||
return self.find(tag)
|
return self.find(tag)
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
"'%s' object has no attribute '%s'" % (self.__class__, tag))
|
"'%s' object has no attribute '%s'" % (self.__class__, tag))
|
||||||
|
@ -1057,10 +1096,11 @@ class Tag(PageElement):
|
||||||
|
|
||||||
def _should_pretty_print(self, indent_level):
|
def _should_pretty_print(self, indent_level):
|
||||||
"""Should this tag be pretty-printed?"""
|
"""Should this tag be pretty-printed?"""
|
||||||
|
|
||||||
return (
|
return (
|
||||||
indent_level is not None and
|
indent_level is not None
|
||||||
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
|
and self.name not in self.preserve_whitespace_tags
|
||||||
or self._is_xml))
|
)
|
||||||
|
|
||||||
def decode(self, indent_level=None,
|
def decode(self, indent_level=None,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
|
@ -1280,6 +1320,7 @@ class Tag(PageElement):
|
||||||
|
|
||||||
_selector_combinators = ['>', '+', '~']
|
_selector_combinators = ['>', '+', '~']
|
||||||
_select_debug = False
|
_select_debug = False
|
||||||
|
quoted_colon = re.compile('"[^"]*:[^"]*"')
|
||||||
def select_one(self, selector):
|
def select_one(self, selector):
|
||||||
"""Perform a CSS selection operation on the current element."""
|
"""Perform a CSS selection operation on the current element."""
|
||||||
value = self.select(selector, limit=1)
|
value = self.select(selector, limit=1)
|
||||||
|
@ -1305,8 +1346,7 @@ class Tag(PageElement):
|
||||||
if limit and len(context) >= limit:
|
if limit and len(context) >= limit:
|
||||||
break
|
break
|
||||||
return context
|
return context
|
||||||
|
tokens = shlex.split(selector)
|
||||||
tokens = selector.split()
|
|
||||||
current_context = [self]
|
current_context = [self]
|
||||||
|
|
||||||
if tokens[-1] in self._selector_combinators:
|
if tokens[-1] in self._selector_combinators:
|
||||||
|
@ -1358,7 +1398,7 @@ class Tag(PageElement):
|
||||||
return classes.issubset(candidate.get('class', []))
|
return classes.issubset(candidate.get('class', []))
|
||||||
checker = classes_match
|
checker = classes_match
|
||||||
|
|
||||||
elif ':' in token:
|
elif ':' in token and not self.quoted_colon.search(token):
|
||||||
# Pseudo-class
|
# Pseudo-class
|
||||||
tag_name, pseudo = token.split(':', 1)
|
tag_name, pseudo = token.split(':', 1)
|
||||||
if tag_name == '':
|
if tag_name == '':
|
||||||
|
@ -1389,11 +1429,8 @@ class Tag(PageElement):
|
||||||
self.count += 1
|
self.count += 1
|
||||||
if self.count == self.destination:
|
if self.count == self.destination:
|
||||||
return True
|
return True
|
||||||
if self.count > self.destination:
|
else:
|
||||||
# Stop the generator that's sending us
|
return False
|
||||||
# these things.
|
|
||||||
raise StopIteration()
|
|
||||||
return False
|
|
||||||
checker = Counter(pseudo_value).nth_child_of_type
|
checker = Counter(pseudo_value).nth_child_of_type
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
@ -1498,13 +1535,12 @@ class Tag(PageElement):
|
||||||
# don't include it in the context more than once.
|
# don't include it in the context more than once.
|
||||||
new_context.append(candidate)
|
new_context.append(candidate)
|
||||||
new_context_ids.add(id(candidate))
|
new_context_ids.add(id(candidate))
|
||||||
if limit and len(new_context) >= limit:
|
|
||||||
break
|
|
||||||
elif self._select_debug:
|
elif self._select_debug:
|
||||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||||
|
|
||||||
|
|
||||||
current_context = new_context
|
current_context = new_context
|
||||||
|
if limit and len(current_context) >= limit:
|
||||||
|
current_context = current_context[:limit]
|
||||||
|
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print "Final verdict:"
|
print "Final verdict:"
|
||||||
|
@ -1668,21 +1704,15 @@ class SoupStrainer(object):
|
||||||
if isinstance(markup, list) or isinstance(markup, tuple):
|
if isinstance(markup, list) or isinstance(markup, tuple):
|
||||||
# This should only happen when searching a multi-valued attribute
|
# This should only happen when searching a multi-valued attribute
|
||||||
# like 'class'.
|
# like 'class'.
|
||||||
if (isinstance(match_against, unicode)
|
for item in markup:
|
||||||
and ' ' in match_against):
|
if self._matches(item, match_against):
|
||||||
# A bit of a special case. If they try to match "foo
|
return True
|
||||||
# bar" on a multivalue attribute's value, only accept
|
# We didn't match any particular value of the multivalue
|
||||||
# the literal value "foo bar"
|
# attribute, but maybe we match the attribute value when
|
||||||
#
|
# considered as a string.
|
||||||
# XXX This is going to be pretty slow because we keep
|
if self._matches(' '.join(markup), match_against):
|
||||||
# splitting match_against. But it shouldn't come up
|
return True
|
||||||
# too often.
|
return False
|
||||||
return (whitespace_re.split(match_against) == markup)
|
|
||||||
else:
|
|
||||||
for item in markup:
|
|
||||||
if self._matches(item, match_against):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
if match_against is True:
|
if match_against is True:
|
||||||
# True matches any non-None value.
|
# True matches any non-None value.
|
||||||
|
|
Loading…
Reference in a new issue