mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-07 10:33:38 +00:00
Merge pull request #420 from JackDandy/feature/UpdateBSoup
Update Beautiful Soup to 4.3.2 (r353).
This commit is contained in:
commit
23da0ad9ef
16 changed files with 240 additions and 3425 deletions
|
@ -47,6 +47,7 @@
|
||||||
* Update dateutil library 2.2 to 2.4.2 (a6b8925)
|
* Update dateutil library 2.2 to 2.4.2 (a6b8925)
|
||||||
* Change zoneinfo update/loader to be compatible with dateutil 2.4.2
|
* Change zoneinfo update/loader to be compatible with dateutil 2.4.2
|
||||||
* Update ConfigObj library 4.6.0 to 5.1.0 (a68530a)
|
* Update ConfigObj library 4.6.0 to 5.1.0 (a68530a)
|
||||||
|
* Update Beautiful Soup to 4.3.2 (r353)
|
||||||
|
|
||||||
[develop changelog]
|
[develop changelog]
|
||||||
* Update Requests library 2.7.0 (ab1f493) to 2.7.0 (8b5e457)
|
* Update Requests library 2.7.0 (ab1f493) to 2.7.0 (8b5e457)
|
||||||
|
|
|
@ -45,7 +45,7 @@ from .element import (
|
||||||
|
|
||||||
# The very first thing we do is give a useful error if someone is
|
# The very first thing we do is give a useful error if someone is
|
||||||
# running this code under Python 3 without converting it.
|
# running this code under Python 3 without converting it.
|
||||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||||
|
|
||||||
class BeautifulSoup(Tag):
|
class BeautifulSoup(Tag):
|
||||||
"""
|
"""
|
||||||
|
@ -77,6 +77,8 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, **kwargs):
|
parse_only=None, from_encoding=None, **kwargs):
|
||||||
"""The Soup object is initialized as the 'root tag', and the
|
"""The Soup object is initialized as the 'root tag', and the
|
||||||
|
@ -114,9 +116,9 @@ class BeautifulSoup(Tag):
|
||||||
del kwargs['isHTML']
|
del kwargs['isHTML']
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"BS4 does not respect the isHTML argument to the "
|
"BS4 does not respect the isHTML argument to the "
|
||||||
"BeautifulSoup constructor. You can pass in features='html' "
|
"BeautifulSoup constructor. Suggest you use "
|
||||||
"or features='xml' to get a builder capable of handling "
|
"features='lxml' for HTML and features='lxml-xml' for "
|
||||||
"one or the other.")
|
"XML.")
|
||||||
|
|
||||||
def deprecated_argument(old_name, new_name):
|
def deprecated_argument(old_name, new_name):
|
||||||
if old_name in kwargs:
|
if old_name in kwargs:
|
||||||
|
@ -140,6 +142,7 @@ class BeautifulSoup(Tag):
|
||||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||||
|
|
||||||
if builder is None:
|
if builder is None:
|
||||||
|
original_features = features
|
||||||
if isinstance(features, basestring):
|
if isinstance(features, basestring):
|
||||||
features = [features]
|
features = [features]
|
||||||
if features is None or len(features) == 0:
|
if features is None or len(features) == 0:
|
||||||
|
@ -151,6 +154,11 @@ class BeautifulSoup(Tag):
|
||||||
"requested: %s. Do you need to install a parser library?"
|
"requested: %s. Do you need to install a parser library?"
|
||||||
% ",".join(features))
|
% ",".join(features))
|
||||||
builder = builder_class()
|
builder = builder_class()
|
||||||
|
if not (original_features == builder.NAME or
|
||||||
|
original_features in builder.ALTERNATE_NAMES):
|
||||||
|
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||||
|
parser=builder.NAME))
|
||||||
|
|
||||||
self.builder = builder
|
self.builder = builder
|
||||||
self.is_xml = builder.is_xml
|
self.is_xml = builder.is_xml
|
||||||
self.builder.soup = self
|
self.builder.soup = self
|
||||||
|
@ -178,6 +186,8 @@ class BeautifulSoup(Tag):
|
||||||
# system. Just let it go.
|
# system. Just let it go.
|
||||||
pass
|
pass
|
||||||
if is_file:
|
if is_file:
|
||||||
|
if isinstance(markup, unicode):
|
||||||
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
|
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
|
||||||
if markup[:5] == "http:" or markup[:6] == "https:":
|
if markup[:5] == "http:" or markup[:6] == "https:":
|
||||||
|
@ -185,6 +195,8 @@ class BeautifulSoup(Tag):
|
||||||
# Python 3 otherwise.
|
# Python 3 otherwise.
|
||||||
if ((isinstance(markup, bytes) and not b' ' in markup)
|
if ((isinstance(markup, bytes) and not b' ' in markup)
|
||||||
or (isinstance(markup, unicode) and not u' ' in markup)):
|
or (isinstance(markup, unicode) and not u' ' in markup)):
|
||||||
|
if isinstance(markup, unicode):
|
||||||
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
||||||
|
|
||||||
|
|
|
@ -80,6 +80,8 @@ builder_registry = TreeBuilderRegistry()
|
||||||
class TreeBuilder(object):
|
class TreeBuilder(object):
|
||||||
"""Turn a document into a Beautiful Soup object tree."""
|
"""Turn a document into a Beautiful Soup object tree."""
|
||||||
|
|
||||||
|
NAME = "[Unknown tree builder]"
|
||||||
|
ALTERNATE_NAMES = []
|
||||||
features = []
|
features = []
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
|
|
|
@ -22,7 +22,9 @@ from bs4.element import (
|
||||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
"""Use html5lib to build a tree."""
|
"""Use html5lib to build a tree."""
|
||||||
|
|
||||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
NAME = "html5lib"
|
||||||
|
|
||||||
|
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding):
|
def prepare_markup(self, markup, user_specified_encoding):
|
||||||
# Store the user-specified encoding for use later on.
|
# Store the user-specified encoding for use later on.
|
||||||
|
@ -161,6 +163,12 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
# immediately after the parent, if it has no children.)
|
# immediately after the parent, if it has no children.)
|
||||||
if self.element.contents:
|
if self.element.contents:
|
||||||
most_recent_element = self.element._last_descendant(False)
|
most_recent_element = self.element._last_descendant(False)
|
||||||
|
elif self.element.next_element is not None:
|
||||||
|
# Something from further ahead in the parse tree is
|
||||||
|
# being inserted into this earlier element. This is
|
||||||
|
# very annoying because it means an expensive search
|
||||||
|
# for the last element in the tree.
|
||||||
|
most_recent_element = self.soup._last_descendant()
|
||||||
else:
|
else:
|
||||||
most_recent_element = self.element
|
most_recent_element = self.element
|
||||||
|
|
||||||
|
|
|
@ -19,10 +19,8 @@ import warnings
|
||||||
# At the end of this file, we monkeypatch HTMLParser so that
|
# At the end of this file, we monkeypatch HTMLParser so that
|
||||||
# strict=True works well on Python 3.2.2.
|
# strict=True works well on Python 3.2.2.
|
||||||
major, minor, release = sys.version_info[:3]
|
major, minor, release = sys.version_info[:3]
|
||||||
CONSTRUCTOR_TAKES_STRICT = (
|
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||||
major > 3
|
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||||
or (major == 3 and minor > 2)
|
|
||||||
or (major == 3 and minor == 2 and release >= 3))
|
|
||||||
|
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CData,
|
CData,
|
||||||
|
@ -63,7 +61,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
|
||||||
def handle_charref(self, name):
|
def handle_charref(self, name):
|
||||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||||
# it's fixed.
|
# it's fixed in all supported versions.
|
||||||
|
# http://bugs.python.org/issue13633
|
||||||
if name.startswith('x'):
|
if name.startswith('x'):
|
||||||
real_name = int(name.lstrip('x'), 16)
|
real_name = int(name.lstrip('x'), 16)
|
||||||
elif name.startswith('X'):
|
elif name.startswith('X'):
|
||||||
|
@ -113,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
|
||||||
def handle_pi(self, data):
|
def handle_pi(self, data):
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
if data.endswith("?") and data.lower().startswith("xml"):
|
|
||||||
# "An XHTML processing instruction using the trailing '?'
|
|
||||||
# will cause the '?' to be included in data." - HTMLParser
|
|
||||||
# docs.
|
|
||||||
#
|
|
||||||
# Strip the question mark so we don't end up with two
|
|
||||||
# question marks.
|
|
||||||
data = data[:-1]
|
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(ProcessingInstruction)
|
self.soup.endData(ProcessingInstruction)
|
||||||
|
|
||||||
|
@ -128,11 +119,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
features = [HTML, STRICT, HTMLPARSER]
|
NAME = HTMLPARSER
|
||||||
|
features = [NAME, HTML, STRICT]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
if CONSTRUCTOR_TAKES_STRICT:
|
if CONSTRUCTOR_TAKES_STRICT:
|
||||||
kwargs['strict'] = False
|
kwargs['strict'] = False
|
||||||
|
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||||
|
kwargs['convert_charrefs'] = False
|
||||||
self.parser_args = (args, kwargs)
|
self.parser_args = (args, kwargs)
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
|
|
|
@ -7,7 +7,12 @@ from io import BytesIO
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import collections
|
import collections
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
from bs4.element import (
|
||||||
|
Comment,
|
||||||
|
Doctype,
|
||||||
|
NamespacedAttribute,
|
||||||
|
ProcessingInstruction,
|
||||||
|
)
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
FAST,
|
FAST,
|
||||||
HTML,
|
HTML,
|
||||||
|
@ -25,8 +30,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
is_xml = True
|
is_xml = True
|
||||||
|
|
||||||
|
NAME = "lxml-xml"
|
||||||
|
|
||||||
# Well, it's permissive by XML parser standards.
|
# Well, it's permissive by XML parser standards.
|
||||||
features = [LXML, XML, FAST, PERMISSIVE]
|
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||||
|
|
||||||
CHUNK_SIZE = 512
|
CHUNK_SIZE = 512
|
||||||
|
|
||||||
|
@ -189,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
self.nsmaps.pop()
|
self.nsmaps.pop()
|
||||||
|
|
||||||
def pi(self, target, data):
|
def pi(self, target, data):
|
||||||
pass
|
self.soup.endData()
|
||||||
|
self.soup.handle_data(target + ' ' + data)
|
||||||
|
self.soup.endData(ProcessingInstruction)
|
||||||
|
|
||||||
def data(self, content):
|
def data(self, content):
|
||||||
self.soup.handle_data(content)
|
self.soup.handle_data(content)
|
||||||
|
@ -212,7 +221,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
|
|
||||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
NAME = LXML
|
||||||
|
ALTERNATE_NAMES = ["lxml-html"]
|
||||||
|
|
||||||
|
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||||
is_xml = False
|
is_xml = False
|
||||||
|
|
||||||
def default_parser(self, encoding):
|
def default_parser(self, encoding):
|
||||||
|
|
|
@ -548,17 +548,17 @@ class PageElement(object):
|
||||||
|
|
||||||
# Methods for supporting CSS selectors.
|
# Methods for supporting CSS selectors.
|
||||||
|
|
||||||
tag_name_re = re.compile('^[a-z0-9]+$')
|
tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
|
||||||
|
|
||||||
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
|
# /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
|
||||||
# \---/ \---/\-------------/ \-------/
|
# \---------------------------/ \---/\-------------/ \-------/
|
||||||
# | | | |
|
# | | | |
|
||||||
# | | | The value
|
# | | | The value
|
||||||
# | | ~,|,^,$,* or =
|
# | | ~,|,^,$,* or =
|
||||||
# | Attribute
|
# | Attribute
|
||||||
# Tag
|
# Tag
|
||||||
attribselect_re = re.compile(
|
attribselect_re = re.compile(
|
||||||
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
|
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
|
||||||
r'=?"?(?P<value>[^\]"]*)"?\]$'
|
r'=?"?(?P<value>[^\]"]*)"?\]$'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -707,7 +707,7 @@ class CData(PreformattedString):
|
||||||
class ProcessingInstruction(PreformattedString):
|
class ProcessingInstruction(PreformattedString):
|
||||||
|
|
||||||
PREFIX = u'<?'
|
PREFIX = u'<?'
|
||||||
SUFFIX = u'?>'
|
SUFFIX = u'>'
|
||||||
|
|
||||||
class Comment(PreformattedString):
|
class Comment(PreformattedString):
|
||||||
|
|
||||||
|
@ -1203,192 +1203,206 @@ class Tag(PageElement):
|
||||||
_select_debug = False
|
_select_debug = False
|
||||||
def select(self, selector, _candidate_generator=None):
|
def select(self, selector, _candidate_generator=None):
|
||||||
"""Perform a CSS selection operation on the current element."""
|
"""Perform a CSS selection operation on the current element."""
|
||||||
tokens = selector.split()
|
|
||||||
|
# Remove whitespace directly after the grouping operator ','
|
||||||
|
# then split into tokens.
|
||||||
|
tokens = re.sub(',[\s]*',',', selector).split()
|
||||||
current_context = [self]
|
current_context = [self]
|
||||||
|
|
||||||
if tokens[-1] in self._selector_combinators:
|
if tokens[-1] in self._selector_combinators:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
||||||
|
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print 'Running CSS selector "%s"' % selector
|
print 'Running CSS selector "%s"' % selector
|
||||||
for index, token in enumerate(tokens):
|
|
||||||
if self._select_debug:
|
for index, token_group in enumerate(tokens):
|
||||||
print ' Considering token "%s"' % token
|
new_context = []
|
||||||
recursive_candidate_generator = None
|
new_context_ids = set([])
|
||||||
tag_name = None
|
|
||||||
|
# Grouping selectors, ie: p,a
|
||||||
|
grouped_tokens = token_group.split(',')
|
||||||
|
if '' in grouped_tokens:
|
||||||
|
raise ValueError('Invalid group selection syntax: %s' % token_group)
|
||||||
|
|
||||||
if tokens[index-1] in self._selector_combinators:
|
if tokens[index-1] in self._selector_combinators:
|
||||||
# This token was consumed by the previous combinator. Skip it.
|
# This token was consumed by the previous combinator. Skip it.
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print ' Token was consumed by the previous combinator.'
|
print ' Token was consumed by the previous combinator.'
|
||||||
continue
|
continue
|
||||||
# Each operation corresponds to a checker function, a rule
|
|
||||||
# for determining whether a candidate matches the
|
|
||||||
# selector. Candidates are generated by the active
|
|
||||||
# iterator.
|
|
||||||
checker = None
|
|
||||||
|
|
||||||
m = self.attribselect_re.match(token)
|
for token in grouped_tokens:
|
||||||
if m is not None:
|
if self._select_debug:
|
||||||
# Attribute selector
|
print ' Considering token "%s"' % token
|
||||||
tag_name, attribute, operator, value = m.groups()
|
recursive_candidate_generator = None
|
||||||
checker = self._attribute_checker(operator, attribute, value)
|
tag_name = None
|
||||||
|
|
||||||
elif '#' in token:
|
# Each operation corresponds to a checker function, a rule
|
||||||
# ID selector
|
# for determining whether a candidate matches the
|
||||||
tag_name, tag_id = token.split('#', 1)
|
# selector. Candidates are generated by the active
|
||||||
def id_matches(tag):
|
# iterator.
|
||||||
return tag.get('id', None) == tag_id
|
checker = None
|
||||||
checker = id_matches
|
|
||||||
|
|
||||||
elif '.' in token:
|
m = self.attribselect_re.match(token)
|
||||||
# Class selector
|
if m is not None:
|
||||||
tag_name, klass = token.split('.', 1)
|
# Attribute selector
|
||||||
classes = set(klass.split('.'))
|
tag_name, attribute, operator, value = m.groups()
|
||||||
def classes_match(candidate):
|
checker = self._attribute_checker(operator, attribute, value)
|
||||||
return classes.issubset(candidate.get('class', []))
|
|
||||||
checker = classes_match
|
|
||||||
|
|
||||||
elif ':' in token:
|
elif '#' in token:
|
||||||
# Pseudo-class
|
# ID selector
|
||||||
tag_name, pseudo = token.split(':', 1)
|
tag_name, tag_id = token.split('#', 1)
|
||||||
if tag_name == '':
|
def id_matches(tag):
|
||||||
raise ValueError(
|
return tag.get('id', None) == tag_id
|
||||||
"A pseudo-class must be prefixed with a tag name.")
|
checker = id_matches
|
||||||
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
|
||||||
found = []
|
elif '.' in token:
|
||||||
if pseudo_attributes is not None:
|
# Class selector
|
||||||
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
tag_name, klass = token.split('.', 1)
|
||||||
if pseudo_type == 'nth-of-type':
|
classes = set(klass.split('.'))
|
||||||
try:
|
def classes_match(candidate):
|
||||||
pseudo_value = int(pseudo_value)
|
return classes.issubset(candidate.get('class', []))
|
||||||
except:
|
checker = classes_match
|
||||||
|
|
||||||
|
elif ':' in token:
|
||||||
|
# Pseudo-class
|
||||||
|
tag_name, pseudo = token.split(':', 1)
|
||||||
|
if tag_name == '':
|
||||||
|
raise ValueError(
|
||||||
|
"A pseudo-class must be prefixed with a tag name.")
|
||||||
|
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||||
|
found = []
|
||||||
|
if pseudo_attributes is not None:
|
||||||
|
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
||||||
|
if pseudo_type == 'nth-of-type':
|
||||||
|
try:
|
||||||
|
pseudo_value = int(pseudo_value)
|
||||||
|
except:
|
||||||
|
raise NotImplementedError(
|
||||||
|
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
|
||||||
|
if pseudo_value < 1:
|
||||||
|
raise ValueError(
|
||||||
|
'nth-of-type pseudo-class value must be at least 1.')
|
||||||
|
class Counter(object):
|
||||||
|
def __init__(self, destination):
|
||||||
|
self.count = 0
|
||||||
|
self.destination = destination
|
||||||
|
|
||||||
|
def nth_child_of_type(self, tag):
|
||||||
|
self.count += 1
|
||||||
|
if self.count == self.destination:
|
||||||
|
return True
|
||||||
|
if self.count > self.destination:
|
||||||
|
# Stop the generator that's sending us
|
||||||
|
# these things.
|
||||||
|
raise StopIteration()
|
||||||
|
return False
|
||||||
|
checker = Counter(pseudo_value).nth_child_of_type
|
||||||
|
else:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
|
'Only the following pseudo-classes are implemented: nth-of-type.')
|
||||||
if pseudo_value < 1:
|
|
||||||
raise ValueError(
|
|
||||||
'nth-of-type pseudo-class value must be at least 1.')
|
|
||||||
class Counter(object):
|
|
||||||
def __init__(self, destination):
|
|
||||||
self.count = 0
|
|
||||||
self.destination = destination
|
|
||||||
|
|
||||||
def nth_child_of_type(self, tag):
|
elif token == '*':
|
||||||
self.count += 1
|
# Star selector -- matches everything
|
||||||
if self.count == self.destination:
|
pass
|
||||||
return True
|
elif token == '>':
|
||||||
if self.count > self.destination:
|
# Run the next token as a CSS selector against the
|
||||||
# Stop the generator that's sending us
|
# direct children of each tag in the current context.
|
||||||
# these things.
|
recursive_candidate_generator = lambda tag: tag.children
|
||||||
raise StopIteration()
|
elif token == '~':
|
||||||
return False
|
# Run the next token as a CSS selector against the
|
||||||
checker = Counter(pseudo_value).nth_child_of_type
|
# siblings of each tag in the current context.
|
||||||
else:
|
recursive_candidate_generator = lambda tag: tag.next_siblings
|
||||||
raise NotImplementedError(
|
elif token == '+':
|
||||||
'Only the following pseudo-classes are implemented: nth-of-type.')
|
# For each tag in the current context, run the next
|
||||||
|
# token as a CSS selector against the tag's next
|
||||||
|
# sibling that's a tag.
|
||||||
|
def next_tag_sibling(tag):
|
||||||
|
yield tag.find_next_sibling(True)
|
||||||
|
recursive_candidate_generator = next_tag_sibling
|
||||||
|
|
||||||
elif token == '*':
|
elif self.tag_name_re.match(token):
|
||||||
# Star selector -- matches everything
|
# Just a tag name.
|
||||||
pass
|
tag_name = token
|
||||||
elif token == '>':
|
|
||||||
# Run the next token as a CSS selector against the
|
|
||||||
# direct children of each tag in the current context.
|
|
||||||
recursive_candidate_generator = lambda tag: tag.children
|
|
||||||
elif token == '~':
|
|
||||||
# Run the next token as a CSS selector against the
|
|
||||||
# siblings of each tag in the current context.
|
|
||||||
recursive_candidate_generator = lambda tag: tag.next_siblings
|
|
||||||
elif token == '+':
|
|
||||||
# For each tag in the current context, run the next
|
|
||||||
# token as a CSS selector against the tag's next
|
|
||||||
# sibling that's a tag.
|
|
||||||
def next_tag_sibling(tag):
|
|
||||||
yield tag.find_next_sibling(True)
|
|
||||||
recursive_candidate_generator = next_tag_sibling
|
|
||||||
|
|
||||||
elif self.tag_name_re.match(token):
|
|
||||||
# Just a tag name.
|
|
||||||
tag_name = token
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
'Unsupported or invalid CSS selector: "%s"' % token)
|
|
||||||
|
|
||||||
if recursive_candidate_generator:
|
|
||||||
# This happens when the selector looks like "> foo".
|
|
||||||
#
|
|
||||||
# The generator calls select() recursively on every
|
|
||||||
# member of the current context, passing in a different
|
|
||||||
# candidate generator and a different selector.
|
|
||||||
#
|
|
||||||
# In the case of "> foo", the candidate generator is
|
|
||||||
# one that yields a tag's direct children (">"), and
|
|
||||||
# the selector is "foo".
|
|
||||||
next_token = tokens[index+1]
|
|
||||||
def recursive_select(tag):
|
|
||||||
if self._select_debug:
|
|
||||||
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
|
||||||
print '-' * 40
|
|
||||||
for i in tag.select(next_token, recursive_candidate_generator):
|
|
||||||
if self._select_debug:
|
|
||||||
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
|
||||||
yield i
|
|
||||||
if self._select_debug:
|
|
||||||
print '-' * 40
|
|
||||||
_use_candidate_generator = recursive_select
|
|
||||||
elif _candidate_generator is None:
|
|
||||||
# By default, a tag's candidates are all of its
|
|
||||||
# children. If tag_name is defined, only yield tags
|
|
||||||
# with that name.
|
|
||||||
if self._select_debug:
|
|
||||||
if tag_name:
|
|
||||||
check = "[any]"
|
|
||||||
else:
|
|
||||||
check = tag_name
|
|
||||||
print ' Default candidate generator, tag name="%s"' % check
|
|
||||||
if self._select_debug:
|
|
||||||
# This is redundant with later code, but it stops
|
|
||||||
# a bunch of bogus tags from cluttering up the
|
|
||||||
# debug log.
|
|
||||||
def default_candidate_generator(tag):
|
|
||||||
for child in tag.descendants:
|
|
||||||
if not isinstance(child, Tag):
|
|
||||||
continue
|
|
||||||
if tag_name and not child.name == tag_name:
|
|
||||||
continue
|
|
||||||
yield child
|
|
||||||
_use_candidate_generator = default_candidate_generator
|
|
||||||
else:
|
else:
|
||||||
_use_candidate_generator = lambda tag: tag.descendants
|
raise ValueError(
|
||||||
else:
|
'Unsupported or invalid CSS selector: "%s"' % token)
|
||||||
_use_candidate_generator = _candidate_generator
|
if recursive_candidate_generator:
|
||||||
|
# This happens when the selector looks like "> foo".
|
||||||
new_context = []
|
#
|
||||||
new_context_ids = set([])
|
# The generator calls select() recursively on every
|
||||||
for tag in current_context:
|
# member of the current context, passing in a different
|
||||||
if self._select_debug:
|
# candidate generator and a different selector.
|
||||||
print " Running candidate generator on %s %s" % (
|
#
|
||||||
tag.name, repr(tag.attrs))
|
# In the case of "> foo", the candidate generator is
|
||||||
for candidate in _use_candidate_generator(tag):
|
# one that yields a tag's direct children (">"), and
|
||||||
if not isinstance(candidate, Tag):
|
# the selector is "foo".
|
||||||
continue
|
next_token = tokens[index+1]
|
||||||
if tag_name and candidate.name != tag_name:
|
def recursive_select(tag):
|
||||||
continue
|
|
||||||
if checker is not None:
|
|
||||||
try:
|
|
||||||
result = checker(candidate)
|
|
||||||
except StopIteration:
|
|
||||||
# The checker has decided we should no longer
|
|
||||||
# run the generator.
|
|
||||||
break
|
|
||||||
if checker is None or result:
|
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
||||||
if id(candidate) not in new_context_ids:
|
print '-' * 40
|
||||||
# If a tag matches a selector more than once,
|
for i in tag.select(next_token, recursive_candidate_generator):
|
||||||
# don't include it in the context more than once.
|
if self._select_debug:
|
||||||
new_context.append(candidate)
|
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
||||||
new_context_ids.add(id(candidate))
|
yield i
|
||||||
elif self._select_debug:
|
if self._select_debug:
|
||||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
print '-' * 40
|
||||||
|
_use_candidate_generator = recursive_select
|
||||||
|
elif _candidate_generator is None:
|
||||||
|
# By default, a tag's candidates are all of its
|
||||||
|
# children. If tag_name is defined, only yield tags
|
||||||
|
# with that name.
|
||||||
|
if self._select_debug:
|
||||||
|
if tag_name:
|
||||||
|
check = "[any]"
|
||||||
|
else:
|
||||||
|
check = tag_name
|
||||||
|
print ' Default candidate generator, tag name="%s"' % check
|
||||||
|
if self._select_debug:
|
||||||
|
# This is redundant with later code, but it stops
|
||||||
|
# a bunch of bogus tags from cluttering up the
|
||||||
|
# debug log.
|
||||||
|
def default_candidate_generator(tag):
|
||||||
|
for child in tag.descendants:
|
||||||
|
if not isinstance(child, Tag):
|
||||||
|
continue
|
||||||
|
if tag_name and not child.name == tag_name:
|
||||||
|
continue
|
||||||
|
yield child
|
||||||
|
_use_candidate_generator = default_candidate_generator
|
||||||
|
else:
|
||||||
|
_use_candidate_generator = lambda tag: tag.descendants
|
||||||
|
else:
|
||||||
|
_use_candidate_generator = _candidate_generator
|
||||||
|
|
||||||
|
for tag in current_context:
|
||||||
|
if self._select_debug:
|
||||||
|
print " Running candidate generator on %s %s" % (
|
||||||
|
tag.name, repr(tag.attrs))
|
||||||
|
for candidate in _use_candidate_generator(tag):
|
||||||
|
if not isinstance(candidate, Tag):
|
||||||
|
continue
|
||||||
|
if tag_name and candidate.name != tag_name:
|
||||||
|
continue
|
||||||
|
if checker is not None:
|
||||||
|
try:
|
||||||
|
result = checker(candidate)
|
||||||
|
except StopIteration:
|
||||||
|
# The checker has decided we should no longer
|
||||||
|
# run the generator.
|
||||||
|
break
|
||||||
|
if checker is None or result:
|
||||||
|
if self._select_debug:
|
||||||
|
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
||||||
|
if id(candidate) not in new_context_ids:
|
||||||
|
# If a tag matches a selector more than once,
|
||||||
|
# don't include it in the context more than once.
|
||||||
|
new_context.append(candidate)
|
||||||
|
new_context_ids.add(id(candidate))
|
||||||
|
elif self._select_debug:
|
||||||
|
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||||
|
|
||||||
|
|
||||||
current_context = new_context
|
current_context = new_context
|
||||||
|
|
||||||
|
|
|
@ -1,592 +0,0 @@
|
||||||
"""Helper classes for tests."""
|
|
||||||
|
|
||||||
import copy
|
|
||||||
import functools
|
|
||||||
import unittest
|
|
||||||
from unittest import TestCase
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from bs4.element import (
|
|
||||||
CharsetMetaAttributeValue,
|
|
||||||
Comment,
|
|
||||||
ContentMetaAttributeValue,
|
|
||||||
Doctype,
|
|
||||||
SoupStrainer,
|
|
||||||
)
|
|
||||||
|
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
|
||||||
default_builder = HTMLParserTreeBuilder
|
|
||||||
|
|
||||||
|
|
||||||
class SoupTest(unittest.TestCase):
|
|
||||||
|
|
||||||
@property
|
|
||||||
def default_builder(self):
|
|
||||||
return default_builder()
|
|
||||||
|
|
||||||
def soup(self, markup, **kwargs):
|
|
||||||
"""Build a Beautiful Soup object from markup."""
|
|
||||||
builder = kwargs.pop('builder', self.default_builder)
|
|
||||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
|
||||||
|
|
||||||
def document_for(self, markup):
|
|
||||||
"""Turn an HTML fragment into a document.
|
|
||||||
|
|
||||||
The details depend on the builder.
|
|
||||||
"""
|
|
||||||
return self.default_builder.test_fragment_to_document(markup)
|
|
||||||
|
|
||||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
|
||||||
builder = self.default_builder
|
|
||||||
obj = BeautifulSoup(to_parse, builder=builder)
|
|
||||||
if compare_parsed_to is None:
|
|
||||||
compare_parsed_to = to_parse
|
|
||||||
|
|
||||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLTreeBuilderSmokeTest(object):
|
|
||||||
|
|
||||||
"""A basic test of a treebuilder's competence.
|
|
||||||
|
|
||||||
Any HTML treebuilder, present or future, should be able to pass
|
|
||||||
these tests. With invalid markup, there's room for interpretation,
|
|
||||||
and different parsers can handle it differently. But with the
|
|
||||||
markup in these tests, there's not much room for interpretation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def assertDoctypeHandled(self, doctype_fragment):
|
|
||||||
"""Assert that a given doctype string is handled correctly."""
|
|
||||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
|
||||||
|
|
||||||
# Make sure a Doctype object was created.
|
|
||||||
doctype = soup.contents[0]
|
|
||||||
self.assertEqual(doctype.__class__, Doctype)
|
|
||||||
self.assertEqual(doctype, doctype_fragment)
|
|
||||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
|
||||||
|
|
||||||
# Make sure that the doctype was correctly associated with the
|
|
||||||
# parse tree and that the rest of the document parsed.
|
|
||||||
self.assertEqual(soup.p.contents[0], 'foo')
|
|
||||||
|
|
||||||
def _document_with_doctype(self, doctype_fragment):
|
|
||||||
"""Generate and parse a document with the given doctype."""
|
|
||||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
|
||||||
markup = doctype + '\n<p>foo</p>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
return doctype, soup
|
|
||||||
|
|
||||||
def test_normal_doctypes(self):
|
|
||||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
|
||||||
self.assertDoctypeHandled("html")
|
|
||||||
self.assertDoctypeHandled(
|
|
||||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
|
||||||
|
|
||||||
def test_empty_doctype(self):
|
|
||||||
soup = self.soup("<!DOCTYPE>")
|
|
||||||
doctype = soup.contents[0]
|
|
||||||
self.assertEqual("", doctype.strip())
|
|
||||||
|
|
||||||
def test_public_doctype_with_url(self):
|
|
||||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
|
||||||
self.assertDoctypeHandled(doctype)
|
|
||||||
|
|
||||||
def test_system_doctype(self):
|
|
||||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
|
||||||
|
|
||||||
def test_namespaced_system_doctype(self):
|
|
||||||
# We can handle a namespaced doctype with a system ID.
|
|
||||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
|
||||||
|
|
||||||
def test_namespaced_public_doctype(self):
|
|
||||||
# Test a namespaced doctype with a public id.
|
|
||||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
|
||||||
|
|
||||||
def test_real_xhtml_document(self):
|
|
||||||
"""A real XHTML document should come out more or less the same as it went in."""
|
|
||||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head><title>Hello.</title></head>
|
|
||||||
<body>Goodbye.</body>
|
|
||||||
</html>"""
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(
|
|
||||||
soup.encode("utf-8").replace(b"\n", b""),
|
|
||||||
markup.replace(b"\n", b""))
|
|
||||||
|
|
||||||
def test_deepcopy(self):
|
|
||||||
"""Make sure you can copy the tree builder.
|
|
||||||
|
|
||||||
This is important because the builder is part of a
|
|
||||||
BeautifulSoup object, and we want to be able to copy that.
|
|
||||||
"""
|
|
||||||
copy.deepcopy(self.default_builder)
|
|
||||||
|
|
||||||
def test_p_tag_is_never_empty_element(self):
|
|
||||||
"""A <p> tag is never designated as an empty-element tag.
|
|
||||||
|
|
||||||
Even if the markup shows it as an empty-element tag, it
|
|
||||||
shouldn't be presented that way.
|
|
||||||
"""
|
|
||||||
soup = self.soup("<p/>")
|
|
||||||
self.assertFalse(soup.p.is_empty_element)
|
|
||||||
self.assertEqual(str(soup.p), "<p></p>")
|
|
||||||
|
|
||||||
def test_unclosed_tags_get_closed(self):
|
|
||||||
"""A tag that's not closed by the end of the document should be closed.
|
|
||||||
|
|
||||||
This applies to all tags except empty-element tags.
|
|
||||||
"""
|
|
||||||
self.assertSoupEquals("<p>", "<p></p>")
|
|
||||||
self.assertSoupEquals("<b>", "<b></b>")
|
|
||||||
|
|
||||||
self.assertSoupEquals("<br>", "<br/>")
|
|
||||||
|
|
||||||
def test_br_is_always_empty_element_tag(self):
|
|
||||||
"""A <br> tag is designated as an empty-element tag.
|
|
||||||
|
|
||||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
|
||||||
two tags, but it should always be an empty-element tag.
|
|
||||||
"""
|
|
||||||
soup = self.soup("<br></br>")
|
|
||||||
self.assertTrue(soup.br.is_empty_element)
|
|
||||||
self.assertEqual(str(soup.br), "<br/>")
|
|
||||||
|
|
||||||
def test_nested_formatting_elements(self):
|
|
||||||
self.assertSoupEquals("<em><em></em></em>")
|
|
||||||
|
|
||||||
def test_comment(self):
|
|
||||||
# Comments are represented as Comment objects.
|
|
||||||
markup = "<p>foo<!--foobar-->baz</p>"
|
|
||||||
self.assertSoupEquals(markup)
|
|
||||||
|
|
||||||
soup = self.soup(markup)
|
|
||||||
comment = soup.find(text="foobar")
|
|
||||||
self.assertEqual(comment.__class__, Comment)
|
|
||||||
|
|
||||||
# The comment is properly integrated into the tree.
|
|
||||||
foo = soup.find(text="foo")
|
|
||||||
self.assertEqual(comment, foo.next_element)
|
|
||||||
baz = soup.find(text="baz")
|
|
||||||
self.assertEqual(comment, baz.previous_element)
|
|
||||||
|
|
||||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
|
||||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
|
||||||
self.assertSoupEquals("<pre> </pre>")
|
|
||||||
self.assertSoupEquals("<textarea> woo </textarea>")
|
|
||||||
|
|
||||||
def test_nested_inline_elements(self):
|
|
||||||
"""Inline elements can be nested indefinitely."""
|
|
||||||
b_tag = "<b>Inside a B tag</b>"
|
|
||||||
self.assertSoupEquals(b_tag)
|
|
||||||
|
|
||||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
|
||||||
self.assertSoupEquals(nested_b_tag)
|
|
||||||
|
|
||||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
|
||||||
self.assertSoupEquals(nested_b_tag)
|
|
||||||
|
|
||||||
def test_nested_block_level_elements(self):
|
|
||||||
"""Block elements can be nested."""
|
|
||||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
|
||||||
blockquote = soup.blockquote
|
|
||||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
|
||||||
self.assertEqual(blockquote.b.string, 'Foo')
|
|
||||||
|
|
||||||
def test_correctly_nested_tables(self):
|
|
||||||
"""One table can go inside another one."""
|
|
||||||
markup = ('<table id="1">'
|
|
||||||
'<tr>'
|
|
||||||
"<td>Here's another table:"
|
|
||||||
'<table id="2">'
|
|
||||||
'<tr><td>foo</td></tr>'
|
|
||||||
'</table></td>')
|
|
||||||
|
|
||||||
self.assertSoupEquals(
|
|
||||||
markup,
|
|
||||||
'<table id="1"><tr><td>Here\'s another table:'
|
|
||||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
|
||||||
'</td></tr></table>')
|
|
||||||
|
|
||||||
self.assertSoupEquals(
|
|
||||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
|
||||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
|
||||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
|
||||||
|
|
||||||
def test_deeply_nested_multivalued_attribute(self):
|
|
||||||
# html5lib can set the attributes of the same tag many times
|
|
||||||
# as it rearranges the tree. This has caused problems with
|
|
||||||
# multivalued attributes.
|
|
||||||
markup = '<table><div><div class="css"></div></div></table>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(["css"], soup.div.div['class'])
|
|
||||||
|
|
||||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
|
||||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
|
||||||
|
|
||||||
def test_entities_in_attributes_converted_to_unicode(self):
|
|
||||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
|
||||||
|
|
||||||
def test_entities_in_text_converted_to_unicode(self):
|
|
||||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
|
||||||
|
|
||||||
def test_quot_entity_converted_to_quotation_mark(self):
|
|
||||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
|
||||||
'<p>I said "good day!"</p>')
|
|
||||||
|
|
||||||
def test_out_of_range_entity(self):
|
|
||||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
|
||||||
self.assertSoupEquals("�", expect)
|
|
||||||
self.assertSoupEquals("�", expect)
|
|
||||||
self.assertSoupEquals("�", expect)
|
|
||||||
|
|
||||||
def test_multipart_strings(self):
|
|
||||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
|
||||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
|
||||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
|
||||||
self.assertEqual("p", soup.p.name)
|
|
||||||
|
|
||||||
def test_basic_namespaces(self):
|
|
||||||
"""Parsers don't need to *understand* namespaces, but at the
|
|
||||||
very least they should not choke on namespaces or lose
|
|
||||||
data."""
|
|
||||||
|
|
||||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(markup, soup.encode())
|
|
||||||
html = soup.html
|
|
||||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
|
||||||
self.assertEqual(
|
|
||||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
|
||||||
self.assertEqual(
|
|
||||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
|
||||||
|
|
||||||
def test_multivalued_attribute_value_becomes_list(self):
|
|
||||||
markup = b'<a class="foo bar">'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Generally speaking, tests below this point are more tests of
|
|
||||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
|
||||||
# weird, so we run these tests separately for every tree builder
|
|
||||||
# to detect any differences between them.
|
|
||||||
#
|
|
||||||
|
|
||||||
def test_can_parse_unicode_document(self):
|
|
||||||
# A seemingly innocuous document... but it's in Unicode! And
|
|
||||||
# it contains characters that can't be represented in the
|
|
||||||
# encoding found in the declaration! The horror!
|
|
||||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
|
||||||
|
|
||||||
def test_soupstrainer(self):
|
|
||||||
"""Parsers should be able to work with SoupStrainers."""
|
|
||||||
strainer = SoupStrainer("b")
|
|
||||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
|
||||||
parse_only=strainer)
|
|
||||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
|
||||||
|
|
||||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
|
||||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
|
||||||
'<foo attr="bar"></foo>')
|
|
||||||
|
|
||||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
|
||||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
|
||||||
self.assertSoupEquals(text)
|
|
||||||
|
|
||||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
|
||||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
|
||||||
soup = self.soup(text)
|
|
||||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
|
||||||
self.assertSoupEquals(
|
|
||||||
soup.foo.decode(),
|
|
||||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
|
||||||
|
|
||||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
|
||||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
|
||||||
'<this is="really messed up & stuff"></this>')
|
|
||||||
|
|
||||||
self.assertSoupEquals(
|
|
||||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
|
||||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
|
||||||
|
|
||||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
|
||||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
|
||||||
|
|
||||||
def test_entities_in_strings_converted_during_parsing(self):
|
|
||||||
# Both XML and HTML entities are converted to Unicode characters
|
|
||||||
# during parsing.
|
|
||||||
text = "<p><<sacré bleu!>></p>"
|
|
||||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
|
||||||
self.assertSoupEquals(text, expected)
|
|
||||||
|
|
||||||
def test_smart_quotes_converted_on_the_way_in(self):
|
|
||||||
# Microsoft smart quotes are converted to Unicode characters during
|
|
||||||
# parsing.
|
|
||||||
quote = b"<p>\x91Foo\x92</p>"
|
|
||||||
soup = self.soup(quote)
|
|
||||||
self.assertEqual(
|
|
||||||
soup.p.string,
|
|
||||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
|
||||||
|
|
||||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
|
||||||
soup = self.soup("<a> </a>")
|
|
||||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
|
||||||
|
|
||||||
def test_entities_converted_on_the_way_out(self):
|
|
||||||
text = "<p><<sacré bleu!>></p>"
|
|
||||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
|
||||||
soup = self.soup(text)
|
|
||||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
|
||||||
|
|
||||||
def test_real_iso_latin_document(self):
|
|
||||||
# Smoke test of interrelated functionality, using an
|
|
||||||
# easy-to-understand document.
|
|
||||||
|
|
||||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
|
||||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
|
||||||
|
|
||||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
|
||||||
# that to test.
|
|
||||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
|
||||||
|
|
||||||
# Parse the ISO-Latin-1 HTML.
|
|
||||||
soup = self.soup(iso_latin_html)
|
|
||||||
# Encode it to UTF-8.
|
|
||||||
result = soup.encode("utf-8")
|
|
||||||
|
|
||||||
# What do we expect the result to look like? Well, it would
|
|
||||||
# look like unicode_html, except that the META tag would say
|
|
||||||
# UTF-8 instead of ISO-Latin-1.
|
|
||||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
|
||||||
|
|
||||||
# And, of course, it would be in UTF-8, not Unicode.
|
|
||||||
expected = expected.encode("utf-8")
|
|
||||||
|
|
||||||
# Ta-da!
|
|
||||||
self.assertEqual(result, expected)
|
|
||||||
|
|
||||||
def test_real_shift_jis_document(self):
|
|
||||||
# Smoke test to make sure the parser can handle a document in
|
|
||||||
# Shift-JIS encoding, without choking.
|
|
||||||
shift_jis_html = (
|
|
||||||
b'<html><head></head><body><pre>'
|
|
||||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
|
||||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
|
||||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
|
||||||
b'</pre></body></html>')
|
|
||||||
unicode_html = shift_jis_html.decode("shift-jis")
|
|
||||||
soup = self.soup(unicode_html)
|
|
||||||
|
|
||||||
# Make sure the parse tree is correctly encoded to various
|
|
||||||
# encodings.
|
|
||||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
|
||||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
|
||||||
|
|
||||||
def test_real_hebrew_document(self):
|
|
||||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
|
||||||
# Hebrew encoding) to UTF-8.
|
|
||||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
|
||||||
soup = self.soup(
|
|
||||||
hebrew_document, from_encoding="iso8859-8")
|
|
||||||
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
|
||||||
self.assertEqual(
|
|
||||||
soup.encode('utf-8'),
|
|
||||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
|
||||||
|
|
||||||
def test_meta_tag_reflects_current_encoding(self):
|
|
||||||
# Here's the <meta> tag saying that a document is
|
|
||||||
# encoded in Shift-JIS.
|
|
||||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
|
||||||
'http-equiv="Content-type"/>')
|
|
||||||
|
|
||||||
# Here's a document incorporating that meta tag.
|
|
||||||
shift_jis_html = (
|
|
||||||
'<html><head>\n%s\n'
|
|
||||||
'<meta http-equiv="Content-language" content="ja"/>'
|
|
||||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
|
||||||
soup = self.soup(shift_jis_html)
|
|
||||||
|
|
||||||
# Parse the document, and the charset is seemingly unaffected.
|
|
||||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
|
||||||
content = parsed_meta['content']
|
|
||||||
self.assertEqual('text/html; charset=x-sjis', content)
|
|
||||||
|
|
||||||
# But that value is actually a ContentMetaAttributeValue object.
|
|
||||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
|
||||||
|
|
||||||
# And it will take on a value that reflects its current
|
|
||||||
# encoding.
|
|
||||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
|
||||||
|
|
||||||
# For the rest of the story, see TestSubstitutions in
|
|
||||||
# test_tree.py.
|
|
||||||
|
|
||||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
|
||||||
# Here's the <meta> tag saying that a document is
|
|
||||||
# encoded in Shift-JIS.
|
|
||||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
|
||||||
|
|
||||||
# Here's a document incorporating that meta tag.
|
|
||||||
shift_jis_html = (
|
|
||||||
'<html><head>\n%s\n'
|
|
||||||
'<meta http-equiv="Content-language" content="ja"/>'
|
|
||||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
|
||||||
soup = self.soup(shift_jis_html)
|
|
||||||
|
|
||||||
# Parse the document, and the charset is seemingly unaffected.
|
|
||||||
parsed_meta = soup.find('meta', id="encoding")
|
|
||||||
charset = parsed_meta['charset']
|
|
||||||
self.assertEqual('x-sjis', charset)
|
|
||||||
|
|
||||||
# But that value is actually a CharsetMetaAttributeValue object.
|
|
||||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
|
||||||
|
|
||||||
# And it will take on a value that reflects its current
|
|
||||||
# encoding.
|
|
||||||
self.assertEqual('utf8', charset.encode("utf8"))
|
|
||||||
|
|
||||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
|
||||||
data = self.soup("<a>text</a>")
|
|
||||||
data.a['foo'] = 'bar'
|
|
||||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
|
||||||
|
|
||||||
class XMLTreeBuilderSmokeTest(object):
|
|
||||||
|
|
||||||
def test_docstring_generated(self):
|
|
||||||
soup = self.soup("<root/>")
|
|
||||||
self.assertEqual(
|
|
||||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
|
||||||
|
|
||||||
def test_real_xhtml_document(self):
|
|
||||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
|
||||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head><title>Hello.</title></head>
|
|
||||||
<body>Goodbye.</body>
|
|
||||||
</html>"""
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(
|
|
||||||
soup.encode("utf-8"), markup)
|
|
||||||
|
|
||||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
|
||||||
doc = """
|
|
||||||
<script type="text/javascript">
|
|
||||||
</script>
|
|
||||||
"""
|
|
||||||
soup = BeautifulSoup(doc, "xml")
|
|
||||||
# lxml would have stripped this while parsing, but we can add
|
|
||||||
# it later.
|
|
||||||
soup.script.string = 'console.log("< < hey > > ");'
|
|
||||||
encoded = soup.encode()
|
|
||||||
self.assertTrue(b"< < hey > >" in encoded)
|
|
||||||
|
|
||||||
def test_can_parse_unicode_document(self):
|
|
||||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
|
||||||
|
|
||||||
def test_popping_namespaced_tag(self):
|
|
||||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(
|
|
||||||
unicode(soup.rss), markup)
|
|
||||||
|
|
||||||
def test_docstring_includes_correct_encoding(self):
|
|
||||||
soup = self.soup("<root/>")
|
|
||||||
self.assertEqual(
|
|
||||||
soup.encode("latin1"),
|
|
||||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
|
||||||
|
|
||||||
def test_large_xml_document(self):
|
|
||||||
"""A large XML document should come out the same as it went in."""
|
|
||||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
|
||||||
+ b'0' * (2**12)
|
|
||||||
+ b'</root>')
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(soup.encode("utf-8"), markup)
|
|
||||||
|
|
||||||
|
|
||||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
|
||||||
self.assertSoupEquals("<p>", "<p/>")
|
|
||||||
self.assertSoupEquals("<p>foo</p>")
|
|
||||||
|
|
||||||
def test_namespaces_are_preserved(self):
|
|
||||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
root = soup.root
|
|
||||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
|
||||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
|
||||||
|
|
||||||
def test_closing_namespaced_tag(self):
|
|
||||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(unicode(soup.p), markup)
|
|
||||||
|
|
||||||
def test_namespaced_attributes(self):
|
|
||||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
|
||||||
|
|
||||||
def test_namespaced_attributes_xml_namespace(self):
|
|
||||||
markup = '<foo xml:lang="fr">bar</foo>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
|
||||||
|
|
||||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
|
||||||
"""Smoke test for a tree builder that supports HTML5."""
|
|
||||||
|
|
||||||
def test_real_xhtml_document(self):
|
|
||||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
|
||||||
# XHTML documents in any particular way.
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_html_tags_have_namespace(self):
|
|
||||||
markup = "<a>"
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
|
||||||
|
|
||||||
def test_svg_tags_have_namespace(self):
|
|
||||||
markup = '<svg><circle/></svg>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
namespace = "http://www.w3.org/2000/svg"
|
|
||||||
self.assertEqual(namespace, soup.svg.namespace)
|
|
||||||
self.assertEqual(namespace, soup.circle.namespace)
|
|
||||||
|
|
||||||
|
|
||||||
def test_mathml_tags_have_namespace(self):
|
|
||||||
markup = '<math><msqrt>5</msqrt></math>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
|
||||||
self.assertEqual(namespace, soup.math.namespace)
|
|
||||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
|
||||||
|
|
||||||
def test_xml_declaration_becomes_comment(self):
|
|
||||||
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertTrue(isinstance(soup.contents[0], Comment))
|
|
||||||
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
|
||||||
self.assertEqual("html", soup.contents[0].next_element.name)
|
|
||||||
|
|
||||||
def skipIf(condition, reason):
|
|
||||||
def nothing(test, *args, **kwargs):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def decorator(test_item):
|
|
||||||
if condition:
|
|
||||||
return nothing
|
|
||||||
else:
|
|
||||||
return test_item
|
|
||||||
|
|
||||||
return decorator
|
|
|
@ -1 +0,0 @@
|
||||||
"The beautifulsoup tests."
|
|
|
@ -1,141 +0,0 @@
|
||||||
"""Tests of the builder registry."""
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from bs4.builder import (
|
|
||||||
builder_registry as registry,
|
|
||||||
HTMLParserTreeBuilder,
|
|
||||||
TreeBuilderRegistry,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from bs4.builder import HTML5TreeBuilder
|
|
||||||
HTML5LIB_PRESENT = True
|
|
||||||
except ImportError:
|
|
||||||
HTML5LIB_PRESENT = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
from bs4.builder import (
|
|
||||||
LXMLTreeBuilderForXML,
|
|
||||||
LXMLTreeBuilder,
|
|
||||||
)
|
|
||||||
LXML_PRESENT = True
|
|
||||||
except ImportError:
|
|
||||||
LXML_PRESENT = False
|
|
||||||
|
|
||||||
|
|
||||||
class BuiltInRegistryTest(unittest.TestCase):
|
|
||||||
"""Test the built-in registry with the default builders registered."""
|
|
||||||
|
|
||||||
def test_combination(self):
|
|
||||||
if LXML_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('fast', 'html'),
|
|
||||||
LXMLTreeBuilder)
|
|
||||||
|
|
||||||
if LXML_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('permissive', 'xml'),
|
|
||||||
LXMLTreeBuilderForXML)
|
|
||||||
self.assertEqual(registry.lookup('strict', 'html'),
|
|
||||||
HTMLParserTreeBuilder)
|
|
||||||
if HTML5LIB_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('html5lib', 'html'),
|
|
||||||
HTML5TreeBuilder)
|
|
||||||
|
|
||||||
def test_lookup_by_markup_type(self):
|
|
||||||
if LXML_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
|
||||||
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
|
||||||
else:
|
|
||||||
self.assertEqual(registry.lookup('xml'), None)
|
|
||||||
if HTML5LIB_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
|
||||||
else:
|
|
||||||
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
|
||||||
|
|
||||||
def test_named_library(self):
|
|
||||||
if LXML_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('lxml', 'xml'),
|
|
||||||
LXMLTreeBuilderForXML)
|
|
||||||
self.assertEqual(registry.lookup('lxml', 'html'),
|
|
||||||
LXMLTreeBuilder)
|
|
||||||
if HTML5LIB_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('html5lib'),
|
|
||||||
HTML5TreeBuilder)
|
|
||||||
|
|
||||||
self.assertEqual(registry.lookup('html.parser'),
|
|
||||||
HTMLParserTreeBuilder)
|
|
||||||
|
|
||||||
def test_beautifulsoup_constructor_does_lookup(self):
|
|
||||||
# You can pass in a string.
|
|
||||||
BeautifulSoup("", features="html")
|
|
||||||
# Or a list of strings.
|
|
||||||
BeautifulSoup("", features=["html", "fast"])
|
|
||||||
|
|
||||||
# You'll get an exception if BS can't find an appropriate
|
|
||||||
# builder.
|
|
||||||
self.assertRaises(ValueError, BeautifulSoup,
|
|
||||||
"", features="no-such-feature")
|
|
||||||
|
|
||||||
class RegistryTest(unittest.TestCase):
|
|
||||||
"""Test the TreeBuilderRegistry class in general."""
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
self.registry = TreeBuilderRegistry()
|
|
||||||
|
|
||||||
def builder_for_features(self, *feature_list):
|
|
||||||
cls = type('Builder_' + '_'.join(feature_list),
|
|
||||||
(object,), {'features' : feature_list})
|
|
||||||
|
|
||||||
self.registry.register(cls)
|
|
||||||
return cls
|
|
||||||
|
|
||||||
def test_register_with_no_features(self):
|
|
||||||
builder = self.builder_for_features()
|
|
||||||
|
|
||||||
# Since the builder advertises no features, you can't find it
|
|
||||||
# by looking up features.
|
|
||||||
self.assertEqual(self.registry.lookup('foo'), None)
|
|
||||||
|
|
||||||
# But you can find it by doing a lookup with no features, if
|
|
||||||
# this happens to be the only registered builder.
|
|
||||||
self.assertEqual(self.registry.lookup(), builder)
|
|
||||||
|
|
||||||
def test_register_with_features_makes_lookup_succeed(self):
|
|
||||||
builder = self.builder_for_features('foo', 'bar')
|
|
||||||
self.assertEqual(self.registry.lookup('foo'), builder)
|
|
||||||
self.assertEqual(self.registry.lookup('bar'), builder)
|
|
||||||
|
|
||||||
def test_lookup_fails_when_no_builder_implements_feature(self):
|
|
||||||
builder = self.builder_for_features('foo', 'bar')
|
|
||||||
self.assertEqual(self.registry.lookup('baz'), None)
|
|
||||||
|
|
||||||
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
|
||||||
builder1 = self.builder_for_features('foo')
|
|
||||||
builder2 = self.builder_for_features('bar')
|
|
||||||
self.assertEqual(self.registry.lookup(), builder2)
|
|
||||||
|
|
||||||
def test_lookup_fails_when_no_tree_builders_registered(self):
|
|
||||||
self.assertEqual(self.registry.lookup(), None)
|
|
||||||
|
|
||||||
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
|
||||||
has_one = self.builder_for_features('foo')
|
|
||||||
has_the_other = self.builder_for_features('bar')
|
|
||||||
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
|
||||||
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
|
||||||
lacks_one = self.builder_for_features('bar')
|
|
||||||
has_the_other = self.builder_for_features('foo')
|
|
||||||
|
|
||||||
# There are two builders featuring 'foo' and 'bar', but
|
|
||||||
# the one that also features 'quux' was registered later.
|
|
||||||
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
|
||||||
has_both_late)
|
|
||||||
|
|
||||||
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
|
||||||
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
|
||||||
has_both_early)
|
|
||||||
|
|
||||||
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
|
||||||
builder1 = self.builder_for_features('foo', 'bar')
|
|
||||||
builder2 = self.builder_for_features('foo', 'baz')
|
|
||||||
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
|
|
@ -1,36 +0,0 @@
|
||||||
"Test harness for doctests."
|
|
||||||
|
|
||||||
# pylint: disable-msg=E0611,W0142
|
|
||||||
|
|
||||||
__metaclass__ = type
|
|
||||||
__all__ = [
|
|
||||||
'additional_tests',
|
|
||||||
]
|
|
||||||
|
|
||||||
import atexit
|
|
||||||
import doctest
|
|
||||||
import os
|
|
||||||
#from pkg_resources import (
|
|
||||||
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
DOCTEST_FLAGS = (
|
|
||||||
doctest.ELLIPSIS |
|
|
||||||
doctest.NORMALIZE_WHITESPACE |
|
|
||||||
doctest.REPORT_NDIFF)
|
|
||||||
|
|
||||||
|
|
||||||
# def additional_tests():
|
|
||||||
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
|
||||||
# doctest_files = [
|
|
||||||
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
|
||||||
# if resource_exists('bs4', 'docs'):
|
|
||||||
# for name in resource_listdir('bs4', 'docs'):
|
|
||||||
# if name.endswith('.txt'):
|
|
||||||
# doctest_files.append(
|
|
||||||
# os.path.abspath(
|
|
||||||
# resource_filename('bs4', 'docs/%s' % name)))
|
|
||||||
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
|
||||||
# atexit.register(cleanup_resources)
|
|
||||||
# return unittest.TestSuite((
|
|
||||||
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
|
|
@ -1,85 +0,0 @@
|
||||||
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
|
||||||
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
try:
|
|
||||||
from bs4.builder import HTML5TreeBuilder
|
|
||||||
HTML5LIB_PRESENT = True
|
|
||||||
except ImportError, e:
|
|
||||||
HTML5LIB_PRESENT = False
|
|
||||||
from bs4.element import SoupStrainer
|
|
||||||
from bs4.testing import (
|
|
||||||
HTML5TreeBuilderSmokeTest,
|
|
||||||
SoupTest,
|
|
||||||
skipIf,
|
|
||||||
)
|
|
||||||
|
|
||||||
@skipIf(
|
|
||||||
not HTML5LIB_PRESENT,
|
|
||||||
"html5lib seems not to be present, not testing its tree builder.")
|
|
||||||
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
|
||||||
"""See ``HTML5TreeBuilderSmokeTest``."""
|
|
||||||
|
|
||||||
@property
|
|
||||||
def default_builder(self):
|
|
||||||
return HTML5TreeBuilder()
|
|
||||||
|
|
||||||
def test_soupstrainer(self):
|
|
||||||
# The html5lib tree builder does not support SoupStrainers.
|
|
||||||
strainer = SoupStrainer("b")
|
|
||||||
markup = "<p>A <b>bold</b> statement.</p>"
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup(markup, parse_only=strainer)
|
|
||||||
self.assertEqual(
|
|
||||||
soup.decode(), self.document_for(markup))
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
"the html5lib tree builder doesn't support parse_only" in
|
|
||||||
str(w[0].message))
|
|
||||||
|
|
||||||
def test_correctly_nested_tables(self):
|
|
||||||
"""html5lib inserts <tbody> tags where other parsers don't."""
|
|
||||||
markup = ('<table id="1">'
|
|
||||||
'<tr>'
|
|
||||||
"<td>Here's another table:"
|
|
||||||
'<table id="2">'
|
|
||||||
'<tr><td>foo</td></tr>'
|
|
||||||
'</table></td>')
|
|
||||||
|
|
||||||
self.assertSoupEquals(
|
|
||||||
markup,
|
|
||||||
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
|
||||||
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
|
||||||
'</td></tr></tbody></table>')
|
|
||||||
|
|
||||||
self.assertSoupEquals(
|
|
||||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
|
||||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
|
||||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
|
||||||
|
|
||||||
def test_xml_declaration_followed_by_doctype(self):
|
|
||||||
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p>foo</p>
|
|
||||||
</body>
|
|
||||||
</html>'''
|
|
||||||
soup = self.soup(markup)
|
|
||||||
# Verify that we can reach the <p> tag; this means the tree is connected.
|
|
||||||
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
|
||||||
|
|
||||||
def test_reparented_markup(self):
|
|
||||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
|
||||||
self.assertEqual(2, len(soup.find_all('p')))
|
|
||||||
|
|
||||||
|
|
||||||
def test_reparented_markup_ends_with_whitespace(self):
|
|
||||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
|
||||||
self.assertEqual(2, len(soup.find_all('p')))
|
|
|
@ -1,19 +0,0 @@
|
||||||
"""Tests to ensure that the html.parser tree builder generates good
|
|
||||||
trees."""
|
|
||||||
|
|
||||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
|
||||||
|
|
||||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|
||||||
|
|
||||||
@property
|
|
||||||
def default_builder(self):
|
|
||||||
return HTMLParserTreeBuilder()
|
|
||||||
|
|
||||||
def test_namespaced_system_doctype(self):
|
|
||||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_namespaced_public_doctype(self):
|
|
||||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
|
||||||
pass
|
|
|
@ -1,91 +0,0 @@
|
||||||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
try:
|
|
||||||
import lxml.etree
|
|
||||||
LXML_PRESENT = True
|
|
||||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
|
||||||
except ImportError, e:
|
|
||||||
LXML_PRESENT = False
|
|
||||||
LXML_VERSION = (0,)
|
|
||||||
|
|
||||||
if LXML_PRESENT:
|
|
||||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
|
||||||
|
|
||||||
from bs4 import (
|
|
||||||
BeautifulSoup,
|
|
||||||
BeautifulStoneSoup,
|
|
||||||
)
|
|
||||||
from bs4.element import Comment, Doctype, SoupStrainer
|
|
||||||
from bs4.testing import skipIf
|
|
||||||
from bs4.tests import test_htmlparser
|
|
||||||
from bs4.testing import (
|
|
||||||
HTMLTreeBuilderSmokeTest,
|
|
||||||
XMLTreeBuilderSmokeTest,
|
|
||||||
SoupTest,
|
|
||||||
skipIf,
|
|
||||||
)
|
|
||||||
|
|
||||||
@skipIf(
|
|
||||||
not LXML_PRESENT,
|
|
||||||
"lxml seems not to be present, not testing its tree builder.")
|
|
||||||
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|
||||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
|
||||||
|
|
||||||
@property
|
|
||||||
def default_builder(self):
|
|
||||||
return LXMLTreeBuilder()
|
|
||||||
|
|
||||||
def test_out_of_range_entity(self):
|
|
||||||
self.assertSoupEquals(
|
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
|
||||||
self.assertSoupEquals(
|
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
|
||||||
self.assertSoupEquals(
|
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
|
||||||
|
|
||||||
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
|
||||||
# test if an old version of lxml is installed.
|
|
||||||
|
|
||||||
@skipIf(
|
|
||||||
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
|
|
||||||
"Skipping doctype test for old version of lxml to avoid segfault.")
|
|
||||||
def test_empty_doctype(self):
|
|
||||||
soup = self.soup("<!DOCTYPE>")
|
|
||||||
doctype = soup.contents[0]
|
|
||||||
self.assertEqual("", doctype.strip())
|
|
||||||
|
|
||||||
def test_beautifulstonesoup_is_xml_parser(self):
|
|
||||||
# Make sure that the deprecated BSS class uses an xml builder
|
|
||||||
# if one is installed.
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = BeautifulStoneSoup("<b />")
|
|
||||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
|
||||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
|
||||||
|
|
||||||
def test_real_xhtml_document(self):
|
|
||||||
"""lxml strips the XML definition from an XHTML doc, which is fine."""
|
|
||||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head><title>Hello.</title></head>
|
|
||||||
<body>Goodbye.</body>
|
|
||||||
</html>"""
|
|
||||||
soup = self.soup(markup)
|
|
||||||
self.assertEqual(
|
|
||||||
soup.encode("utf-8").replace(b"\n", b''),
|
|
||||||
markup.replace(b'\n', b'').replace(
|
|
||||||
b'<?xml version="1.0" encoding="utf-8"?>', b''))
|
|
||||||
|
|
||||||
|
|
||||||
@skipIf(
|
|
||||||
not LXML_PRESENT,
|
|
||||||
"lxml seems not to be present, not testing its XML tree builder.")
|
|
||||||
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
|
||||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
|
||||||
|
|
||||||
@property
|
|
||||||
def default_builder(self):
|
|
||||||
return LXMLTreeBuilderForXML()
|
|
|
@ -1,434 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""Tests of Beautiful Soup as a whole."""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import unittest
|
|
||||||
import sys
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
from bs4 import (
|
|
||||||
BeautifulSoup,
|
|
||||||
BeautifulStoneSoup,
|
|
||||||
)
|
|
||||||
from bs4.element import (
|
|
||||||
CharsetMetaAttributeValue,
|
|
||||||
ContentMetaAttributeValue,
|
|
||||||
SoupStrainer,
|
|
||||||
NamespacedAttribute,
|
|
||||||
)
|
|
||||||
import bs4.dammit
|
|
||||||
from bs4.dammit import (
|
|
||||||
EntitySubstitution,
|
|
||||||
UnicodeDammit,
|
|
||||||
)
|
|
||||||
from bs4.testing import (
|
|
||||||
SoupTest,
|
|
||||||
skipIf,
|
|
||||||
)
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
try:
|
|
||||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
|
||||||
LXML_PRESENT = True
|
|
||||||
except ImportError, e:
|
|
||||||
LXML_PRESENT = False
|
|
||||||
|
|
||||||
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
|
|
||||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
|
||||||
|
|
||||||
class TestConstructor(SoupTest):
|
|
||||||
|
|
||||||
def test_short_unicode_input(self):
|
|
||||||
data = u"<h1>éé</h1>"
|
|
||||||
soup = self.soup(data)
|
|
||||||
self.assertEqual(u"éé", soup.h1.string)
|
|
||||||
|
|
||||||
def test_embedded_null(self):
|
|
||||||
data = u"<h1>foo\0bar</h1>"
|
|
||||||
soup = self.soup(data)
|
|
||||||
self.assertEqual(u"foo\0bar", soup.h1.string)
|
|
||||||
|
|
||||||
|
|
||||||
class TestDeprecatedConstructorArguments(SoupTest):
|
|
||||||
|
|
||||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
|
||||||
msg = str(w[0].message)
|
|
||||||
self.assertTrue("parseOnlyThese" in msg)
|
|
||||||
self.assertTrue("parse_only" in msg)
|
|
||||||
self.assertEqual(b"<b></b>", soup.encode())
|
|
||||||
|
|
||||||
def test_fromEncoding_renamed_to_from_encoding(self):
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
utf8 = b"\xc3\xa9"
|
|
||||||
soup = self.soup(utf8, fromEncoding="utf8")
|
|
||||||
msg = str(w[0].message)
|
|
||||||
self.assertTrue("fromEncoding" in msg)
|
|
||||||
self.assertTrue("from_encoding" in msg)
|
|
||||||
self.assertEqual("utf8", soup.original_encoding)
|
|
||||||
|
|
||||||
def test_unrecognized_keyword_argument(self):
|
|
||||||
self.assertRaises(
|
|
||||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
|
||||||
|
|
||||||
class TestWarnings(SoupTest):
|
|
||||||
|
|
||||||
def test_disk_file_warning(self):
|
|
||||||
filehandle = tempfile.NamedTemporaryFile()
|
|
||||||
filename = filehandle.name
|
|
||||||
try:
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup(filename)
|
|
||||||
msg = str(w[0].message)
|
|
||||||
self.assertTrue("looks like a filename" in msg)
|
|
||||||
finally:
|
|
||||||
filehandle.close()
|
|
||||||
|
|
||||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup(filename)
|
|
||||||
self.assertEqual(0, len(w))
|
|
||||||
|
|
||||||
def test_url_warning(self):
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup("http://www.crummy.com/")
|
|
||||||
msg = str(w[0].message)
|
|
||||||
self.assertTrue("looks like a URL" in msg)
|
|
||||||
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup("http://www.crummy.com/ is great")
|
|
||||||
self.assertEqual(0, len(w))
|
|
||||||
|
|
||||||
class TestSelectiveParsing(SoupTest):
|
|
||||||
|
|
||||||
def test_parse_with_soupstrainer(self):
|
|
||||||
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
|
||||||
strainer = SoupStrainer("b")
|
|
||||||
soup = self.soup(markup, parse_only=strainer)
|
|
||||||
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
|
||||||
|
|
||||||
|
|
||||||
class TestEntitySubstitution(unittest.TestCase):
|
|
||||||
"""Standalone tests of the EntitySubstitution class."""
|
|
||||||
def setUp(self):
|
|
||||||
self.sub = EntitySubstitution
|
|
||||||
|
|
||||||
def test_simple_html_substitution(self):
|
|
||||||
# Unicode characters corresponding to named HTML entites
|
|
||||||
# are substituted, and no others.
|
|
||||||
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
|
||||||
self.assertEqual(self.sub.substitute_html(s),
|
|
||||||
u"foo∀\N{SNOWMAN}õbar")
|
|
||||||
|
|
||||||
def test_smart_quote_substitution(self):
|
|
||||||
# MS smart quotes are a common source of frustration, so we
|
|
||||||
# give them a special test.
|
|
||||||
quotes = b"\x91\x92foo\x93\x94"
|
|
||||||
dammit = UnicodeDammit(quotes)
|
|
||||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
|
||||||
"‘’foo“”")
|
|
||||||
|
|
||||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
|
||||||
s = 'Welcome to "my bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
|
||||||
|
|
||||||
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
|
||||||
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
|
||||||
'"Welcome"')
|
|
||||||
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
|
||||||
'"Bob\'s Bar"')
|
|
||||||
|
|
||||||
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
|
||||||
s = 'Welcome to "my bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_xml(s, True),
|
|
||||||
"'Welcome to \"my bar\"'")
|
|
||||||
|
|
||||||
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
|
||||||
s = 'Welcome to "Bob\'s Bar"'
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml(s, True),
|
|
||||||
'"Welcome to "Bob\'s Bar""')
|
|
||||||
|
|
||||||
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
|
||||||
quoted = 'Welcome to "Bob\'s Bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
|
||||||
|
|
||||||
def test_xml_quoting_handles_angle_brackets(self):
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml("foo<bar>"),
|
|
||||||
"foo<bar>")
|
|
||||||
|
|
||||||
def test_xml_quoting_handles_ampersands(self):
|
|
||||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
|
||||||
|
|
||||||
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml("ÁT&T"),
|
|
||||||
"&Aacute;T&T")
|
|
||||||
|
|
||||||
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
|
||||||
"ÁT&T")
|
|
||||||
|
|
||||||
def test_quotes_not_html_substituted(self):
|
|
||||||
"""There's no need to do this except inside attribute values."""
|
|
||||||
text = 'Bob\'s "bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_html(text), text)
|
|
||||||
|
|
||||||
|
|
||||||
class TestEncodingConversion(SoupTest):
|
|
||||||
# Test Beautiful Soup's ability to decode and encode from various
|
|
||||||
# encodings.
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
super(TestEncodingConversion, self).setUp()
|
|
||||||
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
|
||||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
|
||||||
# Just so you know what it looks like.
|
|
||||||
self.assertEqual(
|
|
||||||
self.utf8_data,
|
|
||||||
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
|
||||||
|
|
||||||
def test_ascii_in_unicode_out(self):
|
|
||||||
# ASCII input is converted to Unicode. The original_encoding
|
|
||||||
# attribute is set to 'utf-8', a superset of ASCII.
|
|
||||||
chardet = bs4.dammit.chardet_dammit
|
|
||||||
logging.disable(logging.WARNING)
|
|
||||||
try:
|
|
||||||
def noop(str):
|
|
||||||
return None
|
|
||||||
# Disable chardet, which will realize that the ASCII is ASCII.
|
|
||||||
bs4.dammit.chardet_dammit = noop
|
|
||||||
ascii = b"<foo>a</foo>"
|
|
||||||
soup_from_ascii = self.soup(ascii)
|
|
||||||
unicode_output = soup_from_ascii.decode()
|
|
||||||
self.assertTrue(isinstance(unicode_output, unicode))
|
|
||||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
|
||||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
|
||||||
finally:
|
|
||||||
logging.disable(logging.NOTSET)
|
|
||||||
bs4.dammit.chardet_dammit = chardet
|
|
||||||
|
|
||||||
def test_unicode_in_unicode_out(self):
|
|
||||||
# Unicode input is left alone. The original_encoding attribute
|
|
||||||
# is not set.
|
|
||||||
soup_from_unicode = self.soup(self.unicode_data)
|
|
||||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
|
||||||
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
|
||||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
|
||||||
|
|
||||||
def test_utf8_in_unicode_out(self):
|
|
||||||
# UTF-8 input is converted to Unicode. The original_encoding
|
|
||||||
# attribute is set.
|
|
||||||
soup_from_utf8 = self.soup(self.utf8_data)
|
|
||||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
|
||||||
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
|
||||||
|
|
||||||
def test_utf8_out(self):
|
|
||||||
# The internal data structures can be encoded as UTF-8.
|
|
||||||
soup_from_unicode = self.soup(self.unicode_data)
|
|
||||||
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
|
||||||
|
|
||||||
@skipIf(
|
|
||||||
PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
|
|
||||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
|
||||||
def test_attribute_name_containing_unicode_characters(self):
|
|
||||||
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
|
||||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
|
||||||
|
|
||||||
class TestUnicodeDammit(unittest.TestCase):
|
|
||||||
"""Standalone tests of UnicodeDammit."""
|
|
||||||
|
|
||||||
def test_unicode_input(self):
|
|
||||||
markup = u"I'm already Unicode! \N{SNOWMAN}"
|
|
||||||
dammit = UnicodeDammit(markup)
|
|
||||||
self.assertEqual(dammit.unicode_markup, markup)
|
|
||||||
|
|
||||||
def test_smart_quotes_to_unicode(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup)
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
|
|
||||||
|
|
||||||
def test_smart_quotes_to_xml_entities(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
|
||||||
|
|
||||||
def test_smart_quotes_to_html_entities(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
|
||||||
|
|
||||||
def test_smart_quotes_to_ascii(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
|
||||||
|
|
||||||
def test_detect_utf8(self):
|
|
||||||
utf8 = b"\xc3\xa9"
|
|
||||||
dammit = UnicodeDammit(utf8)
|
|
||||||
self.assertEqual(dammit.unicode_markup, u'\xe9')
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
|
|
||||||
def test_convert_hebrew(self):
|
|
||||||
hebrew = b"\xed\xe5\xec\xf9"
|
|
||||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
|
||||||
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
|
|
||||||
|
|
||||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
|
||||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
|
||||||
dammit = UnicodeDammit(utf_8)
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
|
||||||
|
|
||||||
def test_ignore_inappropriate_codecs(self):
|
|
||||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
|
||||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
|
|
||||||
def test_ignore_invalid_codecs(self):
|
|
||||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
|
||||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
|
||||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
|
|
||||||
def test_detect_html5_style_meta_tag(self):
|
|
||||||
|
|
||||||
for data in (
|
|
||||||
b'<html><meta charset="euc-jp" /></html>',
|
|
||||||
b"<html><meta charset='euc-jp' /></html>",
|
|
||||||
b"<html><meta charset=euc-jp /></html>",
|
|
||||||
b"<html><meta charset=euc-jp/></html>"):
|
|
||||||
dammit = UnicodeDammit(data, is_html=True)
|
|
||||||
self.assertEqual(
|
|
||||||
"euc-jp", dammit.original_encoding)
|
|
||||||
|
|
||||||
def test_last_ditch_entity_replacement(self):
|
|
||||||
# This is a UTF-8 document that contains bytestrings
|
|
||||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
|
||||||
# encoding).
|
|
||||||
#
|
|
||||||
# Since there is no consistent encoding for the document,
|
|
||||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
|
||||||
# and encode the incompatible characters as REPLACEMENT
|
|
||||||
# CHARACTER.
|
|
||||||
#
|
|
||||||
# If chardet is installed, it will detect that the document
|
|
||||||
# can be converted into ISO-8859-1 without errors. This happens
|
|
||||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
|
||||||
# code we're testing here won't run.
|
|
||||||
#
|
|
||||||
# So we temporarily disable chardet if it's present.
|
|
||||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<html><b>\330\250\330\252\330\261</b>
|
|
||||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
|
||||||
chardet = bs4.dammit.chardet_dammit
|
|
||||||
logging.disable(logging.WARNING)
|
|
||||||
try:
|
|
||||||
def noop(str):
|
|
||||||
return None
|
|
||||||
bs4.dammit.chardet_dammit = noop
|
|
||||||
dammit = UnicodeDammit(doc)
|
|
||||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
|
||||||
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(doc, "html.parser")
|
|
||||||
self.assertTrue(soup.contains_replacement_characters)
|
|
||||||
finally:
|
|
||||||
logging.disable(logging.NOTSET)
|
|
||||||
bs4.dammit.chardet_dammit = chardet
|
|
||||||
|
|
||||||
def test_byte_order_mark_removed(self):
|
|
||||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
|
||||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
|
||||||
dammit = UnicodeDammit(data)
|
|
||||||
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
|
|
||||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
|
||||||
|
|
||||||
def test_detwingle(self):
|
|
||||||
# Here's a UTF8 document.
|
|
||||||
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
|
|
||||||
|
|
||||||
# Here's a Windows-1252 document.
|
|
||||||
windows_1252 = (
|
|
||||||
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
|
||||||
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
|
||||||
|
|
||||||
# Through some unholy alchemy, they've been stuck together.
|
|
||||||
doc = utf8 + windows_1252 + utf8
|
|
||||||
|
|
||||||
# The document can't be turned into UTF-8:
|
|
||||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
|
||||||
|
|
||||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
|
||||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
|
||||||
|
|
||||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
|
||||||
|
|
||||||
fixed = UnicodeDammit.detwingle(doc)
|
|
||||||
self.assertEqual(
|
|
||||||
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
|
||||||
|
|
||||||
def test_detwingle_ignores_multibyte_characters(self):
|
|
||||||
# Each of these characters has a UTF-8 representation ending
|
|
||||||
# in \x93. \x93 is a smart quote if interpreted as
|
|
||||||
# Windows-1252. But our code knows to skip over multibyte
|
|
||||||
# UTF-8 characters, so they'll survive the process unscathed.
|
|
||||||
for tricky_unicode_char in (
|
|
||||||
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
|
||||||
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
|
||||||
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
|
||||||
):
|
|
||||||
input = tricky_unicode_char.encode("utf8")
|
|
||||||
self.assertTrue(input.endswith(b'\x93'))
|
|
||||||
output = UnicodeDammit.detwingle(input)
|
|
||||||
self.assertEqual(output, input)
|
|
||||||
|
|
||||||
class TestNamedspacedAttribute(SoupTest):
|
|
||||||
|
|
||||||
def test_name_may_be_none(self):
|
|
||||||
a = NamespacedAttribute("xmlns", None)
|
|
||||||
self.assertEqual(a, "xmlns")
|
|
||||||
|
|
||||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
|
||||||
a = NamespacedAttribute("a", "b")
|
|
||||||
self.assertEqual("a:b", a)
|
|
||||||
|
|
||||||
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
|
||||||
a = NamespacedAttribute("a", "b", "c")
|
|
||||||
b = NamespacedAttribute("a", "b", "c")
|
|
||||||
self.assertEqual(a, b)
|
|
||||||
|
|
||||||
# The actual namespace is not considered.
|
|
||||||
c = NamespacedAttribute("a", "b", None)
|
|
||||||
self.assertEqual(a, c)
|
|
||||||
|
|
||||||
# But name and prefix are important.
|
|
||||||
d = NamespacedAttribute("a", "z", "c")
|
|
||||||
self.assertNotEqual(a, d)
|
|
||||||
|
|
||||||
e = NamespacedAttribute("z", "b", "c")
|
|
||||||
self.assertNotEqual(a, e)
|
|
||||||
|
|
||||||
|
|
||||||
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_content_meta_attribute_value(self):
|
|
||||||
value = CharsetMetaAttributeValue("euc-jp")
|
|
||||||
self.assertEqual("euc-jp", value)
|
|
||||||
self.assertEqual("euc-jp", value.original_value)
|
|
||||||
self.assertEqual("utf8", value.encode("utf8"))
|
|
||||||
|
|
||||||
|
|
||||||
def test_content_meta_attribute_value(self):
|
|
||||||
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
|
||||||
self.assertEqual("text/html; charset=euc-jp", value)
|
|
||||||
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
|
||||||
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
|
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue