From f83390e689234d30871a8d8948c7ec277a7ebf89 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Mon, 15 Jun 2015 00:45:29 +0100 Subject: [PATCH] Update Beautiful Soup to 4.3.2 (r353). --- CHANGES.md | 1 + lib/bs4/__init__.py | 20 +- lib/bs4/builder/__init__.py | 2 + lib/bs4/builder/_html5lib.py | 10 +- lib/bs4/builder/_htmlparser.py | 22 +- lib/bs4/builder/_lxml.py | 20 +- lib/bs4/element.py | 362 ++--- lib/bs4/testing.py | 592 -------- lib/bs4/tests/__init__.py | 1 - lib/bs4/tests/test_builder_registry.py | 141 -- lib/bs4/tests/test_docs.py | 36 - lib/bs4/tests/test_html5lib.py | 85 -- lib/bs4/tests/test_htmlparser.py | 19 - lib/bs4/tests/test_lxml.py | 91 -- lib/bs4/tests/test_soup.py | 434 ------ lib/bs4/tests/test_tree.py | 1829 ------------------------ 16 files changed, 240 insertions(+), 3425 deletions(-) delete mode 100644 lib/bs4/testing.py delete mode 100644 lib/bs4/tests/__init__.py delete mode 100644 lib/bs4/tests/test_builder_registry.py delete mode 100644 lib/bs4/tests/test_docs.py delete mode 100644 lib/bs4/tests/test_html5lib.py delete mode 100644 lib/bs4/tests/test_htmlparser.py delete mode 100644 lib/bs4/tests/test_lxml.py delete mode 100644 lib/bs4/tests/test_soup.py delete mode 100644 lib/bs4/tests/test_tree.py diff --git a/CHANGES.md b/CHANGES.md index 9230b664..c70ac37c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -47,6 +47,7 @@ * Update dateutil library 2.2 to 2.4.2 (a6b8925) * Change zoneinfo update/loader to be compatible with dateutil 2.4.2 * Update ConfigObj library 4.6.0 to 5.1.0 (a68530a) +* Update Beautiful Soup to 4.3.2 (r353) [develop changelog] * Update Requests library 2.7.0 (ab1f493) to 2.7.0 (8b5e457) diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index 7ba34269..a53048d6 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -45,7 +45,7 @@ from .element import ( # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -77,6 +77,8 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the @@ -114,9 +116,9 @@ class BeautifulSoup(Tag): del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") def deprecated_argument(old_name, new_name): if old_name in kwargs: @@ -140,6 +142,7 @@ class BeautifulSoup(Tag): "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: + original_features = features if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: @@ -151,6 +154,11 @@ class BeautifulSoup(Tag): "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() + if not (original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES): + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + parser=builder.NAME)) + self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self @@ -178,6 +186,8 @@ class BeautifulSoup(Tag): # system. Just let it go. pass if is_file: + if isinstance(markup, unicode): + markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) if markup[:5] == "http:" or markup[:6] == "https:": @@ -185,6 +195,8 @@ class BeautifulSoup(Tag): # Python 3 otherwise. if ((isinstance(markup, bytes) and not b' ' in markup) or (isinstance(markup, unicode) and not u' ' in markup)): + if isinstance(markup, unicode): + markup = markup.encode("utf8") warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 740f5f29..820bc801 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -80,6 +80,8 @@ builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] features = [] is_xml = False diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 7de36ae7..60135752 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -22,7 +22,9 @@ from bs4.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = ['html5lib', PERMISSIVE, HTML_5, HTML] + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. @@ -161,6 +163,12 @@ class Element(html5lib.treebuilders._base.Node): # immediately after the parent, if it has no children.) if self.element.contents: most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() else: most_recent_element = self.element diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index ca8d8b89..7f3ae735 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -19,10 +19,8 @@ import warnings # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = ( - major > 3 - or (major == 3 and minor > 2) - or (major == 3 and minor == 2 and release >= 3)) +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 from bs4.element import ( CData, @@ -63,7 +61,8 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed. + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -113,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) @@ -128,11 +119,14 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False - features = [HTML, STRICT, HTMLPARSER] + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): if CONSTRUCTOR_TAKES_STRICT: kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + kwargs['convert_charrefs'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py index fa5d4987..b0bc8a03 100644 --- a/lib/bs4/builder/_lxml.py +++ b/lib/bs4/builder/_lxml.py @@ -7,7 +7,12 @@ from io import BytesIO from StringIO import StringIO import collections from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, +) from bs4.builder import ( FAST, HTML, @@ -25,8 +30,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): is_xml = True + NAME = "lxml-xml" + # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] + features = [NAME, LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @@ -189,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.pop() def pi(self, target, data): - pass + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(ProcessingInstruction) def data(self, content): self.soup.handle_data(content) @@ -212,7 +221,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [LXML, HTML, FAST, PERMISSIVE] + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False def default_parser(self, encoding): diff --git a/lib/bs4/element.py b/lib/bs4/element.py index da9afdf4..ff716dfb 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -548,17 +548,17 @@ class PageElement(object): # Methods for supporting CSS selectors. - tag_name_re = re.compile('^[a-z0-9]+$') + tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') - # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ - # \---/ \---/\-------------/ \-------/ - # | | | | - # | | | The value - # | | ~,|,^,$,* or = - # | Attribute + # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---------------------------/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute # Tag attribselect_re = re.compile( - r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'^(?P[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + r'=?"?(?P[^\]"]*)"?\]$' ) @@ -707,7 +707,7 @@ class CData(PreformattedString): class ProcessingInstruction(PreformattedString): PREFIX = u'' + SUFFIX = u'>' class Comment(PreformattedString): @@ -1203,192 +1203,206 @@ class Tag(PageElement): _select_debug = False def select(self, selector, _candidate_generator=None): """Perform a CSS selection operation on the current element.""" - tokens = selector.split() + + # Remove whitespace directly after the grouping operator ',' + # then split into tokens. + tokens = re.sub(',[\s]*',',', selector).split() current_context = [self] if tokens[-1] in self._selector_combinators: raise ValueError( 'Final combinator "%s" is missing an argument.' % tokens[-1]) + if self._select_debug: print 'Running CSS selector "%s"' % selector - for index, token in enumerate(tokens): - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None + + for index, token_group in enumerate(tokens): + new_context = [] + new_context_ids = set([]) + + # Grouping selectors, ie: p,a + grouped_tokens = token_group.split(',') + if '' in grouped_tokens: + raise ValueError('Invalid group selection syntax: %s' % token_group) + if tokens[index-1] in self._selector_combinators: # This token was consumed by the previous combinator. Skip it. if self._select_debug: print ' Token was consumed by the previous combinator.' continue - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) + for token in grouped_tokens: + if self._select_debug: + print ' Considering token "%s"' % token + recursive_candidate_generator = None + tag_name = None - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches + # Each operation corresponds to a checker function, a rule + # for determining whether a candidate matches the + # selector. Candidates are generated by the active + # iterator. + checker = None - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag_name, attribute, operator, value = m.groups() + checker = self._attribute_checker(operator, attribute, value) - elif ':' in token: - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is not None: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: + elif '#' in token: + # ID selector + tag_name, tag_id = token.split('#', 1) + def id_matches(tag): + return tag.get('id', None) == tag_id + checker = id_matches + + elif '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + classes = set(klass.split('.')) + def classes_match(candidate): + return classes.issubset(candidate.get('class', [])) + checker = classes_match + + elif ':' in token: + # Pseudo-class + tag_name, pseudo = token.split(':', 1) + if tag_name == '': + raise ValueError( + "A pseudo-class must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is not None: + pseudo_type, pseudo_value = pseudo_attributes.groups() + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: + raise NotImplementedError( + 'Only numeric values are currently supported for the nth-of-type pseudo-class.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudo-class value must be at least 1.') + class Counter(object): + def __init__(self, destination): + self.count = 0 + self.destination = destination + + def nth_child_of_type(self, tag): + self.count += 1 + if self.count == self.destination: + return True + if self.count > self.destination: + # Stop the generator that's sending us + # these things. + raise StopIteration() + return False + checker = Counter(pseudo_value).nth_child_of_type + else: raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination + 'Only the following pseudo-classes are implemented: nth-of-type.') - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - if self.count > self.destination: - # Stop the generator that's sending us - # these things. - raise StopIteration() - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') + elif token == '*': + # Star selector -- matches everything + pass + elif token == '>': + # Run the next token as a CSS selector against the + # direct children of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.children + elif token == '~': + # Run the next token as a CSS selector against the + # siblings of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.next_siblings + elif token == '+': + # For each tag in the current context, run the next + # token as a CSS selector against the tag's next + # sibling that's a tag. + def next_tag_sibling(tag): + yield tag.find_next_sibling(True) + recursive_candidate_generator = next_tag_sibling - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i - if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator + elif self.tag_name_re.match(token): + # Just a tag name. + tag_name = token else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator - - new_context = [] - new_context_ids = set([]) - for tag in current_context: - if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. - break - if checker is None or result: + raise ValueError( + 'Unsupported or invalid CSS selector: "%s"' % token) + if recursive_candidate_generator: + # This happens when the selector looks like "> foo". + # + # The generator calls select() recursively on every + # member of the current context, passing in a different + # candidate generator and a different selector. + # + # In the case of "> foo", the candidate generator is + # one that yields a tag's direct children (">"), and + # the selector is "foo". + next_token = tokens[index+1] + def recursive_select(tag): if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) + print '-' * 40 + for i in tag.select(next_token, recursive_candidate_generator): + if self._select_debug: + print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + yield i + if self._select_debug: + print '-' * 40 + _use_candidate_generator = recursive_select + elif _candidate_generator is None: + # By default, a tag's candidates are all of its + # children. If tag_name is defined, only yield tags + # with that name. + if self._select_debug: + if tag_name: + check = "[any]" + else: + check = tag_name + print ' Default candidate generator, tag name="%s"' % check + if self._select_debug: + # This is redundant with later code, but it stops + # a bunch of bogus tags from cluttering up the + # debug log. + def default_candidate_generator(tag): + for child in tag.descendants: + if not isinstance(child, Tag): + continue + if tag_name and not child.name == tag_name: + continue + yield child + _use_candidate_generator = default_candidate_generator + else: + _use_candidate_generator = lambda tag: tag.descendants + else: + _use_candidate_generator = _candidate_generator + + for tag in current_context: + if self._select_debug: + print " Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs)) + for candidate in _use_candidate_generator(tag): + if not isinstance(candidate, Tag): + continue + if tag_name and candidate.name != tag_name: + continue + if checker is not None: + try: + result = checker(candidate) + except StopIteration: + # The checker has decided we should no longer + # run the generator. + break + if checker is None or result: + if self._select_debug: + print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + if id(candidate) not in new_context_ids: + # If a tag matches a selector more than once, + # don't include it in the context more than once. + new_context.append(candidate) + new_context_ids.add(id(candidate)) + elif self._select_debug: + print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + current_context = new_context diff --git a/lib/bs4/testing.py b/lib/bs4/testing.py deleted file mode 100644 index fd4495ac..00000000 --- a/lib/bs4/testing.py +++ /dev/null @@ -1,592 +0,0 @@ -"""Helper classes for tests.""" - -import copy -import functools -import unittest -from unittest import TestCase -from bs4 import BeautifulSoup -from bs4.element import ( - CharsetMetaAttributeValue, - Comment, - ContentMetaAttributeValue, - Doctype, - SoupStrainer, -) - -from bs4.builder import HTMLParserTreeBuilder -default_builder = HTMLParserTreeBuilder - - -class SoupTest(unittest.TestCase): - - @property - def default_builder(self): - return default_builder() - - def soup(self, markup, **kwargs): - """Build a Beautiful Soup object from markup.""" - builder = kwargs.pop('builder', self.default_builder) - return BeautifulSoup(markup, builder=builder, **kwargs) - - def document_for(self, markup): - """Turn an HTML fragment into a document. - - The details depend on the builder. - """ - return self.default_builder.test_fragment_to_document(markup) - - def assertSoupEquals(self, to_parse, compare_parsed_to=None): - builder = self.default_builder - obj = BeautifulSoup(to_parse, builder=builder) - if compare_parsed_to is None: - compare_parsed_to = to_parse - - self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) - - -class HTMLTreeBuilderSmokeTest(object): - - """A basic test of a treebuilder's competence. - - Any HTML treebuilder, present or future, should be able to pass - these tests. With invalid markup, there's room for interpretation, - and different parsers can handle it differently. But with the - markup in these tests, there's not much room for interpretation. - """ - - def assertDoctypeHandled(self, doctype_fragment): - """Assert that a given doctype string is handled correctly.""" - doctype_str, soup = self._document_with_doctype(doctype_fragment) - - # Make sure a Doctype object was created. - doctype = soup.contents[0] - self.assertEqual(doctype.__class__, Doctype) - self.assertEqual(doctype, doctype_fragment) - self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) - - # Make sure that the doctype was correctly associated with the - # parse tree and that the rest of the document parsed. - self.assertEqual(soup.p.contents[0], 'foo') - - def _document_with_doctype(self, doctype_fragment): - """Generate and parse a document with the given doctype.""" - doctype = '' % doctype_fragment - markup = doctype + '\n

foo

' - soup = self.soup(markup) - return doctype, soup - - def test_normal_doctypes(self): - """Make sure normal, everyday HTML doctypes are handled correctly.""" - self.assertDoctypeHandled("html") - self.assertDoctypeHandled( - 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') - - def test_empty_doctype(self): - soup = self.soup("") - doctype = soup.contents[0] - self.assertEqual("", doctype.strip()) - - def test_public_doctype_with_url(self): - doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' - self.assertDoctypeHandled(doctype) - - def test_system_doctype(self): - self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') - - def test_namespaced_system_doctype(self): - # We can handle a namespaced doctype with a system ID. - self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') - - def test_namespaced_public_doctype(self): - # Test a namespaced doctype with a public id. - self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') - - def test_real_xhtml_document(self): - """A real XHTML document should come out more or less the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b""), - markup.replace(b"\n", b"")) - - def test_deepcopy(self): - """Make sure you can copy the tree builder. - - This is important because the builder is part of a - BeautifulSoup object, and we want to be able to copy that. - """ - copy.deepcopy(self.default_builder) - - def test_p_tag_is_never_empty_element(self): - """A

tag is never designated as an empty-element tag. - - Even if the markup shows it as an empty-element tag, it - shouldn't be presented that way. - """ - soup = self.soup("

") - self.assertFalse(soup.p.is_empty_element) - self.assertEqual(str(soup.p), "

") - - def test_unclosed_tags_get_closed(self): - """A tag that's not closed by the end of the document should be closed. - - This applies to all tags except empty-element tags. - """ - self.assertSoupEquals("

", "

") - self.assertSoupEquals("", "") - - self.assertSoupEquals("
", "
") - - def test_br_is_always_empty_element_tag(self): - """A
tag is designated as an empty-element tag. - - Some parsers treat

as one
tag, some parsers as - two tags, but it should always be an empty-element tag. - """ - soup = self.soup("

") - self.assertTrue(soup.br.is_empty_element) - self.assertEqual(str(soup.br), "
") - - def test_nested_formatting_elements(self): - self.assertSoupEquals("") - - def test_comment(self): - # Comments are represented as Comment objects. - markup = "

foobaz

" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEqual(comment.__class__, Comment) - - # The comment is properly integrated into the tree. - foo = soup.find(text="foo") - self.assertEqual(comment, foo.next_element) - baz = soup.find(text="baz") - self.assertEqual(comment, baz.previous_element) - - def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved in
 and ")
-
-    def test_nested_inline_elements(self):
-        """Inline elements can be nested indefinitely."""
-        b_tag = "Inside a B tag"
-        self.assertSoupEquals(b_tag)
-
-        nested_b_tag = "

A nested tag

" - self.assertSoupEquals(nested_b_tag) - - double_nested_b_tag = "

A doubly nested tag

" - self.assertSoupEquals(nested_b_tag) - - def test_nested_block_level_elements(self): - """Block elements can be nested.""" - soup = self.soup('

Foo

') - blockquote = soup.blockquote - self.assertEqual(blockquote.p.b.string, 'Foo') - self.assertEqual(blockquote.b.string, 'Foo') - - def test_correctly_nested_tables(self): - """One table can go inside another one.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_deeply_nested_multivalued_attribute(self): - # html5lib can set the attributes of the same tag many times - # as it rearranges the tree. This has caused problems with - # multivalued attributes. - markup = '
' - soup = self.soup(markup) - self.assertEqual(["css"], soup.div.div['class']) - - def test_angle_brackets_in_attribute_values_are_escaped(self): - self.assertSoupEquals('', '') - - def test_entities_in_attributes_converted_to_unicode(self): - expect = u'

' - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - - def test_entities_in_text_converted_to_unicode(self): - expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - - def test_quot_entity_converted_to_quotation_mark(self): - self.assertSoupEquals("

I said "good day!"

", - '

I said "good day!"

') - - def test_out_of_range_entity(self): - expect = u"\N{REPLACEMENT CHARACTER}" - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - - def test_multipart_strings(self): - "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." - soup = self.soup("

\nfoo

") - self.assertEqual("p", soup.h2.string.next_element.name) - self.assertEqual("p", soup.p.name) - - def test_basic_namespaces(self): - """Parsers don't need to *understand* namespaces, but at the - very least they should not choke on namespaces or lose - data.""" - - markup = b'4' - soup = self.soup(markup) - self.assertEqual(markup, soup.encode()) - html = soup.html - self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) - self.assertEqual( - 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) - self.assertEqual( - 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) - - def test_multivalued_attribute_value_becomes_list(self): - markup = b'' - soup = self.soup(markup) - self.assertEqual(['foo', 'bar'], soup.a['class']) - - # - # Generally speaking, tests below this point are more tests of - # Beautiful Soup than tests of the tree builders. But parsers are - # weird, so we run these tests separately for every tree builder - # to detect any differences between them. - # - - def test_can_parse_unicode_document(self): - # A seemingly innocuous document... but it's in Unicode! And - # it contains characters that can't be represented in the - # encoding found in the declaration! The horror! - markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) - - def test_soupstrainer(self): - """Parsers should be able to work with SoupStrainers.""" - strainer = SoupStrainer("b") - soup = self.soup("A bold statement", - parse_only=strainer) - self.assertEqual(soup.decode(), "bold") - - def test_single_quote_attribute_values_become_double_quotes(self): - self.assertSoupEquals("", - '') - - def test_attribute_values_with_nested_quotes_are_left_alone(self): - text = """a""" - self.assertSoupEquals(text) - - def test_attribute_values_with_double_nested_quotes_get_quoted(self): - text = """a""" - soup = self.soup(text) - soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' - self.assertSoupEquals( - soup.foo.decode(), - """a""") - - def test_ampersand_in_attribute_value_gets_escaped(self): - self.assertSoupEquals('', - '') - - self.assertSoupEquals( - 'foo', - 'foo') - - def test_escaped_ampersand_in_attribute_value_is_left_alone(self): - self.assertSoupEquals('') - - def test_entities_in_strings_converted_during_parsing(self): - # Both XML and HTML entities are converted to Unicode characters - # during parsing. - text = "

<<sacré bleu!>>

" - expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" - self.assertSoupEquals(text, expected) - - def test_smart_quotes_converted_on_the_way_in(self): - # Microsoft smart quotes are converted to Unicode characters during - # parsing. - quote = b"

\x91Foo\x92

" - soup = self.soup(quote) - self.assertEqual( - soup.p.string, - u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") - - def test_non_breaking_spaces_converted_on_the_way_in(self): - soup = self.soup("  ") - self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) - - def test_entities_converted_on_the_way_out(self): - text = "

<<sacré bleu!>>

" - expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") - soup = self.soup(text) - self.assertEqual(soup.p.encode("utf-8"), expected) - - def test_real_iso_latin_document(self): - # Smoke test of interrelated functionality, using an - # easy-to-understand document. - - # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' - - # That's because we're going to encode it into ISO-Latin-1, and use - # that to test. - iso_latin_html = unicode_html.encode("iso-8859-1") - - # Parse the ISO-Latin-1 HTML. - soup = self.soup(iso_latin_html) - # Encode it to UTF-8. - result = soup.encode("utf-8") - - # What do we expect the result to look like? Well, it would - # look like unicode_html, except that the META tag would say - # UTF-8 instead of ISO-Latin-1. - expected = unicode_html.replace("ISO-Latin-1", "utf-8") - - # And, of course, it would be in UTF-8, not Unicode. - expected = expected.encode("utf-8") - - # Ta-da! - self.assertEqual(result, expected) - - def test_real_shift_jis_document(self): - # Smoke test to make sure the parser can handle a document in - # Shift-JIS encoding, without choking. - shift_jis_html = ( - b'
'
-            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
-            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
-            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
-            b'
') - unicode_html = shift_jis_html.decode("shift-jis") - soup = self.soup(unicode_html) - - # Make sure the parse tree is correctly encoded to various - # encodings. - self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) - self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) - - def test_real_hebrew_document(self): - # A real-world test to make sure we can convert ISO-8859-9 (a - # Hebrew encoding) to UTF-8. - hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' - soup = self.soup( - hebrew_document, from_encoding="iso8859-8") - self.assertEqual(soup.original_encoding, 'iso8859-8') - self.assertEqual( - soup.encode('utf-8'), - hebrew_document.decode("iso8859-8").encode("utf-8")) - - def test_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) - content = parsed_meta['content'] - self.assertEqual('text/html; charset=x-sjis', content) - - # But that value is actually a ContentMetaAttributeValue object. - self.assertTrue(isinstance(content, ContentMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('text/html; charset=utf8', content.encode("utf8")) - - # For the rest of the story, see TestSubstitutions in - # test_tree.py. - - def test_html5_style_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', id="encoding") - charset = parsed_meta['charset'] - self.assertEqual('x-sjis', charset) - - # But that value is actually a CharsetMetaAttributeValue object. - self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('utf8', charset.encode("utf8")) - - def test_tag_with_no_attributes_can_have_attributes_added(self): - data = self.soup("text") - data.a['foo'] = 'bar' - self.assertEqual('text', data.a.decode()) - -class XMLTreeBuilderSmokeTest(object): - - def test_docstring_generated(self): - soup = self.soup("") - self.assertEqual( - soup.encode(), b'\n') - - def test_real_xhtml_document(self): - """A real XHTML document should come out *exactly* the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8"), markup) - - def test_formatter_processes_script_tag_for_xml_documents(self): - doc = """ - -""" - soup = BeautifulSoup(doc, "xml") - # lxml would have stripped this while parsing, but we can add - # it later. - soup.script.string = 'console.log("< < hey > > ");' - encoded = soup.encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_can_parse_unicode_document(self): - markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) - - def test_popping_namespaced_tag(self): - markup = 'b2012-07-02T20:33:42Zcd' - soup = self.soup(markup) - self.assertEqual( - unicode(soup.rss), markup) - - def test_docstring_includes_correct_encoding(self): - soup = self.soup("") - self.assertEqual( - soup.encode("latin1"), - b'\n') - - def test_large_xml_document(self): - """A large XML document should come out the same as it went in.""" - markup = (b'\n' - + b'0' * (2**12) - + b'') - soup = self.soup(markup) - self.assertEqual(soup.encode("utf-8"), markup) - - - def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): - self.assertSoupEquals("

", "

") - self.assertSoupEquals("

foo

") - - def test_namespaces_are_preserved(self): - markup = 'This tag is in the a namespaceThis tag is in the b namespace' - soup = self.soup(markup) - root = soup.root - self.assertEqual("http://example.com/", root['xmlns:a']) - self.assertEqual("http://example.net/", root['xmlns:b']) - - def test_closing_namespaced_tag(self): - markup = '

20010504

' - soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) - - def test_namespaced_attributes(self): - markup = '' - soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) - - def test_namespaced_attributes_xml_namespace(self): - markup = 'bar' - soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) - -class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): - """Smoke test for a tree builder that supports HTML5.""" - - def test_real_xhtml_document(self): - # Since XHTML is not HTML5, HTML5 parsers are not tested to handle - # XHTML documents in any particular way. - pass - - def test_html_tags_have_namespace(self): - markup = "" - soup = self.soup(markup) - self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) - - def test_svg_tags_have_namespace(self): - markup = '' - soup = self.soup(markup) - namespace = "http://www.w3.org/2000/svg" - self.assertEqual(namespace, soup.svg.namespace) - self.assertEqual(namespace, soup.circle.namespace) - - - def test_mathml_tags_have_namespace(self): - markup = '5' - soup = self.soup(markup) - namespace = 'http://www.w3.org/1998/Math/MathML' - self.assertEqual(namespace, soup.math.namespace) - self.assertEqual(namespace, soup.msqrt.namespace) - - def test_xml_declaration_becomes_comment(self): - markup = '' - soup = self.soup(markup) - self.assertTrue(isinstance(soup.contents[0], Comment)) - self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') - self.assertEqual("html", soup.contents[0].next_element.name) - -def skipIf(condition, reason): - def nothing(test, *args, **kwargs): - return None - - def decorator(test_item): - if condition: - return nothing - else: - return test_item - - return decorator diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py deleted file mode 100644 index 142c8cc3..00000000 --- a/lib/bs4/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"The beautifulsoup tests." diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py deleted file mode 100644 index 92ad10fb..00000000 --- a/lib/bs4/tests/test_builder_registry.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Tests of the builder registry.""" - -import unittest - -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry as registry, - HTMLParserTreeBuilder, - TreeBuilderRegistry, -) - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError: - HTML5LIB_PRESENT = False - -try: - from bs4.builder import ( - LXMLTreeBuilderForXML, - LXMLTreeBuilder, - ) - LXML_PRESENT = True -except ImportError: - LXML_PRESENT = False - - -class BuiltInRegistryTest(unittest.TestCase): - """Test the built-in registry with the default builders registered.""" - - def test_combination(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('fast', 'html'), - LXMLTreeBuilder) - - if LXML_PRESENT: - self.assertEqual(registry.lookup('permissive', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('strict', 'html'), - HTMLParserTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib', 'html'), - HTML5TreeBuilder) - - def test_lookup_by_markup_type(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) - self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) - else: - self.assertEqual(registry.lookup('xml'), None) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: - self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) - - def test_named_library(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('lxml', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('lxml', 'html'), - LXMLTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib'), - HTML5TreeBuilder) - - self.assertEqual(registry.lookup('html.parser'), - HTMLParserTreeBuilder) - - def test_beautifulsoup_constructor_does_lookup(self): - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) - - # You'll get an exception if BS can't find an appropriate - # builder. - self.assertRaises(ValueError, BeautifulSoup, - "", features="no-such-feature") - -class RegistryTest(unittest.TestCase): - """Test the TreeBuilderRegistry class in general.""" - - def setUp(self): - self.registry = TreeBuilderRegistry() - - def builder_for_features(self, *feature_list): - cls = type('Builder_' + '_'.join(feature_list), - (object,), {'features' : feature_list}) - - self.registry.register(cls) - return cls - - def test_register_with_no_features(self): - builder = self.builder_for_features() - - # Since the builder advertises no features, you can't find it - # by looking up features. - self.assertEqual(self.registry.lookup('foo'), None) - - # But you can find it by doing a lookup with no features, if - # this happens to be the only registered builder. - self.assertEqual(self.registry.lookup(), builder) - - def test_register_with_features_makes_lookup_succeed(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('foo'), builder) - self.assertEqual(self.registry.lookup('bar'), builder) - - def test_lookup_fails_when_no_builder_implements_feature(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('baz'), None) - - def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): - builder1 = self.builder_for_features('foo') - builder2 = self.builder_for_features('bar') - self.assertEqual(self.registry.lookup(), builder2) - - def test_lookup_fails_when_no_tree_builders_registered(self): - self.assertEqual(self.registry.lookup(), None) - - def test_lookup_gets_most_recent_builder_supporting_all_features(self): - has_one = self.builder_for_features('foo') - has_the_other = self.builder_for_features('bar') - has_both_early = self.builder_for_features('foo', 'bar', 'baz') - has_both_late = self.builder_for_features('foo', 'bar', 'quux') - lacks_one = self.builder_for_features('bar') - has_the_other = self.builder_for_features('foo') - - # There are two builders featuring 'foo' and 'bar', but - # the one that also features 'quux' was registered later. - self.assertEqual(self.registry.lookup('foo', 'bar'), - has_both_late) - - # There is only one builder featuring 'foo', 'bar', and 'baz'. - self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), - has_both_early) - - def test_lookup_fails_when_cannot_reconcile_requested_features(self): - builder1 = self.builder_for_features('foo', 'bar') - builder2 = self.builder_for_features('foo', 'baz') - self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/lib/bs4/tests/test_docs.py b/lib/bs4/tests/test_docs.py deleted file mode 100644 index 5b9f6770..00000000 --- a/lib/bs4/tests/test_docs.py +++ /dev/null @@ -1,36 +0,0 @@ -"Test harness for doctests." - -# pylint: disable-msg=E0611,W0142 - -__metaclass__ = type -__all__ = [ - 'additional_tests', - ] - -import atexit -import doctest -import os -#from pkg_resources import ( -# resource_filename, resource_exists, resource_listdir, cleanup_resources) -import unittest - -DOCTEST_FLAGS = ( - doctest.ELLIPSIS | - doctest.NORMALIZE_WHITESPACE | - doctest.REPORT_NDIFF) - - -# def additional_tests(): -# "Run the doc tests (README.txt and docs/*, if any exist)" -# doctest_files = [ -# os.path.abspath(resource_filename('bs4', 'README.txt'))] -# if resource_exists('bs4', 'docs'): -# for name in resource_listdir('bs4', 'docs'): -# if name.endswith('.txt'): -# doctest_files.append( -# os.path.abspath( -# resource_filename('bs4', 'docs/%s' % name))) -# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) -# atexit.register(cleanup_resources) -# return unittest.TestSuite(( -# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py deleted file mode 100644 index 594c3e1f..00000000 --- a/lib/bs4/tests/test_html5lib.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Tests to ensure that the html5lib tree builder generates good trees.""" - -import warnings - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError, e: - HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( - HTML5TreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): - """See ``HTML5TreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return HTML5TreeBuilder() - - def test_soupstrainer(self): - # The html5lib tree builder does not support SoupStrainers. - strainer = SoupStrainer("b") - markup = "

A bold statement.

" - with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) - self.assertEqual( - soup.decode(), self.document_for(markup)) - - self.assertTrue( - "the html5lib tree builder doesn't support parse_only" in - str(w[0].message)) - - def test_correctly_nested_tables(self): - """html5lib inserts tags where other parsers don't.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_xml_declaration_followed_by_doctype(self): - markup = ''' - - - - - -

foo

- -''' - soup = self.soup(markup) - # Verify that we can reach the

tag; this means the tree is connected. - self.assertEqual(b"

foo

", soup.p.encode()) - - def test_reparented_markup(self): - markup = '

foo

\n

bar

' - soup = self.soup(markup) - self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - - def test_reparented_markup_ends_with_whitespace(self): - markup = '

foo

\n

bar

\n' - soup = self.soup(markup) - self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py deleted file mode 100644 index bcb5ed23..00000000 --- a/lib/bs4/tests/test_htmlparser.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Tests to ensure that the html.parser tree builder generates good -trees.""" - -from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest -from bs4.builder import HTMLParserTreeBuilder - -class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - - @property - def default_builder(self): - return HTMLParserTreeBuilder() - - def test_namespaced_system_doctype(self): - # html.parser can't handle namespaced doctypes, so skip this one. - pass - - def test_namespaced_public_doctype(self): - # html.parser can't handle namespaced doctypes, so skip this one. - pass diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py deleted file mode 100644 index 2b2e9b7e..00000000 --- a/lib/bs4/tests/test_lxml.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Tests to ensure that the lxml tree builder generates good trees.""" - -import re -import warnings - -try: - import lxml.etree - LXML_PRESENT = True - LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError, e: - LXML_PRESENT = False - LXML_VERSION = (0,) - -if LXML_PRESENT: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - -from bs4 import ( - BeautifulSoup, - BeautifulStoneSoup, - ) -from bs4.element import Comment, Doctype, SoupStrainer -from bs4.testing import skipIf -from bs4.tests import test_htmlparser -from bs4.testing import ( - HTMLTreeBuilderSmokeTest, - XMLTreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not LXML_PRESENT, - "lxml seems not to be present, not testing its tree builder.") -class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - """See ``HTMLTreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return LXMLTreeBuilder() - - def test_out_of_range_entity(self): - self.assertSoupEquals( - "

foo�bar

", "

foobar

") - self.assertSoupEquals( - "

foo�bar

", "

foobar

") - self.assertSoupEquals( - "

foo�bar

", "

foobar

") - - # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this - # test if an old version of lxml is installed. - - @skipIf( - not LXML_PRESENT or LXML_VERSION < (2,3,5,0), - "Skipping doctype test for old version of lxml to avoid segfault.") - def test_empty_doctype(self): - soup = self.soup("") - doctype = soup.contents[0] - self.assertEqual("", doctype.strip()) - - def test_beautifulstonesoup_is_xml_parser(self): - # Make sure that the deprecated BSS class uses an xml builder - # if one is installed. - with warnings.catch_warnings(record=True) as w: - soup = BeautifulStoneSoup("") - self.assertEqual(u"", unicode(soup.b)) - self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) - - def test_real_xhtml_document(self): - """lxml strips the XML definition from an XHTML doc, which is fine.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b''), - markup.replace(b'\n', b'').replace( - b'', b'')) - - -@skipIf( - not LXML_PRESENT, - "lxml seems not to be present, not testing its XML tree builder.") -class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): - """See ``HTMLTreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return LXMLTreeBuilderForXML() diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py deleted file mode 100644 index 47ac245f..00000000 --- a/lib/bs4/tests/test_soup.py +++ /dev/null @@ -1,434 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests of Beautiful Soup as a whole.""" - -import logging -import unittest -import sys -import tempfile - -from bs4 import ( - BeautifulSoup, - BeautifulStoneSoup, -) -from bs4.element import ( - CharsetMetaAttributeValue, - ContentMetaAttributeValue, - SoupStrainer, - NamespacedAttribute, - ) -import bs4.dammit -from bs4.dammit import ( - EntitySubstitution, - UnicodeDammit, -) -from bs4.testing import ( - SoupTest, - skipIf, -) -import warnings - -try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True -except ImportError, e: - LXML_PRESENT = False - -PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) -PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) - -class TestConstructor(SoupTest): - - def test_short_unicode_input(self): - data = u"

éé

" - soup = self.soup(data) - self.assertEqual(u"éé", soup.h1.string) - - def test_embedded_null(self): - data = u"

foo\0bar

" - soup = self.soup(data) - self.assertEqual(u"foo\0bar", soup.h1.string) - - -class TestDeprecatedConstructorArguments(SoupTest): - - def test_parseOnlyThese_renamed_to_parse_only(self): - with warnings.catch_warnings(record=True) as w: - soup = self.soup("", parseOnlyThese=SoupStrainer("b")) - msg = str(w[0].message) - self.assertTrue("parseOnlyThese" in msg) - self.assertTrue("parse_only" in msg) - self.assertEqual(b"", soup.encode()) - - def test_fromEncoding_renamed_to_from_encoding(self): - with warnings.catch_warnings(record=True) as w: - utf8 = b"\xc3\xa9" - soup = self.soup(utf8, fromEncoding="utf8") - msg = str(w[0].message) - self.assertTrue("fromEncoding" in msg) - self.assertTrue("from_encoding" in msg) - self.assertEqual("utf8", soup.original_encoding) - - def test_unrecognized_keyword_argument(self): - self.assertRaises( - TypeError, self.soup, "", no_such_argument=True) - -class TestWarnings(SoupTest): - - def test_disk_file_warning(self): - filehandle = tempfile.NamedTemporaryFile() - filename = filehandle.name - try: - with warnings.catch_warnings(record=True) as w: - soup = self.soup(filename) - msg = str(w[0].message) - self.assertTrue("looks like a filename" in msg) - finally: - filehandle.close() - - # The file no longer exists, so Beautiful Soup will no longer issue the warning. - with warnings.catch_warnings(record=True) as w: - soup = self.soup(filename) - self.assertEqual(0, len(w)) - - def test_url_warning(self): - with warnings.catch_warnings(record=True) as w: - soup = self.soup("http://www.crummy.com/") - msg = str(w[0].message) - self.assertTrue("looks like a URL" in msg) - - with warnings.catch_warnings(record=True) as w: - soup = self.soup("http://www.crummy.com/ is great") - self.assertEqual(0, len(w)) - -class TestSelectiveParsing(SoupTest): - - def test_parse_with_soupstrainer(self): - markup = "NoYesNoYes Yes" - strainer = SoupStrainer("b") - soup = self.soup(markup, parse_only=strainer) - self.assertEqual(soup.encode(), b"YesYes Yes") - - -class TestEntitySubstitution(unittest.TestCase): - """Standalone tests of the EntitySubstitution class.""" - def setUp(self): - self.sub = EntitySubstitution - - def test_simple_html_substitution(self): - # Unicode characters corresponding to named HTML entites - # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" - self.assertEqual(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") - - def test_smart_quote_substitution(self): - # MS smart quotes are a common source of frustration, so we - # give them a special test. - quotes = b"\x91\x92foo\x93\x94" - dammit = UnicodeDammit(quotes) - self.assertEqual(self.sub.substitute_html(dammit.markup), - "‘’foo“”") - - def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): - s = 'Welcome to "my bar"' - self.assertEqual(self.sub.substitute_xml(s, False), s) - - def test_xml_attribute_quoting_normally_uses_double_quotes(self): - self.assertEqual(self.sub.substitute_xml("Welcome", True), - '"Welcome"') - self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), - '"Bob\'s Bar"') - - def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): - s = 'Welcome to "my bar"' - self.assertEqual(self.sub.substitute_xml(s, True), - "'Welcome to \"my bar\"'") - - def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): - s = 'Welcome to "Bob\'s Bar"' - self.assertEqual( - self.sub.substitute_xml(s, True), - '"Welcome to "Bob\'s Bar""') - - def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): - quoted = 'Welcome to "Bob\'s Bar"' - self.assertEqual(self.sub.substitute_xml(quoted), quoted) - - def test_xml_quoting_handles_angle_brackets(self): - self.assertEqual( - self.sub.substitute_xml("foo"), - "foo<bar>") - - def test_xml_quoting_handles_ampersands(self): - self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") - - def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): - self.assertEqual( - self.sub.substitute_xml("ÁT&T"), - "&Aacute;T&T") - - def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): - self.assertEqual( - self.sub.substitute_xml_containing_entities("ÁT&T"), - "ÁT&T") - - def test_quotes_not_html_substituted(self): - """There's no need to do this except inside attribute values.""" - text = 'Bob\'s "bar"' - self.assertEqual(self.sub.substitute_html(text), text) - - -class TestEncodingConversion(SoupTest): - # Test Beautiful Soup's ability to decode and encode from various - # encodings. - - def setUp(self): - super(TestEncodingConversion, self).setUp() - self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - self.utf8_data = self.unicode_data.encode("utf-8") - # Just so you know what it looks like. - self.assertEqual( - self.utf8_data, - b'Sacr\xc3\xa9 bleu!') - - def test_ascii_in_unicode_out(self): - # ASCII input is converted to Unicode. The original_encoding - # attribute is set to 'utf-8', a superset of ASCII. - chardet = bs4.dammit.chardet_dammit - logging.disable(logging.WARNING) - try: - def noop(str): - return None - # Disable chardet, which will realize that the ASCII is ASCII. - bs4.dammit.chardet_dammit = noop - ascii = b"a" - soup_from_ascii = self.soup(ascii) - unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) - self.assertEqual(unicode_output, self.document_for(ascii.decode())) - self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") - finally: - logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet - - def test_unicode_in_unicode_out(self): - # Unicode input is left alone. The original_encoding attribute - # is not set. - soup_from_unicode = self.soup(self.unicode_data) - self.assertEqual(soup_from_unicode.decode(), self.unicode_data) - self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') - self.assertEqual(soup_from_unicode.original_encoding, None) - - def test_utf8_in_unicode_out(self): - # UTF-8 input is converted to Unicode. The original_encoding - # attribute is set. - soup_from_utf8 = self.soup(self.utf8_data) - self.assertEqual(soup_from_utf8.decode(), self.unicode_data) - self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') - - def test_utf8_out(self): - # The internal data structures can be encoded as UTF-8. - soup_from_unicode = self.soup(self.unicode_data) - self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) - - @skipIf( - PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, - "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") - def test_attribute_name_containing_unicode_characters(self): - markup = u'
' - self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) - -class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of UnicodeDammit.""" - - def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" - dammit = UnicodeDammit(markup) - self.assertEqual(dammit.unicode_markup, markup) - - def test_smart_quotes_to_unicode(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup) - self.assertEqual( - dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") - - def test_smart_quotes_to_xml_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - self.assertEqual( - dammit.unicode_markup, "‘’“”") - - def test_smart_quotes_to_html_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - self.assertEqual( - dammit.unicode_markup, "‘’“”") - - def test_smart_quotes_to_ascii(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - self.assertEqual( - dammit.unicode_markup, """''""""") - - def test_detect_utf8(self): - utf8 = b"\xc3\xa9" - dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.unicode_markup, u'\xe9') - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_convert_hebrew(self): - hebrew = b"\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') - - def test_dont_see_smart_quotes_where_there_are_none(self): - utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - dammit = UnicodeDammit(utf_8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) - - def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - for bad_encoding in ['.utf8', '...', 'utF---16.!']: - dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_detect_html5_style_meta_tag(self): - - for data in ( - b'', - b"", - b"", - b""): - dammit = UnicodeDammit(data, is_html=True) - self.assertEqual( - "euc-jp", dammit.original_encoding) - - def test_last_ditch_entity_replacement(self): - # This is a UTF-8 document that contains bytestrings - # completely incompatible with UTF-8 (ie. encoded with some other - # encoding). - # - # Since there is no consistent encoding for the document, - # Unicode, Dammit will eventually encode the document as UTF-8 - # and encode the incompatible characters as REPLACEMENT - # CHARACTER. - # - # If chardet is installed, it will detect that the document - # can be converted into ISO-8859-1 without errors. This happens - # to be the wrong encoding, but it is a consistent encoding, so the - # code we're testing here won't run. - # - # So we temporarily disable chardet if it's present. - doc = b"""\357\273\277 -\330\250\330\252\330\261 -\310\322\321\220\312\321\355\344""" - chardet = bs4.dammit.chardet_dammit - logging.disable(logging.WARNING) - try: - def noop(str): - return None - bs4.dammit.chardet_dammit = noop - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) - - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - finally: - logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet - - def test_byte_order_mark_removed(self): - # A document written in UTF-16LE will have its byte order marker stripped. - data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' - dammit = UnicodeDammit(data) - self.assertEqual(u"áé", dammit.unicode_markup) - self.assertEqual("utf-16le", dammit.original_encoding) - - def test_detwingle(self): - # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") - - # Here's a Windows-1252 document. - windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") - - # Through some unholy alchemy, they've been stuck together. - doc = utf8 + windows_1252 + utf8 - - # The document can't be turned into UTF-8: - self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") - - # Unicode, Dammit thinks the whole document is Windows-1252, - # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" - - # But if we run it through fix_embedded_windows_1252, it's fixed: - - fixed = UnicodeDammit.detwingle(doc) - self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) - - def test_detwingle_ignores_multibyte_characters(self): - # Each of these characters has a UTF-8 representation ending - # in \x93. \x93 is a smart quote if interpreted as - # Windows-1252. But our code knows to skip over multibyte - # UTF-8 characters, so they'll survive the process unscathed. - for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. - ): - input = tricky_unicode_char.encode("utf8") - self.assertTrue(input.endswith(b'\x93')) - output = UnicodeDammit.detwingle(input) - self.assertEqual(output, input) - -class TestNamedspacedAttribute(SoupTest): - - def test_name_may_be_none(self): - a = NamespacedAttribute("xmlns", None) - self.assertEqual(a, "xmlns") - - def test_attribute_is_equivalent_to_colon_separated_string(self): - a = NamespacedAttribute("a", "b") - self.assertEqual("a:b", a) - - def test_attributes_are_equivalent_if_prefix_and_name_identical(self): - a = NamespacedAttribute("a", "b", "c") - b = NamespacedAttribute("a", "b", "c") - self.assertEqual(a, b) - - # The actual namespace is not considered. - c = NamespacedAttribute("a", "b", None) - self.assertEqual(a, c) - - # But name and prefix are important. - d = NamespacedAttribute("a", "z", "c") - self.assertNotEqual(a, d) - - e = NamespacedAttribute("z", "b", "c") - self.assertNotEqual(a, e) - - -class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): - - def test_content_meta_attribute_value(self): - value = CharsetMetaAttributeValue("euc-jp") - self.assertEqual("euc-jp", value) - self.assertEqual("euc-jp", value.original_value) - self.assertEqual("utf8", value.encode("utf8")) - - - def test_content_meta_attribute_value(self): - value = ContentMetaAttributeValue("text/html; charset=euc-jp") - self.assertEqual("text/html; charset=euc-jp", value) - self.assertEqual("text/html; charset=euc-jp", value.original_value) - self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py deleted file mode 100644 index f8515c0e..00000000 --- a/lib/bs4/tests/test_tree.py +++ /dev/null @@ -1,1829 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests for Beautiful Soup's tree traversal methods. - -The tree traversal methods are the main advantage of using Beautiful -Soup over just using a parser. - -Different parsers will build different Beautiful Soup trees given the -same markup, but all Beautiful Soup trees can be traversed with the -methods tested here. -""" - -import copy -import pickle -import re -import warnings -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry, - HTMLParserTreeBuilder, -) -from bs4.element import ( - CData, - Comment, - Doctype, - NavigableString, - SoupStrainer, - Tag, -) -from bs4.testing import ( - SoupTest, - skipIf, -) - -XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) -LXML_PRESENT = (builder_registry.lookup("lxml") is not None) - -class TreeTest(SoupTest): - - def assertSelects(self, tags, should_match): - """Make sure that the given tags have the correct text. - - This is used in tests that define a bunch of tags, each - containing a single string, and then select certain strings by - some mechanism. - """ - self.assertEqual([tag.string for tag in tags], should_match) - - def assertSelectsIDs(self, tags, should_match): - """Make sure that the given tags have the correct IDs. - - This is used in tests that define a bunch of tags, each - containing a single string, and then select certain strings by - some mechanism. - """ - self.assertEqual([tag['id'] for tag in tags], should_match) - - -class TestFind(TreeTest): - """Basic tests of the find() method. - - find() just calls find_all() with limit=1, so it's not tested all - that thouroughly here. - """ - - def test_find_tag(self): - soup = self.soup("1234") - self.assertEqual(soup.find("b").string, "2") - - def test_unicode_text_find(self): - soup = self.soup(u'

Räksmörgås

') - self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') - - def test_find_everything(self): - """Test an optimization that finds all tags.""" - soup = self.soup("foobar") - self.assertEqual(2, len(soup.find_all())) - - def test_find_everything_with_name(self): - """Test an optimization that finds all tags with a given name.""" - soup = self.soup("foobarbaz") - self.assertEqual(2, len(soup.find_all('a'))) - -class TestFindAll(TreeTest): - """Basic tests of the find_all() method.""" - - def test_find_all_text_nodes(self): - """You can search the tree for text nodes.""" - soup = self.soup("Foobar\xbb") - # Exact match. - self.assertEqual(soup.find_all(text="bar"), [u"bar"]) - # Match any of a number of strings. - self.assertEqual( - soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) - # Match a regular expression. - self.assertEqual(soup.find_all(text=re.compile('.*')), - [u"Foo", u"bar", u'\xbb']) - # Match anything. - self.assertEqual(soup.find_all(text=True), - [u"Foo", u"bar", u'\xbb']) - - def test_find_all_limit(self): - """You can limit the number of items returned by find_all.""" - soup = self.soup("12345") - self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) - self.assertSelects(soup.find_all('a', limit=1), ["1"]) - self.assertSelects( - soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) - - # A limit of 0 means no limit. - self.assertSelects( - soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) - - def test_calling_a_tag_is_calling_findall(self): - soup = self.soup("123") - self.assertSelects(soup('a', limit=1), ["1"]) - self.assertSelects(soup.b(id="foo"), ["3"]) - - def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): - soup = self.soup("") - # Create a self-referential list. - l = [] - l.append(l) - - # Without special code in _normalize_search_value, this would cause infinite - # recursion. - self.assertEqual([], soup.find_all(l)) - - def test_find_all_resultset(self): - """All find_all calls return a ResultSet""" - soup = self.soup("") - result = soup.find_all("a") - self.assertTrue(hasattr(result, "source")) - - result = soup.find_all(True) - self.assertTrue(hasattr(result, "source")) - - result = soup.find_all(text="foo") - self.assertTrue(hasattr(result, "source")) - - -class TestFindAllBasicNamespaces(TreeTest): - - def test_find_by_namespaced_name(self): - soup = self.soup('4') - self.assertEqual("4", soup.find("mathml:msqrt").string) - self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) - - -class TestFindAllByName(TreeTest): - """Test ways of finding tags by tag name.""" - - def setUp(self): - super(TreeTest, self).setUp() - self.tree = self.soup("""First tag. - Second tag. - Third Nested tag. tag.""") - - def test_find_all_by_tag_name(self): - # Find all the tags. - self.assertSelects( - self.tree.find_all('a'), ['First tag.', 'Nested tag.']) - - def test_find_all_by_name_and_text(self): - self.assertSelects( - self.tree.find_all('a', text='First tag.'), ['First tag.']) - - self.assertSelects( - self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) - - self.assertSelects( - self.tree.find_all('a', text=re.compile("tag")), - ['First tag.', 'Nested tag.']) - - - def test_find_all_on_non_root_element(self): - # You can call find_all on any node, not just the root. - self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) - - def test_calling_element_invokes_find_all(self): - self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) - - def test_find_all_by_tag_strainer(self): - self.assertSelects( - self.tree.find_all(SoupStrainer('a')), - ['First tag.', 'Nested tag.']) - - def test_find_all_by_tag_names(self): - self.assertSelects( - self.tree.find_all(['a', 'b']), - ['First tag.', 'Second tag.', 'Nested tag.']) - - def test_find_all_by_tag_dict(self): - self.assertSelects( - self.tree.find_all({'a' : True, 'b' : True}), - ['First tag.', 'Second tag.', 'Nested tag.']) - - def test_find_all_by_tag_re(self): - self.assertSelects( - self.tree.find_all(re.compile('^[ab]$')), - ['First tag.', 'Second tag.', 'Nested tag.']) - - def test_find_all_with_tags_matching_method(self): - # You can define an oracle method that determines whether - # a tag matches the search. - def id_matches_name(tag): - return tag.name == tag.get('id') - - tree = self.soup("""Match 1. - Does not match. - Match 2.""") - - self.assertSelects( - tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) - - -class TestFindAllByAttribute(TreeTest): - - def test_find_all_by_attribute_name(self): - # You can pass in keyword arguments to find_all to search by - # attribute. - tree = self.soup(""" - Matching a. - - Non-matching Matching b.a. - """) - self.assertSelects(tree.find_all(id='first'), - ["Matching a.", "Matching b."]) - - def test_find_all_by_utf8_attribute_value(self): - peace = u"םולש".encode("utf8") - data = u''.encode("utf8") - soup = self.soup(data) - self.assertEqual([soup.a], soup.find_all(title=peace)) - self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) - self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) - - def test_find_all_by_attribute_dict(self): - # You can pass in a dictionary as the argument 'attrs'. This - # lets you search for attributes like 'name' (a fixed argument - # to find_all) and 'class' (a reserved word in Python.) - tree = self.soup(""" - Name match. - Class match. - Non-match. - A tag called 'name1'. - """) - - # This doesn't do what you want. - self.assertSelects(tree.find_all(name='name1'), - ["A tag called 'name1'."]) - # This does what you want. - self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), - ["Name match."]) - - self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), - ["Class match."]) - - def test_find_all_by_class(self): - tree = self.soup(""" - Class 1. - Class 2. - Class 1. - Class 3 and 4. - """) - - # Passing in the class_ keyword argument will search against - # the 'class' attribute. - self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) - self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) - self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) - - # Passing in a string to 'attrs' will also search the CSS class. - self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) - self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) - self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) - self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) - - def test_find_by_class_when_multiple_classes_present(self): - tree = self.soup("Found it") - - f = tree.find_all("gar", class_=re.compile("o")) - self.assertSelects(f, ["Found it"]) - - f = tree.find_all("gar", class_=re.compile("a")) - self.assertSelects(f, ["Found it"]) - - # Since the class is not the string "foo bar", but the two - # strings "foo" and "bar", this will not find anything. - f = tree.find_all("gar", class_=re.compile("o b")) - self.assertSelects(f, []) - - def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): - soup = self.soup("Found it") - - self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) - - def big_attribute_value(value): - return len(value) > 3 - - self.assertSelects(soup.find_all("a", big_attribute_value), []) - - def small_attribute_value(value): - return len(value) <= 3 - - self.assertSelects( - soup.find_all("a", small_attribute_value), ["Found it"]) - - def test_find_all_with_string_for_attrs_finds_multiple_classes(self): - soup = self.soup('') - a, a2 = soup.find_all("a") - self.assertEqual([a, a2], soup.find_all("a", "foo")) - self.assertEqual([a], soup.find_all("a", "bar")) - - # If you specify the class as a string that contains a - # space, only that specific value will be found. - self.assertEqual([a], soup.find_all("a", class_="foo bar")) - self.assertEqual([a], soup.find_all("a", "foo bar")) - self.assertEqual([], soup.find_all("a", "bar foo")) - - def test_find_all_by_attribute_soupstrainer(self): - tree = self.soup(""" - Match. - Non-match.""") - - strainer = SoupStrainer(attrs={'id' : 'first'}) - self.assertSelects(tree.find_all(strainer), ['Match.']) - - def test_find_all_with_missing_atribute(self): - # You can pass in None as the value of an attribute to find_all. - # This will match tags that do not have that attribute set. - tree = self.soup("""ID present. - No ID present. - ID is empty.""") - self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) - - def test_find_all_with_defined_attribute(self): - # You can pass in None as the value of an attribute to find_all. - # This will match tags that have that attribute set to any value. - tree = self.soup("""ID present. - No ID present. - ID is empty.""") - self.assertSelects( - tree.find_all(id=True), ["ID present.", "ID is empty."]) - - def test_find_all_with_numeric_attribute(self): - # If you search for a number, it's treated as a string. - tree = self.soup("""Unquoted attribute. - Quoted attribute.""") - - expected = ["Unquoted attribute.", "Quoted attribute."] - self.assertSelects(tree.find_all(id=1), expected) - self.assertSelects(tree.find_all(id="1"), expected) - - def test_find_all_with_list_attribute_values(self): - # You can pass a list of attribute values instead of just one, - # and you'll get tags that match any of the values. - tree = self.soup("""1 - 2 - 3 - No ID.""") - self.assertSelects(tree.find_all(id=["1", "3", "4"]), - ["1", "3"]) - - def test_find_all_with_regular_expression_attribute_value(self): - # You can pass a regular expression as an attribute value, and - # you'll get tags whose values for that attribute match the - # regular expression. - tree = self.soup("""One a. - Two as. - Mixed as and bs. - One b. - No ID.""") - - self.assertSelects(tree.find_all(id=re.compile("^a+$")), - ["One a.", "Two as."]) - - def test_find_by_name_and_containing_string(self): - soup = self.soup("foobarfoo") - a = soup.a - - self.assertEqual([a], soup.find_all("a", text="foo")) - self.assertEqual([], soup.find_all("a", text="bar")) - self.assertEqual([], soup.find_all("a", text="bar")) - - def test_find_by_name_and_containing_string_when_string_is_buried(self): - soup = self.soup("foofoo") - self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) - - def test_find_by_attribute_and_containing_string(self): - soup = self.soup('foofoo') - a = soup.a - - self.assertEqual([a], soup.find_all(id=2, text="foo")) - self.assertEqual([], soup.find_all(id=1, text="bar")) - - - - -class TestIndex(TreeTest): - """Test Tag.index""" - def test_index(self): - tree = self.soup("""
- Identical - Not identical - Identical - - Identical with child - Also not identical - Identical with child -
""") - div = tree.div - for i, element in enumerate(div.contents): - self.assertEqual(i, div.index(element)) - self.assertRaises(ValueError, tree.index, 1) - - -class TestParentOperations(TreeTest): - """Test navigation and searching through an element's parents.""" - - def setUp(self): - super(TestParentOperations, self).setUp() - self.tree = self.soup('''
    -
      -
        -
          - Start here -
        -
      ''') - self.start = self.tree.b - - - def test_parent(self): - self.assertEqual(self.start.parent['id'], 'bottom') - self.assertEqual(self.start.parent.parent['id'], 'middle') - self.assertEqual(self.start.parent.parent.parent['id'], 'top') - - def test_parent_of_top_tag_is_soup_object(self): - top_tag = self.tree.contents[0] - self.assertEqual(top_tag.parent, self.tree) - - def test_soup_object_has_no_parent(self): - self.assertEqual(None, self.tree.parent) - - def test_find_parents(self): - self.assertSelectsIDs( - self.start.find_parents('ul'), ['bottom', 'middle', 'top']) - self.assertSelectsIDs( - self.start.find_parents('ul', id="middle"), ['middle']) - - def test_find_parent(self): - self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') - self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') - - def test_parent_of_text_element(self): - text = self.tree.find(text="Start here") - self.assertEqual(text.parent.name, 'b') - - def test_text_element_find_parent(self): - text = self.tree.find(text="Start here") - self.assertEqual(text.find_parent('ul')['id'], 'bottom') - - def test_parent_generator(self): - parents = [parent['id'] for parent in self.start.parents - if parent is not None and 'id' in parent.attrs] - self.assertEqual(parents, ['bottom', 'middle', 'top']) - - -class ProximityTest(TreeTest): - - def setUp(self): - super(TreeTest, self).setUp() - self.tree = self.soup( - 'OneTwoThree') - - -class TestNextOperations(ProximityTest): - - def setUp(self): - super(TestNextOperations, self).setUp() - self.start = self.tree.b - - def test_next(self): - self.assertEqual(self.start.next_element, "One") - self.assertEqual(self.start.next_element.next_element['id'], "2") - - def test_next_of_last_item_is_none(self): - last = self.tree.find(text="Three") - self.assertEqual(last.next_element, None) - - def test_next_of_root_is_none(self): - # The document root is outside the next/previous chain. - self.assertEqual(self.tree.next_element, None) - - def test_find_all_next(self): - self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) - self.start.find_all_next(id=3) - self.assertSelects(self.start.find_all_next(id=3), ["Three"]) - - def test_find_next(self): - self.assertEqual(self.start.find_next('b')['id'], '2') - self.assertEqual(self.start.find_next(text="Three"), "Three") - - def test_find_next_for_text_element(self): - text = self.tree.find(text="One") - self.assertEqual(text.find_next("b").string, "Two") - self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) - - def test_next_generator(self): - start = self.tree.find(text="Two") - successors = [node for node in start.next_elements] - # There are two successors: the final tag and its text contents. - tag, contents = successors - self.assertEqual(tag['id'], '3') - self.assertEqual(contents, "Three") - -class TestPreviousOperations(ProximityTest): - - def setUp(self): - super(TestPreviousOperations, self).setUp() - self.end = self.tree.find(text="Three") - - def test_previous(self): - self.assertEqual(self.end.previous_element['id'], "3") - self.assertEqual(self.end.previous_element.previous_element, "Two") - - def test_previous_of_first_item_is_none(self): - first = self.tree.find('html') - self.assertEqual(first.previous_element, None) - - def test_previous_of_root_is_none(self): - # The document root is outside the next/previous chain. - # XXX This is broken! - #self.assertEqual(self.tree.previous_element, None) - pass - - def test_find_all_previous(self): - # The tag containing the "Three" node is the predecessor - # of the "Three" node itself, which is why "Three" shows up - # here. - self.assertSelects( - self.end.find_all_previous('b'), ["Three", "Two", "One"]) - self.assertSelects(self.end.find_all_previous(id=1), ["One"]) - - def test_find_previous(self): - self.assertEqual(self.end.find_previous('b')['id'], '3') - self.assertEqual(self.end.find_previous(text="One"), "One") - - def test_find_previous_for_text_element(self): - text = self.tree.find(text="Three") - self.assertEqual(text.find_previous("b").string, "Three") - self.assertSelects( - text.find_all_previous("b"), ["Three", "Two", "One"]) - - def test_previous_generator(self): - start = self.tree.find(text="One") - predecessors = [node for node in start.previous_elements] - - # There are four predecessors: the tag containing "One" - # the tag, the tag, and the tag. - b, body, head, html = predecessors - self.assertEqual(b['id'], '1') - self.assertEqual(body.name, "body") - self.assertEqual(head.name, "head") - self.assertEqual(html.name, "html") - - -class SiblingTest(TreeTest): - - def setUp(self): - super(SiblingTest, self).setUp() - markup = ''' - - - - - - - - - - - ''' - # All that whitespace looks good but makes the tests more - # difficult. Get rid of it. - markup = re.compile("\n\s*").sub("", markup) - self.tree = self.soup(markup) - - -class TestNextSibling(SiblingTest): - - def setUp(self): - super(TestNextSibling, self).setUp() - self.start = self.tree.find(id="1") - - def test_next_sibling_of_root_is_none(self): - self.assertEqual(self.tree.next_sibling, None) - - def test_next_sibling(self): - self.assertEqual(self.start.next_sibling['id'], '2') - self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') - - # Note the difference between next_sibling and next_element. - self.assertEqual(self.start.next_element['id'], '1.1') - - def test_next_sibling_may_not_exist(self): - self.assertEqual(self.tree.html.next_sibling, None) - - nested_span = self.tree.find(id="1.1") - self.assertEqual(nested_span.next_sibling, None) - - last_span = self.tree.find(id="4") - self.assertEqual(last_span.next_sibling, None) - - def test_find_next_sibling(self): - self.assertEqual(self.start.find_next_sibling('span')['id'], '2') - - def test_next_siblings(self): - self.assertSelectsIDs(self.start.find_next_siblings("span"), - ['2', '3', '4']) - - self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) - - def test_next_sibling_for_text_element(self): - soup = self.soup("Foobarbaz") - start = soup.find(text="Foo") - self.assertEqual(start.next_sibling.name, 'b') - self.assertEqual(start.next_sibling.next_sibling, 'baz') - - self.assertSelects(start.find_next_siblings('b'), ['bar']) - self.assertEqual(start.find_next_sibling(text="baz"), "baz") - self.assertEqual(start.find_next_sibling(text="nonesuch"), None) - - -class TestPreviousSibling(SiblingTest): - - def setUp(self): - super(TestPreviousSibling, self).setUp() - self.end = self.tree.find(id="4") - - def test_previous_sibling_of_root_is_none(self): - self.assertEqual(self.tree.previous_sibling, None) - - def test_previous_sibling(self): - self.assertEqual(self.end.previous_sibling['id'], '3') - self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') - - # Note the difference between previous_sibling and previous_element. - self.assertEqual(self.end.previous_element['id'], '3.1') - - def test_previous_sibling_may_not_exist(self): - self.assertEqual(self.tree.html.previous_sibling, None) - - nested_span = self.tree.find(id="1.1") - self.assertEqual(nested_span.previous_sibling, None) - - first_span = self.tree.find(id="1") - self.assertEqual(first_span.previous_sibling, None) - - def test_find_previous_sibling(self): - self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') - - def test_previous_siblings(self): - self.assertSelectsIDs(self.end.find_previous_siblings("span"), - ['3', '2', '1']) - - self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) - - def test_previous_sibling_for_text_element(self): - soup = self.soup("Foobarbaz") - start = soup.find(text="baz") - self.assertEqual(start.previous_sibling.name, 'b') - self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') - - self.assertSelects(start.find_previous_siblings('b'), ['bar']) - self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") - self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) - - -class TestTagCreation(SoupTest): - """Test the ability to create new tags.""" - def test_new_tag(self): - soup = self.soup("") - new_tag = soup.new_tag("foo", bar="baz") - self.assertTrue(isinstance(new_tag, Tag)) - self.assertEqual("foo", new_tag.name) - self.assertEqual(dict(bar="baz"), new_tag.attrs) - self.assertEqual(None, new_tag.parent) - - def test_tag_inherits_self_closing_rules_from_builder(self): - if XML_BUILDER_PRESENT: - xml_soup = BeautifulSoup("", "xml") - xml_br = xml_soup.new_tag("br") - xml_p = xml_soup.new_tag("p") - - # Both the
      and

      tag are empty-element, just because - # they have no contents. - self.assertEqual(b"
      ", xml_br.encode()) - self.assertEqual(b"

      ", xml_p.encode()) - - html_soup = BeautifulSoup("", "html") - html_br = html_soup.new_tag("br") - html_p = html_soup.new_tag("p") - - # The HTML builder users HTML's rules about which tags are - # empty-element tags, and the new tags reflect these rules. - self.assertEqual(b"
      ", html_br.encode()) - self.assertEqual(b"

      ", html_p.encode()) - - def test_new_string_creates_navigablestring(self): - soup = self.soup("") - s = soup.new_string("foo") - self.assertEqual("foo", s) - self.assertTrue(isinstance(s, NavigableString)) - - def test_new_string_can_create_navigablestring_subclass(self): - soup = self.soup("") - s = soup.new_string("foo", Comment) - self.assertEqual("foo", s) - self.assertTrue(isinstance(s, Comment)) - -class TestTreeModification(SoupTest): - - def test_attribute_modification(self): - soup = self.soup('') - soup.a['id'] = 2 - self.assertEqual(soup.decode(), self.document_for('')) - del(soup.a['id']) - self.assertEqual(soup.decode(), self.document_for('')) - soup.a['id2'] = 'foo' - self.assertEqual(soup.decode(), self.document_for('')) - - def test_new_tag_creation(self): - builder = builder_registry.lookup('html')() - soup = self.soup("", builder=builder) - a = Tag(soup, builder, 'a') - ol = Tag(soup, builder, 'ol') - a['href'] = 'http://foo.com/' - soup.body.insert(0, a) - soup.body.insert(1, ol) - self.assertEqual( - soup.body.encode(), - b'
        ') - - def test_append_to_contents_moves_tag(self): - doc = """

        Don't leave me here.

        -

        Don\'t leave!

        """ - soup = self.soup(doc) - second_para = soup.find(id='2') - bold = soup.b - - # Move the tag to the end of the second paragraph. - soup.find(id='2').append(soup.b) - - # The tag is now a child of the second paragraph. - self.assertEqual(bold.parent, second_para) - - self.assertEqual( - soup.decode(), self.document_for( - '

        Don\'t leave me .

        \n' - '

        Don\'t leave!here

        ')) - - def test_replace_with_returns_thing_that_was_replaced(self): - text = "" - soup = self.soup(text) - a = soup.a - new_a = a.replace_with(soup.c) - self.assertEqual(a, new_a) - - def test_unwrap_returns_thing_that_was_replaced(self): - text = "" - soup = self.soup(text) - a = soup.a - new_a = a.unwrap() - self.assertEqual(a, new_a) - - def test_replace_tag_with_itself(self): - text = "Foo" - soup = self.soup(text) - c = soup.c - soup.c.replace_with(c) - self.assertEqual(soup.decode(), self.document_for(text)) - - def test_replace_tag_with_its_parent_raises_exception(self): - text = "" - soup = self.soup(text) - self.assertRaises(ValueError, soup.b.replace_with, soup.a) - - def test_insert_tag_into_itself_raises_exception(self): - text = "" - soup = self.soup(text) - self.assertRaises(ValueError, soup.a.insert, 0, soup.a) - - def test_replace_with_maintains_next_element_throughout(self): - soup = self.soup('

        onethree

        ') - a = soup.a - b = a.contents[0] - # Make it so the tag has two text children. - a.insert(1, "two") - - # Now replace each one with the empty string. - left, right = a.contents - left.replaceWith('') - right.replaceWith('') - - # The tag is still connected to the tree. - self.assertEqual("three", soup.b.string) - - def test_replace_final_node(self): - soup = self.soup("Argh!") - soup.find(text="Argh!").replace_with("Hooray!") - new_text = soup.find(text="Hooray!") - b = soup.b - self.assertEqual(new_text.previous_element, b) - self.assertEqual(new_text.parent, b) - self.assertEqual(new_text.previous_element.next_element, new_text) - self.assertEqual(new_text.next_element, None) - - def test_consecutive_text_nodes(self): - # A builder should never create two consecutive text nodes, - # but if you insert one next to another, Beautiful Soup will - # handle it correctly. - soup = self.soup("Argh!") - soup.b.insert(1, "Hooray!") - - self.assertEqual( - soup.decode(), self.document_for( - "Argh!Hooray!")) - - new_text = soup.find(text="Hooray!") - self.assertEqual(new_text.previous_element, "Argh!") - self.assertEqual(new_text.previous_element.next_element, new_text) - - self.assertEqual(new_text.previous_sibling, "Argh!") - self.assertEqual(new_text.previous_sibling.next_sibling, new_text) - - self.assertEqual(new_text.next_sibling, None) - self.assertEqual(new_text.next_element, soup.c) - - def test_insert_string(self): - soup = self.soup("") - soup.a.insert(0, "bar") - soup.a.insert(0, "foo") - # The string were added to the tag. - self.assertEqual(["foo", "bar"], soup.a.contents) - # And they were converted to NavigableStrings. - self.assertEqual(soup.a.contents[0].next_element, "bar") - - def test_insert_tag(self): - builder = self.default_builder - soup = self.soup( - "Findlady!", builder=builder) - magic_tag = Tag(soup, builder, 'magictag') - magic_tag.insert(0, "the") - soup.a.insert(1, magic_tag) - - self.assertEqual( - soup.decode(), self.document_for( - "Findthelady!")) - - # Make sure all the relationships are hooked up correctly. - b_tag = soup.b - self.assertEqual(b_tag.next_sibling, magic_tag) - self.assertEqual(magic_tag.previous_sibling, b_tag) - - find = b_tag.find(text="Find") - self.assertEqual(find.next_element, magic_tag) - self.assertEqual(magic_tag.previous_element, find) - - c_tag = soup.c - self.assertEqual(magic_tag.next_sibling, c_tag) - self.assertEqual(c_tag.previous_sibling, magic_tag) - - the = magic_tag.find(text="the") - self.assertEqual(the.parent, magic_tag) - self.assertEqual(the.next_element, c_tag) - self.assertEqual(c_tag.previous_element, the) - - def test_append_child_thats_already_at_the_end(self): - data = "" - soup = self.soup(data) - soup.a.append(soup.b) - self.assertEqual(data, soup.decode()) - - def test_move_tag_to_beginning_of_parent(self): - data = "" - soup = self.soup(data) - soup.a.insert(0, soup.d) - self.assertEqual("", soup.decode()) - - def test_insert_works_on_empty_element_tag(self): - # This is a little strange, since most HTML parsers don't allow - # markup like this to come through. But in general, we don't - # know what the parser would or wouldn't have allowed, so - # I'm letting this succeed for now. - soup = self.soup("
        ") - soup.br.insert(1, "Contents") - self.assertEqual(str(soup.br), "
        Contents
        ") - - def test_insert_before(self): - soup = self.soup("foobar") - soup.b.insert_before("BAZ") - soup.a.insert_before("QUUX") - self.assertEqual( - soup.decode(), self.document_for("QUUXfooBAZbar")) - - soup.a.insert_before(soup.b) - self.assertEqual( - soup.decode(), self.document_for("QUUXbarfooBAZ")) - - def test_insert_after(self): - soup = self.soup("foobar") - soup.b.insert_after("BAZ") - soup.a.insert_after("QUUX") - self.assertEqual( - soup.decode(), self.document_for("fooQUUXbarBAZ")) - soup.b.insert_after(soup.a) - self.assertEqual( - soup.decode(), self.document_for("QUUXbarfooBAZ")) - - def test_insert_after_raises_exception_if_after_has_no_meaning(self): - soup = self.soup("") - tag = soup.new_tag("a") - string = soup.new_string("") - self.assertRaises(ValueError, string.insert_after, tag) - self.assertRaises(NotImplementedError, soup.insert_after, tag) - self.assertRaises(ValueError, tag.insert_after, tag) - - def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): - soup = self.soup("") - tag = soup.new_tag("a") - string = soup.new_string("") - self.assertRaises(ValueError, string.insert_before, tag) - self.assertRaises(NotImplementedError, soup.insert_before, tag) - self.assertRaises(ValueError, tag.insert_before, tag) - - def test_replace_with(self): - soup = self.soup( - "

        There's no business like show business

        ") - no, show = soup.find_all('b') - show.replace_with(no) - self.assertEqual( - soup.decode(), - self.document_for( - "

        There's business like no business

        ")) - - self.assertEqual(show.parent, None) - self.assertEqual(no.parent, soup.p) - self.assertEqual(no.next_element, "no") - self.assertEqual(no.next_sibling, " business") - - def test_replace_first_child(self): - data = "" - soup = self.soup(data) - soup.b.replace_with(soup.c) - self.assertEqual("", soup.decode()) - - def test_replace_last_child(self): - data = "" - soup = self.soup(data) - soup.c.replace_with(soup.b) - self.assertEqual("", soup.decode()) - - def test_nested_tag_replace_with(self): - soup = self.soup( - """Wereservetherighttorefuseservice""") - - # Replace the entire tag and its contents ("reserve the - # right") with the tag ("refuse"). - remove_tag = soup.b - move_tag = soup.f - remove_tag.replace_with(move_tag) - - self.assertEqual( - soup.decode(), self.document_for( - "Werefusetoservice")) - - # The tag is now an orphan. - self.assertEqual(remove_tag.parent, None) - self.assertEqual(remove_tag.find(text="right").next_element, None) - self.assertEqual(remove_tag.previous_element, None) - self.assertEqual(remove_tag.next_sibling, None) - self.assertEqual(remove_tag.previous_sibling, None) - - # The tag is now connected to the tag. - self.assertEqual(move_tag.parent, soup.a) - self.assertEqual(move_tag.previous_element, "We") - self.assertEqual(move_tag.next_element.next_element, soup.e) - self.assertEqual(move_tag.next_sibling, None) - - # The gap where the tag used to be has been mended, and - # the word "to" is now connected to the tag. - to_text = soup.find(text="to") - g_tag = soup.g - self.assertEqual(to_text.next_element, g_tag) - self.assertEqual(to_text.next_sibling, g_tag) - self.assertEqual(g_tag.previous_element, to_text) - self.assertEqual(g_tag.previous_sibling, to_text) - - def test_unwrap(self): - tree = self.soup(""" -

        Unneeded formatting is unneeded

        - """) - tree.em.unwrap() - self.assertEqual(tree.em, None) - self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") - - def test_wrap(self): - soup = self.soup("I wish I was bold.") - value = soup.string.wrap(soup.new_tag("b")) - self.assertEqual(value.decode(), "I wish I was bold.") - self.assertEqual( - soup.decode(), self.document_for("I wish I was bold.")) - - def test_wrap_extracts_tag_from_elsewhere(self): - soup = self.soup("I wish I was bold.") - soup.b.next_sibling.wrap(soup.b) - self.assertEqual( - soup.decode(), self.document_for("I wish I was bold.")) - - def test_wrap_puts_new_contents_at_the_end(self): - soup = self.soup("I like being bold.I wish I was bold.") - soup.b.next_sibling.wrap(soup.b) - self.assertEqual(2, len(soup.b.contents)) - self.assertEqual( - soup.decode(), self.document_for( - "I like being bold.I wish I was bold.")) - - def test_extract(self): - soup = self.soup( - 'Some content. More content.') - - self.assertEqual(len(soup.body.contents), 3) - extracted = soup.find(id="nav").extract() - - self.assertEqual( - soup.decode(), "Some content. More content.") - self.assertEqual(extracted.decode(), '') - - # The extracted tag is now an orphan. - self.assertEqual(len(soup.body.contents), 2) - self.assertEqual(extracted.parent, None) - self.assertEqual(extracted.previous_element, None) - self.assertEqual(extracted.next_element.next_element, None) - - # The gap where the extracted tag used to be has been mended. - content_1 = soup.find(text="Some content. ") - content_2 = soup.find(text=" More content.") - self.assertEqual(content_1.next_element, content_2) - self.assertEqual(content_1.next_sibling, content_2) - self.assertEqual(content_2.previous_element, content_1) - self.assertEqual(content_2.previous_sibling, content_1) - - def test_extract_distinguishes_between_identical_strings(self): - soup = self.soup("
        foobar") - foo_1 = soup.a.string - bar_1 = soup.b.string - foo_2 = soup.new_string("foo") - bar_2 = soup.new_string("bar") - soup.a.append(foo_2) - soup.b.append(bar_2) - - # Now there are two identical strings in the tag, and two - # in the tag. Let's remove the first "foo" and the second - # "bar". - foo_1.extract() - bar_2.extract() - self.assertEqual(foo_2, soup.a.string) - self.assertEqual(bar_2, soup.b.string) - - def test_clear(self): - """Tag.clear()""" - soup = self.soup("

        String Italicized and another

        ") - # clear using extract() - a = soup.a - soup.p.clear() - self.assertEqual(len(soup.p.contents), 0) - self.assertTrue(hasattr(a, "contents")) - - # clear using decompose() - em = a.em - a.clear(decompose=True) - self.assertEqual(0, len(em.contents)) - - def test_string_set(self): - """Tag.string = 'string'""" - soup = self.soup(" ") - soup.a.string = "foo" - self.assertEqual(soup.a.contents, ["foo"]) - soup.b.string = "bar" - self.assertEqual(soup.b.contents, ["bar"]) - - def test_string_set_does_not_affect_original_string(self): - soup = self.soup("foobar") - soup.b.string = soup.c.string - self.assertEqual(soup.a.encode(), b"barbar") - - def test_set_string_preserves_class_of_string(self): - soup = self.soup("") - cdata = CData("foo") - soup.a.string = cdata - self.assertTrue(isinstance(soup.a.string, CData)) - -class TestElementObjects(SoupTest): - """Test various features of element objects.""" - - def test_len(self): - """The length of an element is its number of children.""" - soup = self.soup("123") - - # The BeautifulSoup object itself contains one element: the - # tag. - self.assertEqual(len(soup.contents), 1) - self.assertEqual(len(soup), 1) - - # The tag contains three elements: the text node "1", the - # tag, and the text node "3". - self.assertEqual(len(soup.top), 3) - self.assertEqual(len(soup.top.contents), 3) - - def test_member_access_invokes_find(self): - """Accessing a Python member .foo invokes find('foo')""" - soup = self.soup('') - self.assertEqual(soup.b, soup.find('b')) - self.assertEqual(soup.b.i, soup.find('b').find('i')) - self.assertEqual(soup.a, None) - - def test_deprecated_member_access(self): - soup = self.soup('') - with warnings.catch_warnings(record=True) as w: - tag = soup.bTag - self.assertEqual(soup.b, tag) - self.assertEqual( - '.bTag is deprecated, use .find("b") instead.', - str(w[0].message)) - - def test_has_attr(self): - """has_attr() checks for the presence of an attribute. - - Please note note: has_attr() is different from - __in__. has_attr() checks the tag's attributes and __in__ - checks the tag's chidlren. - """ - soup = self.soup("") - self.assertTrue(soup.foo.has_attr('attr')) - self.assertFalse(soup.foo.has_attr('attr2')) - - - def test_attributes_come_out_in_alphabetical_order(self): - markup = '' - self.assertSoupEquals(markup, '') - - def test_string(self): - # A tag that contains only a text node makes that node - # available as .string. - soup = self.soup("foo") - self.assertEqual(soup.b.string, 'foo') - - def test_empty_tag_has_no_string(self): - # A tag with no children has no .stirng. - soup = self.soup("") - self.assertEqual(soup.b.string, None) - - def test_tag_with_multiple_children_has_no_string(self): - # A tag with no children has no .string. - soup = self.soup("foo") - self.assertEqual(soup.b.string, None) - - soup = self.soup("foobar
        ") - self.assertEqual(soup.b.string, None) - - # Even if all the children are strings, due to trickery, - # it won't work--but this would be a good optimization. - soup = self.soup("foo
        ") - soup.a.insert(1, "bar") - self.assertEqual(soup.a.string, None) - - def test_tag_with_recursive_string_has_string(self): - # A tag with a single child which has a .string inherits that - # .string. - soup = self.soup("foo") - self.assertEqual(soup.a.string, "foo") - self.assertEqual(soup.string, "foo") - - def test_lack_of_string(self): - """Only a tag containing a single text node has a .string.""" - soup = self.soup("feo") - self.assertFalse(soup.b.string) - - soup = self.soup("") - self.assertFalse(soup.b.string) - - def test_all_text(self): - """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" - soup = self.soup("ar t ") - self.assertEqual(soup.a.text, "ar t ") - self.assertEqual(soup.a.get_text(strip=True), "art") - self.assertEqual(soup.a.get_text(","), "a,r, , t ") - self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") - - def test_get_text_ignores_comments(self): - soup = self.soup("foobar") - self.assertEqual(soup.get_text(), "foobar") - - self.assertEqual( - soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") - self.assertEqual( - soup.get_text(types=None), "fooIGNOREbar") - - def test_all_strings_ignores_comments(self): - soup = self.soup("foobar") - self.assertEqual(['foo', 'bar'], list(soup.strings)) - -class TestCDAtaListAttributes(SoupTest): - - """Testing cdata-list attributes like 'class'. - """ - def test_single_value_becomes_list(self): - soup = self.soup("") - self.assertEqual(["foo"],soup.a['class']) - - def test_multiple_values_becomes_list(self): - soup = self.soup("") - self.assertEqual(["foo", "bar"], soup.a['class']) - - def test_multiple_values_separated_by_weird_whitespace(self): - soup = self.soup("") - self.assertEqual(["foo", "bar", "baz"],soup.a['class']) - - def test_attributes_joined_into_string_on_output(self): - soup = self.soup("") - self.assertEqual(b'', soup.a.encode()) - - def test_accept_charset(self): - soup = self.soup('
        ') - self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) - - def test_cdata_attribute_applying_only_to_one_tag(self): - data = '' - soup = self.soup(data) - # We saw in another test that accept-charset is a cdata-list - # attribute for the tag. But it's not a cdata-list - # attribute for any other tag. - self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) - - def test_string_has_immutable_name_property(self): - string = self.soup("s").string - self.assertEqual(None, string.name) - def t(): - string.name = 'foo' - self.assertRaises(AttributeError, t) - -class TestPersistence(SoupTest): - "Testing features like pickle and deepcopy." - - def setUp(self): - super(TestPersistence, self).setUp() - self.page = """ - - - -Beautiful Soup: We called him Tortoise because he taught us. - - - - - - -foo -bar - -""" - self.tree = self.soup(self.page) - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - dumped = pickle.dumps(self.tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), self.tree.decode()) - - def test_deepcopy_identity(self): - # Making a deepcopy of a tree yields an identical tree. - copied = copy.deepcopy(self.tree) - self.assertEqual(copied.decode(), self.tree.decode()) - - def test_unicode_pickle(self): - # A tree containing Unicode characters can be pickled. - html = u"\N{SNOWMAN}" - soup = self.soup(html) - dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.decode(), soup.decode()) - - -class TestSubstitutions(SoupTest): - - def test_default_formatter_is_minimal(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter="minimal") - # The < is converted back into < but the e-with-acute is left alone. - self.assertEqual( - decoded, - self.document_for( - u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) - - def test_formatter_html(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter="html") - self.assertEqual( - decoded, - self.document_for("<<Sacré bleu!>>")) - - def test_formatter_minimal(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter="minimal") - # The < is converted back into < but the e-with-acute is left alone. - self.assertEqual( - decoded, - self.document_for( - u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) - - def test_formatter_null(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter=None) - # Neither the angle brackets nor the e-with-acute are converted. - # This is not valid HTML, but it's what the user wanted. - self.assertEqual(decoded, - self.document_for(u"<>")) - - def test_formatter_custom(self): - markup = u"<foo>bar" - soup = self.soup(markup) - decoded = soup.decode(formatter = lambda x: x.upper()) - # Instead of normal entity conversion code, the custom - # callable is called on every string. - self.assertEqual( - decoded, - self.document_for(u"BAR")) - - def test_formatter_is_run_on_attribute_values(self): - markup = u'e' - soup = self.soup(markup) - a = soup.a - - expect_minimal = u'e' - - self.assertEqual(expect_minimal, a.decode()) - self.assertEqual(expect_minimal, a.decode(formatter="minimal")) - - expect_html = u'e' - self.assertEqual(expect_html, a.decode(formatter="html")) - - self.assertEqual(markup, a.decode(formatter=None)) - expect_upper = u'E' - self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) - - def test_formatter_skips_script_tag_for_html_documents(self): - doc = """ - -""" - encoded = BeautifulSoup(doc).encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_formatter_skips_style_tag_for_html_documents(self): - doc = """ - -""" - encoded = BeautifulSoup(doc).encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_prettify_leaves_preformatted_text_alone(self): - soup = self.soup("
        foo
          \tbar\n  \n  
        baz ") - # Everything outside the
         tag is reformatted, but everything
        -        # inside is left alone.
        -        self.assertEqual(
        -            u'
        \n foo\n
          \tbar\n  \n  
        \n baz\n
        ', - soup.div.prettify()) - - def test_prettify_accepts_formatter(self): - soup = BeautifulSoup("foo") - pretty = soup.prettify(formatter = lambda x: x.upper()) - self.assertTrue("FOO" in pretty) - - def test_prettify_outputs_unicode_by_default(self): - soup = self.soup("") - self.assertEqual(unicode, type(soup.prettify())) - - def test_prettify_can_encode_data(self): - soup = self.soup("") - self.assertEqual(bytes, type(soup.prettify("utf-8"))) - - def test_html_entity_substitution_off_by_default(self): - markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" - soup = self.soup(markup) - encoded = soup.b.encode("utf-8") - self.assertEqual(encoded, markup.encode('utf-8')) - - def test_encoding_substitution(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - soup = self.soup(meta_tag) - - # Parse the document, and the charset apprears unchanged. - self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') - - # Encode the document into some encoding, and the encoding is - # substituted into the meta tag. - utf_8 = soup.encode("utf-8") - self.assertTrue(b"charset=utf-8" in utf_8) - - euc_jp = soup.encode("euc_jp") - self.assertTrue(b"charset=euc_jp" in euc_jp) - - shift_jis = soup.encode("shift-jis") - self.assertTrue(b"charset=shift-jis" in shift_jis) - - utf_16_u = soup.encode("utf-16").decode("utf-16") - self.assertTrue("charset=utf-16" in utf_16_u) - - def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): - markup = ('
        foo
        ') - - # Beautiful Soup used to try to rewrite the meta tag even if the - # meta tag got filtered out by the strainer. This test makes - # sure that doesn't happen. - strainer = SoupStrainer('pre') - soup = self.soup(markup, parse_only=strainer) - self.assertEqual(soup.contents[0].name, 'pre') - -class TestEncoding(SoupTest): - """Test the ability to encode objects into strings.""" - - def test_unicode_string_can_be_encoded(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual(soup.b.string.encode("utf-8"), - u"\N{SNOWMAN}".encode("utf-8")) - - def test_tag_containing_unicode_string_can_be_encoded(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual( - soup.b.encode("utf-8"), html.encode("utf-8")) - - def test_encoding_substitutes_unrecognized_characters_by_default(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual(soup.b.encode("ascii"), b"") - - def test_encoding_can_be_made_strict(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertRaises( - UnicodeEncodeError, soup.encode, "ascii", errors="strict") - - def test_decode_contents(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) - - def test_encode_contents(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( - encoding="utf8")) - - def test_deprecated_renderContents(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) - -class TestNavigableStringSubclasses(SoupTest): - - def test_cdata(self): - # None of the current builders turn CDATA sections into CData - # objects, but you can create them manually. - soup = self.soup("") - cdata = CData("foo") - soup.insert(1, cdata) - self.assertEqual(str(soup), "") - self.assertEqual(soup.find(text="foo"), "foo") - self.assertEqual(soup.contents[0], "foo") - - def test_cdata_is_never_formatted(self): - """Text inside a CData object is passed into the formatter. - - But the return value is ignored. - """ - - self.count = 0 - def increment(*args): - self.count += 1 - return "BITTER FAILURE" - - soup = self.soup("") - cdata = CData("<><><>") - soup.insert(1, cdata) - self.assertEqual( - b"<><>]]>", soup.encode(formatter=increment)) - self.assertEqual(1, self.count) - - def test_doctype_ends_in_newline(self): - # Unlike other NavigableString subclasses, a DOCTYPE always ends - # in a newline. - doctype = Doctype("foo") - soup = self.soup("") - soup.insert(1, doctype) - self.assertEqual(soup.encode(), b"\n") - - -class TestSoupSelector(TreeTest): - - HTML = """ - - - -The title - - - - -
        -
        -

        An H1

        -

        Some text

        -

        Some more text

        -

        An H2

        -

        Another

        -Bob -

        Another H2

        -me - -span1a1 -span1a2 test - -span2a1 - - - -
        -

        English

        -

        English UK

        -

        English US

        -

        French

        -
        - - -""" - - def setUp(self): - self.soup = BeautifulSoup(self.HTML) - - def assertSelects(self, selector, expected_ids): - el_ids = [el['id'] for el in self.soup.select(selector)] - el_ids.sort() - expected_ids.sort() - self.assertEqual(expected_ids, el_ids, - "Selector %s, expected [%s], got [%s]" % ( - selector, ', '.join(expected_ids), ', '.join(el_ids) - ) - ) - - assertSelect = assertSelects - - def assertSelectMultiple(self, *tests): - for selector, expected_ids in tests: - self.assertSelect(selector, expected_ids) - - def test_one_tag_one(self): - els = self.soup.select('title') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].name, 'title') - self.assertEqual(els[0].contents, [u'The title']) - - def test_one_tag_many(self): - els = self.soup.select('div') - self.assertEqual(len(els), 3) - for div in els: - self.assertEqual(div.name, 'div') - - def test_tag_in_tag_one(self): - els = self.soup.select('div div') - self.assertSelects('div div', ['inner']) - - def test_tag_in_tag_many(self): - for selector in ('html div', 'html body div', 'body div'): - self.assertSelects(selector, ['main', 'inner', 'footer']) - - def test_tag_no_match(self): - self.assertEqual(len(self.soup.select('del')), 0) - - def test_invalid_tag(self): - self.assertRaises(ValueError, self.soup.select, 'tag%t') - - def test_header_tags(self): - self.assertSelectMultiple( - ('h1', ['header1']), - ('h2', ['header2', 'header3']), - ) - - def test_class_one(self): - for selector in ('.onep', 'p.onep', 'html p.onep'): - els = self.soup.select(selector) - self.assertEqual(len(els), 1) - self.assertEqual(els[0].name, 'p') - self.assertEqual(els[0]['class'], ['onep']) - - def test_class_mismatched_tag(self): - els = self.soup.select('div.onep') - self.assertEqual(len(els), 0) - - def test_one_id(self): - for selector in ('div#inner', '#inner', 'div div#inner'): - self.assertSelects(selector, ['inner']) - - def test_bad_id(self): - els = self.soup.select('#doesnotexist') - self.assertEqual(len(els), 0) - - def test_items_in_id(self): - els = self.soup.select('div#inner p') - self.assertEqual(len(els), 3) - for el in els: - self.assertEqual(el.name, 'p') - self.assertEqual(els[1]['class'], ['onep']) - self.assertFalse(els[0].has_attr('class')) - - def test_a_bunch_of_emptys(self): - for selector in ('div#main del', 'div#main div.oops', 'div div#main'): - self.assertEqual(len(self.soup.select(selector)), 0) - - def test_multi_class_support(self): - for selector in ('.class1', 'p.class1', '.class2', 'p.class2', - '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): - self.assertSelects(selector, ['pmulti']) - - def test_multi_class_selection(self): - for selector in ('.class1.class3', '.class3.class2', - '.class1.class2.class3'): - self.assertSelects(selector, ['pmulti']) - - def test_child_selector(self): - self.assertSelects('.s1 > a', ['s1a1', 's1a2']) - self.assertSelects('.s1 > a span', ['s1a2s1']) - - def test_child_selector_id(self): - self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) - - def test_attribute_equals(self): - self.assertSelectMultiple( - ('p[class="onep"]', ['p1']), - ('p[id="p1"]', ['p1']), - ('[class="onep"]', ['p1']), - ('[id="p1"]', ['p1']), - ('link[rel="stylesheet"]', ['l1']), - ('link[type="text/css"]', ['l1']), - ('link[href="blah.css"]', ['l1']), - ('link[href="no-blah.css"]', []), - ('[rel="stylesheet"]', ['l1']), - ('[type="text/css"]', ['l1']), - ('[href="blah.css"]', ['l1']), - ('[href="no-blah.css"]', []), - ('p[href="no-blah.css"]', []), - ('[href="no-blah.css"]', []), - ) - - def test_attribute_tilde(self): - self.assertSelectMultiple( - ('p[class~="class1"]', ['pmulti']), - ('p[class~="class2"]', ['pmulti']), - ('p[class~="class3"]', ['pmulti']), - ('[class~="class1"]', ['pmulti']), - ('[class~="class2"]', ['pmulti']), - ('[class~="class3"]', ['pmulti']), - ('a[rel~="friend"]', ['bob']), - ('a[rel~="met"]', ['bob']), - ('[rel~="friend"]', ['bob']), - ('[rel~="met"]', ['bob']), - ) - - def test_attribute_startswith(self): - self.assertSelectMultiple( - ('[rel^="style"]', ['l1']), - ('link[rel^="style"]', ['l1']), - ('notlink[rel^="notstyle"]', []), - ('[rel^="notstyle"]', []), - ('link[rel^="notstyle"]', []), - ('link[href^="bla"]', ['l1']), - ('a[href^="http://"]', ['bob', 'me']), - ('[href^="http://"]', ['bob', 'me']), - ('[id^="p"]', ['pmulti', 'p1']), - ('[id^="m"]', ['me', 'main']), - ('div[id^="m"]', ['main']), - ('a[id^="m"]', ['me']), - ) - - def test_attribute_endswith(self): - self.assertSelectMultiple( - ('[href$=".css"]', ['l1']), - ('link[href$=".css"]', ['l1']), - ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), - ('div[id$="1"]', []), - ('[id$="noending"]', []), - ) - - def test_attribute_contains(self): - self.assertSelectMultiple( - # From test_attribute_startswith - ('[rel*="style"]', ['l1']), - ('link[rel*="style"]', ['l1']), - ('notlink[rel*="notstyle"]', []), - ('[rel*="notstyle"]', []), - ('link[rel*="notstyle"]', []), - ('link[href*="bla"]', ['l1']), - ('a[href*="http://"]', ['bob', 'me']), - ('[href*="http://"]', ['bob', 'me']), - ('[id*="p"]', ['pmulti', 'p1']), - ('div[id*="m"]', ['main']), - ('a[id*="m"]', ['me']), - # From test_attribute_endswith - ('[href*=".css"]', ['l1']), - ('link[href*=".css"]', ['l1']), - ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), - ('div[id*="1"]', []), - ('[id*="noending"]', []), - # New for this test - ('[href*="."]', ['bob', 'me', 'l1']), - ('a[href*="."]', ['bob', 'me']), - ('link[href*="."]', ['l1']), - ('div[id*="n"]', ['main', 'inner']), - ('div[id*="nn"]', ['inner']), - ) - - def test_attribute_exact_or_hypen(self): - self.assertSelectMultiple( - ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('p[lang|="fr"]', ['lang-fr']), - ('p[lang|="gb"]', []), - ) - - def test_attribute_exists(self): - self.assertSelectMultiple( - ('[rel]', ['l1', 'bob', 'me']), - ('link[rel]', ['l1']), - ('a[rel]', ['bob', 'me']), - ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), - ('p[class]', ['p1', 'pmulti']), - ('[blah]', []), - ('p[blah]', []), - ) - - def test_nth_of_type(self): - # Try to select first paragraph - els = self.soup.select('div#inner p:nth-of-type(1)') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') - - # Try to select third paragraph - els = self.soup.select('div#inner p:nth-of-type(3)') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Another') - - # Try to select (non-existent!) fourth paragraph - els = self.soup.select('div#inner p:nth-of-type(4)') - self.assertEqual(len(els), 0) - - # Pass in an invalid value. - self.assertRaises( - ValueError, self.soup.select, 'div p:nth-of-type(0)') - - def test_nth_of_type_direct_descendant(self): - els = self.soup.select('div#inner > p:nth-of-type(1)') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') - - def test_id_child_selector_nth_of_type(self): - self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) - - def test_select_on_element(self): - # Other tests operate on the tree; this operates on an element - # within the tree. - inner = self.soup.find("div", id="main") - selected = inner.select("div") - # The
        tag was selected. The