Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439).

This commit is contained in:
JackDandy 2017-01-27 14:20:52 +00:00
parent 9b497e6df2
commit 447c7e0fa8
9 changed files with 301 additions and 93 deletions

View file

@ -9,6 +9,7 @@
* Change improve add show search results by comparing search term to an additional unidecoded result set * Change improve add show search results by comparing search term to an additional unidecoded result set
* Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups * Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
* Update backports_abc 0.4 to 0.5 * Update backports_abc 0.4 to 0.5
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
[develop changelog] [develop changelog]

View file

@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup (possibly invalid) document into a tree representation. Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to provides methods and Pythonic idioms that make it easy to navigate,
navigate, search, and modify the parse tree. search, and modify the parse tree.
Beautiful Soup works with Python 2.6 and up. It works better if lxml Beautiful Soup works with Python 2.7 and up. It works better if lxml
and/or html5lib is installed. and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the For more than you ever wanted to know about Beautiful Soup, see the
documentation: documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.4.0" __version__ = "4.5.3"
__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
import os import os
import re import re
import traceback
import warnings import warnings
from .builder import builder_registry, ParserRejectedMarkup from .builder import builder_registry, ParserRejectedMarkup
@ -77,7 +82,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
@ -137,6 +142,10 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument( from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if from_encoding and isinstance(markup, unicode):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
if len(kwargs) > 0: if len(kwargs) > 0:
arg = kwargs.keys().pop() arg = kwargs.keys().pop()
raise TypeError( raise TypeError(
@ -161,19 +170,29 @@ class BeautifulSoup(Tag):
markup_type = "XML" markup_type = "XML"
else: else:
markup_type = "HTML" markup_type = "HTML"
caller = traceback.extract_stack()[0]
filename = caller[0]
line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
filename=filename,
line_number=line_number,
parser=builder.NAME, parser=builder.NAME,
markup_type=markup_type)) markup_type=markup_type))
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.known_xml = self.is_xml
self.builder.soup = self self.builder.soup = self
self.parse_only = parse_only self.parse_only = parse_only
if hasattr(markup, 'read'): # It's a file-type object. if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read() markup = markup.read()
elif len(markup) <= 256: elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, unicode) and not u'<' in markup)
):
# Print out warnings for a couple beginner problems # Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
@ -195,16 +214,10 @@ class BeautifulSoup(Tag):
if isinstance(markup, unicode): if isinstance(markup, unicode):
markup = markup.encode("utf8") markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) '"%s" looks like a filename, not markup. You should'
if markup[:5] == "http:" or markup[:6] == "https:": 'probably open this file and pass the filehandle into'
# TODO: This is ugly but I couldn't get it to work in 'Beautiful Soup.' % markup)
# Python 3 otherwise. self._check_markup_is_url(markup)
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
for (self.markup, self.original_encoding, self.declared_html_encoding, for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in ( self.contains_replacement_characters) in (
@ -223,15 +236,52 @@ class BeautifulSoup(Tag):
self.builder.soup = None self.builder.soup = None
def __copy__(self): def __copy__(self):
return type(self)(self.encode(), builder=self.builder) copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
def __getstate__(self): def __getstate__(self):
# Frequently a tree builder can't be pickled. # Frequently a tree builder can't be pickled.
d = dict(self.__dict__) d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable: if 'builder' in d and not self.builder.picklable:
del d['builder'] d['builder'] = None
return d return d
@staticmethod
def _check_markup_is_url(markup):
"""
Check if markup looks like it's actually a url and raise a warning
if so. Markup can be unicode or str (py2) / bytes (py3).
"""
if isinstance(markup, bytes):
space = b' '
cant_start_with = (b"http:", b"https:")
elif isinstance(markup, unicode):
space = u' '
cant_start_with = (u"http:", u"https:")
else:
return
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
if isinstance(markup, bytes):
decoded_markup = markup.decode('utf-8', 'replace')
else:
decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % decoded_markup
)
def _feed(self): def _feed(self):
# Convert the document to Unicode. # Convert the document to Unicode.
self.builder.reset() self.builder.reset()
@ -335,7 +385,18 @@ class BeautifulSoup(Tag):
if parent.next_sibling: if parent.next_sibling:
# This node is being inserted into an element that has # This node is being inserted into an element that has
# already been parsed. Deal with any dangling references. # already been parsed. Deal with any dangling references.
index = parent.contents.index(o) index = len(parent.contents)-1
while index >= 0:
if parent.contents[index] is o:
break
index -= 1
else:
raise ValueError(
"Error building tree: supposedly %r was inserted "
"into %r after the fact, but I don't see it!" % (
o, parent
)
)
if index == 0: if index == 0:
previous_element = parent previous_element = parent
previous_sibling = None previous_sibling = None
@ -387,7 +448,7 @@ class BeautifulSoup(Tag):
"""Push a start tag on to the stack. """Push a start tag on to the stack.
If this method returns None, the tag was rejected by the If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occured SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag, in the document. For instance, if this was a self-closing tag,
don't call handle_endtag. don't call handle_endtag.
""" """

View file

@ -1,9 +1,13 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from collections import defaultdict from collections import defaultdict
import itertools import itertools
import sys import sys
from bs4.element import ( from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
whitespace_re whitespace_re
) )
@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags. Such as which tags are empty-element tags.
""" """
preserve_whitespace_tags = set(['pre', 'textarea']) preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base']) 'spacer', 'link', 'frame', 'base'])

View file

@ -1,9 +1,12 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
] ]
from pdb import set_trace
import warnings import warnings
import re
from bs4.builder import ( from bs4.builder import (
PERMISSIVE, PERMISSIVE,
HTML, HTML,
@ -15,7 +18,10 @@ from bs4.element import (
whitespace_re, whitespace_re,
) )
import html5lib import html5lib
from html5lib.constants import namespaces from html5lib.constants import (
namespaces,
prefixes,
)
from bs4.element import ( from bs4.element import (
Comment, Comment,
Doctype, Doctype,
@ -23,6 +29,15 @@ from bs4.element import (
Tag, Tag,
) )
try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
except ImportError, e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder): class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree.""" """Use html5lib to build a tree."""
@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None: if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
extra_kwargs = dict()
if not isinstance(markup, unicode):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, unicode):
@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
else: else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0] original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
def create_treebuilder(self, namespaceHTMLElements): def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib( self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements) namespaceHTMLElements, self.soup)
return self.underlying_builder return self.underlying_builder
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
return u'<html><head></head><body>%s</body></html>' % fragment return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements): def __init__(self, namespaceHTMLElements, soup=None):
self.soup = soup if soup:
self.soup = soup
else:
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self): def documentClass(self):
@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return TextNode(Comment(data), self.soup) return TextNode(Comment(data), self.soup)
def fragmentClass(self): def fragmentClass(self):
self.soup = BeautifulSoup("") from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]" self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return self.soup return self.soup
def getFragment(self): def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element):
from bs4 import BeautifulSoup
rv = []
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
def serializeElement(element, indent=0):
if isinstance(element, BeautifulSoup):
pass
if isinstance(element, Doctype):
m = doctype_re.match(element)
if m:
name = m.group(1)
if m.lastindex > 1:
publicId = m.group(2) or ""
systemId = m.group(3) or m.group(4) or ""
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
else:
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
elif isinstance(element, NavigableString):
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
if element.namespace:
name = "%s %s" % (prefixes[element.namespace],
element.name)
else:
name = element.name
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in element.attrs.items():
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
value = " ".join(value)
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.children:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
class AttrList(object): class AttrList(object):
def __init__(self, element): def __init__(self, element):
@ -137,9 +220,9 @@ class AttrList(object):
return name in list(self.attrs.keys()) return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node): class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace): def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name) treebuilder_base.Node.__init__(self, element.name)
self.element = element self.element = element
self.soup = soup self.soup = soup
self.namespace = namespace self.namespace = namespace
@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node):
child = node child = node
elif node.element.__class__ == NavigableString: elif node.element.__class__ == NavigableString:
string_child = child = node.element string_child = child = node.element
node.parent = self
else: else:
child = node.element child = node.element
node.parent = self
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, basestring) and child.parent is not None:
node.element.extract() node.element.extract()
@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
most_recent_element=most_recent_element) most_recent_element=most_recent_element)
def getAttributes(self): def getAttributes(self):
if isinstance(self.element, Comment):
return {}
return AttrList(self.element) return AttrList(self.element)
def setAttributes(self, attributes): def setAttributes(self, attributes):
@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node):
attributes = property(getAttributes, setAttributes) attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore: if insertBefore:
text = TextNode(self.soup.new_string(data), self.soup) self.insertBefore(text, insertBefore)
self.insertBefore(data, insertBefore)
else: else:
self.appendChild(data) self.appendChild(text)
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
index = self.element.index(refNode.element) index = self.element.index(refNode.element)
@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
# print "MOVE", self.element.contents # print "MOVE", self.element.contents
# print "FROM", self.element # print "FROM", self.element
# print "TO", new_parent.element # print "TO", new_parent.element
element = self.element element = self.element
new_parent_element = new_parent.element new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children # Determine what this tag's next_element will be once all the children
@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents to_append = element.contents
append_after = new_parent_element.contents
if len(to_append) > 0: if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling # Set the first child's previous_element and previous_sibling
# to elements within the new parent # to elements within the new parent
@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
if new_parents_last_child: if new_parents_last_child:
new_parents_last_child.next_sibling = first_child new_parents_last_child.next_sibling = first_child
# Fix the last child's next_element and next_sibling # Find the very last element being moved. It is now the
last_child = to_append[-1] # parent's last descendant. It has no .next_sibling and
last_child.next_element = new_parents_last_descendant_next_element # its .next_element is whatever the previous last
# descendant had.
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element: if new_parents_last_descendant_next_element:
new_parents_last_descendant_next_element.previous_element = last_child # TODO: This code has no test coverage and I'm not sure
last_child.next_sibling = None # how to get html5lib to go through this path, but it's
# just the other side of the previous line.
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
last_childs_last_descendant.next_sibling = None
for child in to_append: for child in to_append:
child.parent = new_parent_element child.parent = new_parent_element
@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):
class TextNode(Element): class TextNode(Element):
def __init__(self, element, soup): def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None) treebuilder_base.Node.__init__(self, None)
self.element = element self.element = element
self.soup = soup self.soup = soup

View file

@ -1,5 +1,8 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" """Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]

View file

@ -1,3 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [ __all__ = [
'LXMLTreeBuilderForXML', 'LXMLTreeBuilderForXML',
'LXMLTreeBuilder', 'LXMLTreeBuilder',
@ -12,6 +14,7 @@ from bs4.element import (
Doctype, Doctype,
NamespacedAttribute, NamespacedAttribute,
ProcessingInstruction, ProcessingInstruction,
XMLProcessingInstruction,
) )
from bs4.builder import ( from bs4.builder import (
FAST, FAST,
@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True is_xml = True
processing_instruction_class = XMLProcessingInstruction
NAME = "lxml-xml" NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"] ALTERNATE_NAMES = ["xml"]
@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document. Each 4-tuple represents a strategy for parsing the document.
""" """
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
else:
self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, unicode): if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on # We were given Unicode. Maybe lxml can parse Unicode on
# this system? # this system?
@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False) document_declared_encoding, False)
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector( detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings) markup, try_encodings, is_html, exclude_encodings)
@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def pi(self, target, data): def pi(self, target, data):
self.soup.endData() self.soup.endData()
self.soup.handle_data(target + ' ' + data) self.soup.handle_data(target + ' ' + data)
self.soup.endData(ProcessingInstruction) self.soup.endData(self.processing_instruction_class)
def data(self, content): def data(self, content):
self.soup.handle_data(content) self.soup.handle_data(content)
@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False is_xml = False
processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding): def default_parser(self, encoding):
return etree.HTMLParser return etree.HTMLParser

View file

@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job. XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
from pdb import set_trace
import codecs import codecs
from htmlentitydefs import codepoint2name from htmlentitydefs import codepoint2name
import re import re
@ -309,7 +310,7 @@ class EncodingDetector:
else: else:
xml_endpos = 1024 xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05)) html_endpos = max(2048, int(len(markup) * 0.05))
declared_encoding = None declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html: if not declared_encoding_match and is_html:
@ -346,7 +347,7 @@ class UnicodeDammit:
self.tried_encodings = [] self.tried_encodings = []
self.contains_replacement_characters = False self.contains_replacement_characters = False
self.is_html = is_html self.is_html = is_html
self.log = logging.getLogger(__name__)
self.detector = EncodingDetector( self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings) markup, override_encodings, is_html, exclude_encodings)
@ -376,9 +377,10 @@ class UnicodeDammit:
if encoding != "ascii": if encoding != "ascii":
u = self._convert_from(encoding, "replace") u = self._convert_from(encoding, "replace")
if u is not None: if u is not None:
logging.warning( self.log.warning(
"Some characters could not be decoded, and were " "Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER.") "replaced with REPLACEMENT CHARACTER."
)
self.contains_replacement_characters = True self.contains_replacement_characters = True
break break
@ -734,7 +736,7 @@ class UnicodeDammit:
0xde : b'\xc3\x9e', # Þ 0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß 0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à 0xe0 : b'\xc3\xa0', # à
0xe1 : b'\xa1', # á 0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â 0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã 0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä 0xe4 : b'\xc3\xa4', # ä

View file

@ -1,5 +1,7 @@
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import cProfile import cProfile
@ -56,7 +58,8 @@ def diagnose(data):
data = data.read() data = data.read()
elif os.path.exists(data): elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data print '"%s" looks like a filename. Reading data from the file.' % data
data = open(data).read() with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."

View file

@ -1,8 +1,10 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
from pdb import set_trace
import collections import collections
import re import re
import shlex
import sys import sys
import warnings import warnings
from bs4.dammit import EntitySubstitution from bs4.dammit import EntitySubstitution
@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
preformatted_tags = set(["pre"]) preformatted_tags = set(["pre"])
preserve_whitespace_tags = set(['pre', 'textarea'])
@classmethod @classmethod
def _substitute_if_appropriate(cls, ns, f): def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString) if (isinstance(ns, NavigableString)
@ -169,11 +173,19 @@ class PageElement(object):
This is used when mapping a formatter name ("minimal") to an This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on appropriate function (one that performs entity-substitution on
the contents of <script> and <style> tags, or not). It's the contents of <script> and <style> tags, or not). It can be
inefficient, but it should be called very rarely. inefficient, but it should be called very rarely.
""" """
if self.known_xml is not None:
# Most of the time we will have determined this when the
# document is parsed.
return self.known_xml
# Otherwise, it's likely that this element was created by
# direct invocation of the constructor from within the user's
# Python code.
if self.parent is None: if self.parent is None:
# This is the top-level object. It should have .is_xml set # This is the top-level object. It should have .known_xml set
# from tree creation. If not, take a guess--BS is usually # from tree creation. If not, take a guess--BS is usually
# used on HTML markup. # used on HTML markup.
return getattr(self, 'is_xml', False) return getattr(self, 'is_xml', False)
@ -637,7 +649,7 @@ class PageElement(object):
return lambda el: el._attr_value_as_string( return lambda el: el._attr_value_as_string(
attribute, '').startswith(value) attribute, '').startswith(value)
elif operator == '$': elif operator == '$':
# string represenation of `attribute` ends with `value` # string representation of `attribute` ends with `value`
return lambda el: el._attr_value_as_string( return lambda el: el._attr_value_as_string(
attribute, '').endswith(value) attribute, '').endswith(value)
elif operator == '*': elif operator == '*':
@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement):
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__(cls, value): def __new__(cls, value):
"""Create a new NavigableString. """Create a new NavigableString.
@ -743,10 +760,16 @@ class CData(PreformattedString):
SUFFIX = u']]>' SUFFIX = u']]>'
class ProcessingInstruction(PreformattedString): class ProcessingInstruction(PreformattedString):
"""A SGML processing instruction."""
PREFIX = u'<?' PREFIX = u'<?'
SUFFIX = u'>' SUFFIX = u'>'
class XMLProcessingInstruction(ProcessingInstruction):
"""An XML processing instruction."""
PREFIX = u'<?'
SUFFIX = u'?>'
class Comment(PreformattedString): class Comment(PreformattedString):
PREFIX = u'<!--' PREFIX = u'<!--'
@ -781,7 +804,8 @@ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents.""" """Represents a found HTML tag with its attributes and contents."""
def __init__(self, parser=None, builder=None, name=None, namespace=None, def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None): prefix=None, attrs=None, parent=None, previous=None,
is_xml=None):
"Basic constructor." "Basic constructor."
if parser is None: if parser is None:
@ -795,6 +819,14 @@ class Tag(PageElement):
self.name = name self.name = name
self.namespace = namespace self.namespace = namespace
self.prefix = prefix self.prefix = prefix
if builder is not None:
preserve_whitespace_tags = builder.preserve_whitespace_tags
else:
if is_xml:
preserve_whitespace_tags = []
else:
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
self.preserve_whitespace_tags = preserve_whitespace_tags
if attrs is None: if attrs is None:
attrs = {} attrs = {}
elif attrs: elif attrs:
@ -805,6 +837,13 @@ class Tag(PageElement):
attrs = dict(attrs) attrs = dict(attrs)
else: else:
attrs = dict(attrs) attrs = dict(attrs)
# If possible, determine ahead of time whether this tag is an
# XML tag.
if builder:
self.known_xml = builder.is_xml
else:
self.known_xml = is_xml
self.attrs = attrs self.attrs = attrs
self.contents = [] self.contents = []
self.setup(parent, previous) self.setup(parent, previous)
@ -824,7 +863,7 @@ class Tag(PageElement):
Its contents are a copy of the old Tag's contents. Its contents are a copy of the old Tag's contents.
""" """
clone = type(self)(None, self.builder, self.name, self.namespace, clone = type(self)(None, self.builder, self.name, self.namespace,
self.nsprefix, self.attrs) self.nsprefix, self.attrs, is_xml=self._is_xml)
for attr in ('can_be_empty_element', 'hidden'): for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr)) setattr(clone, attr, getattr(self, attr))
for child in self.contents: for child in self.contents:
@ -997,7 +1036,7 @@ class Tag(PageElement):
tag_name, tag_name)) tag_name, tag_name))
return self.find(tag_name) return self.find(tag_name)
# We special case contents to avoid recursion. # We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag=="contents": elif not tag.startswith("__") and not tag == "contents":
return self.find(tag) return self.find(tag)
raise AttributeError( raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__, tag)) "'%s' object has no attribute '%s'" % (self.__class__, tag))
@ -1057,10 +1096,11 @@ class Tag(PageElement):
def _should_pretty_print(self, indent_level): def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?""" """Should this tag be pretty-printed?"""
return ( return (
indent_level is not None and indent_level is not None
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags and self.name not in self.preserve_whitespace_tags
or self._is_xml)) )
def decode(self, indent_level=None, def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
@ -1280,6 +1320,7 @@ class Tag(PageElement):
_selector_combinators = ['>', '+', '~'] _selector_combinators = ['>', '+', '~']
_select_debug = False _select_debug = False
quoted_colon = re.compile('"[^"]*:[^"]*"')
def select_one(self, selector): def select_one(self, selector):
"""Perform a CSS selection operation on the current element.""" """Perform a CSS selection operation on the current element."""
value = self.select(selector, limit=1) value = self.select(selector, limit=1)
@ -1305,8 +1346,7 @@ class Tag(PageElement):
if limit and len(context) >= limit: if limit and len(context) >= limit:
break break
return context return context
tokens = shlex.split(selector)
tokens = selector.split()
current_context = [self] current_context = [self]
if tokens[-1] in self._selector_combinators: if tokens[-1] in self._selector_combinators:
@ -1358,7 +1398,7 @@ class Tag(PageElement):
return classes.issubset(candidate.get('class', [])) return classes.issubset(candidate.get('class', []))
checker = classes_match checker = classes_match
elif ':' in token: elif ':' in token and not self.quoted_colon.search(token):
# Pseudo-class # Pseudo-class
tag_name, pseudo = token.split(':', 1) tag_name, pseudo = token.split(':', 1)
if tag_name == '': if tag_name == '':
@ -1389,11 +1429,8 @@ class Tag(PageElement):
self.count += 1 self.count += 1
if self.count == self.destination: if self.count == self.destination:
return True return True
if self.count > self.destination: else:
# Stop the generator that's sending us return False
# these things.
raise StopIteration()
return False
checker = Counter(pseudo_value).nth_child_of_type checker = Counter(pseudo_value).nth_child_of_type
else: else:
raise NotImplementedError( raise NotImplementedError(
@ -1498,13 +1535,12 @@ class Tag(PageElement):
# don't include it in the context more than once. # don't include it in the context more than once.
new_context.append(candidate) new_context.append(candidate)
new_context_ids.add(id(candidate)) new_context_ids.add(id(candidate))
if limit and len(new_context) >= limit:
break
elif self._select_debug: elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context current_context = new_context
if limit and len(current_context) >= limit:
current_context = current_context[:limit]
if self._select_debug: if self._select_debug:
print "Final verdict:" print "Final verdict:"
@ -1668,21 +1704,15 @@ class SoupStrainer(object):
if isinstance(markup, list) or isinstance(markup, tuple): if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute # This should only happen when searching a multi-valued attribute
# like 'class'. # like 'class'.
if (isinstance(match_against, unicode) for item in markup:
and ' ' in match_against): if self._matches(item, match_against):
# A bit of a special case. If they try to match "foo return True
# bar" on a multivalue attribute's value, only accept # We didn't match any particular value of the multivalue
# the literal value "foo bar" # attribute, but maybe we match the attribute value when
# # considered as a string.
# XXX This is going to be pretty slow because we keep if self._matches(' '.join(markup), match_against):
# splitting match_against. But it shouldn't come up return True
# too often. return False
return (whitespace_re.split(match_against) == markup)
else:
for item in markup:
if self._matches(item, match_against):
return True
return False
if match_against is True: if match_against is True:
# True matches any non-None value. # True matches any non-None value.