Update Beautiful Soup 4.6.0 (r449) → 4.6.3 (r475)

This commit is contained in:
JackDandy 2018-09-01 01:51:31 +01:00
parent a7784b6bb0
commit 5f6edbb5f5
9 changed files with 273 additions and 91 deletions

View file

@ -1,5 +1,7 @@
### 0.18.0 (2018-xx-xx xx:xx:xx UTC) ### 0.18.0 (2018-xx-xx xx:xx:xx UTC)
* Update Beautiful Soup 4.6.0 (r449) to 4.6.3 (r475)
[develop changelog] [develop changelog]

View file

@ -21,14 +21,15 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# found in the LICENSE file. # found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.5.3" __version__ = "4.6.3"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
import os import os
import re import re
import sys
import traceback import traceback
import warnings import warnings
@ -82,14 +83,46 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs): **kwargs):
"""The Soup object is initialized as the 'root tag', and the """Constructor.
provided markup (which can be a string or a file-like object)
is fed into the underlying parser.""" :param markup: A string or a file-like object representing
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A specific TreeBuilder to use instead of looking one
up based on `features`. You shouldn't need to use this.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4 and there's no need to actually pass keyword
arguments into the constructor.
"""
if 'convertEntities' in kwargs: if 'convertEntities' in kwargs:
warnings.warn( warnings.warn(
@ -171,14 +204,35 @@ class BeautifulSoup(Tag):
else: else:
markup_type = "HTML" markup_type = "HTML"
caller = traceback.extract_stack()[0] # This code adapted from warnings.py so that we get the same line
filename = caller[0] # of code as our warnings.warn() call gets, even if the answer is wrong
line_number = caller[1] # (as it may be in a multithreading situation).
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( caller = None
filename=filename, try:
line_number=line_number, caller = sys._getframe(1)
parser=builder.NAME, except ValueError:
markup_type=markup_type)) pass
if caller:
globals = caller.f_globals
line_number = caller.f_lineno
else:
globals = sys.__dict__
line_number= 1
filename = globals.get('__file__')
if filename:
fnl = filename.lower()
if fnl.endswith((".pyc", ".pyo")):
filename = filename[:-1]
if filename:
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
@ -302,9 +356,10 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = [] self.preserve_whitespace_tag_stack = []
self.pushTag(self) self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs): def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
"""Create a new tag associated with this soup.""" """Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs) kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
def new_string(self, s, subclass=NavigableString): def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup.""" """Create a new NavigableString associated with this soup."""

View file

@ -93,7 +93,7 @@ class TreeBuilder(object):
preserve_whitespace_tags = set() preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents. # tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or # A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA. # comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {} cdata_list_attributes = {}
@ -125,7 +125,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None: if self.empty_element_tags is None:
return True return True
return tag_name in self.empty_element_tags return tag_name in self.empty_element_tags
def feed(self, markup): def feed(self, markup):
raise NotImplementedError() raise NotImplementedError()
@ -235,11 +235,17 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([ empty_element_tags = set([
# These are from HTML5. # These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
# These are from HTML4, removed in HTML5. # These are from earlier versions of HTML and are removed in HTML5.
'spacer', 'frame' 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
]) ])
# The HTML standard defines these as block-level elements. Beautiful
# Soup does not treat these elements differently from other elements,
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines these attributes as containing a # The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is, # space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values, # class="foo bar" means that the 'class' attribute has two values,

View file

@ -30,14 +30,13 @@ from bs4.element import (
) )
try: try:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
old_html5lib = False
except ImportError:
# Pre-0.99999999 # Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base from html5lib.treebuilders import _base as treebuilder_base
old_html5lib = True new_html5lib = False
except ImportError, e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder): class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree.""" """Use html5lib to build a tree."""
@ -66,7 +65,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
extra_kwargs = dict() extra_kwargs = dict()
if not isinstance(markup, unicode): if not isinstance(markup, unicode):
if not old_html5lib: if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding extra_kwargs['override_encoding'] = self.user_specified_encoding
else: else:
extra_kwargs['encoding'] = self.user_specified_encoding extra_kwargs['encoding'] = self.user_specified_encoding

View file

@ -1,3 +1,4 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" """Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and # order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up. # will ignore, assuming they ever show up.
self.already_closed_empty_element = [] self.already_closed_empty_element = []
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs): def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like # This is only called when the markup looks like
# <tag/>. # <tag/>.
@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else: else:
real_name = int(name) real_name = int(name)
try: data = None
data = unichr(real_name) if real_name < 256:
except (ValueError, OverflowError), e: # HTML numeric entities are supposed to reference Unicode
data = u"\N{REPLACEMENT CHARACTER}" # code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, 'windows-1252'):
if not encoding:
continue
try:
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError, e:
pass
if not data:
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
pass
data = data or u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data) self.handle_data(data)
def handle_entityref(self, name): def handle_entityref(self, name):
@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None: if character is not None:
data = character data = character
else: else:
data = "&%s;" % name # If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
self.handle_data(data) self.handle_data(data)
def handle_comment(self, data): def handle_comment(self, data):
@ -213,6 +245,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
parser.close()
except HTMLParseError, e: except HTMLParseError, e:
warnings.warn(RuntimeWarning( warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))

View file

@ -5,9 +5,13 @@ __all__ = [
'LXMLTreeBuilder', 'LXMLTreeBuilder',
] ]
try:
from collections.abc import Callable # Python 3.6
except ImportError , e:
from collections import Callable
from io import BytesIO from io import BytesIO
from StringIO import StringIO from StringIO import StringIO
import collections
from lxml import etree from lxml import etree
from bs4.element import ( from bs4.element import (
Comment, Comment,
@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser. # Use the default parser.
parser = self.default_parser(encoding) parser = self.default_parser(encoding)
if isinstance(parser, collections.Callable): if isinstance(parser, Callable):
# Instantiate the parser with default arguments # Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding) parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser return parser
@ -147,11 +151,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
attrs = dict(attrs) attrs = dict(attrs)
nsprefix = None nsprefix = None
# Invert each namespace map as it comes in. # Invert each namespace map as it comes in.
if len(self.nsmaps) > 1: if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but # There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a # non-default namespaces are in play, so we need a
# separate tag stack to know when they end. # separate tag stack to know when they end.
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items()) inverted_nsmap = dict((value, key) for key, value in nsmap.items())

View file

@ -46,9 +46,9 @@ except ImportError:
pass pass
xml_encoding_re = re.compile( xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile( html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object): class EntitySubstitution(object):
@ -82,7 +82,7 @@ class EntitySubstitution(object):
} }
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")") ")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])") AMPERSAND_OR_BRACKET = re.compile("([<>&])")

View file

@ -37,7 +37,7 @@ def diagnose(data):
name) name)
if 'lxml' in basic_parsers: if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"]) basic_parsers.append("lxml-xml")
try: try:
from lxml import etree from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
@ -56,21 +56,27 @@ def diagnose(data):
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return return
print else:
try:
if os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print
for parser in basic_parsers: for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser print "Trying to parse your markup with %s" % parser
success = False success = False
try: try:
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, features=parser)
success = True success = True
except Exception, e: except Exception, e:
print "%s could not parse the markup." % parser print "%s could not parse the markup." % parser

View file

@ -2,7 +2,10 @@
# found in the LICENSE file. # found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import collections try:
from collections.abc import Callable # Python 3.6
except ImportError , e:
from collections import Callable
import re import re
import shlex import shlex
import sys import sys
@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8" DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2) PY3K = (sys.version_info[0] > 2)
whitespace_re = re.compile("\s+") whitespace_re = re.compile(r"\s+")
def _alias(attr): def _alias(attr):
"""Alias one attribute name to another for backward compatibility""" """Alias one attribute name to another for backward compatibility"""
@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects. The value of the 'content' attribute will be one of these objects.
""" """
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value): def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value) match = cls.CHARSET_RE.search(original_value)
@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate( return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml) ns, EntitySubstitution.substitute_xml)
class Formatter(object):
"""Contains information about how to format a parse tree."""
# By default, represent void elements as <tag/> rather than <tag>
void_element_close_prefix = '/'
def substitute_entities(self, *args, **kwargs):
"""Transform certain characters into named entities."""
raise NotImplementedError()
class HTMLFormatter(Formatter):
"""The default HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class MinimalHTMLFormatter(Formatter):
"""A minimal HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
class HTML5Formatter(HTMLFormatter):
"""An HTML formatter that omits the slash in a void tag."""
void_element_close_prefix = None
class XMLFormatter(Formatter):
"""Substitute only the essential XML entities."""
def substitute(self, *args, **kwargs):
return EntitySubstitution.substitute_xml(*args, **kwargs)
class HTMLXMLFormatter(Formatter):
"""Format XML using HTML rules."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class PageElement(object): class PageElement(object):
"""Contains the navigational information for some part of the page """Contains the navigational information for some part of the page
(either a tag or a piece of text)""" (either a tag or a piece of text)"""
@ -131,40 +169,49 @@ class PageElement(object):
# to methods like encode() and prettify(): # to methods like encode() and prettify():
# #
# "html" - All Unicode characters with corresponding HTML entities # "html" - All Unicode characters with corresponding HTML entities
# are converted to those entities on output. # are converted to those entities on output.
# "minimal" - Bare ampersands and angle brackets are converted to # "html5" - The same as "html", but empty void tags are represented as
# <tag> rather than <tag/>
# "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt; # XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never # None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's # converted to entities. This is not recommended, but it's
# faster than "minimal". # faster than "minimal".
# A function - This function will be called on every string that # A callable function - it will be called on every string that needs to undergo entity substitution.
# A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution. # needs to undergo entity substitution.
# #
# In an HTML document, the default "html" and "minimal" functions # In an HTML document, the default "html", "html5", and "minimal"
# will leave the contents of <script> and <style> tags alone. For # functions will leave the contents of <script> and <style> tags
# an XML document, all tags will be given the same treatment. # alone. For an XML document, all tags will be given the same
# treatment.
HTML_FORMATTERS = { HTML_FORMATTERS = {
"html" : HTMLAwareEntitySubstitution.substitute_html, "html" : HTMLFormatter(),
"minimal" : HTMLAwareEntitySubstitution.substitute_xml, "html5" : HTML5Formatter(),
"minimal" : MinimalHTMLFormatter(),
None : None None : None
} }
XML_FORMATTERS = { XML_FORMATTERS = {
"html" : EntitySubstitution.substitute_html, "html" : HTMLXMLFormatter(),
"minimal" : EntitySubstitution.substitute_xml, "minimal" : XMLFormatter(),
None : None None : None
} }
def format_string(self, s, formatter='minimal'): def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter.""" """Format the given string using the given formatter."""
if not callable(formatter): if isinstance(formatter, basestring):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
if formatter is None: if formatter is None:
output = s output = s
else: else:
output = formatter(s) if callable(formatter):
# Backwards compatibility -- you used to pass in a formatting method.
output = formatter(s)
else:
output = formatter.substitute(s)
return output return output
@property @property
@ -194,11 +241,9 @@ class PageElement(object):
def _formatter_for_name(self, name): def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree." "Look up a formatter function based on its name and the tree."
if self._is_xml: if self._is_xml:
return self.XML_FORMATTERS.get( return self.XML_FORMATTERS.get(name, XMLFormatter())
name, EntitySubstitution.substitute_xml)
else: else:
return self.HTML_FORMATTERS.get( return self.HTML_FORMATTERS.get(name, HTMLFormatter())
name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None, next_element=None, def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None): previous_sibling=None, next_sibling=None):
@ -316,6 +361,14 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)): and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child) new_child = NavigableString(new_child)
from bs4 import BeautifulSoup
if isinstance(new_child, BeautifulSoup):
# We don't want to end up with a situation where one BeautifulSoup
# object contains another. Insert the children one at a time.
for subchild in list(new_child.contents):
self.insert(position, subchild)
position += 1
return
position = min(position, len(self.contents)) position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None: if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one # We're 'inserting' an element that's already one
@ -536,14 +589,21 @@ class PageElement(object):
elif isinstance(name, basestring): elif isinstance(name, basestring):
# Optimization to find all tags with a given name. # Optimization to find all tags with a given name.
if name.count(':') == 1: if name.count(':') == 1:
# This is a name with a prefix. # This is a name with a prefix. If this is a namespace-aware document,
prefix, name = name.split(':', 1) # we need to match the local name against tag.name. If not,
# we need to match the fully-qualified name against tag.name.
prefix, local_name = name.split(':', 1)
else: else:
prefix = None prefix = None
local_name = name
result = (element for element in generator result = (element for element in generator
if isinstance(element, Tag) if isinstance(element, Tag)
and element.name == name and (
and (prefix is None or element.prefix == prefix) element.name == name
) or (
element.name == local_name
and (prefix is None or element.prefix == prefix)
)
) )
return ResultSet(strainer, result) return ResultSet(strainer, result)
results = ResultSet(strainer) results = ResultSet(strainer)
@ -862,7 +922,7 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name) self.can_be_empty_element = builder.can_be_empty_element(name)
else: else:
self.can_be_empty_element = False self.can_be_empty_element = False
parserClass = _alias("parser_class") # BS3 parserClass = _alias("parser_class") # BS3
def __copy__(self): def __copy__(self):
@ -1046,8 +1106,10 @@ class Tag(PageElement):
# BS3: soup.aTag -> "soup.find("a") # BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3] tag_name = tag[:-3]
warnings.warn( warnings.warn(
'.%sTag is deprecated, use .find("%s") instead.' % ( '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
tag_name, tag_name)) name=tag_name
)
)
return self.find(tag_name) return self.find(tag_name)
# We special case contents to avoid recursion. # We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag == "contents": elif not tag.startswith("__") and not tag == "contents":
@ -1129,11 +1191,10 @@ class Tag(PageElement):
encoding. encoding.
""" """
# First off, turn a string formatter into a function. This # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not callable(formatter): if not isinstance(formatter, Formatter) and not callable(formatter):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
attrs = [] attrs = []
if self.attrs: if self.attrs:
for key, val in sorted(self.attrs.items()): for key, val in sorted(self.attrs.items()):
@ -1162,7 +1223,9 @@ class Tag(PageElement):
prefix = self.prefix + ":" prefix = self.prefix + ":"
if self.is_empty_element: if self.is_empty_element:
close = '/' close = ''
if isinstance(formatter, Formatter):
close = formatter.void_element_close_prefix or close
else: else:
closeTag = '</%s%s>' % (prefix, self.name) closeTag = '</%s%s>' % (prefix, self.name)
@ -1233,9 +1296,9 @@ class Tag(PageElement):
:param formatter: The output formatter responsible for converting :param formatter: The output formatter responsible for converting
entities to Unicode characters. entities to Unicode characters.
""" """
# First off, turn a string formatter into a function. This # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not callable(formatter): if not isinstance(formatter, Formatter) and not callable(formatter):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None) pretty_print = (indent_level is not None)
@ -1348,15 +1411,29 @@ class Tag(PageElement):
# Handle grouping selectors if ',' exists, ie: p,a # Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector: if ',' in selector:
context = [] context = []
for partial_selector in selector.split(','): selectors = [x.strip() for x in selector.split(",")]
partial_selector = partial_selector.strip()
# If a selector is mentioned multiple times we don't want
# to use it more than once.
used_selectors = set()
# We also don't want to select the same element more than once,
# if it's matched by multiple selectors.
selected_object_ids = set()
for partial_selector in selectors:
if partial_selector == '': if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector) raise ValueError('Invalid group selection syntax: %s' % selector)
if partial_selector in used_selectors:
continue
used_selectors.add(partial_selector)
candidates = self.select(partial_selector, limit=limit) candidates = self.select(partial_selector, limit=limit)
for candidate in candidates: for candidate in candidates:
if candidate not in context: # This lets us distinguish between distinct tags that
# represent the same markup.
object_id = id(candidate)
if object_id not in selected_object_ids:
context.append(candidate) context.append(candidate)
selected_object_ids.add(object_id)
if limit and len(context) >= limit: if limit and len(context) >= limit:
break break
return context return context
@ -1418,7 +1495,7 @@ class Tag(PageElement):
if tag_name == '': if tag_name == '':
raise ValueError( raise ValueError(
"A pseudo-class must be prefixed with a tag name.") "A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = [] found = []
if pseudo_attributes is None: if pseudo_attributes is None:
pseudo_type = pseudo pseudo_type = pseudo
@ -1652,7 +1729,7 @@ class SoupStrainer(object):
markup = markup_name markup = markup_name
markup_attrs = markup markup_attrs = markup
call_function_with_tag_data = ( call_function_with_tag_data = (
isinstance(self.name, collections.Callable) isinstance(self.name, Callable)
and not isinstance(markup_name, Tag)) and not isinstance(markup_name, Tag))
if ((not self.name) if ((not self.name)
@ -1732,7 +1809,7 @@ class SoupStrainer(object):
# True matches any non-None value. # True matches any non-None value.
return markup is not None return markup is not None
if isinstance(match_against, collections.Callable): if isinstance(match_against, Callable):
return match_against(markup) return match_against(markup)
# Custom callables take the tag as an argument, but all # Custom callables take the tag as an argument, but all