diff --git a/CHANGES.md b/CHANGES.md
index b394812a..7cc80da5 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,7 @@
### 0.18.0 (2018-xx-xx xx:xx:xx UTC)
+* Update Beautiful Soup 4.6.0 (r449) to 4.6.3 (r475)
+
[develop changelog]
diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index c984ef6e..470177fd 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -21,14 +21,15 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.5.3"
-__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
+__version__ = "4.6.3"
+__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
+import sys
import traceback
import warnings
@@ -82,14 +83,46 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
- NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
- """The Soup object is initialized as the 'root tag', and the
- provided markup (which can be a string or a file-like object)
- is fed into the underlying parser."""
+ """Constructor.
+
+ :param markup: A string or a file-like object representing
+ markup to be parsed.
+
+ :param features: Desirable features of the parser to be used. This
+ may be the name of a specific parser ("lxml", "lxml-xml",
+ "html.parser", or "html5lib") or it may be the type of markup
+ to be used ("html", "html5", "xml"). It's recommended that you
+ name a specific parser, so that Beautiful Soup gives you the
+ same results across platforms and virtual environments.
+
+ :param builder: A specific TreeBuilder to use instead of looking one
+ up based on `features`. You shouldn't need to use this.
+
+ :param parse_only: A SoupStrainer. Only parts of the document
+ matching the SoupStrainer will be considered. This is useful
+ when parsing part of a document that would otherwise be too
+ large to fit into memory.
+
+ :param from_encoding: A string indicating the encoding of the
+ document to be parsed. Pass this in if Beautiful Soup is
+ guessing wrongly about the document's encoding.
+
+ :param exclude_encodings: A list of strings indicating
+ encodings known to be wrong. Pass this in if you don't know
+ the document's encoding but you know Beautiful Soup's guess is
+ wrong.
+
+ :param kwargs: For backwards compatibility purposes, the
+ constructor accepts certain keyword arguments used in
+ Beautiful Soup 3. None of these arguments do anything in
+ Beautiful Soup 4 and there's no need to actually pass keyword
+ arguments into the constructor.
+ """
if 'convertEntities' in kwargs:
warnings.warn(
@@ -171,14 +204,35 @@ class BeautifulSoup(Tag):
else:
markup_type = "HTML"
- caller = traceback.extract_stack()[0]
- filename = caller[0]
- line_number = caller[1]
- warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
- filename=filename,
- line_number=line_number,
- parser=builder.NAME,
- markup_type=markup_type))
+ # This code adapted from warnings.py so that we get the same line
+ # of code as our warnings.warn() call gets, even if the answer is wrong
+ # (as it may be in a multithreading situation).
+ caller = None
+ try:
+ caller = sys._getframe(1)
+ except ValueError:
+ pass
+ if caller:
+ globals = caller.f_globals
+ line_number = caller.f_lineno
+ else:
+ globals = sys.__dict__
+ line_number= 1
+ filename = globals.get('__file__')
+ if filename:
+ fnl = filename.lower()
+ if fnl.endswith((".pyc", ".pyo")):
+ filename = filename[:-1]
+ if filename:
+ # If there is no filename at all, the user is most likely in a REPL,
+ # and the warning is not necessary.
+ values = dict(
+ filename=filename,
+ line_number=line_number,
+ parser=builder.NAME,
+ markup_type=markup_type
+ )
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
self.builder = builder
self.is_xml = builder.is_xml
@@ -302,9 +356,10 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
- def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+ def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
"""Create a new tag associated with this soup."""
- return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+ kwattrs.update(attrs)
+ return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
index fdb3362f..c9e3f3d3 100644
--- a/lib/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -93,7 +93,7 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
@@ -125,7 +125,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
@@ -235,11 +235,17 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
- # These are from HTML4, removed in HTML5.
- 'spacer', 'frame'
+
+ # These are from earlier versions of HTML and are removed in HTML5.
+ 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
+ # The HTML standard defines these as block-level elements. Beautiful
+ # Soup does not treat these elements differently from other elements,
+ # but it may do so eventually, and this information is available if
+ # you need to use it.
+ block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index 641c2ebe..5f548935 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -30,14 +30,13 @@ from bs4.element import (
)
try:
- # 0.99999999 and up
- from html5lib.treebuilders import base as treebuilder_base
- old_html5lib = False
-except ImportError:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
- old_html5lib = True
-
+ new_html5lib = False
+except ImportError, e:
+ # 0.99999999 and up
+ from html5lib.treebuilders import base as treebuilder_base
+ new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
@@ -66,7 +65,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
extra_kwargs = dict()
if not isinstance(markup, unicode):
- if not old_html5lib:
+ if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
index 67890b3a..ee6c685d 100644
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
@@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
-
+
+ def error(self, msg):
+ """In Python 3, HTMLParser subclasses must implement error(), although this
+ requirement doesn't appear to be documented.
+
+ In Python 2, HTMLParser implements error() as raising an exception.
+
+ In any event, this method is called only on very strange markup and our best strategy
+ is to pretend it didn't happen and keep going.
+ """
+ warnings.warn(msg)
+
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# .
@@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- data = u"\N{REPLACEMENT CHARACTER}"
-
+ data = None
+ if real_name < 256:
+ # HTML numeric entities are supposed to reference Unicode
+ # code points, but sometimes they reference code points in
+ # some other encoding (ahem, Windows-1252). E.g.
+ # instead of É for LEFT DOUBLE QUOTATION MARK. This
+ # code tries to detect this situation and compensate.
+ for encoding in (self.soup.original_encoding, 'windows-1252'):
+ if not encoding:
+ continue
+ try:
+ data = bytearray([real_name]).decode(encoding)
+ except UnicodeDecodeError, e:
+ pass
+ if not data:
+ try:
+ data = unichr(real_name)
+ except (ValueError, OverflowError), e:
+ pass
+ data = data or u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
- data = "&%s;" % name
+ # If this were XML, it would be ambiguous whether "&foo"
+ # was an character entity reference with a missing
+ # semicolon or the literal string "&foo". Since this is
+ # HTML, we have a complete list of all character entity references,
+ # and this one wasn't found, so assume it's the literal string "&foo".
+ data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data):
@@ -213,6 +245,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
+ parser.close()
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py
index d2ca2872..4a0f7de4 100644
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@@ -5,9 +5,13 @@ __all__ = [
'LXMLTreeBuilder',
]
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError , e:
+ from collections import Callable
+
from io import BytesIO
from StringIO import StringIO
-import collections
from lxml import etree
from bs4.element import (
Comment,
@@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
- if isinstance(parser, collections.Callable):
+ if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
@@ -147,11 +151,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
- if len(self.nsmaps) > 1:
- # There are no new namespaces for this tag, but
- # non-default namespaces are in play, so we need a
- # separate tag stack to know when they end.
- self.nsmaps.append(None)
+ if len(nsmap) == 0 and len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
+ self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py
index 7965565f..be46b394 100644
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@@ -46,9 +46,9 @@ except ImportError:
pass
xml_encoding_re = re.compile(
- '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+ '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile(
- '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+ '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@@ -82,7 +82,7 @@ class EntitySubstitution(object):
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py
index 8768332f..7a28c09a 100644
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@@ -37,7 +37,7 @@ def diagnose(data):
name)
if 'lxml' in basic_parsers:
- basic_parsers.append(["lxml", "xml"])
+ basic_parsers.append("lxml-xml")
try:
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
@@ -56,21 +56,27 @@ def diagnose(data):
if hasattr(data, 'read'):
data = data.read()
- elif os.path.exists(data):
- print '"%s" looks like a filename. Reading data from the file.' % data
- with open(data) as fp:
- data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
- print
+ else:
+ try:
+ if os.path.exists(data):
+ print '"%s" looks like a filename. Reading data from the file.' % data
+ with open(data) as fp:
+ data = fp.read()
+ except ValueError:
+ # This can happen on some platforms when the 'filename' is
+ # too long. Assume it's data and not a filename.
+ pass
+ print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
- soup = BeautifulSoup(data, parser)
+ soup = BeautifulSoup(data, features=parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
diff --git a/lib/bs4/element.py b/lib/bs4/element.py
index 9ef75f81..886eb91f 100644
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@@ -2,7 +2,10 @@
# found in the LICENSE file.
__license__ = "MIT"
-import collections
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError , e:
+ from collections import Callable
import re
import shlex
import sys
@@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile("\s+")
+whitespace_re = re.compile(r"\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+ CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
@@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
+class Formatter(object):
+ """Contains information about how to format a parse tree."""
+
+ # By default, represent void elements as rather than
+ void_element_close_prefix = '/'
+
+ def substitute_entities(self, *args, **kwargs):
+ """Transform certain characters into named entities."""
+ raise NotImplementedError()
+
+class HTMLFormatter(Formatter):
+ """The default HTML formatter."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+class MinimalHTMLFormatter(Formatter):
+ """A minimal HTML formatter."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTML5Formatter(HTMLFormatter):
+ """An HTML formatter that omits the slash in a void tag."""
+ void_element_close_prefix = None
+
+class XMLFormatter(Formatter):
+ """Substitute only the essential XML entities."""
+ def substitute(self, *args, **kwargs):
+ return EntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTMLXMLFormatter(Formatter):
+ """Format XML using HTML rules."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -131,40 +169,49 @@ class PageElement(object):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "minimal" - Bare ampersands and angle brackets are converted to
+ # are converted to those entities on output.
+ # "html5" - The same as "html", but empty void tags are represented as
+ # rather than
+ # "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: & < >
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
- # A function - This function will be called on every string that
+ # A callable function - it will be called on every string that needs to undergo entity substitution.
+ # A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution.
#
- # In an HTML document, the default "html" and "minimal" functions
- # will leave the contents of