mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-18 08:43:37 +00:00
Merge branch 'feature/UpdateBSoup' into dev
This commit is contained in:
commit
c24d19beda
11 changed files with 819 additions and 296 deletions
|
@ -1,6 +1,7 @@
|
|||
### 3.27.0 (202x-xx-xx xx:xx:00 UTC)
|
||||
|
||||
* Update attr 20.3.0 (f3762ba) to 22.2.0 (a9960de)
|
||||
* Update Beautiful Soup 4.9.3 (r593) to 4.11.1 (r642)
|
||||
* Update cachecontrol 0.12.6 (167a605) to 0.12.11 (c05ef9e)
|
||||
* Add filelock 3.9.0 (ce3e891)
|
||||
* Remove lockfile no longer used by cachecontrol
|
||||
|
|
|
@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
|||
provides methods and Pythonic idioms that make it easy to navigate,
|
||||
search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||
Beautiful Soup works with Python 3.5 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
|
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.9.3"
|
||||
__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
|
||||
__version__ = "4.11.1"
|
||||
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
|
@ -29,7 +29,16 @@ import sys
|
|||
import traceback
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry, ParserRejectedMarkup
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 2.
|
||||
if sys.version_info.major < 3:
|
||||
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
|
||||
|
||||
from .builder import (
|
||||
builder_registry,
|
||||
ParserRejectedMarkup,
|
||||
XMLParsedAsHTMLWarning,
|
||||
)
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
|
@ -49,10 +58,6 @@ from .element import (
|
|||
TemplateString,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
# Define some custom warnings.
|
||||
class GuessedAtParserWarning(UserWarning):
|
||||
"""The warning issued when BeautifulSoup has to guess what parser to
|
||||
|
@ -205,10 +210,10 @@ class BeautifulSoup(Tag):
|
|||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
'has been renamed to "%s."' % (old_name, new_name),
|
||||
DeprecationWarning
|
||||
)
|
||||
return kwargs.pop(old_name)
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
|
@ -303,39 +308,18 @@ class BeautifulSoup(Tag):
|
|||
self._namespaces = dict()
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.builder.initialize_soup(self)
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256 and (
|
||||
(isinstance(markup, bytes) and not b'<' in markup)
|
||||
or (isinstance(markup, str) and not '<' in markup)
|
||||
):
|
||||
# Print out warnings for a couple beginner problems
|
||||
# Issue warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, str)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
possible_filename = markup
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception as e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should'
|
||||
' probably open this file and pass the filehandle into'
|
||||
' Beautiful Soup.' % self._decode_markup(markup),
|
||||
MarkupResemblesLocatorWarning
|
||||
)
|
||||
self._check_markup_is_url(markup)
|
||||
# since that is sometimes the intended behavior.
|
||||
if not self._markup_is_url(markup):
|
||||
self._markup_resembles_filename(markup)
|
||||
|
||||
rejections = []
|
||||
success = False
|
||||
|
@ -344,6 +328,7 @@ class BeautifulSoup(Tag):
|
|||
self.builder.prepare_markup(
|
||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||
self.reset()
|
||||
self.builder.initialize_soup(self)
|
||||
try:
|
||||
self._feed()
|
||||
success = True
|
||||
|
@ -379,7 +364,7 @@ class BeautifulSoup(Tag):
|
|||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and not self.builder.picklable:
|
||||
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
||||
d['builder'] = None
|
||||
return d
|
||||
|
||||
|
@ -397,11 +382,13 @@ class BeautifulSoup(Tag):
|
|||
return decoded
|
||||
|
||||
@classmethod
|
||||
def _check_markup_is_url(cls, markup):
|
||||
def _markup_is_url(cls, markup):
|
||||
"""Error-handling method to raise a warning if incoming markup looks
|
||||
like a URL.
|
||||
|
||||
:param markup: A string.
|
||||
:return: Whether or not the markup resembles a URL
|
||||
closely enough to justify a warning.
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
space = b' '
|
||||
|
@ -410,19 +397,49 @@ class BeautifulSoup(Tag):
|
|||
space = ' '
|
||||
cant_start_with = ("http:", "https:")
|
||||
else:
|
||||
return
|
||||
return False
|
||||
|
||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||
if not space in markup:
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||
' HTTP client. You should probably use an HTTP client like'
|
||||
' requests to get the document behind the URL, and feed'
|
||||
' that document to Beautiful Soup.' % cls._decode_markup(
|
||||
markup
|
||||
),
|
||||
'The input looks more like a URL than markup. You may want to use'
|
||||
' an HTTP client like requests to get the document behind'
|
||||
' the URL, and feed that document to Beautiful Soup.',
|
||||
MarkupResemblesLocatorWarning
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _markup_resembles_filename(cls, markup):
|
||||
"""Error-handling method to raise a warning if incoming markup
|
||||
resembles a filename.
|
||||
|
||||
:param markup: A bytestring or string.
|
||||
:return: Whether or not the markup resembles a filename
|
||||
closely enough to justify a warning.
|
||||
"""
|
||||
path_characters = '/\\'
|
||||
extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
|
||||
if isinstance(markup, bytes):
|
||||
path_characters = path_characters.encode("utf8")
|
||||
extensions = [x.encode('utf8') for x in extensions]
|
||||
filelike = False
|
||||
if any(x in markup for x in path_characters):
|
||||
filelike = True
|
||||
else:
|
||||
lower = markup.lower()
|
||||
if any(lower.endswith(ext) for ext in extensions):
|
||||
filelike = True
|
||||
if filelike:
|
||||
warnings.warn(
|
||||
'The input looks more like a filename than markup. You may'
|
||||
' want to open this file and pass the filehandle into'
|
||||
' Beautiful Soup.',
|
||||
MarkupResemblesLocatorWarning
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
def _feed(self):
|
||||
"""Internal method that parses previously set markup, creating a large
|
||||
|
@ -485,7 +502,7 @@ class BeautifulSoup(Tag):
|
|||
|
||||
# On top of that, we may be inside a tag that needs a special
|
||||
# container class.
|
||||
if self.string_container_stack:
|
||||
if self.string_container_stack and container is NavigableString:
|
||||
container = self.builder.string_containers.get(
|
||||
self.string_container_stack[-1].name, container
|
||||
)
|
||||
|
@ -542,8 +559,6 @@ class BeautifulSoup(Tag):
|
|||
"""Method called by the TreeBuilder when the end of a data segment
|
||||
occurs.
|
||||
"""
|
||||
containerClass = self.string_container(containerClass)
|
||||
|
||||
if self.current_data:
|
||||
current_data = ''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
|
@ -570,6 +585,7 @@ class BeautifulSoup(Tag):
|
|||
not self.parse_only.search(current_data)):
|
||||
return
|
||||
|
||||
containerClass = self.string_container(containerClass)
|
||||
o = containerClass(current_data)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
|
@ -676,7 +692,7 @@ class BeautifulSoup(Tag):
|
|||
return most_recently_popped
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
||||
sourcepos=None):
|
||||
sourcepos=None, namespaces=None):
|
||||
"""Called by the tree builder when a new tag is encountered.
|
||||
|
||||
:param name: Name of the tag.
|
||||
|
@ -686,6 +702,8 @@ class BeautifulSoup(Tag):
|
|||
source document.
|
||||
:param sourcepos: The character position within `sourceline` where this
|
||||
tag was found.
|
||||
:param namespaces: A dictionary of all namespace prefix mappings
|
||||
currently in scope in the document.
|
||||
|
||||
If this method returns None, the tag was rejected by an active
|
||||
SoupStrainer. You should proceed as if the tag had not occurred
|
||||
|
@ -703,7 +721,8 @@ class BeautifulSoup(Tag):
|
|||
tag = self.element_classes.get(Tag, Tag)(
|
||||
self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element,
|
||||
sourceline=sourceline, sourcepos=sourcepos
|
||||
sourceline=sourceline, sourcepos=sourcepos,
|
||||
namespaces=namespaces
|
||||
)
|
||||
if tag is None:
|
||||
return tag
|
||||
|
@ -769,7 +788,9 @@ class BeautifulStoneSoup(BeautifulSoup):
|
|||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.',
|
||||
DeprecationWarning
|
||||
)
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
|
|
|
@ -3,10 +3,14 @@ __license__ = "MIT"
|
|||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import re
|
||||
import warnings
|
||||
import sys
|
||||
from ..element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
RubyParenthesisString,
|
||||
RubyTextString,
|
||||
Stylesheet,
|
||||
Script,
|
||||
TemplateString,
|
||||
|
@ -28,6 +32,12 @@ XML = 'xml'
|
|||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
class XMLParsedAsHTMLWarning(UserWarning):
|
||||
"""The warning issued when an HTML parser is used to parse
|
||||
XML that is not XHTML.
|
||||
"""
|
||||
MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
"""A way of looking up TreeBuilder subclasses by their name or by desired
|
||||
|
@ -112,7 +122,7 @@ class TreeBuilder(object):
|
|||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
|
||||
|
||||
# Whitespace should be preserved inside these tags.
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||
|
@ -234,7 +244,8 @@ class TreeBuilder(object):
|
|||
:param markup: Some markup -- probably a bytestring.
|
||||
:param user_specified_encoding: The user asked to try this encoding.
|
||||
:param document_declared_encoding: The markup itself claims to be
|
||||
in this encoding.
|
||||
in this encoding. NOTE: This argument is not used by the
|
||||
calling code and can probably be removed.
|
||||
:param exclude_encodings: The user asked _not_ to try any of
|
||||
these encodings.
|
||||
|
||||
|
@ -389,17 +400,25 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
# you need to use it.
|
||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||
|
||||
# The HTML standard defines an unusual content model for these tags.
|
||||
# We represent this by using a string class other than NavigableString
|
||||
# inside these tags.
|
||||
# These HTML tags need special treatment so they can be
|
||||
# represented by a string class other than NavigableString.
|
||||
#
|
||||
# I made this list by going through the HTML spec
|
||||
# For some of these tags, it's because the HTML standard defines
|
||||
# an unusual content model for them. I made this list by going
|
||||
# through the HTML spec
|
||||
# (https://html.spec.whatwg.org/#metadata-content) and looking for
|
||||
# "metadata content" elements that can contain strings.
|
||||
#
|
||||
# The Ruby tags (<rt> and <rp>) are here despite being normal
|
||||
# "phrasing content" tags, because the content they contain is
|
||||
# qualitatively different from other text in the document, and it
|
||||
# can be useful to be able to distinguish it.
|
||||
#
|
||||
# TODO: Arguably <noscript> could go here but it seems
|
||||
# qualitatively different from the other tags.
|
||||
DEFAULT_STRING_CONTAINERS = {
|
||||
'rt' : RubyTextString,
|
||||
'rp' : RubyParenthesisString,
|
||||
'style': Stylesheet,
|
||||
'script': Script,
|
||||
'template': TemplateString,
|
||||
|
@ -474,6 +493,99 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
class DetectsXMLParsedAsHTML(object):
|
||||
"""A mixin class for any class (a TreeBuilder, or some class used by a
|
||||
TreeBuilder) that's in a position to detect whether an XML
|
||||
document is being incorrectly parsed as HTML, and issue an
|
||||
appropriate warning.
|
||||
|
||||
This requires being able to observe an incoming processing
|
||||
instruction that might be an XML declaration, and also able to
|
||||
observe tags as they're opened. If you can't do that for a given
|
||||
TreeBuilder, there's a less reliable implementation based on
|
||||
examining the raw markup.
|
||||
"""
|
||||
|
||||
# Regular expression for seeing if markup has an <html> tag.
|
||||
LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
|
||||
LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
|
||||
|
||||
XML_PREFIX = '<?xml'
|
||||
XML_PREFIX_B = b'<?xml'
|
||||
|
||||
@classmethod
|
||||
def warn_if_markup_looks_like_xml(cls, markup):
|
||||
"""Perform a check on some markup to see if it looks like XML
|
||||
that's not XHTML. If so, issue a warning.
|
||||
|
||||
This is much less reliable than doing the check while parsing,
|
||||
but some of the tree builders can't do that.
|
||||
|
||||
:return: True if the markup looks like non-XHTML XML, False
|
||||
otherwise.
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
prefix = cls.XML_PREFIX_B
|
||||
looks_like_html = cls.LOOKS_LIKE_HTML_B
|
||||
else:
|
||||
prefix = cls.XML_PREFIX
|
||||
looks_like_html = cls.LOOKS_LIKE_HTML
|
||||
|
||||
if (markup is not None
|
||||
and markup.startswith(prefix)
|
||||
and not looks_like_html.search(markup[:500])
|
||||
):
|
||||
cls._warn()
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _warn(cls):
|
||||
"""Issue a warning about XML being parsed as HTML."""
|
||||
warnings.warn(
|
||||
XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
|
||||
)
|
||||
|
||||
def _initialize_xml_detector(self):
|
||||
"""Call this method before parsing a document."""
|
||||
self._first_processing_instruction = None
|
||||
self._root_tag = None
|
||||
|
||||
def _document_might_be_xml(self, processing_instruction):
|
||||
"""Call this method when encountering an XML declaration, or a
|
||||
"processing instruction" that might be an XML declaration.
|
||||
"""
|
||||
if (self._first_processing_instruction is not None
|
||||
or self._root_tag is not None):
|
||||
# The document has already started. Don't bother checking
|
||||
# anymore.
|
||||
return
|
||||
|
||||
self._first_processing_instruction = processing_instruction
|
||||
|
||||
# We won't know until we encounter the first tag whether or
|
||||
# not this is actually a problem.
|
||||
|
||||
def _root_tag_encountered(self, name):
|
||||
"""Call this when you encounter the document's root tag.
|
||||
|
||||
This is where we actually check whether an XML document is
|
||||
being incorrectly parsed as HTML, and issue the warning.
|
||||
"""
|
||||
if self._root_tag is not None:
|
||||
# This method was incorrectly called multiple times. Do
|
||||
# nothing.
|
||||
return
|
||||
|
||||
self._root_tag = name
|
||||
if (name != 'html' and self._first_processing_instruction is not None
|
||||
and self._first_processing_instruction.lower().startswith('xml ')):
|
||||
# We encountered an XML declaration and then a tag other
|
||||
# than 'html'. This is a reliable indicator that a
|
||||
# non-XHTML document is being parsed as XML.
|
||||
self._warn()
|
||||
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
this_module = sys.modules[__name__]
|
||||
|
|
|
@ -8,6 +8,7 @@ __all__ = [
|
|||
import warnings
|
||||
import re
|
||||
from ..builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
|
@ -70,6 +71,11 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
|||
# UnicodeDammit.
|
||||
if exclude_encodings:
|
||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||
|
||||
# html5lib only parses HTML, so if it's given XML that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
|
||||
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
|
@ -242,10 +248,10 @@ class AttrList(object):
|
|||
def __setitem__(self, name, value):
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = self.element.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
list_attr = self.element.cdata_list_attributes or {}
|
||||
if (name in list_attr.get('*', [])
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
and name in list_attr.get(self.element.name, []))):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
|
|
|
@ -44,6 +44,7 @@ from ..element import (
|
|||
from ..dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from ..builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
|
@ -52,7 +53,7 @@ from ..builder import (
|
|||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||
"""A subclass of the Python standard library's HTMLParser class, which
|
||||
listens for HTMLParser events and translates them into calls
|
||||
to Beautiful Soup's tree construction API.
|
||||
|
@ -88,6 +89,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
self._initialize_xml_detector()
|
||||
|
||||
def error(self, msg):
|
||||
"""In Python 3, HTMLParser subclasses must implement error(), although
|
||||
this requirement doesn't appear to be documented.
|
||||
|
@ -168,6 +171,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(name)
|
||||
|
||||
if self._root_tag is None:
|
||||
self._root_tag_encountered(name)
|
||||
|
||||
def handle_endtag(self, name, check_already_closed=True):
|
||||
"""Handle a closing tag, e.g. '</tag>'
|
||||
|
||||
|
@ -231,7 +237,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
|
||||
def handle_entityref(self, name):
|
||||
"""Handle a named entity reference by converting it to the
|
||||
corresponding Unicode character and treating it as textual
|
||||
corresponding Unicode character(s) and treating it as textual
|
||||
data.
|
||||
|
||||
:param name: Name of the entity reference.
|
||||
|
@ -288,6 +294,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
"""
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self._document_might_be_xml(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
|
@ -359,9 +366,24 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
|||
return
|
||||
|
||||
# Ask UnicodeDammit to sniff the most likely encoding.
|
||||
|
||||
# This was provided by the end-user; treat it as a known
|
||||
# definite encoding per the algorithm laid out in the HTML5
|
||||
# spec. (See the EncodingDetector class for details.)
|
||||
known_definite_encodings = [user_specified_encoding]
|
||||
|
||||
# This was found in the document; treat it as a slightly lower-priority
|
||||
# user encoding.
|
||||
user_encodings = [document_declared_encoding]
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||
exclude_encodings=exclude_encodings)
|
||||
dammit = UnicodeDammit(
|
||||
markup,
|
||||
known_definite_encodings=known_definite_encodings,
|
||||
user_encodings=user_encodings,
|
||||
is_html=True,
|
||||
exclude_encodings=exclude_encodings
|
||||
)
|
||||
yield (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
|
|
@ -22,6 +22,7 @@ from ..element import (
|
|||
XMLProcessingInstruction,
|
||||
)
|
||||
from ..builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
|
@ -79,9 +80,18 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
|
||||
This might be useful later on when creating CSS selectors.
|
||||
|
||||
This will track (almost) all namespaces, even ones that were
|
||||
only in scope for part of the document. If two namespaces have
|
||||
the same prefix, only the first one encountered will be
|
||||
tracked. Un-prefixed namespaces are not tracked.
|
||||
|
||||
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
||||
"""
|
||||
for key, value in list(mapping.items()):
|
||||
# This is 'if key' and not 'if key is not None' because we
|
||||
# don't track un-prefixed namespaces. Soupselect will
|
||||
# treat an un-prefixed namespace as the default, which
|
||||
# causes confusion in some cases.
|
||||
if key and key not in self.soup._namespaces:
|
||||
# Let the BeautifulSoup object know about a new namespace.
|
||||
# If there are multiple namespaces defined with the same
|
||||
|
@ -125,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
|
||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
|
@ -166,12 +177,21 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
is_html = not self.is_xml
|
||||
if is_html:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
# We're in HTML mode, so if we're given XML, that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
|
||||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
|
||||
# TODO: This is a workaround for
|
||||
# https://bugs.launchpad.net/lxml/+bug/1948551.
|
||||
# We can remove it once the upstream issue is fixed.
|
||||
if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
|
||||
markup = markup[1:]
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, str):
|
||||
|
@ -180,9 +200,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
yield (markup.encode("utf8"), "utf8",
|
||||
document_declared_encoding, False)
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
# This was provided by the end-user; treat it as a known
|
||||
# definite encoding per the algorithm laid out in the HTML5
|
||||
# spec. (See the EncodingDetector class for details.)
|
||||
known_definite_encodings = [user_specified_encoding]
|
||||
|
||||
# This was found in the document; treat it as a slightly lower-priority
|
||||
# user encoding.
|
||||
user_encodings = [document_declared_encoding]
|
||||
detector = EncodingDetector(
|
||||
markup, try_encodings, is_html, exclude_encodings)
|
||||
markup, known_definite_encodings=known_definite_encodings,
|
||||
user_encodings=user_encodings, is_html=is_html,
|
||||
exclude_encodings=exclude_encodings
|
||||
)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
|
@ -230,6 +260,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
# mappings.
|
||||
self.nsmaps.append(_invert(nsmap))
|
||||
|
||||
# The currently active namespace prefixes have
|
||||
# changed. Calculate the new mapping so it can be stored
|
||||
# with all Tag objects created while these prefixes are in
|
||||
# scope.
|
||||
current_mapping = dict(self.active_namespace_prefixes[-1])
|
||||
current_mapping.update(nsmap)
|
||||
|
||||
# We should not track un-prefixed namespaces as we can only hold one
|
||||
# and it will be recognized as the default namespace by soupsieve,
|
||||
# which may be confusing in some situations.
|
||||
if '' in current_mapping:
|
||||
del current_mapping['']
|
||||
self.active_namespace_prefixes.append(current_mapping)
|
||||
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
|
@ -254,7 +298,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
self.soup.handle_starttag(
|
||||
name, namespace, nsprefix, attrs,
|
||||
namespaces=self.active_namespace_prefixes[-1]
|
||||
)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
|
@ -279,11 +326,18 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
out_of_scope_nsmap = self.nsmaps.pop()
|
||||
|
||||
if out_of_scope_nsmap is not None:
|
||||
# This tag introduced a namespace mapping which is no
|
||||
# longer in scope. Recalculate the currently active
|
||||
# namespace prefixes.
|
||||
self.active_namespace_prefixes.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(target + ' ' + data)
|
||||
data = target + ' ' + data
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(self.processing_instruction_class)
|
||||
|
||||
def data(self, content):
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
import requests
|
||||
data = requests.get("https://www.crummy.com/").content
|
||||
from . import _s
|
||||
data = [x for x in _s(data).block_text()]
|
|
@ -9,48 +9,45 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
|
|||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
from html.entities import codepoint2name
|
||||
from collections import defaultdict
|
||||
import codecs
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
# Import a library to autodetect character encodings. We'll support
|
||||
# any of a number of libraries that all support the same API:
|
||||
#
|
||||
# * cchardet
|
||||
# * chardet
|
||||
# * charset-normalizer
|
||||
chardet_module = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
if isinstance(s, str):
|
||||
return None
|
||||
return cchardet.detect(s)['encoding']
|
||||
import cchardet as chardet_module
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
import chardet as chardet_module
|
||||
except ImportError:
|
||||
try:
|
||||
# PyPI package: charset-normalizer
|
||||
import charset_normalizer as chardet_module
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
chardet_module = None
|
||||
|
||||
if chardet_module:
|
||||
def chardet_dammit(s):
|
||||
if isinstance(s, str):
|
||||
return None
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
return chardet_module.detect(s)['encoding']
|
||||
else:
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
#
|
||||
# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
|
||||
# is GPL-licensed. Check whether this is still necessary.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Build bytestring and Unicode versions of regular expressions for finding
|
||||
# a declared encoding inside an XML or HTML document.
|
||||
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
||||
|
@ -65,34 +62,129 @@ encoding_res[str] = {
|
|||
'xml' : re.compile(xml_encoding, re.I)
|
||||
}
|
||||
|
||||
from html.entities import html5
|
||||
|
||||
class EntitySubstitution(object):
|
||||
"""The ability to substitute XML or HTML entities for certain characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
"""Initialize variables used by this class to manage the plethora of
|
||||
HTML5 named entities.
|
||||
|
||||
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
|
||||
# entity. We don't want to use it, but we want to recognize it on the way in.
|
||||
This function returns a 3-tuple containing two dictionaries
|
||||
and a regular expression:
|
||||
|
||||
unicode_to_name - A mapping of Unicode strings like "⦨" to
|
||||
entity names like "angmsdaa". When a single Unicode string has
|
||||
multiple entity names, we try to choose the most commonly-used
|
||||
name.
|
||||
|
||||
name_to_unicode: A mapping of entity names like "angmsdaa" to
|
||||
Unicode strings like "⦨".
|
||||
|
||||
named_entity_re: A regular expression matching (almost) any
|
||||
Unicode string that corresponds to an HTML5 named entity.
|
||||
"""
|
||||
unicode_to_name = {}
|
||||
name_to_unicode = {}
|
||||
|
||||
short_entities = set()
|
||||
long_entities_by_first_character = defaultdict(set)
|
||||
|
||||
for name_with_semicolon, character in sorted(html5.items()):
|
||||
# "It is intentional, for legacy compatibility, that many
|
||||
# code points have multiple character reference names. For
|
||||
# example, some appear both with and without the trailing
|
||||
# semicolon, or with different capitalizations."
|
||||
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
|
||||
#
|
||||
# TODO: Ideally we would be able to recognize all HTML 5 named
|
||||
# entities, but that's a little tricky.
|
||||
extra = [(39, 'apos')]
|
||||
for codepoint, name in list(codepoint2name.items()) + extra:
|
||||
# The parsers are in charge of handling (or not) character
|
||||
# references with no trailing semicolon, so we remove the
|
||||
# semicolon whenever it appears.
|
||||
if name_with_semicolon.endswith(';'):
|
||||
name = name_with_semicolon[:-1]
|
||||
else:
|
||||
name = name_with_semicolon
|
||||
|
||||
# When parsing HTML, we want to recognize any known named
|
||||
# entity and convert it to a sequence of Unicode
|
||||
# characters.
|
||||
if name not in name_to_unicode:
|
||||
name_to_unicode[name] = character
|
||||
|
||||
# When _generating_ HTML, we want to recognize special
|
||||
# character sequences that _could_ be converted to named
|
||||
# entities.
|
||||
unicode_to_name[character] = name
|
||||
|
||||
# We also need to build a regular expression that lets us
|
||||
# _find_ those characters in output strings so we can
|
||||
# replace them.
|
||||
#
|
||||
# This is tricky, for two reasons.
|
||||
|
||||
if (len(character) == 1 and ord(character) < 128
|
||||
and character not in '<>&'):
|
||||
# First, it would be annoying to turn single ASCII
|
||||
# characters like | into named entities like
|
||||
# |. The exceptions are <>&, which we _must_
|
||||
# turn into named entities to produce valid HTML.
|
||||
continue
|
||||
|
||||
if len(character) > 1 and all(ord(x) < 128 for x in character):
|
||||
# We also do not want to turn _combinations_ of ASCII
|
||||
# characters like 'fj' into named entities like 'fj',
|
||||
# though that's more debateable.
|
||||
continue
|
||||
|
||||
# Second, some named entities have a Unicode value that's
|
||||
# a subset of the Unicode value for some _other_ named
|
||||
# entity. As an example, \u2267' is ≧,
|
||||
# but '\u2267\u0338' is ≧̸. Our regular
|
||||
# expression needs to match the first two characters of
|
||||
# "\u2267\u0338foo", but only the first character of
|
||||
# "\u2267foo".
|
||||
#
|
||||
# In this step, we build two sets of characters that
|
||||
# _eventually_ need to go into the regular expression. But
|
||||
# we won't know exactly what the regular expression needs
|
||||
# to look like until we've gone through the entire list of
|
||||
# named entities.
|
||||
if len(character) == 1:
|
||||
short_entities.add(character)
|
||||
else:
|
||||
long_entities_by_first_character[character[0]].add(character)
|
||||
|
||||
# Now that we've been through the entire list of entities, we
|
||||
# can create a regular expression that matches any of them.
|
||||
particles = set()
|
||||
for short in short_entities:
|
||||
long_versions = long_entities_by_first_character[short]
|
||||
if not long_versions:
|
||||
particles.add(short)
|
||||
else:
|
||||
ignore = "".join([x[1] for x in long_versions])
|
||||
# This finds, e.g. \u2267 but only if it is _not_
|
||||
# followed by \u0338.
|
||||
particles.add("%s(?![%s])" % (short, ignore))
|
||||
|
||||
for long_entities in list(long_entities_by_first_character.values()):
|
||||
for long_entity in long_entities:
|
||||
particles.add(long_entity)
|
||||
|
||||
re_definition = "(%s)" % "|".join(particles)
|
||||
|
||||
# If an entity shows up in both html5 and codepoint2name, it's
|
||||
# likely that HTML5 gives it several different names, such as
|
||||
# 'rsquo' and 'rsquor'. When converting Unicode characters to
|
||||
# named entities, the codepoint2name name should take
|
||||
# precedence where possible, since that's the more easily
|
||||
# recognizable one.
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = chr(codepoint)
|
||||
if codepoint not in (34, 39):
|
||||
# There's no point in turning the quotation mark into
|
||||
# " or the single quote into ', unless it
|
||||
# happens within an attribute value, which is handled
|
||||
# elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to recognize those entities on the way in and
|
||||
# convert them to Unicode characters.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
unicode_to_name[character] = name
|
||||
|
||||
return unicode_to_name, name_to_unicode, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
|
@ -113,14 +205,14 @@ class EntitySubstitution(object):
|
|||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate HTML entity for a special character."""
|
||||
appropriate HTML entity for a special character string."""
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for a special character."""
|
||||
appropriate XML entity for a special character string."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
|
@ -228,32 +320,65 @@ class EncodingDetector:
|
|||
Order of precedence:
|
||||
|
||||
1. Encodings you specifically tell EncodingDetector to try first
|
||||
(the override_encodings argument to the constructor).
|
||||
(the known_definite_encodings argument to the constructor).
|
||||
|
||||
2. An encoding declared within the bytestring itself, either in an
|
||||
2. An encoding determined by sniffing the document's byte-order mark.
|
||||
|
||||
3. Encodings you specifically tell EncodingDetector to try if
|
||||
byte-order mark sniffing fails (the user_encodings argument to the
|
||||
constructor).
|
||||
|
||||
4. An encoding declared within the bytestring itself, either in an
|
||||
XML declaration (if the bytestring is to be interpreted as an XML
|
||||
document), or in a <meta> tag (if the bytestring is to be
|
||||
interpreted as an HTML document.)
|
||||
|
||||
3. An encoding detected through textual analysis by chardet,
|
||||
5. An encoding detected through textual analysis by chardet,
|
||||
cchardet, or a similar external library.
|
||||
|
||||
4. UTF-8.
|
||||
|
||||
5. Windows-1252.
|
||||
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||
exclude_encodings=None):
|
||||
def __init__(self, markup, known_definite_encodings=None,
|
||||
is_html=False, exclude_encodings=None,
|
||||
user_encodings=None, override_encodings=None):
|
||||
"""Constructor.
|
||||
|
||||
:param markup: Some markup in an unknown encoding.
|
||||
:param override_encodings: These encodings will be tried first.
|
||||
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
||||
it's assumed to be XML.
|
||||
:param exclude_encodings: These encodings will not be tried, even
|
||||
if they otherwise would be.
|
||||
|
||||
:param known_definite_encodings: When determining the encoding
|
||||
of `markup`, these encodings will be tried first, in
|
||||
order. In HTML terms, this corresponds to the "known
|
||||
definite encoding" step defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
||||
|
||||
:param user_encodings: These encodings will be tried after the
|
||||
`known_definite_encodings` have been tried and failed, and
|
||||
after an attempt to sniff the encoding by looking at a
|
||||
byte order mark has failed. In HTML terms, this
|
||||
corresponds to the step "user has explicitly instructed
|
||||
the user agent to override the document's character
|
||||
encoding", defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
|
||||
:param override_encodings: A deprecated alias for
|
||||
known_definite_encodings. Any encodings here will be tried
|
||||
immediately after the encodings in
|
||||
known_definite_encodings.
|
||||
|
||||
:param is_html: If True, this markup is considered to be
|
||||
HTML. Otherwise it's assumed to be XML.
|
||||
|
||||
:param exclude_encodings: These encodings will not be tried,
|
||||
even if they otherwise would be.
|
||||
|
||||
"""
|
||||
self.override_encodings = override_encodings or []
|
||||
self.known_definite_encodings = list(known_definite_encodings or [])
|
||||
if override_encodings:
|
||||
self.known_definite_encodings += override_encodings
|
||||
self.user_encodings = user_encodings or []
|
||||
exclude_encodings = exclude_encodings or []
|
||||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||
self.chardet_encoding = None
|
||||
|
@ -286,7 +411,9 @@ class EncodingDetector:
|
|||
:yield: A sequence of strings.
|
||||
"""
|
||||
tried = set()
|
||||
for e in self.override_encodings:
|
||||
|
||||
# First, try the known definite encodings
|
||||
for e in self.known_definite_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
|
@ -295,6 +422,12 @@ class EncodingDetector:
|
|||
if self._usable(self.sniffed_encoding, tried):
|
||||
yield self.sniffed_encoding
|
||||
|
||||
# Sniffing the byte-order mark did nothing; try the user
|
||||
# encodings.
|
||||
for e in self.user_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
# Look within the document for an XML or HTML encoding
|
||||
# declaration.
|
||||
if self.declared_encoding is None:
|
||||
|
@ -405,13 +538,33 @@ class UnicodeDammit:
|
|||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||
def __init__(self, markup, known_definite_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[],
|
||||
user_encodings=None, override_encodings=None
|
||||
):
|
||||
"""Constructor.
|
||||
|
||||
:param markup: A bytestring representing markup in an unknown encoding.
|
||||
:param override_encodings: These encodings will be tried first,
|
||||
before any sniffing code is run.
|
||||
|
||||
:param known_definite_encodings: When determining the encoding
|
||||
of `markup`, these encodings will be tried first, in
|
||||
order. In HTML terms, this corresponds to the "known
|
||||
definite encoding" step defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
||||
|
||||
:param user_encodings: These encodings will be tried after the
|
||||
`known_definite_encodings` have been tried and failed, and
|
||||
after an attempt to sniff the encoding by looking at a
|
||||
byte order mark has failed. In HTML terms, this
|
||||
corresponds to the step "user has explicitly instructed
|
||||
the user agent to override the document's character
|
||||
encoding", defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
|
||||
:param override_encodings: A deprecated alias for
|
||||
known_definite_encodings. Any encodings here will be tried
|
||||
immediately after the encodings in
|
||||
known_definite_encodings.
|
||||
|
||||
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
|
||||
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
|
||||
|
@ -421,6 +574,7 @@ class UnicodeDammit:
|
|||
it's assumed to be XML.
|
||||
:param exclude_encodings: These encodings will not be considered, even
|
||||
if the sniffing code thinks they might make sense.
|
||||
|
||||
"""
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
|
@ -428,7 +582,9 @@ class UnicodeDammit:
|
|||
self.is_html = is_html
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.detector = EncodingDetector(
|
||||
markup, override_encodings, is_html, exclude_encodings)
|
||||
markup, known_definite_encodings, is_html, exclude_encodings,
|
||||
user_encodings, override_encodings
|
||||
)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, str) or markup == '':
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
from io import StringIO
|
||||
from io import BytesIO
|
||||
from html.parser import HTMLParser
|
||||
import bs4
|
||||
from . import BeautifulSoup, __version__
|
||||
|
@ -103,7 +103,13 @@ def lxml_trace(data, html=True, **kwargs):
|
|||
if False, lxml's XML parser will be used.
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
recover = kwargs.pop('recover', True)
|
||||
if isinstance(data, str):
|
||||
data = data.encode("utf8")
|
||||
reader = BytesIO(data)
|
||||
for event, element in etree.iterparse(
|
||||
reader, html=html, recover=recover, **kwargs
|
||||
):
|
||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
|
|
|
@ -23,7 +23,6 @@ from .formatter import (
|
|||
)
|
||||
|
||||
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
||||
PY3K = (sys.version_info[0] > 2)
|
||||
|
||||
nonwhitespace_re = re.compile(r"\S+")
|
||||
|
||||
|
@ -83,9 +82,9 @@ class NamespacedAttribute(str):
|
|||
# per https://www.w3.org/TR/xml-names/#defaulting
|
||||
name = None
|
||||
|
||||
if name is None:
|
||||
if not name:
|
||||
obj = str.__new__(cls, prefix)
|
||||
elif prefix is None:
|
||||
elif not prefix:
|
||||
# Not really namespaced.
|
||||
obj = str.__new__(cls, name)
|
||||
else:
|
||||
|
@ -255,25 +254,67 @@ class PageElement(object):
|
|||
nextSibling = _alias("next_sibling") # BS3
|
||||
previousSibling = _alias("previous_sibling") # BS3
|
||||
|
||||
def replace_with(self, replace_with):
|
||||
"""Replace this PageElement with another one, keeping the rest of the
|
||||
tree the same.
|
||||
default = object()
|
||||
def _all_strings(self, strip=False, types=default):
|
||||
"""Yield all strings of certain classes, possibly stripping them.
|
||||
|
||||
:param replace_with: A PageElement.
|
||||
This is implemented differently in Tag and NavigableString.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def stripped_strings(self):
|
||||
"""Yield all strings in this PageElement, stripping them first.
|
||||
|
||||
:yield: A sequence of stripped strings.
|
||||
"""
|
||||
for string in self._all_strings(True):
|
||||
yield string
|
||||
|
||||
def get_text(self, separator="", strip=False,
|
||||
types=default):
|
||||
"""Get all child strings of this PageElement, concatenated using the
|
||||
given separator.
|
||||
|
||||
:param separator: Strings will be concatenated using this separator.
|
||||
|
||||
:param strip: If True, strings will be stripped before being
|
||||
concatenated.
|
||||
|
||||
:param types: A tuple of NavigableString subclasses. Any
|
||||
strings of a subclass not found in this list will be
|
||||
ignored. Although there are exceptions, the default
|
||||
behavior in most cases is to consider only NavigableString
|
||||
and CData objects. That means no comments, processing
|
||||
instructions, etc.
|
||||
|
||||
:return: A string.
|
||||
"""
|
||||
return separator.join([s for s in self._all_strings(
|
||||
strip, types=types)])
|
||||
getText = get_text
|
||||
text = property(get_text)
|
||||
|
||||
def replace_with(self, *args):
|
||||
"""Replace this PageElement with one or more PageElements, keeping the
|
||||
rest of the tree the same.
|
||||
|
||||
:param args: One or more PageElements.
|
||||
:return: `self`, no longer part of the tree.
|
||||
"""
|
||||
if self.parent is None:
|
||||
raise ValueError(
|
||||
"Cannot replace one element with another when the "
|
||||
"element to be replaced is not part of a tree.")
|
||||
if replace_with is self:
|
||||
if len(args) == 1 and args[0] is self:
|
||||
return
|
||||
if replace_with is self.parent:
|
||||
if any(x is self.parent for x in args):
|
||||
raise ValueError("Cannot replace a Tag with its parent.")
|
||||
old_parent = self.parent
|
||||
my_index = self.parent.index(self)
|
||||
self.extract(_self_index=my_index)
|
||||
old_parent.insert(my_index, replace_with)
|
||||
for idx, replace_with in enumerate(args, start=my_index):
|
||||
old_parent.insert(idx, replace_with)
|
||||
return self
|
||||
replaceWith = replace_with # BS3
|
||||
|
||||
|
@ -513,7 +554,7 @@ class PageElement(object):
|
|||
parent.insert(index+1+offset, successor)
|
||||
offset += 1
|
||||
|
||||
def find_next(self, name=None, attrs={}, text=None, **kwargs):
|
||||
def find_next(self, name=None, attrs={}, string=None, **kwargs):
|
||||
"""Find the first PageElement that matches the given criteria and
|
||||
appears later in the document than this PageElement.
|
||||
|
||||
|
@ -522,15 +563,15 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A PageElement.
|
||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||
"""
|
||||
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
|
||||
return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
|
||||
findNext = find_next # BS3
|
||||
|
||||
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
|
||||
def find_all_next(self, name=None, attrs={}, string=None, limit=None,
|
||||
**kwargs):
|
||||
"""Find all PageElements that match the given criteria and appear
|
||||
later in the document than this PageElement.
|
||||
|
@ -540,16 +581,16 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:param limit: Stop looking after finding this many results.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A ResultSet containing PageElements.
|
||||
"""
|
||||
return self._find_all(name, attrs, text, limit, self.next_elements,
|
||||
return self._find_all(name, attrs, string, limit, self.next_elements,
|
||||
**kwargs)
|
||||
findAllNext = find_all_next # BS3
|
||||
|
||||
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
|
||||
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
|
||||
"""Find the closest sibling to this PageElement that matches the
|
||||
given criteria and appears later in the document.
|
||||
|
||||
|
@ -558,16 +599,16 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A PageElement.
|
||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||
"""
|
||||
return self._find_one(self.find_next_siblings, name, attrs, text,
|
||||
return self._find_one(self.find_next_siblings, name, attrs, string,
|
||||
**kwargs)
|
||||
findNextSibling = find_next_sibling # BS3
|
||||
|
||||
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
|
||||
def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
|
||||
**kwargs):
|
||||
"""Find all siblings of this PageElement that match the given criteria
|
||||
and appear later in the document.
|
||||
|
@ -577,18 +618,18 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:param limit: Stop looking after finding this many results.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A ResultSet of PageElements.
|
||||
:rtype: bs4.element.ResultSet
|
||||
"""
|
||||
return self._find_all(name, attrs, text, limit,
|
||||
return self._find_all(name, attrs, string, limit,
|
||||
self.next_siblings, **kwargs)
|
||||
findNextSiblings = find_next_siblings # BS3
|
||||
fetchNextSiblings = find_next_siblings # BS2
|
||||
|
||||
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
|
||||
def find_previous(self, name=None, attrs={}, string=None, **kwargs):
|
||||
"""Look backwards in the document from this PageElement and find the
|
||||
first PageElement that matches the given criteria.
|
||||
|
||||
|
@ -597,16 +638,16 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A PageElement.
|
||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||
"""
|
||||
return self._find_one(
|
||||
self.find_all_previous, name, attrs, text, **kwargs)
|
||||
self.find_all_previous, name, attrs, string, **kwargs)
|
||||
findPrevious = find_previous # BS3
|
||||
|
||||
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
|
||||
def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
|
||||
**kwargs):
|
||||
"""Look backwards in the document from this PageElement and find all
|
||||
PageElements that match the given criteria.
|
||||
|
@ -616,18 +657,18 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:param limit: Stop looking after finding this many results.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A ResultSet of PageElements.
|
||||
:rtype: bs4.element.ResultSet
|
||||
"""
|
||||
return self._find_all(name, attrs, text, limit, self.previous_elements,
|
||||
return self._find_all(name, attrs, string, limit, self.previous_elements,
|
||||
**kwargs)
|
||||
findAllPrevious = find_all_previous # BS3
|
||||
fetchPrevious = find_all_previous # BS2
|
||||
|
||||
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
|
||||
def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
|
||||
"""Returns the closest sibling to this PageElement that matches the
|
||||
given criteria and appears earlier in the document.
|
||||
|
||||
|
@ -636,16 +677,16 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A PageElement.
|
||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||
"""
|
||||
return self._find_one(self.find_previous_siblings, name, attrs, text,
|
||||
return self._find_one(self.find_previous_siblings, name, attrs, string,
|
||||
**kwargs)
|
||||
findPreviousSibling = find_previous_sibling # BS3
|
||||
|
||||
def find_previous_siblings(self, name=None, attrs={}, text=None,
|
||||
def find_previous_siblings(self, name=None, attrs={}, string=None,
|
||||
limit=None, **kwargs):
|
||||
"""Returns all siblings to this PageElement that match the
|
||||
given criteria and appear earlier in the document.
|
||||
|
@ -655,13 +696,13 @@ class PageElement(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:param limit: Stop looking after finding this many results.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
:return: A ResultSet of PageElements.
|
||||
:rtype: bs4.element.ResultSet
|
||||
"""
|
||||
return self._find_all(name, attrs, text, limit,
|
||||
return self._find_all(name, attrs, string, limit,
|
||||
self.previous_siblings, **kwargs)
|
||||
findPreviousSiblings = find_previous_siblings # BS3
|
||||
fetchPreviousSiblings = find_previous_siblings # BS2
|
||||
|
@ -728,26 +769,29 @@ class PageElement(object):
|
|||
|
||||
#These methods do the real heavy lifting.
|
||||
|
||||
def _find_one(self, method, name, attrs, text, **kwargs):
|
||||
def _find_one(self, method, name, attrs, string, **kwargs):
|
||||
r = None
|
||||
l = method(name, attrs, text, 1, **kwargs)
|
||||
l = method(name, attrs, string, 1, **kwargs)
|
||||
if l:
|
||||
r = l[0]
|
||||
return r
|
||||
|
||||
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
|
||||
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
|
||||
"Iterates over a generator looking for things that match."
|
||||
|
||||
if text is None and 'string' in kwargs:
|
||||
text = kwargs['string']
|
||||
del kwargs['string']
|
||||
if string is None and 'text' in kwargs:
|
||||
string = kwargs.pop('text')
|
||||
warnings.warn(
|
||||
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
|
||||
DeprecationWarning
|
||||
)
|
||||
|
||||
if isinstance(name, SoupStrainer):
|
||||
strainer = name
|
||||
else:
|
||||
strainer = SoupStrainer(name, attrs, text, **kwargs)
|
||||
strainer = SoupStrainer(name, attrs, string, **kwargs)
|
||||
|
||||
if text is None and not limit and not attrs and not kwargs:
|
||||
if string is None and not limit and not attrs and not kwargs:
|
||||
if name is True or name is None:
|
||||
# Optimization to find all tags.
|
||||
result = (element for element in generator
|
||||
|
@ -945,6 +989,53 @@ class NavigableString(str, PageElement):
|
|||
"""Prevent NavigableString.name from ever being set."""
|
||||
raise AttributeError("A NavigableString cannot be given a name.")
|
||||
|
||||
def _all_strings(self, strip=False, types=PageElement.default):
|
||||
"""Yield all strings of certain classes, possibly stripping them.
|
||||
|
||||
This makes it easy for NavigableString to implement methods
|
||||
like get_text() as conveniences, creating a consistent
|
||||
text-extraction API across all PageElements.
|
||||
|
||||
:param strip: If True, all strings will be stripped before being
|
||||
yielded.
|
||||
|
||||
:param types: A tuple of NavigableString subclasses. If this
|
||||
NavigableString isn't one of those subclasses, the
|
||||
sequence will be empty. By default, the subclasses
|
||||
considered are NavigableString and CData objects. That
|
||||
means no comments, processing instructions, etc.
|
||||
|
||||
:yield: A sequence that either contains this string, or is empty.
|
||||
|
||||
"""
|
||||
if types is self.default:
|
||||
# This is kept in Tag because it's full of subclasses of
|
||||
# this class, which aren't defined until later in the file.
|
||||
types = Tag.DEFAULT_INTERESTING_STRING_TYPES
|
||||
|
||||
# Do nothing if the caller is looking for specific types of
|
||||
# string, and we're of a different type.
|
||||
#
|
||||
# We check specific types instead of using isinstance(self,
|
||||
# types) because all of these classes subclass
|
||||
# NavigableString. Anyone who's using this feature probably
|
||||
# wants generic NavigableStrings but not other stuff.
|
||||
my_type = type(self)
|
||||
if types is not None:
|
||||
if isinstance(types, type):
|
||||
# Looking for a single type.
|
||||
if my_type is not types:
|
||||
return
|
||||
elif my_type not in types:
|
||||
# Looking for one of a list of types.
|
||||
return
|
||||
|
||||
value = self
|
||||
if strip:
|
||||
value = value.strip()
|
||||
if len(value) > 0:
|
||||
yield value
|
||||
strings = property(_all_strings)
|
||||
|
||||
class PreformattedString(NavigableString):
|
||||
"""A NavigableString not subject to the normal formatting rules.
|
||||
|
@ -1057,6 +1148,27 @@ class TemplateString(NavigableString):
|
|||
pass
|
||||
|
||||
|
||||
class RubyTextString(NavigableString):
|
||||
"""A NavigableString representing the contents of the <rt> HTML
|
||||
element.
|
||||
|
||||
https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
|
||||
|
||||
Can be used to distinguish such strings from the strings they're
|
||||
annotating.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class RubyParenthesisString(NavigableString):
|
||||
"""A NavigableString representing the contents of the <rp> HTML
|
||||
element.
|
||||
|
||||
https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class Tag(PageElement):
|
||||
"""Represents an HTML or XML tag that is part of a parse tree, along
|
||||
with its attributes and contents.
|
||||
|
@ -1069,7 +1181,9 @@ class Tag(PageElement):
|
|||
prefix=None, attrs=None, parent=None, previous=None,
|
||||
is_xml=None, sourceline=None, sourcepos=None,
|
||||
can_be_empty_element=None, cdata_list_attributes=None,
|
||||
preserve_whitespace_tags=None
|
||||
preserve_whitespace_tags=None,
|
||||
interesting_string_types=None,
|
||||
namespaces=None
|
||||
):
|
||||
"""Basic constructor.
|
||||
|
||||
|
@ -1095,6 +1209,16 @@ class Tag(PageElement):
|
|||
be treated as CDATA if they ever show up on this tag.
|
||||
:param preserve_whitespace_tags: A list of tag names whose contents
|
||||
should have their whitespace preserved.
|
||||
:param interesting_string_types: This is a NavigableString
|
||||
subclass or a tuple of them. When iterating over this
|
||||
Tag's strings in methods like Tag.strings or Tag.get_text,
|
||||
these are the types of strings that are interesting enough
|
||||
to be considered. The default is to consider
|
||||
NavigableString and CData the only interesting string
|
||||
subtypes.
|
||||
:param namespaces: A dictionary mapping currently active
|
||||
namespace prefixes to URIs. This can be used later to
|
||||
construct CSS selectors.
|
||||
"""
|
||||
if parser is None:
|
||||
self.parser_class = None
|
||||
|
@ -1106,6 +1230,7 @@ class Tag(PageElement):
|
|||
raise ValueError("No value provided for new tag's name.")
|
||||
self.name = name
|
||||
self.namespace = namespace
|
||||
self._namespaces = namespaces or {}
|
||||
self.prefix = prefix
|
||||
if ((not builder or builder.store_line_numbers)
|
||||
and (sourceline is not None or sourcepos is not None)):
|
||||
|
@ -1140,6 +1265,7 @@ class Tag(PageElement):
|
|||
self.can_be_empty_element = can_be_empty_element
|
||||
self.cdata_list_attributes = cdata_list_attributes
|
||||
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||
self.interesting_string_types = interesting_string_types
|
||||
else:
|
||||
# Set up any substitutions for this tag, such as the charset in a META tag.
|
||||
builder.set_up_substitutions(self)
|
||||
|
@ -1161,6 +1287,13 @@ class Tag(PageElement):
|
|||
# whitespace-preserved tag.
|
||||
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
|
||||
|
||||
if self.name in builder.string_containers:
|
||||
# This sort of tag uses a special string container
|
||||
# subclass for most of its strings. When we ask the
|
||||
self.interesting_string_types = builder.string_containers[self.name]
|
||||
else:
|
||||
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
|
||||
|
||||
parserClass = _alias("parser_class") # BS3
|
||||
|
||||
def __copy__(self):
|
||||
|
@ -1226,65 +1359,45 @@ class Tag(PageElement):
|
|||
self.clear()
|
||||
self.append(string.__class__(string))
|
||||
|
||||
def _all_strings(self, strip=False, types=(NavigableString, CData)):
|
||||
DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
|
||||
def _all_strings(self, strip=False, types=PageElement.default):
|
||||
"""Yield all strings of certain classes, possibly stripping them.
|
||||
|
||||
:param strip: If True, all strings will be stripped before being
|
||||
yielded.
|
||||
|
||||
:types: A tuple of NavigableString subclasses. Any strings of
|
||||
:param types: A tuple of NavigableString subclasses. Any strings of
|
||||
a subclass not found in this list will be ignored. By
|
||||
default, this means only NavigableString and CData objects
|
||||
will be considered. So no comments, processing instructions,
|
||||
etc.
|
||||
default, the subclasses considered are the ones found in
|
||||
self.interesting_string_types. If that's not specified,
|
||||
only NavigableString and CData objects will be
|
||||
considered. That means no comments, processing
|
||||
instructions, etc.
|
||||
|
||||
:yield: A sequence of strings.
|
||||
|
||||
"""
|
||||
if types is self.default:
|
||||
types = self.interesting_string_types
|
||||
|
||||
for descendant in self.descendants:
|
||||
if (
|
||||
(types is None and not isinstance(descendant, NavigableString))
|
||||
or
|
||||
(types is not None and type(descendant) not in types)):
|
||||
if (types is None and not isinstance(descendant, NavigableString)):
|
||||
continue
|
||||
descendant_type = type(descendant)
|
||||
if isinstance(types, type):
|
||||
if descendant_type is not types:
|
||||
# We're not interested in strings of this type.
|
||||
continue
|
||||
elif types is not None and descendant_type not in types:
|
||||
# We're not interested in strings of this type.
|
||||
continue
|
||||
if strip:
|
||||
descendant = descendant.strip()
|
||||
if len(descendant) == 0:
|
||||
continue
|
||||
yield descendant
|
||||
|
||||
strings = property(_all_strings)
|
||||
|
||||
@property
|
||||
def stripped_strings(self):
|
||||
"""Yield all strings in the document, stripping them first.
|
||||
|
||||
:yield: A sequence of stripped strings.
|
||||
"""
|
||||
for string in self._all_strings(True):
|
||||
yield string
|
||||
|
||||
def get_text(self, separator="", strip=False,
|
||||
types=(NavigableString, CData)):
|
||||
"""Get all child strings, concatenated using the given separator.
|
||||
|
||||
:param separator: Strings will be concatenated using this separator.
|
||||
|
||||
:param strip: If True, strings will be stripped before being
|
||||
concatenated.
|
||||
|
||||
:types: A tuple of NavigableString subclasses. Any strings of
|
||||
a subclass not found in this list will be ignored. By
|
||||
default, this means only NavigableString and CData objects
|
||||
will be considered. So no comments, processing instructions,
|
||||
stylesheets, etc.
|
||||
|
||||
:return: A string.
|
||||
"""
|
||||
return separator.join([s for s in self._all_strings(
|
||||
strip, types=types)])
|
||||
getText = get_text
|
||||
text = property(get_text)
|
||||
|
||||
def decompose(self):
|
||||
"""Recursively destroys this PageElement and its children.
|
||||
|
||||
|
@ -1444,7 +1557,8 @@ class Tag(PageElement):
|
|||
warnings.warn(
|
||||
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
|
||||
name=tag_name
|
||||
)
|
||||
),
|
||||
DeprecationWarning
|
||||
)
|
||||
return self.find(tag_name)
|
||||
# We special case contents to avoid recursion.
|
||||
|
@ -1479,34 +1593,17 @@ class Tag(PageElement):
|
|||
"""Renders this PageElement as a string.
|
||||
|
||||
:param encoding: The encoding to use (Python 2 only).
|
||||
:return: Under Python 2, a bytestring; under Python 3,
|
||||
a Unicode string.
|
||||
TODO: This is now ignored and a warning should be issued
|
||||
if a value is provided.
|
||||
:return: A (Unicode) string.
|
||||
"""
|
||||
if PY3K:
|
||||
# "The return value must be a string object", i.e. Unicode
|
||||
return self.decode()
|
||||
else:
|
||||
# "The return value must be a string object", i.e. a bytestring.
|
||||
# By convention, the return value of __repr__ should also be
|
||||
# an ASCII string.
|
||||
return self.encode(encoding)
|
||||
|
||||
def __unicode__(self):
|
||||
"""Renders this PageElement as a Unicode string."""
|
||||
return self.decode()
|
||||
|
||||
def __str__(self):
|
||||
"""Renders this PageElement as a generic string.
|
||||
|
||||
:return: Under Python 2, a UTF-8 bytestring; under Python 3,
|
||||
a Unicode string.
|
||||
"""
|
||||
if PY3K:
|
||||
return self.decode()
|
||||
else:
|
||||
return self.encode()
|
||||
|
||||
if PY3K:
|
||||
__str__ = __repr__ = __unicode__
|
||||
|
||||
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
|
@ -1517,8 +1614,10 @@ class Tag(PageElement):
|
|||
|
||||
:param encoding: The destination encoding.
|
||||
:param indent_level: Each line of the rendering will be
|
||||
indented this many spaces. Used internally in
|
||||
recursive calls while pretty-printing.
|
||||
indented this many levels. (The formatter decides what a
|
||||
'level' means in terms of spaces or other characters
|
||||
output.) Used internally in recursive calls while
|
||||
pretty-printing.
|
||||
:param formatter: A Formatter object, or a string naming one of
|
||||
the standard formatters.
|
||||
:param errors: An error handling strategy such as
|
||||
|
@ -1594,7 +1693,7 @@ class Tag(PageElement):
|
|||
space = ''
|
||||
indent_space = ''
|
||||
if indent_level is not None:
|
||||
indent_space = (' ' * (indent_level - 1))
|
||||
indent_space = (formatter.indent * (indent_level - 1))
|
||||
if pretty_print:
|
||||
space = indent_space
|
||||
indent_contents = indent_level + 1
|
||||
|
@ -1669,8 +1768,10 @@ class Tag(PageElement):
|
|||
"""Renders the contents of this tag as a Unicode string.
|
||||
|
||||
:param indent_level: Each line of the rendering will be
|
||||
indented this many spaces. Used internally in
|
||||
recursive calls while pretty-printing.
|
||||
indented this many levels. (The formatter decides what a
|
||||
'level' means in terms of spaces or other characters
|
||||
output.) Used internally in recursive calls while
|
||||
pretty-printing.
|
||||
|
||||
:param eventual_encoding: The tag is destined to be
|
||||
encoded into this encoding. decode_contents() is _not_
|
||||
|
@ -1681,6 +1782,7 @@ class Tag(PageElement):
|
|||
|
||||
:param formatter: A Formatter object, or a string naming one of
|
||||
the standard Formatters.
|
||||
|
||||
"""
|
||||
# First off, turn a string formatter into a Formatter object. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
|
@ -1703,7 +1805,7 @@ class Tag(PageElement):
|
|||
text = text.strip()
|
||||
if text:
|
||||
if pretty_print and not preserve_whitespace:
|
||||
s.append(" " * (indent_level - 1))
|
||||
s.append(formatter.indent * (indent_level - 1))
|
||||
s.append(text)
|
||||
if pretty_print and not preserve_whitespace:
|
||||
s.append("\n")
|
||||
|
@ -1715,8 +1817,10 @@ class Tag(PageElement):
|
|||
"""Renders the contents of this PageElement as a bytestring.
|
||||
|
||||
:param indent_level: Each line of the rendering will be
|
||||
indented this many spaces. Used internally in
|
||||
recursive calls while pretty-printing.
|
||||
indented this many levels. (The formatter decides what a
|
||||
'level' means in terms of spaces or other characters
|
||||
output.) Used internally in recursive calls while
|
||||
pretty-printing.
|
||||
|
||||
:param eventual_encoding: The bytestring will be in this encoding.
|
||||
|
||||
|
@ -1739,7 +1843,7 @@ class Tag(PageElement):
|
|||
|
||||
#Soup methods
|
||||
|
||||
def find(self, name=None, attrs={}, recursive=True, text=None,
|
||||
def find(self, name=None, attrs={}, recursive=True, string=None,
|
||||
**kwargs):
|
||||
"""Look in the children of this PageElement and find the first
|
||||
PageElement that matches the given criteria.
|
||||
|
@ -1758,13 +1862,13 @@ class Tag(PageElement):
|
|||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||
"""
|
||||
r = None
|
||||
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
|
||||
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
|
||||
if l:
|
||||
r = l[0]
|
||||
return r
|
||||
findChild = find #BS2
|
||||
|
||||
def find_all(self, name=None, attrs={}, recursive=True, text=None,
|
||||
def find_all(self, name=None, attrs={}, recursive=True, string=None,
|
||||
limit=None, **kwargs):
|
||||
"""Look in the children of this PageElement and find all
|
||||
PageElements that match the given criteria.
|
||||
|
@ -1785,7 +1889,7 @@ class Tag(PageElement):
|
|||
generator = self.descendants
|
||||
if not recursive:
|
||||
generator = self.children
|
||||
return self._find_all(name, attrs, text, limit, generator, **kwargs)
|
||||
return self._find_all(name, attrs, string, limit, generator, **kwargs)
|
||||
findAll = find_all # BS3
|
||||
findChildren = find_all # BS2
|
||||
|
||||
|
@ -1887,8 +1991,10 @@ class Tag(PageElement):
|
|||
|
||||
has_key() is gone in Python 3, anyway.
|
||||
"""
|
||||
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
|
||||
key))
|
||||
warnings.warn(
|
||||
'has_key is deprecated. Use has_attr(key) instead.',
|
||||
DeprecationWarning
|
||||
)
|
||||
return self.has_attr(key)
|
||||
|
||||
# Next, a couple classes to represent queries and their results.
|
||||
|
@ -1902,7 +2008,7 @@ class SoupStrainer(object):
|
|||
document.
|
||||
"""
|
||||
|
||||
def __init__(self, name=None, attrs={}, text=None, **kwargs):
|
||||
def __init__(self, name=None, attrs={}, string=None, **kwargs):
|
||||
"""Constructor.
|
||||
|
||||
The SoupStrainer constructor takes the same arguments passed
|
||||
|
@ -1911,9 +2017,16 @@ class SoupStrainer(object):
|
|||
|
||||
:param name: A filter on tag name.
|
||||
:param attrs: A dictionary of filters on attribute values.
|
||||
:param text: A filter for a NavigableString with specific text.
|
||||
:param string: A filter for a NavigableString with specific text.
|
||||
:kwargs: A dictionary of filters on attribute values.
|
||||
"""
|
||||
if string is None and 'text' in kwargs:
|
||||
string = kwargs.pop('text')
|
||||
warnings.warn(
|
||||
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
|
||||
DeprecationWarning
|
||||
)
|
||||
|
||||
self.name = self._normalize_search_value(name)
|
||||
if not isinstance(attrs, dict):
|
||||
# Treat a non-dict value for attrs as a search for the 'class'
|
||||
|
@ -1938,7 +2051,10 @@ class SoupStrainer(object):
|
|||
normalized_attrs[key] = self._normalize_search_value(value)
|
||||
|
||||
self.attrs = normalized_attrs
|
||||
self.text = self._normalize_search_value(text)
|
||||
self.string = self._normalize_search_value(string)
|
||||
|
||||
# DEPRECATED but just in case someone is checking this.
|
||||
self.text = self.string
|
||||
|
||||
def _normalize_search_value(self, value):
|
||||
# Leave it alone if it's a Unicode string, a callable, a
|
||||
|
@ -1972,8 +2088,8 @@ class SoupStrainer(object):
|
|||
|
||||
def __str__(self):
|
||||
"""A human-readable representation of this SoupStrainer."""
|
||||
if self.text:
|
||||
return self.text
|
||||
if self.string:
|
||||
return self.string
|
||||
else:
|
||||
return "%s|%s" % (self.name, self.attrs)
|
||||
|
||||
|
@ -2033,7 +2149,7 @@ class SoupStrainer(object):
|
|||
found = markup
|
||||
else:
|
||||
found = markup_name
|
||||
if found and self.text and not self._matches(found.string, self.text):
|
||||
if found and self.string and not self._matches(found.string, self.string):
|
||||
found = None
|
||||
return found
|
||||
|
||||
|
@ -2061,12 +2177,12 @@ class SoupStrainer(object):
|
|||
# If it's a Tag, make sure its name or attributes match.
|
||||
# Don't bother with Tags if we're searching for text.
|
||||
elif isinstance(markup, Tag):
|
||||
if not self.text or self.name or self.attrs:
|
||||
if not self.string or self.name or self.attrs:
|
||||
found = self.search_tag(markup)
|
||||
# If it's text, make sure the text matches.
|
||||
elif isinstance(markup, NavigableString) or \
|
||||
isinstance(markup, str):
|
||||
if not self.name and not self.attrs and self._matches(markup, self.text):
|
||||
if not self.name and not self.attrs and self._matches(markup, self.string):
|
||||
found = markup
|
||||
else:
|
||||
raise Exception(
|
||||
|
|
|
@ -14,7 +14,8 @@ class Formatter(EntitySubstitution):
|
|||
|
||||
For HTML documents:
|
||||
* 'html' - HTML entity substitution for generic HTML documents. (default)
|
||||
* 'html5' - HTML entity substitution for HTML5 documents.
|
||||
* 'html5' - HTML entity substitution for HTML5 documents, as
|
||||
well as some optimizations in the way tags are rendered.
|
||||
* 'minimal' - Only make the substitutions necessary to guarantee
|
||||
valid HTML.
|
||||
* None - Do not perform any substitution. This will be faster
|
||||
|
@ -48,6 +49,7 @@ class Formatter(EntitySubstitution):
|
|||
def __init__(
|
||||
self, language=None, entity_substitution=None,
|
||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||
empty_attributes_are_booleans=False, indent=1,
|
||||
):
|
||||
"""Constructor.
|
||||
|
||||
|
@ -64,6 +66,18 @@ class Formatter(EntitySubstitution):
|
|||
as containing CDATA in this dialect. For example, in HTML,
|
||||
<script> and <style> tags are defined as containing CDATA,
|
||||
and their contents should not be formatted.
|
||||
:param blank_attributes_are_booleans: Render attributes whose value
|
||||
is the empty string as HTML-style boolean attributes.
|
||||
(Attributes whose value is None are always rendered this way.)
|
||||
|
||||
:param indent: If indent is a non-negative integer or string,
|
||||
then the contents of elements will be indented
|
||||
appropriately when pretty-printing. An indent level of 0,
|
||||
negative, or "" will only insert newlines. Using a
|
||||
positive integer indent indents that many spaces per
|
||||
level. If indent is a string (such as "\t"), that string
|
||||
is used to indent each level. The default behavior to
|
||||
indent one space per level.
|
||||
"""
|
||||
self.language = language
|
||||
self.entity_substitution = entity_substitution
|
||||
|
@ -71,6 +85,18 @@ class Formatter(EntitySubstitution):
|
|||
self.cdata_containing_tags = self._default(
|
||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||
)
|
||||
self.empty_attributes_are_booleans=empty_attributes_are_booleans
|
||||
if indent is None:
|
||||
indent = 0
|
||||
if isinstance(indent, int):
|
||||
if indent < 0:
|
||||
indent = 0
|
||||
indent = ' ' * indent
|
||||
elif isinstance(indent, str):
|
||||
indent = indent
|
||||
else:
|
||||
indent = ' '
|
||||
self.indent = indent
|
||||
|
||||
def substitute(self, ns):
|
||||
"""Process a string that needs to undergo entity substitution.
|
||||
|
@ -107,11 +133,17 @@ class Formatter(EntitySubstitution):
|
|||
By default, attributes are sorted alphabetically. This makes
|
||||
behavior consistent between Python 2 and Python 3, and preserves
|
||||
backwards compatibility with older versions of Beautiful Soup.
|
||||
|
||||
If `empty_boolean_attributes` is True, then attributes whose
|
||||
values are set to the empty string will be treated as boolean
|
||||
attributes.
|
||||
"""
|
||||
if tag.attrs is None:
|
||||
return []
|
||||
return sorted(tag.attrs.items())
|
||||
|
||||
return sorted(
|
||||
(k, (None if self.empty_attributes_are_booleans and v == '' else v))
|
||||
for k, v in list(tag.attrs.items())
|
||||
)
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
"""A generic Formatter for HTML."""
|
||||
|
@ -133,7 +165,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
|||
)
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix = None
|
||||
void_element_close_prefix=None,
|
||||
empty_attributes_are_booleans=True,
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
|
|
Loading…
Reference in a new issue