mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-03 01:43:37 +00:00
Merge pull request #482 from JackDandy/feature/UpdateBSoup
Update Beautiful Soup to 4.4.0 (r390).
This commit is contained in:
commit
8b42315bde
9 changed files with 292 additions and 84 deletions
|
@ -5,6 +5,7 @@
|
||||||
* Add search crawler exclusions
|
* Add search crawler exclusions
|
||||||
* Fix saving default show list group on add new show options page
|
* Fix saving default show list group on add new show options page
|
||||||
* Remove legacy anime split home option from anime settings tab (new option located in general/interface tab)
|
* Remove legacy anime split home option from anime settings tab (new option located in general/interface tab)
|
||||||
|
* Update Beautiful Soup 4.3.2 to 4.4.0 (r390)
|
||||||
|
|
||||||
|
|
||||||
### 0.10.0 (2015-08-06 11:05:00 UTC)
|
### 0.10.0 (2015-08-06 11:05:00 UTC)
|
||||||
|
|
|
@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.3.2"
|
__version__ = "4.4.0"
|
||||||
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
@ -77,10 +77,11 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, **kwargs):
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
|
**kwargs):
|
||||||
"""The Soup object is initialized as the 'root tag', and the
|
"""The Soup object is initialized as the 'root tag', and the
|
||||||
provided markup (which can be a string or a file-like object)
|
provided markup (which can be a string or a file-like object)
|
||||||
is fed into the underlying parser."""
|
is fed into the underlying parser."""
|
||||||
|
@ -156,8 +157,13 @@ class BeautifulSoup(Tag):
|
||||||
builder = builder_class()
|
builder = builder_class()
|
||||||
if not (original_features == builder.NAME or
|
if not (original_features == builder.NAME or
|
||||||
original_features in builder.ALTERNATE_NAMES):
|
original_features in builder.ALTERNATE_NAMES):
|
||||||
|
if builder.is_xml:
|
||||||
|
markup_type = "XML"
|
||||||
|
else:
|
||||||
|
markup_type = "HTML"
|
||||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||||
parser=builder.NAME))
|
parser=builder.NAME,
|
||||||
|
markup_type=markup_type))
|
||||||
|
|
||||||
self.builder = builder
|
self.builder = builder
|
||||||
self.is_xml = builder.is_xml
|
self.is_xml = builder.is_xml
|
||||||
|
@ -202,7 +208,8 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||||
self.contains_replacement_characters) in (
|
self.contains_replacement_characters) in (
|
||||||
self.builder.prepare_markup(markup, from_encoding)):
|
self.builder.prepare_markup(
|
||||||
|
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||||
self.reset()
|
self.reset()
|
||||||
try:
|
try:
|
||||||
self._feed()
|
self._feed()
|
||||||
|
@ -215,6 +222,16 @@ class BeautifulSoup(Tag):
|
||||||
self.markup = None
|
self.markup = None
|
||||||
self.builder.soup = None
|
self.builder.soup = None
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
return type(self)(self.encode(), builder=self.builder)
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
# Frequently a tree builder can't be pickled.
|
||||||
|
d = dict(self.__dict__)
|
||||||
|
if 'builder' in d and not self.builder.picklable:
|
||||||
|
del d['builder']
|
||||||
|
return d
|
||||||
|
|
||||||
def _feed(self):
|
def _feed(self):
|
||||||
# Convert the document to Unicode.
|
# Convert the document to Unicode.
|
||||||
self.builder.reset()
|
self.builder.reset()
|
||||||
|
@ -241,9 +258,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def new_string(self, s, subclass=NavigableString):
|
def new_string(self, s, subclass=NavigableString):
|
||||||
"""Create a new NavigableString associated with this soup."""
|
"""Create a new NavigableString associated with this soup."""
|
||||||
navigable = subclass(s)
|
return subclass(s)
|
||||||
navigable.setup()
|
|
||||||
return navigable
|
|
||||||
|
|
||||||
def insert_before(self, successor):
|
def insert_before(self, successor):
|
||||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||||
|
@ -302,14 +317,49 @@ class BeautifulSoup(Tag):
|
||||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||||
"""Add an object to the parse tree."""
|
"""Add an object to the parse tree."""
|
||||||
parent = parent or self.currentTag
|
parent = parent or self.currentTag
|
||||||
most_recent_element = most_recent_element or self._most_recent_element
|
previous_element = most_recent_element or self._most_recent_element
|
||||||
o.setup(parent, most_recent_element)
|
|
||||||
|
next_element = previous_sibling = next_sibling = None
|
||||||
|
if isinstance(o, Tag):
|
||||||
|
next_element = o.next_element
|
||||||
|
next_sibling = o.next_sibling
|
||||||
|
previous_sibling = o.previous_sibling
|
||||||
|
if not previous_element:
|
||||||
|
previous_element = o.previous_element
|
||||||
|
|
||||||
|
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||||
|
|
||||||
if most_recent_element is not None:
|
|
||||||
most_recent_element.next_element = o
|
|
||||||
self._most_recent_element = o
|
self._most_recent_element = o
|
||||||
parent.contents.append(o)
|
parent.contents.append(o)
|
||||||
|
|
||||||
|
if parent.next_sibling:
|
||||||
|
# This node is being inserted into an element that has
|
||||||
|
# already been parsed. Deal with any dangling references.
|
||||||
|
index = parent.contents.index(o)
|
||||||
|
if index == 0:
|
||||||
|
previous_element = parent
|
||||||
|
previous_sibling = None
|
||||||
|
else:
|
||||||
|
previous_element = previous_sibling = parent.contents[index-1]
|
||||||
|
if index == len(parent.contents)-1:
|
||||||
|
next_element = parent.next_sibling
|
||||||
|
next_sibling = None
|
||||||
|
else:
|
||||||
|
next_element = next_sibling = parent.contents[index+1]
|
||||||
|
|
||||||
|
o.previous_element = previous_element
|
||||||
|
if previous_element:
|
||||||
|
previous_element.next_element = o
|
||||||
|
o.next_element = next_element
|
||||||
|
if next_element:
|
||||||
|
next_element.previous_element = o
|
||||||
|
o.next_sibling = next_sibling
|
||||||
|
if next_sibling:
|
||||||
|
next_sibling.previous_sibling = o
|
||||||
|
o.previous_sibling = previous_sibling
|
||||||
|
if previous_sibling:
|
||||||
|
previous_sibling.next_sibling = o
|
||||||
|
|
||||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
"""Pops the tag stack up to and including the most recent
|
"""Pops the tag stack up to and including the most recent
|
||||||
instance of the given tag. If inclusivePop is false, pops the tag
|
instance of the given tag. If inclusivePop is false, pops the tag
|
||||||
|
|
|
@ -85,6 +85,7 @@ class TreeBuilder(object):
|
||||||
features = []
|
features = []
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
|
picklable = False
|
||||||
preserve_whitespace_tags = set()
|
preserve_whitespace_tags = set()
|
||||||
empty_element_tags = None # A tag will be considered an empty-element
|
empty_element_tags = None # A tag will be considered an empty-element
|
||||||
# tag when and only when it has no contents.
|
# tag when and only when it has no contents.
|
||||||
|
|
|
@ -2,6 +2,7 @@ __all__ = [
|
||||||
'HTML5TreeBuilder',
|
'HTML5TreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
from pdb import set_trace
|
||||||
import warnings
|
import warnings
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
PERMISSIVE,
|
PERMISSIVE,
|
||||||
|
@ -9,7 +10,10 @@ from bs4.builder import (
|
||||||
HTML_5,
|
HTML_5,
|
||||||
HTMLTreeBuilder,
|
HTMLTreeBuilder,
|
||||||
)
|
)
|
||||||
from bs4.element import NamespacedAttribute
|
from bs4.element import (
|
||||||
|
NamespacedAttribute,
|
||||||
|
whitespace_re,
|
||||||
|
)
|
||||||
import html5lib
|
import html5lib
|
||||||
from html5lib.constants import namespaces
|
from html5lib.constants import namespaces
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
|
@ -26,9 +30,16 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding):
|
def prepare_markup(self, markup, user_specified_encoding,
|
||||||
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
# Store the user-specified encoding for use later on.
|
# Store the user-specified encoding for use later on.
|
||||||
self.user_specified_encoding = user_specified_encoding
|
self.user_specified_encoding = user_specified_encoding
|
||||||
|
|
||||||
|
# document_declared_encoding and exclude_encodings aren't used
|
||||||
|
# ATM because the html5lib TreeBuilder doesn't use
|
||||||
|
# UnicodeDammit.
|
||||||
|
if exclude_encodings:
|
||||||
|
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
|
|
||||||
# These methods are defined by Beautiful Soup.
|
# These methods are defined by Beautiful Soup.
|
||||||
|
@ -103,7 +114,13 @@ class AttrList(object):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return list(self.attrs.items()).__iter__()
|
return list(self.attrs.items()).__iter__()
|
||||||
def __setitem__(self, name, value):
|
def __setitem__(self, name, value):
|
||||||
"set attr", name, value
|
# If this attribute is a multi-valued attribute for this element,
|
||||||
|
# turn its value into a list.
|
||||||
|
list_attr = HTML5TreeBuilder.cdata_list_attributes
|
||||||
|
if (name in list_attr['*']
|
||||||
|
or (self.element.name in list_attr
|
||||||
|
and name in list_attr[self.element.name])):
|
||||||
|
value = whitespace_re.split(value)
|
||||||
self.element[name] = value
|
self.element[name] = value
|
||||||
def items(self):
|
def items(self):
|
||||||
return list(self.attrs.items())
|
return list(self.attrs.items())
|
||||||
|
@ -180,6 +197,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
def setAttributes(self, attributes):
|
||||||
|
|
||||||
if attributes is not None and len(attributes) > 0:
|
if attributes is not None and len(attributes) > 0:
|
||||||
|
|
||||||
converted_attributes = []
|
converted_attributes = []
|
||||||
|
@ -226,6 +244,9 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
|
|
||||||
def reparentChildren(self, new_parent):
|
def reparentChildren(self, new_parent):
|
||||||
"""Move all of this tag's children into another tag."""
|
"""Move all of this tag's children into another tag."""
|
||||||
|
# print "MOVE", self.element.contents
|
||||||
|
# print "FROM", self.element
|
||||||
|
# print "TO", new_parent.element
|
||||||
element = self.element
|
element = self.element
|
||||||
new_parent_element = new_parent.element
|
new_parent_element = new_parent.element
|
||||||
# Determine what this tag's next_element will be once all the children
|
# Determine what this tag's next_element will be once all the children
|
||||||
|
@ -244,17 +265,28 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||||
|
|
||||||
to_append = element.contents
|
to_append = element.contents
|
||||||
append_after = new_parent.element.contents
|
append_after = new_parent_element.contents
|
||||||
if len(to_append) > 0:
|
if len(to_append) > 0:
|
||||||
# Set the first child's previous_element and previous_sibling
|
# Set the first child's previous_element and previous_sibling
|
||||||
# to elements within the new parent
|
# to elements within the new parent
|
||||||
first_child = to_append[0]
|
first_child = to_append[0]
|
||||||
|
if new_parents_last_descendant:
|
||||||
first_child.previous_element = new_parents_last_descendant
|
first_child.previous_element = new_parents_last_descendant
|
||||||
|
else:
|
||||||
|
first_child.previous_element = new_parent_element
|
||||||
first_child.previous_sibling = new_parents_last_child
|
first_child.previous_sibling = new_parents_last_child
|
||||||
|
if new_parents_last_descendant:
|
||||||
|
new_parents_last_descendant.next_element = first_child
|
||||||
|
else:
|
||||||
|
new_parent_element.next_element = first_child
|
||||||
|
if new_parents_last_child:
|
||||||
|
new_parents_last_child.next_sibling = first_child
|
||||||
|
|
||||||
# Fix the last child's next_element and next_sibling
|
# Fix the last child's next_element and next_sibling
|
||||||
last_child = to_append[-1]
|
last_child = to_append[-1]
|
||||||
last_child.next_element = new_parents_last_descendant_next_element
|
last_child.next_element = new_parents_last_descendant_next_element
|
||||||
|
if new_parents_last_descendant_next_element:
|
||||||
|
new_parents_last_descendant_next_element.previous_element = last_child
|
||||||
last_child.next_sibling = None
|
last_child.next_sibling = None
|
||||||
|
|
||||||
for child in to_append:
|
for child in to_append:
|
||||||
|
@ -265,6 +297,10 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
element.contents = []
|
element.contents = []
|
||||||
element.next_element = final_next_element
|
element.next_element = final_next_element
|
||||||
|
|
||||||
|
# print "DONE WITH MOVE"
|
||||||
|
# print "FROM", self.element
|
||||||
|
# print "TO", new_parent_element
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||||
node = Element(tag, self.soup, self.namespace)
|
node = Element(tag, self.soup, self.namespace)
|
||||||
|
|
|
@ -4,10 +4,16 @@ __all__ = [
|
||||||
'HTMLParserTreeBuilder',
|
'HTMLParserTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
from HTMLParser import (
|
from HTMLParser import HTMLParser
|
||||||
HTMLParser,
|
|
||||||
HTMLParseError,
|
try:
|
||||||
)
|
from HTMLParser import HTMLParseError
|
||||||
|
except ImportError, e:
|
||||||
|
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||||
|
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||||
|
class HTMLParseError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
@ -20,8 +26,10 @@ import warnings
|
||||||
# strict=True works well on Python 3.2.2.
|
# strict=True works well on Python 3.2.2.
|
||||||
major, minor, release = sys.version_info[:3]
|
major, minor, release = sys.version_info[:3]
|
||||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||||
|
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||||
|
|
||||||
|
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CData,
|
CData,
|
||||||
Comment,
|
Comment,
|
||||||
|
@ -119,18 +127,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
|
picklable = True
|
||||||
NAME = HTMLPARSER
|
NAME = HTMLPARSER
|
||||||
features = [NAME, HTML, STRICT]
|
features = [NAME, HTML, STRICT]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
if CONSTRUCTOR_TAKES_STRICT:
|
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||||
kwargs['strict'] = False
|
kwargs['strict'] = False
|
||||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||||
kwargs['convert_charrefs'] = False
|
kwargs['convert_charrefs'] = False
|
||||||
self.parser_args = (args, kwargs)
|
self.parser_args = (args, kwargs)
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
document_declared_encoding=None):
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
"""
|
"""
|
||||||
:return: A 4-tuple (markup, original encoding, encoding
|
:return: A 4-tuple (markup, original encoding, encoding
|
||||||
declared within markup, whether any characters had to be
|
declared within markup, whether any characters had to be
|
||||||
|
@ -141,7 +150,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
return
|
return
|
||||||
|
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||||
|
exclude_encodings=exclude_encodings)
|
||||||
yield (dammit.markup, dammit.original_encoding,
|
yield (dammit.markup, dammit.original_encoding,
|
||||||
dammit.declared_html_encoding,
|
dammit.declared_html_encoding,
|
||||||
dammit.contains_replacement_characters)
|
dammit.contains_replacement_characters)
|
||||||
|
|
|
@ -31,6 +31,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
is_xml = True
|
is_xml = True
|
||||||
|
|
||||||
NAME = "lxml-xml"
|
NAME = "lxml-xml"
|
||||||
|
ALTERNATE_NAMES = ["xml"]
|
||||||
|
|
||||||
# Well, it's permissive by XML parser standards.
|
# Well, it's permissive by XML parser standards.
|
||||||
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||||
|
@ -77,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
return (None, tag)
|
return (None, tag)
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
|
exclude_encodings=None,
|
||||||
document_declared_encoding=None):
|
document_declared_encoding=None):
|
||||||
"""
|
"""
|
||||||
:yield: A series of 4-tuples.
|
:yield: A series of 4-tuples.
|
||||||
|
@ -102,7 +104,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# the document as each one in turn.
|
# the document as each one in turn.
|
||||||
is_html = not self.is_xml
|
is_html = not self.is_xml
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
detector = EncodingDetector(markup, try_encodings, is_html)
|
detector = EncodingDetector(
|
||||||
|
markup, try_encodings, is_html, exclude_encodings)
|
||||||
for encoding in detector.encodings:
|
for encoding in detector.encodings:
|
||||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,11 @@
|
||||||
|
|
||||||
This library converts a bytestream to Unicode through any means
|
This library converts a bytestream to Unicode through any means
|
||||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||||
Feed Parser. It works best on XML and XML, but it does not rewrite the
|
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from pdb import set_trace
|
||||||
import codecs
|
import codecs
|
||||||
from htmlentitydefs import codepoint2name
|
from htmlentitydefs import codepoint2name
|
||||||
import re
|
import re
|
||||||
|
@ -212,8 +213,11 @@ class EncodingDetector:
|
||||||
|
|
||||||
5. Windows-1252.
|
5. Windows-1252.
|
||||||
"""
|
"""
|
||||||
def __init__(self, markup, override_encodings=None, is_html=False):
|
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||||
|
exclude_encodings=None):
|
||||||
self.override_encodings = override_encodings or []
|
self.override_encodings = override_encodings or []
|
||||||
|
exclude_encodings = exclude_encodings or []
|
||||||
|
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||||
self.chardet_encoding = None
|
self.chardet_encoding = None
|
||||||
self.is_html = is_html
|
self.is_html = is_html
|
||||||
self.declared_encoding = None
|
self.declared_encoding = None
|
||||||
|
@ -224,6 +228,8 @@ class EncodingDetector:
|
||||||
def _usable(self, encoding, tried):
|
def _usable(self, encoding, tried):
|
||||||
if encoding is not None:
|
if encoding is not None:
|
||||||
encoding = encoding.lower()
|
encoding = encoding.lower()
|
||||||
|
if encoding in self.exclude_encodings:
|
||||||
|
return False
|
||||||
if encoding not in tried:
|
if encoding not in tried:
|
||||||
tried.add(encoding)
|
tried.add(encoding)
|
||||||
return True
|
return True
|
||||||
|
@ -266,6 +272,9 @@ class EncodingDetector:
|
||||||
def strip_byte_order_mark(cls, data):
|
def strip_byte_order_mark(cls, data):
|
||||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||||
encoding = None
|
encoding = None
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
# Unicode data cannot have a byte-order mark.
|
||||||
|
return data, encoding
|
||||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||||
and (data[2:4] != '\x00\x00'):
|
and (data[2:4] != '\x00\x00'):
|
||||||
encoding = 'utf-16be'
|
encoding = 'utf-16be'
|
||||||
|
@ -306,7 +315,7 @@ class EncodingDetector:
|
||||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||||
if declared_encoding_match is not None:
|
if declared_encoding_match is not None:
|
||||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||||
'ascii')
|
'ascii', 'replace')
|
||||||
if declared_encoding:
|
if declared_encoding:
|
||||||
return declared_encoding.lower()
|
return declared_encoding.lower()
|
||||||
return None
|
return None
|
||||||
|
@ -331,13 +340,14 @@ class UnicodeDammit:
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, markup, override_encodings=[],
|
def __init__(self, markup, override_encodings=[],
|
||||||
smart_quotes_to=None, is_html=False):
|
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||||
self.smart_quotes_to = smart_quotes_to
|
self.smart_quotes_to = smart_quotes_to
|
||||||
self.tried_encodings = []
|
self.tried_encodings = []
|
||||||
self.contains_replacement_characters = False
|
self.contains_replacement_characters = False
|
||||||
self.is_html = is_html
|
self.is_html = is_html
|
||||||
|
|
||||||
self.detector = EncodingDetector(markup, override_encodings, is_html)
|
self.detector = EncodingDetector(
|
||||||
|
markup, override_encodings, is_html, exclude_encodings)
|
||||||
|
|
||||||
# Short-circuit if the data is in Unicode to begin with.
|
# Short-circuit if the data is in Unicode to begin with.
|
||||||
if isinstance(markup, unicode) or markup == '':
|
if isinstance(markup, unicode) or markup == '':
|
||||||
|
|
|
@ -33,12 +33,21 @@ def diagnose(data):
|
||||||
|
|
||||||
if 'lxml' in basic_parsers:
|
if 'lxml' in basic_parsers:
|
||||||
basic_parsers.append(["lxml", "xml"])
|
basic_parsers.append(["lxml", "xml"])
|
||||||
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||||
|
except ImportError, e:
|
||||||
|
print (
|
||||||
|
"lxml is not installed or couldn't be imported.")
|
||||||
|
|
||||||
|
|
||||||
if 'html5lib' in basic_parsers:
|
if 'html5lib' in basic_parsers:
|
||||||
|
try:
|
||||||
import html5lib
|
import html5lib
|
||||||
print "Found html5lib version %s" % html5lib.__version__
|
print "Found html5lib version %s" % html5lib.__version__
|
||||||
|
except ImportError, e:
|
||||||
|
print (
|
||||||
|
"html5lib is not installed or couldn't be imported.")
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from pdb import set_trace
|
||||||
import collections
|
import collections
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
@ -185,24 +186,40 @@ class PageElement(object):
|
||||||
return self.HTML_FORMATTERS.get(
|
return self.HTML_FORMATTERS.get(
|
||||||
name, HTMLAwareEntitySubstitution.substitute_xml)
|
name, HTMLAwareEntitySubstitution.substitute_xml)
|
||||||
|
|
||||||
def setup(self, parent=None, previous_element=None):
|
def setup(self, parent=None, previous_element=None, next_element=None,
|
||||||
|
previous_sibling=None, next_sibling=None):
|
||||||
"""Sets up the initial relations between this element and
|
"""Sets up the initial relations between this element and
|
||||||
other elements."""
|
other elements."""
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
|
|
||||||
self.previous_element = previous_element
|
self.previous_element = previous_element
|
||||||
if previous_element is not None:
|
if previous_element is not None:
|
||||||
self.previous_element.next_element = self
|
self.previous_element.next_element = self
|
||||||
self.next_element = None
|
|
||||||
self.previous_sibling = None
|
self.next_element = next_element
|
||||||
self.next_sibling = None
|
if self.next_element:
|
||||||
if self.parent is not None and self.parent.contents:
|
self.next_element.previous_element = self
|
||||||
self.previous_sibling = self.parent.contents[-1]
|
|
||||||
|
self.next_sibling = next_sibling
|
||||||
|
if self.next_sibling:
|
||||||
|
self.next_sibling.previous_sibling = self
|
||||||
|
|
||||||
|
if (not previous_sibling
|
||||||
|
and self.parent is not None and self.parent.contents):
|
||||||
|
previous_sibling = self.parent.contents[-1]
|
||||||
|
|
||||||
|
self.previous_sibling = previous_sibling
|
||||||
|
if previous_sibling:
|
||||||
self.previous_sibling.next_sibling = self
|
self.previous_sibling.next_sibling = self
|
||||||
|
|
||||||
nextSibling = _alias("next_sibling") # BS3
|
nextSibling = _alias("next_sibling") # BS3
|
||||||
previousSibling = _alias("previous_sibling") # BS3
|
previousSibling = _alias("previous_sibling") # BS3
|
||||||
|
|
||||||
def replace_with(self, replace_with):
|
def replace_with(self, replace_with):
|
||||||
|
if not self.parent:
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot replace one element with another when the"
|
||||||
|
"element to be replaced is not part of a tree.")
|
||||||
if replace_with is self:
|
if replace_with is self:
|
||||||
return
|
return
|
||||||
if replace_with is self.parent:
|
if replace_with is self.parent:
|
||||||
|
@ -216,6 +233,10 @@ class PageElement(object):
|
||||||
|
|
||||||
def unwrap(self):
|
def unwrap(self):
|
||||||
my_parent = self.parent
|
my_parent = self.parent
|
||||||
|
if not self.parent:
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot replace an element with its contents when that"
|
||||||
|
"element is not part of a tree.")
|
||||||
my_index = self.parent.index(self)
|
my_index = self.parent.index(self)
|
||||||
self.extract()
|
self.extract()
|
||||||
for child in reversed(self.contents[:]):
|
for child in reversed(self.contents[:]):
|
||||||
|
@ -240,17 +261,20 @@ class PageElement(object):
|
||||||
last_child = self._last_descendant()
|
last_child = self._last_descendant()
|
||||||
next_element = last_child.next_element
|
next_element = last_child.next_element
|
||||||
|
|
||||||
if self.previous_element is not None:
|
if (self.previous_element is not None and
|
||||||
|
self.previous_element != next_element):
|
||||||
self.previous_element.next_element = next_element
|
self.previous_element.next_element = next_element
|
||||||
if next_element is not None:
|
if next_element is not None and next_element != self.previous_element:
|
||||||
next_element.previous_element = self.previous_element
|
next_element.previous_element = self.previous_element
|
||||||
self.previous_element = None
|
self.previous_element = None
|
||||||
last_child.next_element = None
|
last_child.next_element = None
|
||||||
|
|
||||||
self.parent = None
|
self.parent = None
|
||||||
if self.previous_sibling is not None:
|
if (self.previous_sibling is not None
|
||||||
|
and self.previous_sibling != self.next_sibling):
|
||||||
self.previous_sibling.next_sibling = self.next_sibling
|
self.previous_sibling.next_sibling = self.next_sibling
|
||||||
if self.next_sibling is not None:
|
if (self.next_sibling is not None
|
||||||
|
and self.next_sibling != self.previous_sibling):
|
||||||
self.next_sibling.previous_sibling = self.previous_sibling
|
self.next_sibling.previous_sibling = self.previous_sibling
|
||||||
self.previous_sibling = self.next_sibling = None
|
self.previous_sibling = self.next_sibling = None
|
||||||
return self
|
return self
|
||||||
|
@ -478,6 +502,10 @@ class PageElement(object):
|
||||||
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
|
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
|
||||||
"Iterates over a generator looking for things that match."
|
"Iterates over a generator looking for things that match."
|
||||||
|
|
||||||
|
if text is None and 'string' in kwargs:
|
||||||
|
text = kwargs['string']
|
||||||
|
del kwargs['string']
|
||||||
|
|
||||||
if isinstance(name, SoupStrainer):
|
if isinstance(name, SoupStrainer):
|
||||||
strainer = name
|
strainer = name
|
||||||
else:
|
else:
|
||||||
|
@ -558,7 +586,7 @@ class PageElement(object):
|
||||||
# | Attribute
|
# | Attribute
|
||||||
# Tag
|
# Tag
|
||||||
attribselect_re = re.compile(
|
attribselect_re = re.compile(
|
||||||
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
|
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
|
||||||
r'=?"?(?P<value>[^\]"]*)"?\]$'
|
r'=?"?(?P<value>[^\]"]*)"?\]$'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -654,11 +682,17 @@ class NavigableString(unicode, PageElement):
|
||||||
how to handle non-ASCII characters.
|
how to handle non-ASCII characters.
|
||||||
"""
|
"""
|
||||||
if isinstance(value, unicode):
|
if isinstance(value, unicode):
|
||||||
return unicode.__new__(cls, value)
|
u = unicode.__new__(cls, value)
|
||||||
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
else:
|
||||||
|
u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||||
|
u.setup()
|
||||||
|
return u
|
||||||
|
|
||||||
def __copy__(self):
|
def __copy__(self):
|
||||||
return self
|
"""A copy of a NavigableString has the same contents and class
|
||||||
|
as the original, but it is not connected to the parse tree.
|
||||||
|
"""
|
||||||
|
return type(self)(self)
|
||||||
|
|
||||||
def __getnewargs__(self):
|
def __getnewargs__(self):
|
||||||
return (unicode(self),)
|
return (unicode(self),)
|
||||||
|
@ -759,11 +793,14 @@ class Tag(PageElement):
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
if attrs is None:
|
if attrs is None:
|
||||||
attrs = {}
|
attrs = {}
|
||||||
elif attrs and builder.cdata_list_attributes:
|
elif attrs:
|
||||||
|
if builder is not None and builder.cdata_list_attributes:
|
||||||
attrs = builder._replace_cdata_list_attribute_values(
|
attrs = builder._replace_cdata_list_attribute_values(
|
||||||
self.name, attrs)
|
self.name, attrs)
|
||||||
else:
|
else:
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
|
else:
|
||||||
|
attrs = dict(attrs)
|
||||||
self.attrs = attrs
|
self.attrs = attrs
|
||||||
self.contents = []
|
self.contents = []
|
||||||
self.setup(parent, previous)
|
self.setup(parent, previous)
|
||||||
|
@ -778,6 +815,18 @@ class Tag(PageElement):
|
||||||
|
|
||||||
parserClass = _alias("parser_class") # BS3
|
parserClass = _alias("parser_class") # BS3
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
|
||||||
|
Its contents are a copy of the old Tag's contents.
|
||||||
|
"""
|
||||||
|
clone = type(self)(None, self.builder, self.name, self.namespace,
|
||||||
|
self.nsprefix, self.attrs)
|
||||||
|
for attr in ('can_be_empty_element', 'hidden'):
|
||||||
|
setattr(clone, attr, getattr(self, attr))
|
||||||
|
for child in self.contents:
|
||||||
|
clone.append(child.__copy__())
|
||||||
|
return clone
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_empty_element(self):
|
def is_empty_element(self):
|
||||||
"""Is this tag an empty-element tag? (aka a self-closing tag)
|
"""Is this tag an empty-element tag? (aka a self-closing tag)
|
||||||
|
@ -971,14 +1020,24 @@ class Tag(PageElement):
|
||||||
as defined in __eq__."""
|
as defined in __eq__."""
|
||||||
return not self == other
|
return not self == other
|
||||||
|
|
||||||
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
def __repr__(self, encoding="unicode-escape"):
|
||||||
"""Renders this tag as a string."""
|
"""Renders this tag as a string."""
|
||||||
|
if PY3K:
|
||||||
|
# "The return value must be a string object", i.e. Unicode
|
||||||
|
return self.decode()
|
||||||
|
else:
|
||||||
|
# "The return value must be a string object", i.e. a bytestring.
|
||||||
|
# By convention, the return value of __repr__ should also be
|
||||||
|
# an ASCII string.
|
||||||
return self.encode(encoding)
|
return self.encode(encoding)
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.decode()
|
return self.decode()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
if PY3K:
|
||||||
|
return self.decode()
|
||||||
|
else:
|
||||||
return self.encode()
|
return self.encode()
|
||||||
|
|
||||||
if PY3K:
|
if PY3K:
|
||||||
|
@ -1103,12 +1162,18 @@ class Tag(PageElement):
|
||||||
formatter="minimal"):
|
formatter="minimal"):
|
||||||
"""Renders the contents of this tag as a Unicode string.
|
"""Renders the contents of this tag as a Unicode string.
|
||||||
|
|
||||||
|
:param indent_level: Each line of the rendering will be
|
||||||
|
indented this many spaces.
|
||||||
|
|
||||||
:param eventual_encoding: The tag is destined to be
|
:param eventual_encoding: The tag is destined to be
|
||||||
encoded into this encoding. This method is _not_
|
encoded into this encoding. This method is _not_
|
||||||
responsible for performing that encoding. This information
|
responsible for performing that encoding. This information
|
||||||
is passed in so that it can be substituted in if the
|
is passed in so that it can be substituted in if the
|
||||||
document contains a <META> tag that mentions the document's
|
document contains a <META> tag that mentions the document's
|
||||||
encoding.
|
encoding.
|
||||||
|
|
||||||
|
:param formatter: The output formatter responsible for converting
|
||||||
|
entities to Unicode characters.
|
||||||
"""
|
"""
|
||||||
# First off, turn a string formatter into a function. This
|
# First off, turn a string formatter into a function. This
|
||||||
# will stop the lookup from happening over and over again.
|
# will stop the lookup from happening over and over again.
|
||||||
|
@ -1137,7 +1202,17 @@ class Tag(PageElement):
|
||||||
def encode_contents(
|
def encode_contents(
|
||||||
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal"):
|
||||||
"""Renders the contents of this tag as a bytestring."""
|
"""Renders the contents of this tag as a bytestring.
|
||||||
|
|
||||||
|
:param indent_level: Each line of the rendering will be
|
||||||
|
indented this many spaces.
|
||||||
|
|
||||||
|
:param eventual_encoding: The bytestring will be in this encoding.
|
||||||
|
|
||||||
|
:param formatter: The output formatter responsible for converting
|
||||||
|
entities to Unicode characters.
|
||||||
|
"""
|
||||||
|
|
||||||
contents = self.decode_contents(indent_level, encoding, formatter)
|
contents = self.decode_contents(indent_level, encoding, formatter)
|
||||||
return contents.encode(encoding)
|
return contents.encode(encoding)
|
||||||
|
|
||||||
|
@ -1201,7 +1276,14 @@ class Tag(PageElement):
|
||||||
|
|
||||||
_selector_combinators = ['>', '+', '~']
|
_selector_combinators = ['>', '+', '~']
|
||||||
_select_debug = False
|
_select_debug = False
|
||||||
def select(self, selector, _candidate_generator=None):
|
def select_one(self, selector):
|
||||||
|
"""Perform a CSS selection operation on the current element."""
|
||||||
|
value = self.select(selector, limit=1)
|
||||||
|
if value:
|
||||||
|
return value[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def select(self, selector, _candidate_generator=None, limit=None):
|
||||||
"""Perform a CSS selection operation on the current element."""
|
"""Perform a CSS selection operation on the current element."""
|
||||||
|
|
||||||
# Remove whitespace directly after the grouping operator ','
|
# Remove whitespace directly after the grouping operator ','
|
||||||
|
@ -1272,7 +1354,10 @@ class Tag(PageElement):
|
||||||
"A pseudo-class must be prefixed with a tag name.")
|
"A pseudo-class must be prefixed with a tag name.")
|
||||||
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||||
found = []
|
found = []
|
||||||
if pseudo_attributes is not None:
|
if pseudo_attributes is None:
|
||||||
|
pseudo_type = pseudo
|
||||||
|
pseudo_value = None
|
||||||
|
else:
|
||||||
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
||||||
if pseudo_type == 'nth-of-type':
|
if pseudo_type == 'nth-of-type':
|
||||||
try:
|
try:
|
||||||
|
@ -1376,6 +1461,7 @@ class Tag(PageElement):
|
||||||
else:
|
else:
|
||||||
_use_candidate_generator = _candidate_generator
|
_use_candidate_generator = _candidate_generator
|
||||||
|
|
||||||
|
count = 0
|
||||||
for tag in current_context:
|
for tag in current_context:
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print " Running candidate generator on %s %s" % (
|
print " Running candidate generator on %s %s" % (
|
||||||
|
@ -1400,6 +1486,8 @@ class Tag(PageElement):
|
||||||
# don't include it in the context more than once.
|
# don't include it in the context more than once.
|
||||||
new_context.append(candidate)
|
new_context.append(candidate)
|
||||||
new_context_ids.add(id(candidate))
|
new_context_ids.add(id(candidate))
|
||||||
|
if limit and len(new_context) >= limit:
|
||||||
|
break
|
||||||
elif self._select_debug:
|
elif self._select_debug:
|
||||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue