Merge pull request #612 from JackDandy/feature/UpdateBSoup

Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397).
This commit is contained in:
JackDandy 2016-01-12 02:58:37 +00:00
commit 9ecc98fff0
5 changed files with 204 additions and 184 deletions

View file

@ -6,6 +6,7 @@
* Update change to suppress reporting of Tornado exception error 1 to updated package (ref:hacks.txt)
* Change API response header for JSON content type and the return of JSONP data
* Remove redundant MultipartPostHandler
* Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397)
### 0.11.0 (2016-01-10 22:30:00 UTC)

View file

@ -120,6 +120,9 @@ class AttrList(object):
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
value = whitespace_re.split(value)
self.element[name] = value
def items(self):

View file

@ -6,6 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
__license__ = "MIT"
from pdb import set_trace
import codecs

View file

@ -1,4 +1,7 @@
"""Diagnostic functions, mainly for use when doing tech support."""
__license__ = "MIT"
import cProfile
from StringIO import StringIO
from HTMLParser import HTMLParser

View file

@ -1,3 +1,5 @@
__license__ = "MIT"
from pdb import set_trace
import collections
import re
@ -262,19 +264,19 @@ class PageElement(object):
next_element = last_child.next_element
if (self.previous_element is not None and
self.previous_element != next_element):
self.previous_element is not next_element):
self.previous_element.next_element = next_element
if next_element is not None and next_element != self.previous_element:
if next_element is not None and next_element is not self.previous_element:
next_element.previous_element = self.previous_element
self.previous_element = None
last_child.next_element = None
self.parent = None
if (self.previous_sibling is not None
and self.previous_sibling != self.next_sibling):
and self.previous_sibling is not self.next_sibling):
self.previous_sibling.next_sibling = self.next_sibling
if (self.next_sibling is not None
and self.next_sibling != self.previous_sibling):
and self.next_sibling is not self.previous_sibling):
self.next_sibling.previous_sibling = self.previous_sibling
self.previous_sibling = self.next_sibling = None
return self
@ -287,13 +289,15 @@ class PageElement(object):
last_child = self
while isinstance(last_child, Tag) and last_child.contents:
last_child = last_child.contents[-1]
if not accept_self and last_child == self:
if not accept_self and last_child is self:
last_child = None
return last_child
# BS3: Not part of the API!
_lastRecursiveChild = _last_descendant
def insert(self, position, new_child):
if new_child is None:
raise ValueError("Cannot insert None into a tag.")
if new_child is self:
raise ValueError("Cannot insert a tag into itself.")
if (isinstance(new_child, basestring)
@ -750,8 +754,8 @@ class Comment(PreformattedString):
class Declaration(PreformattedString):
PREFIX = u'<!'
SUFFIX = u'!>'
PREFIX = u'<?'
SUFFIX = u'?>'
class Doctype(PreformattedString):
@ -1286,9 +1290,23 @@ class Tag(PageElement):
def select(self, selector, _candidate_generator=None, limit=None):
"""Perform a CSS selection operation on the current element."""
# Remove whitespace directly after the grouping operator ','
# then split into tokens.
tokens = re.sub(',[\s]*',',', selector).split()
# Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector:
context = []
for partial_selector in selector.split(','):
partial_selector = partial_selector.strip()
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
if candidate not in context:
context.append(candidate)
if limit and len(context) >= limit:
break
return context
tokens = selector.split()
current_context = [self]
if tokens[-1] in self._selector_combinators:
@ -1298,22 +1316,16 @@ class Tag(PageElement):
if self._select_debug:
print 'Running CSS selector "%s"' % selector
for index, token_group in enumerate(tokens):
for index, token in enumerate(tokens):
new_context = []
new_context_ids = set([])
# Grouping selectors, ie: p,a
grouped_tokens = token_group.split(',')
if '' in grouped_tokens:
raise ValueError('Invalid group selection syntax: %s' % token_group)
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
continue
for token in grouped_tokens:
if self._select_debug:
print ' Considering token "%s"' % token
recursive_candidate_generator = None