From 7c0c75fc9958927b180f4965b90267a6d315d67d Mon Sep 17 00:00:00 2001 From: JackDandy Date: Mon, 11 Jan 2016 23:46:18 +0000 Subject: [PATCH] Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397). --- CHANGES.md | 1 + lib/bs4/builder/_html5lib.py | 5 +- lib/bs4/dammit.py | 1 + lib/bs4/diagnose.py | 3 + lib/bs4/element.py | 378 ++++++++++++++++++----------------- 5 files changed, 204 insertions(+), 184 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 215ee26f..5b2c9731 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ * Update change to suppress reporting of Tornado exception error 1 to updated package (ref:hacks.txt) * Change API response header for JSON content type and the return of JSONP data * Remove redundant MultipartPostHandler +* Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397) ### 0.11.0 (2016-01-10 22:30:00 UTC) diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index ab5793c1..8725a658 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -120,7 +120,10 @@ class AttrList(object): if (name in list_attr['*'] or (self.element.name in list_attr and name in list_attr[self.element.name])): - value = whitespace_re.split(value) + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py index 317ad6d7..636f81b4 100644 --- a/lib/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -6,6 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +__license__ = "MIT" from pdb import set_trace import codecs diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index 1b719830..c04d23c3 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -1,4 +1,7 @@ """Diagnostic functions, mainly for use when doing tech support.""" + +__license__ = "MIT" + import cProfile from StringIO import StringIO from HTMLParser import HTMLParser diff --git a/lib/bs4/element.py b/lib/bs4/element.py index c70ad5a0..ecf2b280 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -1,3 +1,5 @@ +__license__ = "MIT" + from pdb import set_trace import collections import re @@ -262,19 +264,19 @@ class PageElement(object): next_element = last_child.next_element if (self.previous_element is not None and - self.previous_element != next_element): + self.previous_element is not next_element): self.previous_element.next_element = next_element - if next_element is not None and next_element != self.previous_element: + if next_element is not None and next_element is not self.previous_element: next_element.previous_element = self.previous_element self.previous_element = None last_child.next_element = None self.parent = None if (self.previous_sibling is not None - and self.previous_sibling != self.next_sibling): + and self.previous_sibling is not self.next_sibling): self.previous_sibling.next_sibling = self.next_sibling if (self.next_sibling is not None - and self.next_sibling != self.previous_sibling): + and self.next_sibling is not self.previous_sibling): self.next_sibling.previous_sibling = self.previous_sibling self.previous_sibling = self.next_sibling = None return self @@ -287,13 +289,15 @@ class PageElement(object): last_child = self while isinstance(last_child, Tag) and last_child.contents: last_child = last_child.contents[-1] - if not accept_self and last_child == self: + if not accept_self and last_child is self: last_child = None return last_child # BS3: Not part of the API! _lastRecursiveChild = _last_descendant def insert(self, position, new_child): + if new_child is None: + raise ValueError("Cannot insert None into a tag.") if new_child is self: raise ValueError("Cannot insert a tag into itself.") if (isinstance(new_child, basestring) @@ -750,8 +754,8 @@ class Comment(PreformattedString): class Declaration(PreformattedString): - PREFIX = u'' + PREFIX = u'' class Doctype(PreformattedString): @@ -1286,9 +1290,23 @@ class Tag(PageElement): def select(self, selector, _candidate_generator=None, limit=None): """Perform a CSS selection operation on the current element.""" - # Remove whitespace directly after the grouping operator ',' - # then split into tokens. - tokens = re.sub(',[\s]*',',', selector).split() + # Handle grouping selectors if ',' exists, ie: p,a + if ',' in selector: + context = [] + for partial_selector in selector.split(','): + partial_selector = partial_selector.strip() + if partial_selector == '': + raise ValueError('Invalid group selection syntax: %s' % selector) + candidates = self.select(partial_selector, limit=limit) + for candidate in candidates: + if candidate not in context: + context.append(candidate) + + if limit and len(context) >= limit: + break + return context + + tokens = selector.split() current_context = [self] if tokens[-1] in self._selector_combinators: @@ -1298,198 +1316,192 @@ class Tag(PageElement): if self._select_debug: print 'Running CSS selector "%s"' % selector - for index, token_group in enumerate(tokens): + for index, token in enumerate(tokens): new_context = [] new_context_ids = set([]) - # Grouping selectors, ie: p,a - grouped_tokens = token_group.split(',') - if '' in grouped_tokens: - raise ValueError('Invalid group selection syntax: %s' % token_group) - if tokens[index-1] in self._selector_combinators: # This token was consumed by the previous combinator. Skip it. if self._select_debug: print ' Token was consumed by the previous combinator.' continue - for token in grouped_tokens: - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None + if self._select_debug: + print ' Considering token "%s"' % token + recursive_candidate_generator = None + tag_name = None - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None + # Each operation corresponds to a checker function, a rule + # for determining whether a candidate matches the + # selector. Candidates are generated by the active + # iterator. + checker = None - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag_name, attribute, operator, value = m.groups() + checker = self._attribute_checker(operator, attribute, value) - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches + elif '#' in token: + # ID selector + tag_name, tag_id = token.split('#', 1) + def id_matches(tag): + return tag.get('id', None) == tag_id + checker = id_matches - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match + elif '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + classes = set(klass.split('.')) + def classes_match(candidate): + return classes.issubset(candidate.get('class', [])) + checker = classes_match - elif ':' in token: - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is None: - pseudo_type = pseudo - pseudo_value = None - else: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - if self.count > self.destination: - # Stop the generator that's sending us - # these things. - raise StopIteration() - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') - - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: + elif ':' in token: + # Pseudo-class + tag_name, pseudo = token.split(':', 1) + if tag_name == '': raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i - if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator - else: - _use_candidate_generator = lambda tag: tag.descendants + "A pseudo-class must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is None: + pseudo_type = pseudo + pseudo_value = None else: - _use_candidate_generator = _candidate_generator + pseudo_type, pseudo_value = pseudo_attributes.groups() + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: + raise NotImplementedError( + 'Only numeric values are currently supported for the nth-of-type pseudo-class.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudo-class value must be at least 1.') + class Counter(object): + def __init__(self, destination): + self.count = 0 + self.destination = destination - count = 0 - for tag in current_context: + def nth_child_of_type(self, tag): + self.count += 1 + if self.count == self.destination: + return True + if self.count > self.destination: + # Stop the generator that's sending us + # these things. + raise StopIteration() + return False + checker = Counter(pseudo_value).nth_child_of_type + else: + raise NotImplementedError( + 'Only the following pseudo-classes are implemented: nth-of-type.') + + elif token == '*': + # Star selector -- matches everything + pass + elif token == '>': + # Run the next token as a CSS selector against the + # direct children of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.children + elif token == '~': + # Run the next token as a CSS selector against the + # siblings of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.next_siblings + elif token == '+': + # For each tag in the current context, run the next + # token as a CSS selector against the tag's next + # sibling that's a tag. + def next_tag_sibling(tag): + yield tag.find_next_sibling(True) + recursive_candidate_generator = next_tag_sibling + + elif self.tag_name_re.match(token): + # Just a tag name. + tag_name = token + else: + raise ValueError( + 'Unsupported or invalid CSS selector: "%s"' % token) + if recursive_candidate_generator: + # This happens when the selector looks like "> foo". + # + # The generator calls select() recursively on every + # member of the current context, passing in a different + # candidate generator and a different selector. + # + # In the case of "> foo", the candidate generator is + # one that yields a tag's direct children (">"), and + # the selector is "foo". + next_token = tokens[index+1] + def recursive_select(tag): if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. + print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) + print '-' * 40 + for i in tag.select(next_token, recursive_candidate_generator): + if self._select_debug: + print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + yield i + if self._select_debug: + print '-' * 40 + _use_candidate_generator = recursive_select + elif _candidate_generator is None: + # By default, a tag's candidates are all of its + # children. If tag_name is defined, only yield tags + # with that name. + if self._select_debug: + if tag_name: + check = "[any]" + else: + check = tag_name + print ' Default candidate generator, tag name="%s"' % check + if self._select_debug: + # This is redundant with later code, but it stops + # a bunch of bogus tags from cluttering up the + # debug log. + def default_candidate_generator(tag): + for child in tag.descendants: + if not isinstance(child, Tag): + continue + if tag_name and not child.name == tag_name: + continue + yield child + _use_candidate_generator = default_candidate_generator + else: + _use_candidate_generator = lambda tag: tag.descendants + else: + _use_candidate_generator = _candidate_generator + + count = 0 + for tag in current_context: + if self._select_debug: + print " Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs)) + for candidate in _use_candidate_generator(tag): + if not isinstance(candidate, Tag): + continue + if tag_name and candidate.name != tag_name: + continue + if checker is not None: + try: + result = checker(candidate) + except StopIteration: + # The checker has decided we should no longer + # run the generator. + break + if checker is None or result: + if self._select_debug: + print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + if id(candidate) not in new_context_ids: + # If a tag matches a selector more than once, + # don't include it in the context more than once. + new_context.append(candidate) + new_context_ids.add(id(candidate)) + if limit and len(new_context) >= limit: break - if checker is None or result: - if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - if limit and len(new_context) >= limit: - break - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + elif self._select_debug: + print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) current_context = new_context