2014-03-10 05:18:05 +00:00
""" Beautiful Soup
Elixir and Tonic
" The Screen-Scraper ' s Friend "
http : / / www . crummy . com / software / BeautifulSoup /
Beautiful Soup uses a pluggable XML or HTML parser to parse a
( possibly invalid ) document into a tree representation . Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to
navigate , search , and modify the parse tree .
Beautiful Soup works with Python 2.6 and up . It works better if lxml
and / or html5lib is installed .
For more than you ever wanted to know about Beautiful Soup , see the
documentation :
http : / / www . crummy . com / software / BeautifulSoup / bs4 / doc /
"""
__author__ = " Leonard Richardson (leonardr@segfault.org) "
2015-07-04 21:29:15 +00:00
__version__ = " 4.4.0 "
__copyright__ = " Copyright (c) 2004-2015 Leonard Richardson "
2014-03-10 05:18:05 +00:00
__license__ = " MIT "
__all__ = [ ' BeautifulSoup ' ]
import os
import re
import warnings
from . builder import builder_registry , ParserRejectedMarkup
from . dammit import UnicodeDammit
from . element import (
CData ,
Comment ,
DEFAULT_OUTPUT_ENCODING ,
Declaration ,
Doctype ,
NavigableString ,
PageElement ,
ProcessingInstruction ,
ResultSet ,
SoupStrainer ,
Tag ,
)
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
2015-06-14 23:45:29 +00:00
' You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. ' < > ' You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`). '
2014-03-10 05:18:05 +00:00
class BeautifulSoup ( Tag ) :
"""
This class defines the basic interface called by the tree builders .
These methods will be called by the parser :
reset ( )
feed ( markup )
The tree builder may call these methods from its feed ( ) implementation :
handle_starttag ( name , attrs ) # See note about return value
handle_endtag ( name )
handle_data ( data ) # Appends to the current data node
endData ( containerClass = NavigableString ) # Ends the current data node
No matter how complicated the underlying parser is , you should be
able to build a tree using ' start tag ' events , ' end tag ' events ,
' data ' events , and " done with data " events .
If you encounter an empty - element tag ( aka a self - closing tag ,
like HTML ' s <br> tag), call handle_starttag and then
handle_endtag .
"""
ROOT_TAG_NAME = u ' [document] '
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = [ ' html ' , ' fast ' ]
ASCII_SPACES = ' \x20 \x0a \x09 \x0c \x0d '
2015-07-04 21:29:15 +00:00
NO_PARSER_SPECIFIED_WARNING = " No parser was explicitly specified, so I ' m using the best available %(markup_type)s parser for this system ( \" %(parser)s \" ). This usually isn ' t a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently. \n \n To get rid of this warning, change this: \n \n BeautifulSoup([your markup]) \n \n to this: \n \n BeautifulSoup([your markup], \" %(parser)s \" ) \n "
2015-06-14 23:45:29 +00:00
2014-03-10 05:18:05 +00:00
def __init__ ( self , markup = " " , features = None , builder = None ,
2015-07-04 21:29:15 +00:00
parse_only = None , from_encoding = None , exclude_encodings = None ,
* * kwargs ) :
2014-03-10 05:18:05 +00:00
""" The Soup object is initialized as the ' root tag ' , and the
provided markup ( which can be a string or a file - like object )
is fed into the underlying parser . """
if ' convertEntities ' in kwargs :
warnings . warn (
" BS4 does not respect the convertEntities argument to the "
" BeautifulSoup constructor. Entities are always converted "
" to Unicode characters. " )
if ' markupMassage ' in kwargs :
del kwargs [ ' markupMassage ' ]
warnings . warn (
" BS4 does not respect the markupMassage argument to the "
" BeautifulSoup constructor. The tree builder is responsible "
" for any necessary markup massage. " )
if ' smartQuotesTo ' in kwargs :
del kwargs [ ' smartQuotesTo ' ]
warnings . warn (
" BS4 does not respect the smartQuotesTo argument to the "
" BeautifulSoup constructor. Smart quotes are always converted "
" to Unicode characters. " )
if ' selfClosingTags ' in kwargs :
del kwargs [ ' selfClosingTags ' ]
warnings . warn (
" BS4 does not respect the selfClosingTags argument to the "
" BeautifulSoup constructor. The tree builder is responsible "
" for understanding self-closing tags. " )
if ' isHTML ' in kwargs :
del kwargs [ ' isHTML ' ]
warnings . warn (
" BS4 does not respect the isHTML argument to the "
2015-06-14 23:45:29 +00:00
" BeautifulSoup constructor. Suggest you use "
" features= ' lxml ' for HTML and features= ' lxml-xml ' for "
" XML. " )
2014-03-10 05:18:05 +00:00
def deprecated_argument ( old_name , new_name ) :
if old_name in kwargs :
warnings . warn (
' The " %s " argument to the BeautifulSoup constructor '
' has been renamed to " %s . " ' % ( old_name , new_name ) )
value = kwargs [ old_name ]
del kwargs [ old_name ]
return value
return None
parse_only = parse_only or deprecated_argument (
" parseOnlyThese " , " parse_only " )
from_encoding = from_encoding or deprecated_argument (
" fromEncoding " , " from_encoding " )
if len ( kwargs ) > 0 :
arg = kwargs . keys ( ) . pop ( )
raise TypeError (
" __init__() got an unexpected keyword argument ' %s ' " % arg )
if builder is None :
2015-06-14 23:45:29 +00:00
original_features = features
2014-03-10 05:18:05 +00:00
if isinstance ( features , basestring ) :
features = [ features ]
if features is None or len ( features ) == 0 :
features = self . DEFAULT_BUILDER_FEATURES
builder_class = builder_registry . lookup ( * features )
if builder_class is None :
raise FeatureNotFound (
" Couldn ' t find a tree builder with the features you "
" requested: %s . Do you need to install a parser library? "
% " , " . join ( features ) )
builder = builder_class ( )
2015-06-14 23:45:29 +00:00
if not ( original_features == builder . NAME or
original_features in builder . ALTERNATE_NAMES ) :
2015-07-04 21:29:15 +00:00
if builder . is_xml :
markup_type = " XML "
else :
markup_type = " HTML "
2015-06-14 23:45:29 +00:00
warnings . warn ( self . NO_PARSER_SPECIFIED_WARNING % dict (
2015-07-04 21:29:15 +00:00
parser = builder . NAME ,
markup_type = markup_type ) )
2015-06-14 23:45:29 +00:00
2014-03-10 05:18:05 +00:00
self . builder = builder
self . is_xml = builder . is_xml
self . builder . soup = self
self . parse_only = parse_only
if hasattr ( markup , ' read ' ) : # It's a file-type object.
markup = markup . read ( )
elif len ( markup ) < = 256 :
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
if ( isinstance ( markup , unicode )
and not os . path . supports_unicode_filenames ) :
possible_filename = markup . encode ( " utf8 " )
else :
possible_filename = markup
is_file = False
try :
is_file = os . path . exists ( possible_filename )
except Exception , e :
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file :
2015-06-14 23:45:29 +00:00
if isinstance ( markup , unicode ) :
markup = markup . encode ( " utf8 " )
2014-03-10 05:18:05 +00:00
warnings . warn (
' " %s " looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup. ' % markup )
if markup [ : 5 ] == " http: " or markup [ : 6 ] == " https: " :
# TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise.
if ( ( isinstance ( markup , bytes ) and not b ' ' in markup )
or ( isinstance ( markup , unicode ) and not u ' ' in markup ) ) :
2015-06-14 23:45:29 +00:00
if isinstance ( markup , unicode ) :
markup = markup . encode ( " utf8 " )
2014-03-10 05:18:05 +00:00
warnings . warn (
' " %s " looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup. ' % markup )
for ( self . markup , self . original_encoding , self . declared_html_encoding ,
self . contains_replacement_characters ) in (
2015-07-04 21:29:15 +00:00
self . builder . prepare_markup (
markup , from_encoding , exclude_encodings = exclude_encodings ) ) :
2014-03-10 05:18:05 +00:00
self . reset ( )
try :
self . _feed ( )
break
except ParserRejectedMarkup :
pass
# Clear out the markup and remove the builder's circular
# reference to this object.
self . markup = None
self . builder . soup = None
2015-07-04 21:29:15 +00:00
def __copy__ ( self ) :
return type ( self ) ( self . encode ( ) , builder = self . builder )
def __getstate__ ( self ) :
# Frequently a tree builder can't be pickled.
d = dict ( self . __dict__ )
if ' builder ' in d and not self . builder . picklable :
del d [ ' builder ' ]
return d
2014-03-10 05:18:05 +00:00
def _feed ( self ) :
# Convert the document to Unicode.
self . builder . reset ( )
self . builder . feed ( self . markup )
# Close out any unfinished strings and close all the open tags.
self . endData ( )
while self . currentTag . name != self . ROOT_TAG_NAME :
self . popTag ( )
def reset ( self ) :
Tag . __init__ ( self , self , self . builder , self . ROOT_TAG_NAME )
self . hidden = 1
self . builder . reset ( )
self . current_data = [ ]
self . currentTag = None
self . tagStack = [ ]
self . preserve_whitespace_tag_stack = [ ]
self . pushTag ( self )
def new_tag ( self , name , namespace = None , nsprefix = None , * * attrs ) :
""" Create a new tag associated with this soup. """
return Tag ( None , self . builder , name , namespace , nsprefix , attrs )
def new_string ( self , s , subclass = NavigableString ) :
""" Create a new NavigableString associated with this soup. """
2015-07-04 21:29:15 +00:00
return subclass ( s )
2014-03-10 05:18:05 +00:00
def insert_before ( self , successor ) :
raise NotImplementedError ( " BeautifulSoup objects don ' t support insert_before(). " )
def insert_after ( self , successor ) :
raise NotImplementedError ( " BeautifulSoup objects don ' t support insert_after(). " )
def popTag ( self ) :
tag = self . tagStack . pop ( )
if self . preserve_whitespace_tag_stack and tag == self . preserve_whitespace_tag_stack [ - 1 ] :
self . preserve_whitespace_tag_stack . pop ( )
#print "Pop", tag.name
if self . tagStack :
self . currentTag = self . tagStack [ - 1 ]
return self . currentTag
def pushTag ( self , tag ) :
#print "Push", tag.name
if self . currentTag :
self . currentTag . contents . append ( tag )
self . tagStack . append ( tag )
self . currentTag = self . tagStack [ - 1 ]
if tag . name in self . builder . preserve_whitespace_tags :
self . preserve_whitespace_tag_stack . append ( tag )
def endData ( self , containerClass = NavigableString ) :
if self . current_data :
current_data = u ' ' . join ( self . current_data )
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
if not self . preserve_whitespace_tag_stack :
strippable = True
for i in current_data :
if i not in self . ASCII_SPACES :
strippable = False
break
if strippable :
if ' \n ' in current_data :
current_data = ' \n '
else :
current_data = ' '
# Reset the data collector.
self . current_data = [ ]
# Should we add this string to the tree at all?
if self . parse_only and len ( self . tagStack ) < = 1 and \
( not self . parse_only . text or \
not self . parse_only . search ( current_data ) ) :
return
o = containerClass ( current_data )
self . object_was_parsed ( o )
def object_was_parsed ( self , o , parent = None , most_recent_element = None ) :
""" Add an object to the parse tree. """
parent = parent or self . currentTag
2015-07-04 21:29:15 +00:00
previous_element = most_recent_element or self . _most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance ( o , Tag ) :
next_element = o . next_element
next_sibling = o . next_sibling
previous_sibling = o . previous_sibling
if not previous_element :
previous_element = o . previous_element
o . setup ( parent , previous_element , next_element , previous_sibling , next_sibling )
2014-03-10 05:18:05 +00:00
self . _most_recent_element = o
parent . contents . append ( o )
2015-07-04 21:29:15 +00:00
if parent . next_sibling :
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
index = parent . contents . index ( o )
if index == 0 :
previous_element = parent
previous_sibling = None
else :
previous_element = previous_sibling = parent . contents [ index - 1 ]
if index == len ( parent . contents ) - 1 :
next_element = parent . next_sibling
next_sibling = None
else :
next_element = next_sibling = parent . contents [ index + 1 ]
o . previous_element = previous_element
if previous_element :
previous_element . next_element = o
o . next_element = next_element
if next_element :
next_element . previous_element = o
o . next_sibling = next_sibling
if next_sibling :
next_sibling . previous_sibling = o
o . previous_sibling = previous_sibling
if previous_sibling :
previous_sibling . next_sibling = o
2014-03-10 05:18:05 +00:00
def _popToTag ( self , name , nsprefix = None , inclusivePop = True ) :
""" Pops the tag stack up to and including the most recent
instance of the given tag . If inclusivePop is false , pops the tag
stack up to but * not * including the most recent instqance of
the given tag . """
#print "Popping to %s" % name
if name == self . ROOT_TAG_NAME :
# The BeautifulSoup object itself can never be popped.
return
most_recently_popped = None
stack_size = len ( self . tagStack )
for i in range ( stack_size - 1 , 0 , - 1 ) :
t = self . tagStack [ i ]
if ( name == t . name and nsprefix == t . prefix ) :
if inclusivePop :
most_recently_popped = self . popTag ( )
break
most_recently_popped = self . popTag ( )
return most_recently_popped
def handle_starttag ( self , name , namespace , nsprefix , attrs ) :
""" Push a start tag on to the stack.
If this method returns None , the tag was rejected by the
SoupStrainer . You should proceed as if the tag had not occured
in the document . For instance , if this was a self - closing tag ,
don ' t call handle_endtag.
"""
# print "Start tag %s: %s" % (name, attrs)
self . endData ( )
if ( self . parse_only and len ( self . tagStack ) < = 1
and ( self . parse_only . text
or not self . parse_only . search_tag ( name , attrs ) ) ) :
return None
tag = Tag ( self , self . builder , name , namespace , nsprefix , attrs ,
self . currentTag , self . _most_recent_element )
if tag is None :
return tag
if self . _most_recent_element :
self . _most_recent_element . next_element = tag
self . _most_recent_element = tag
self . pushTag ( tag )
return tag
def handle_endtag ( self , name , nsprefix = None ) :
#print "End tag: " + name
self . endData ( )
self . _popToTag ( name , nsprefix )
def handle_data ( self , data ) :
self . current_data . append ( data )
def decode ( self , pretty_print = False ,
eventual_encoding = DEFAULT_OUTPUT_ENCODING ,
formatter = " minimal " ) :
""" Returns a string or Unicode representation of this document.
To get Unicode , pass None for encoding . """
if self . is_xml :
# Print the XML declaration
encoding_part = ' '
if eventual_encoding != None :
encoding_part = ' encoding= " %s " ' % eventual_encoding
prefix = u ' <?xml version= " 1.0 " %s ?> \n ' % encoding_part
else :
prefix = u ' '
if not pretty_print :
indent_level = None
else :
indent_level = 0
return prefix + super ( BeautifulSoup , self ) . decode (
indent_level , eventual_encoding , formatter )
# Alias to make it easier to type import: 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
class BeautifulStoneSoup ( BeautifulSoup ) :
""" Deprecated interface to an XML parser. """
def __init__ ( self , * args , * * kwargs ) :
kwargs [ ' features ' ] = ' xml '
warnings . warn (
' The BeautifulStoneSoup class is deprecated. Instead of using '
' it, pass features= " xml " into the BeautifulSoup constructor. ' )
super ( BeautifulStoneSoup , self ) . __init__ ( * args , * * kwargs )
class StopParsing ( Exception ) :
pass
class FeatureNotFound ( ValueError ) :
pass
#By default, act as an HTML pretty-printer.
if __name__ == ' __main__ ' :
import sys
soup = BeautifulSoup ( sys . stdin )
print soup . prettify ( )