2014-07-22 04:26:58 +00:00
|
|
|
from bs4 import BeautifulSoup
|
2015-09-18 00:06:34 +00:00
|
|
|
import re
|
2014-07-22 04:26:58 +00:00
|
|
|
|
2015-03-01 02:21:31 +00:00
|
|
|
|
2014-07-22 04:26:58 +00:00
|
|
|
class BS4Parser:
|
|
|
|
def __init__(self, *args, **kwargs):
|
2015-06-15 12:48:01 +00:00
|
|
|
# list type param of "feature" arg is not currently correctly tested by bs4 (r353)
|
|
|
|
# so for now, adjust param to provide possible values until the issue is addressed
|
|
|
|
kwargs_new = {}
|
|
|
|
for k, v in kwargs.items():
|
|
|
|
if 'features' in k and isinstance(v, list):
|
|
|
|
v = [item for item in v if item in ['html5lib', 'html.parser', 'html', 'lxml', 'xml']][0]
|
|
|
|
|
|
|
|
kwargs_new[k] = v
|
|
|
|
|
2015-09-18 00:06:34 +00:00
|
|
|
tag, attr = [x in kwargs_new and kwargs_new.pop(x) or y for (x, y) in [('tag', 'table'), ('attr', '')]]
|
|
|
|
if attr:
|
|
|
|
args = (re.sub(r'(?is).*(<%(tag)s[^>]+%(attr)s[^>]*>.*</%(tag)s>).*' % {'tag': tag, 'attr': attr},
|
|
|
|
r'<html><head></head><body>\1</body></html>', args[0]).strip(),) + args[1:]
|
|
|
|
|
2015-06-15 12:48:01 +00:00
|
|
|
self.soup = BeautifulSoup(*args, **kwargs_new)
|
2014-07-22 04:26:58 +00:00
|
|
|
|
|
|
|
def __enter__(self):
|
|
|
|
return self.soup
|
|
|
|
|
|
|
|
def __exit__(self, exc_ty, exc_val, tb):
|
|
|
|
self.soup.clear(True)
|
2015-06-15 12:48:01 +00:00
|
|
|
self.soup = None
|