Created a context manager wrapper for BeautifulSoup4 so that we can cleanup/clear tags/context on exit via WITH statements.

Fixed issues with torrent providers returning no results.
This commit is contained in:
echel0n 2014-07-21 21:26:58 -07:00
parent 77feb5a74c
commit a317ff61c2
12 changed files with 445 additions and 469 deletions

13
sickbeard/bs4_parser.py Normal file
View file

@ -0,0 +1,13 @@
import sickbeard
from bs4 import BeautifulSoup
class BS4Parser:
def __init__(self, *args, **kwargs):
self.soup = BeautifulSoup(*args, **kwargs)
def __enter__(self):
return self.soup
def __exit__(self, exc_ty, exc_val, tb):
self.soup.clear(True)
self.soup = None

View file

@ -31,7 +31,6 @@ import httplib
import urlparse
import uuid
import base64
import string
import zipfile
from lib import requests
@ -1241,7 +1240,7 @@ def mapIndexersToShow(showObj):
return mapped
def touchFile(self, fname, atime=None):
def touchFile(fname, atime=None):
if None != atime:
try:
with file(fname, 'a'):

View file

@ -22,7 +22,7 @@ import datetime
import urlparse
import sickbeard
import generic
from sickbeard.common import Quality, cpu_presets
from sickbeard.common import Quality
from sickbeard import logger
from sickbeard import tvcache
from sickbeard import db
@ -33,7 +33,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
@ -168,13 +168,10 @@ class BitSoupProvider(generic.TorrentProvider):
continue
try:
html = BeautifulSoup(data, "html.parser")
with BS4Parser(data, "html.parser") as html:
torrent_table = html.find('table', attrs={'class': 'koptekst'})
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
html.clear(True)
#Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",

View file

@ -33,7 +33,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
@ -175,7 +175,6 @@ class FreshOnTVProvider(generic.TorrentProvider):
if not self._doLogin():
return []
for mode in search_params.keys():
for search_string in search_params[mode]:
@ -193,13 +192,10 @@ class FreshOnTVProvider(generic.TorrentProvider):
continue
try:
html = BeautifulSoup(data, features=["html5lib", "permissive"])
with BS4Parser(data, features=["html5lib", "permissive"]) as html:
torrent_table = html.find('table', attrs={'class': 'frame'})
torrent_rows = torrent_table.findChildren('tr') if torrent_table else []
html.clear(True)
#Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",

View file

@ -34,7 +34,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
@ -196,13 +196,10 @@ class HDTorrentsProvider(generic.TorrentProvider):
data = split_data[2]
try:
html = BeautifulSoup(data, features=["html5lib", "permissive"])
with BS4Parser(data, features=["html5lib", "permissive"]) as html:
#Get first entry in table
entries = html.find_all('td', attrs={'align': 'center'})
html.clear(True)
if not entries:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",
logger.DEBUG)

View file

@ -33,7 +33,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
from sickbeard.show_name_helpers import allPossibleShowNames
@ -167,8 +167,7 @@ class IPTorrentsProvider(generic.TorrentProvider):
continue
try:
html = BeautifulSoup(data, features=["html5lib", "permissive"])
with BS4Parser(data, features=["html5lib", "permissive"]) as html:
if not html:
logger.log(u"Invalid HTML data: " + str(data), logger.DEBUG)
continue
@ -180,8 +179,6 @@ class IPTorrentsProvider(generic.TorrentProvider):
torrent_table = html.find('table', attrs={'class': 'torrents'})
torrents = torrent_table.find_all('tr') if torrent_table else []
html.clear(True)
#Continue only if one Release is found
if len(torrents) < 2:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",

View file

@ -40,11 +40,9 @@ from sickbeard.show_name_helpers import allPossibleShowNames, sanitizeSceneName
from sickbeard.exceptions import ex
from sickbeard import encodingKludge as ek
from sickbeard import clients
from sickbeard import tv
from sickbeard.bs4_parser import BS4Parser
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from lib.unidecode import unidecode
@ -119,12 +117,9 @@ class KATProvider(generic.TorrentProvider):
return None
try:
soup = BeautifulSoup(data, features=["html5lib", "permissive"])
with BS4Parser(data, features=["html5lib", "permissive"]) as soup:
file_table = soup.find('table', attrs={'class': 'torrentFileList'})
# cleanup memory
soup.clear(True)
if not file_table:
return None
@ -230,6 +225,7 @@ class KATProvider(generic.TorrentProvider):
results = []
items = {'Season': [], 'Episode': [], 'RSS': []}
soup = None
for mode in search_params.keys():
for search_string in search_params[mode]:
@ -250,13 +246,10 @@ class KATProvider(generic.TorrentProvider):
continue
try:
soup = BeautifulSoup(html, features=["html5lib", "permissive"])
with BS4Parser(html, features=["html5lib", "permissive"]) as soup:
torrent_table = soup.find('table', attrs={'class': 'data'})
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
soup.clear(True)
#Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"The data returned from " + self.name + " does not contain any torrents",

View file

@ -37,7 +37,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from sickbeard.helpers import sanitizeSceneName
@ -118,7 +118,7 @@ class NextGenProvider(generic.TorrentProvider):
self.session.headers.update(
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20130519 Firefox/24.0)'})
data = self.session.get(self.urls['login_page'], verify=False)
bs = BeautifulSoup(data.content.decode('iso-8859-1'))
with BS4Parser(data.content.decode('iso-8859-1')) as bs:
csrfraw = bs.find('form', attrs={'id': 'login'})['action']
output = self.session.post(self.urls['base_url'] + csrfraw, data=login_params)
@ -204,7 +204,7 @@ class NextGenProvider(generic.TorrentProvider):
if data:
try:
html = BeautifulSoup(data.decode('iso-8859-1'), features=["html5lib", "permissive"])
with BS4Parser(data.decode('iso-8859-1'), features=["html5lib", "permissive"]) as html:
resultsTable = html.find('div', attrs={'id': 'torrent-table-wrapper'})
if not resultsTable:
@ -257,7 +257,6 @@ class NextGenProvider(generic.TorrentProvider):
logger.WARNING)
continue
except Exception, e:
logger.log(u"Failed parsing " + self.name + " Traceback: " + traceback.format_exc(),
logger.ERROR)

View file

@ -40,7 +40,7 @@ from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
@ -150,13 +150,10 @@ class PublicHDProvider(generic.TorrentProvider):
html = os.linesep.join([s for s in html.splitlines() if not optreg.search(s)])
try:
html = BeautifulSoup(html, features=["html5lib", "permissive"])
with BS4Parser(html, features=["html5lib", "permissive"]) as html:
torrent_table = html.find('table', attrs={'id': 'torrbg'})
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
html.clear(True)
#Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",

View file

@ -34,7 +34,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
@ -196,13 +196,10 @@ class SCCProvider(generic.TorrentProvider):
try:
for dataItem in data:
html = BeautifulSoup(dataItem, features=["html5lib", "permissive"])
with BS4Parser(dataItem, features=["html5lib", "permissive"]) as html:
torrent_table = html.find('table', attrs={'id': 'torrents-table'})
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
html.clear(True)
#Continue only if at least one Release is found
if len(torrent_rows) < 2:
if html.title:
@ -225,9 +222,8 @@ class SCCProvider(generic.TorrentProvider):
title = link.string
if re.search('\.\.\.', title):
details_html = BeautifulSoup(self.getURL(self.url + "/" + link['href']))
with BS4Parser(self.getURL(self.url + "/" + link['href'])) as details_html:
title = re.search('(?<=").+(?<!")', details_html.title.string).group(0)
details_html.clear(True)
download_url = self.urls['download'] % url['href']
id = int(link['href'].replace('details?id=', ''))

View file

@ -33,7 +33,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
@ -168,14 +168,10 @@ class TorrentBytesProvider(generic.TorrentProvider):
continue
try:
html = BeautifulSoup(data)
with BS4Parser(data, features=["html5lib", "permissive"]) as html:
torrent_table = html.find('table', attrs={'border': '1'})
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
# cleanup memory
html.clear(True)
#Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",

View file

@ -34,7 +34,7 @@ from sickbeard.exceptions import ex
from sickbeard import clients
from lib import requests
from lib.requests import exceptions
from bs4 import BeautifulSoup
from sickbeard.bs4_parser import BS4Parser
from lib.unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName
@ -172,14 +172,10 @@ class TorrentLeechProvider(generic.TorrentProvider):
continue
try:
html = BeautifulSoup(data, features=["html5lib", "permissive"])
with BS4Parser(data, features=["html5lib", "permissive"]) as html:
torrent_table = html.find('table', attrs={'id': 'torrenttable'})
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
# cleanup memory
html.clear(True)
#Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"The Data returned from " + self.name + " do not contains any torrent",