From 8c713d66a73d73d1dc764b9e33edf19ff05f1538 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Wed, 17 Aug 2016 15:37:08 +0100 Subject: [PATCH] Change refactor IMDb. --- .../interfaces/default/home_browseShows.tmpl | 3 +- sickbeard/webserve.py | 274 ++++++++---------- 2 files changed, 126 insertions(+), 151 deletions(-) diff --git a/gui/slick/interfaces/default/home_browseShows.tmpl b/gui/slick/interfaces/default/home_browseShows.tmpl index e9774d50..887a97b2 100644 --- a/gui/slick/interfaces/default/home_browseShows.tmpl +++ b/gui/slick/interfaces/default/home_browseShows.tmpl @@ -243,7 +243,8 @@ #if 'poster' in $this_show['images']: #set $image = $this_show['images']['poster']['thumb'] diff --git a/sickbeard/webserve.py b/sickbeard/webserve.py index 78447a76..348bd710 100644 --- a/sickbeard/webserve.py +++ b/sickbeard/webserve.py @@ -2615,6 +2615,118 @@ class NewHomeAddShows(Home): return json.dumps({'result': 'Success', 'accounts': sickbeard.IMDB_ACCOUNTS}) + @staticmethod + def parse_imdb_overview(tag): + paragraphs = tag.select('.lister-item-content p') + filtered = [] + for item in paragraphs: + if not (item.select('span.certificate') or item.select('span.genre') or + item.select('span.runtime') or item.select('span.ghost')): + filtered.append(item.get_text().strip()) + split_lines = [element.split('\n') for element in filtered] + filtered = [] + least_lines = 10 + for item_lines in split_lines: + if len(item_lines) < least_lines: + least_lines = len(item_lines) + filtered = [item_lines] + elif len(item_lines) == least_lines: + filtered.append(item_lines) + overview = None + for item_lines in filtered: + text = ' '.join([item_lines.strip() for item_lines in item_lines]).strip() + if len(text) and (not overview or (len(text) > len(overview))): + overview = text + return overview + + def parse_imdb_html(self, html, filtered, kwargs): + + img_size = re.compile(r'(?im)(V1[^XY]+([XY]))(\d+)([^\d]+)(\d+)([^\d]+)(\d+)([^\d]+)(\d+)([^\d]+)(\d+)(.*?)$') + imdb_id = re.compile(r'(?i).*(tt\d+).*') + + with BS4Parser(html, features=['html5lib', 'permissive']) as soup: + show_list = soup.select('.lister-list') + shows = [] if not show_list else show_list[0].select('.lister-item') + oldest, newest, oldest_dt, newest_dt = None, None, 9999999, 0 + + for row in shows: + try: + title = row.select('.lister-item-header a[href*=title]')[0] + url_path = title['href'].strip('/') + ids = dict(imdb=imdb_id.sub(r'\1', url_path)) + year, ended = 2*[None] + first_aired = row.select('.lister-item-header .lister-item-year') + if len(first_aired): + years = re.findall(r'.*?(\d{4})(?:.*?(\d{4}))?.*', first_aired[0].get_text()) + year, ended = years and years[0] or 2*[None] + dt_ordinal = 0 + if year: + dt = dateutil.parser.parse('01-01-%s' % year) + dt_ordinal = dt.toordinal() + if dt_ordinal < oldest_dt: + oldest_dt = dt_ordinal + oldest = year + if dt_ordinal > newest_dt: + newest_dt = dt_ordinal + newest = year + + genres = row.select('.genre') + images = {} + img = row.select('.lister-item-image img') + overview = self.parse_imdb_overview(row) + rating = row.find('meta', attrs={'itemprop': 'ratingValue'}) + rating = None is not rating and rating.get('content') or '' + voting = row.find('meta', attrs={'itemprop': 'ratingCount'}) + voting = None is not voting and voting.get('content') or '' + if len(img): + img_uri = img[0].get('loadlate') + match = img_size.search(img_uri) + if match and 'tv_series.gif' not in img_uri and 'nopicture' not in img_uri: + scale = lambda low1, high1: int((float(450) / high1) * low1) + high = int(max([match.group(9), match.group(11)])) + scaled = [scale(x, high) for x in + [(int(match.group(n)), high)[high == int(match.group(n))] for n in + 3, 5, 7, 9, 11]] + parts = [match.group(1), match.group(4), match.group(6), match.group(8), match.group(10), + match.group(12)] + img_uri = img_uri.replace(match.group(), ''.join( + [str(y) for x in map(None, parts, scaled) for y in x if y is not None])) + path = ek.ek(os.path.abspath, ek.ek(os.path.join, sickbeard.CACHE_DIR, 'images', 'imdb')) + helpers.make_dirs(path) + file_name = ek.ek(os.path.basename, img_uri) + cached_name = ek.ek(os.path.join, path, file_name) + if not ek.ek(os.path.isfile, cached_name): + helpers.download_file(img_uri, cached_name) + images = dict(poster=dict(thumb='cache/images/imdb/%s' % file_name)) + + filtered.append(dict( + premiered=dt_ordinal, + premiered_str=year or 'No year', + ended_str=ended or '', + when_past=dt_ordinal < datetime.datetime.now().toordinal(), # air time not poss. 16.11.2015 + genres=('No genre yet' if not len(genres) else + genres[0].get_text().strip().lower().replace(' |', ',')), + ids=ids, + images=images, + overview='No overview yet' if not overview else self.encode_html(overview[:250:]), + rating=0 if not len(rating) else int(helpers.tryFloat(rating) * 10), + title=title.get_text().strip(), + url_src_db='http://www.imdb.com/%s/' % url_path.strip('/'), + votes=0 if not len(voting) else helpers.tryInt(voting, 'TBA'))) + + show = filter(lambda x: x.imdbid == ids['imdb'], sickbeard.showList)[0] + src = ((None, 'tvrage')[INDEXER_TVRAGE == show.indexer], 'tvdb')[INDEXER_TVDB == show.indexer] + if src: + filtered[-1]['ids'][src] = show.indexerid + filtered[-1]['url_' + src] = '%s%s' % ( + sickbeard.indexerApi(show.indexer).config['show_url'], show.indexerid) + except (AttributeError, TypeError, KeyError, IndexError): + continue + + kwargs.update(dict(oldest=oldest, newest=newest)) + + return show_list and True or None + def watchlist_imdb(self, *args, **kwargs): if 'add' == kwargs.get('action'): @@ -2645,84 +2757,17 @@ class NewHomeAddShows(Home): html = helpers.getURL(url + url_data) if html: - img_size = re.compile(r'(?im)(V1[^XY]+([XY]))(\d+)([^\d]+)(\d+)([^\d]+)(\d+)([^\d]+)(\d+)([^\d]+)(\d+)(.*?)$') - imdb_id = re.compile(r'(?i).*(tt\d+).*') - - with BS4Parser(html, features=['html5lib', 'permissive']) as soup: - show_list = soup.find('div', {'class': 'lister-list'}) - shows = [] if not show_list else show_list.find_all('div', {'class': 'mode-detail'}) - oldest, newest, oldest_dt, newest_dt = None, None, 9999999, 0 - - for show in shows: - try: - url_path = show.select('h3.lister-item-header a[href*=title]')[0]['href'].strip('/') - ids = dict(imdb=imdb_id.sub(r'\1', url_path)) - first_aired = show.select('h3.lister-item-header span.lister-item-year') - year = None if not len(first_aired) else re.sub(r'.*(\d{4}).*', r'\1', first_aired[0].get_text()) - dt_ordinal = 0 - if year: - dt = dateutil.parser.parse('01-01-%s' % year) - dt_ordinal = dt.toordinal() - if dt_ordinal < oldest_dt: - oldest_dt = dt_ordinal - oldest = year - if dt_ordinal > newest_dt: - newest_dt = dt_ordinal - newest = year - - genres = show.select('span.genre') - images = {} - img = show.select('div.lister-item-image img') - overview = '' if not show.find('p', '') else show.find('p', '').get_text().strip() - rating = show.find('meta', attrs={'itemprop': 'ratingValue'}) - rating = None is not rating and rating.get('content') or '' - voting = show.find('meta', attrs={'itemprop': 'ratingCount'}) - voting = None is not voting and voting.get('content') or '' - if len(img): - img_uri = img[0].get('loadlate') - match = img_size.search(img_uri) - if match and 'tv_series.gif' not in img_uri and 'nopicture' not in img_uri: - scale = lambda low1, high1: int((float(450) / high1) * low1) - high = int(max([match.group(9), match.group(11)])) - scaled = [scale(x, high) for x in [(int(match.group(n)), high)[high == int(match.group(n))] for n in 3, 5, 7, 9, 11]] - parts = [match.group(1), match.group(4), match.group(6), match.group(8), match.group(10), match.group(12)] - img_uri = img_uri.replace(match.group(), ''.join([str(y) for x in map(None, parts, scaled) for y in x if y is not None])) - path = ek.ek(os.path.abspath, ek.ek(os.path.join, sickbeard.CACHE_DIR, 'images', 'imdb')) - helpers.make_dirs(path) - file_name = ek.ek(os.path.basename, img_uri) - cached_name = ek.ek(os.path.join, path, file_name) - if not ek.ek(os.path.isfile, cached_name): - helpers.download_file(img_uri, cached_name) - images = dict(poster=dict(thumb='cache/images/imdb/%s' % file_name)) - - filtered.append(dict( - premiered=dt_ordinal, - premiered_str=year or 'No year', - when_past=dt_ordinal < datetime.datetime.now().toordinal(), # air time not poss. 16.11.2015 - genres='No genre yet' if not len(genres) else genres[0].get_text().strip().lower().replace(' |', ','), - ids=ids, - images=images, - overview='No overview yet' if not len(overview) else self.encode_html(overview[:250:].strip()), - rating=0 if not len(rating) else int(helpers.tryFloat(rating) * 10), - title=show.select('h3.lister-item-header a[href*=title]')[0].get_text().strip(), - url_src_db='http://www.imdb.com/%s/' % url_path, - votes=0 if not len(voting) else helpers.tryInt(voting))) - - tvshow = filter(lambda x: x.imdbid == ids['imdb'], sickbeard.showList)[0] - src = ((None, 'tvrage')[INDEXER_TVRAGE == tvshow.indexer], 'tvdb')[INDEXER_TVDB == tvshow.indexer] - if src: - filtered[-1]['ids'][src] = tvshow.indexerid - filtered[-1]['url_' + src] = '%s%s' % (sickbeard.indexerApi(tvshow.indexer).config['show_url'], tvshow.indexerid) - except (AttributeError, TypeError, KeyError, IndexError): - continue - - kwargs.update(dict(oldest=oldest, newest=newest, start_year=start_year)) + show_list_found = self.parse_imdb_html(html, filtered, kwargs) + kwargs.update(dict(start_year=start_year)) if len(filtered): - footnote = 'Note; Some images on this page may be cropped at source: %s watchlist at IMDb' % (helpers.anon_url(url + url_ui), list_name) - elif None is not show_list: + footnote = ('Note; Some images on this page may be cropped at source: ' + + '%s watchlist at IMDb' % ( + helpers.anon_url(url + url_ui), list_name)) + elif None is not show_list_found: kwargs['show_header'] = True - kwargs['error_msg'] = 'No TV titles in the %s watchlist at IMDb' % (helpers.anon_url(url + url_ui), list_name) + kwargs['error_msg'] = 'No TV titles in the %s watchlist at IMDb' % ( + helpers.anon_url(url + url_ui), list_name) kwargs.update(dict(footnote=footnote, mode='watchlist-%s' % acc_id, periods=periods)) return self.browse_shows(browse_type, '%s IMDb Watchlist' % list_name, filtered, **kwargs) @@ -2746,83 +2791,12 @@ class NewHomeAddShows(Home): url = 'http://www.imdb.com/search/title?at=0&sort=moviemeter&title_type=tv_series&year=%s,%s' % (start_year, end_year) html = helpers.getURL(url) if html: - img_size = re.compile(r'(?im)(V1[^XY]+([XY]))(\d+)([^\d]+)(\d+)([^\d]+)(\d+)([^\d]+)(\d+)([^\d]+)(\d+)(.*?)$') - vote_value = re.compile(r'(?i).*\((\d*).?(\d*)\svotes\).*') - imdb_id = re.compile(r'(?i).*(tt\d+)$') - - with BS4Parser(html, features=['html5lib', 'permissive']) as soup: - show_list = soup.find('table', {'class': 'results'}) - shows = [] if not show_list else show_list.find_all('tr') - oldest, newest, oldest_dt, newest_dt = None, None, 9999999, 0 - - for tr in shows[1:]: - try: - url_path = tr.select('td.title a[href*=title]')[0]['href'].strip('/') - ids = dict(imdb=imdb_id.sub(r'\1', url_path)) - first_aired = tr.select('td.title span.year_type') - year = None if not len(first_aired) else re.sub(r'.*(\d{4}).*', r'\1', first_aired[0].get_text()) - dt_ordinal = 0 - if year: - dt = dateutil.parser.parse('01-01-%s' % year) - dt_ordinal = dt.toordinal() - if dt_ordinal < oldest_dt: - oldest_dt = dt_ordinal - oldest = year - if dt_ordinal > newest_dt: - newest_dt = dt_ordinal - newest = year - - genres = tr.select('td.title span.genre') - images = {} - img = tr.select('td.image img') - overview = tr.select('td.title span.outline') - rating = tr.select('td.title span.rating-rating span.value') - voting = tr.select('td.title div.rating-list') - if len(img): - img_uri = img[0].get('src') - match = img_size.search(img_uri) - if match and 'tv_series.gif' not in img_uri: - scale = lambda low1, high1: int((float(450) / high1) * low1) - high = int(max([match.group(9), match.group(11)])) - scaled = [scale(x, high) for x in [(int(match.group(n)), high)[high == int(match.group(n))] for n in 3, 5, 7, 9, 11]] - parts = [match.group(1), match.group(4), match.group(6), match.group(8), match.group(10), match.group(12)] - img_uri = img_uri.replace(match.group(), ''.join([str(y) for x in map(None, parts, scaled) for y in x if y is not None])) - path = ek.ek(os.path.abspath, ek.ek(os.path.join, sickbeard.CACHE_DIR, 'images', 'imdb')) - helpers.make_dirs(path) - file_name = ek.ek(os.path.basename, img_uri) - cached_name = ek.ek(os.path.join, path, file_name) - if not ek.ek(os.path.isfile, cached_name): - helpers.download_file(img_uri, cached_name) - images = dict(poster=dict(thumb='cache/images/imdb/%s' % file_name)) - - filtered.append(dict( - premiered=dt_ordinal, - premiered_str=year or 'No year', - when_past=dt_ordinal < datetime.datetime.now().toordinal(), # air time not poss. 16.11.2015 - genres=('No genre yet' if not len(genres) else - genres[0].get_text().lower().replace(' |', ',')), - ids=ids, - images=images, - overview=('No overview yet' if not len(overview) else - self.encode_html(overview[0].get_text()[:250:].strip())), - rating=0 if not len(rating) else int(helpers.tryFloat(rating[0].get_text()) * 10), - title=tr.select('td.title a')[0].get_text().strip(), - url_src_db='http://www.imdb.com/%s/' % url_path, - votes=0 if not len(voting) else helpers.tryInt( - vote_value.sub(r'\1\2', voting[0].get('title')), 'TBA'))) - - tvshow = filter(lambda x: x.imdbid == ids['imdb'], sickbeard.showList)[0] - src = ((None, 'tvrage')[INDEXER_TVRAGE == tvshow.indexer], 'tvdb')[INDEXER_TVDB == tvshow.indexer] - if src: - filtered[-1]['ids'][src] = tvshow.indexerid - filtered[-1]['url_' + src] = '%s%s' % (sickbeard.indexerApi(tvshow.indexer).config['show_url'], tvshow.indexerid) - except (AttributeError, TypeError, KeyError, IndexError): - continue - - kwargs.update(dict(oldest=oldest, newest=newest, mode=mode, periods=periods)) + self.parse_imdb_html(html, filtered, kwargs) + kwargs.update(dict(mode=mode, periods=periods)) if len(filtered): - footnote = 'Note; Some images on this page may be cropped at source: IMDb' % helpers.anon_url(url) + footnote = ('Note; Some images on this page may be cropped at source: ' + + 'IMDb' % helpers.anon_url(url)) kwargs.update(dict(footnote=footnote)) return self.browse_shows(browse_type, 'Most Popular IMDb TV', filtered, **kwargs)