SickGear/lib/imdb/parser/http/searchMovieParser.py

183 lines
7.3 KiB
Python
Raw Normal View History

"""
parser.http.searchMovieParser module (imdb package).
This module provides the HTMLSearchMovieParser class (and the
search_movie_parser instance), used to parse the results of a search
for a given title.
E.g., for when searching for the title "the passion", the parsed
page would be:
http://www.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from imdb.utils import analyze_title, build_title
from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
class DOMBasicMovieParser(DOMParserBase):
"""Simply get the title of a movie and the imdbID.
It's used by the DOMHTMLSearchMovieParser class to return a result
for a direct match (when a search on IMDb results in a single
movie, the web server sends directly the movie page."""
# Stay generic enough to be used also for other DOMBasic*Parser classes.
_titleAttrPath = ".//text()"
_linkPath = "//link[@rel='canonical']"
_titleFunct = lambda self, x: analyze_title(x or u'')
def _init(self):
self.preprocessors += [('<span class="tv-extra">TV mini-series</span>',
'<span class="tv-extra">(mini)</span>')]
self.extractors = [Extractor(label='title',
path="//h1",
attrs=Attribute(key='title',
path=self._titleAttrPath,
postprocess=self._titleFunct)),
Extractor(label='link',
path=self._linkPath,
attrs=Attribute(key='link', path="./@href",
postprocess=lambda x: \
analyze_imdbid((x or u'').replace(
'http://pro.imdb.com', ''))
))]
# Remove 'More at IMDb Pro' links.
preprocessors = [(re.compile(r'<span class="pro-link".*?</span>'), ''),
(re.compile(r'<a href="http://ad.doubleclick.net.*?;id=(co[0-9]{7});'), r'<a href="http://pro.imdb.com/company/\1"></a>< a href="')]
def postprocess_data(self, data):
if not 'link' in data:
data = []
else:
link = data.pop('link')
if link and data:
data = [(link, data)]
else:
data = []
return data
def custom_analyze_title(title):
"""Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
# XXX: very crappy. :-(
nt = title.split(' aka ')[0]
if nt:
title = nt
if not title:
return {}
return analyze_title(title)
# Manage AKAs.
_reAKAStitles = re.compile(r'(?:aka) <em>"(.*?)(<br>|<\/td>)', re.I | re.M)
class DOMHTMLSearchMovieParser(DOMParserBase):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used, for movies."""
_BaseParser = DOMBasicMovieParser
_notDirectHitTitle = '<title>find - imdb</title>'
_titleBuilder = lambda self, x: build_title(x)
_linkPrefix = '/title/tt'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'info': ".//text()",
'akas': "./i//text()"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''),
custom_analyze_title(x.get('info') or u''),
x.get('akas')
))]
extractors = [Extractor(label='search',
path="//td[@class='result_text']",
attrs=_attrs)]
def _init(self):
self.url = u''
def _reset(self):
self.url = u''
def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:10240].lower():
if self._linkPrefix == '/title/tt':
# Only for movies.
# XXX (HTU): does this still apply?
html_string = html_string.replace('(TV mini-series)', '(mini)')
return html_string
# Direct hit!
dbme = self._BaseParser(useModule=self._useModule)
res = dbme.parse(html_string, url=self.url)
if not res: return u''
res = res['data']
if not (res and res[0]): return u''
link = '%s%s' % (self._linkPrefix, res[0][0])
# # Tries to cope with companies for which links to pro.imdb.com
# # are missing.
# link = self.url.replace(imdbURL_base[:-1], '')
title = self._titleBuilder(res[0][1])
if not (link and title): return u''
link = link.replace('http://pro.imdb.com', '')
new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
title)
return new_html
def postprocess_data(self, data):
if not data.has_key('data'):
data['data'] = []
results = getattr(self, 'results', None)
if results is not None:
data['data'][:] = data['data'][:results]
# Horrible hack to support AKAs.
if data and data['data'] and len(data['data'][0]) == 3 and \
isinstance(data['data'][0], tuple):
data['data'] = [x for x in data['data'] if x[0] and x[1]]
for idx, datum in enumerate(data['data']):
if not isinstance(datum, tuple):
continue
if not datum[0] and datum[1]:
continue
if datum[2] is not None:
#akas = filter(None, datum[2].split('::'))
if self._linkPrefix == '/title/tt':
# XXX (HTU): couldn't find a result with multiple akas
aka = datum[2]
akas = [aka[1:-1]] # remove the quotes
#akas = [a.replace('" - ', '::').rstrip() for a in akas]
#akas = [a.replace('aka "', '', 1).replace('aka "',
#'', 1).lstrip() for a in akas]
datum[1]['akas'] = akas
data['data'][idx] = (datum[0], datum[1])
else:
data['data'][idx] = (datum[0], datum[1])
return data
def add_refs(self, data):
return data
_OBJECTS = {
'search_movie_parser': ((DOMHTMLSearchMovieParser,), None)
}