SickGear/lib/imdb/parser/http/searchMovieParser.py

"""
parser.http.searchMovieParser module (imdb package).

This module provides the HTMLSearchMovieParser class (and the
search_movie_parser instance), used to parse the results of a search
for a given title.
E.g., for when searching for the title "the passion", the parsed
page would be:
    http://www.imdb.com/find?q=the+passion&tt=on&mx=20

Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
               2008 H. Turgut Uyar <uyar@tekir.org>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

import re
from imdb.utils import analyze_title, build_title
from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid


class DOMBasicMovieParser(DOMParserBase):
    """Simply get the title of a movie and the imdbID.

    It's used by the DOMHTMLSearchMovieParser class to return a result
    for a direct match (when a search on IMDb results in a single
    movie, the web server sends directly the movie page."""
    # Stay generic enough to be used also for other DOMBasic*Parser classes.
    _titleAttrPath = ".//text()"
    _linkPath = "//link[@rel='canonical']"
    _titleFunct = lambda self, x: analyze_title(x or u'')

    def _init(self):
        self.preprocessors += [('<span class="tv-extra">TV mini-series</span>',
                                '<span class="tv-extra">(mini)</span>')]
        self.extractors = [Extractor(label='title',
                                path="//h1",
                                attrs=Attribute(key='title',
                                                path=self._titleAttrPath,
                                                postprocess=self._titleFunct)),
                            Extractor(label='link',
                                path=self._linkPath,
                                attrs=Attribute(key='link', path="./@href",
                                postprocess=lambda x: \
                                        analyze_imdbid((x or u'').replace(
                                            'http://pro.imdb.com', ''))
                                    ))]

    # Remove 'More at IMDb Pro' links.
    preprocessors = [(re.compile(r'<span class="pro-link".*?</span>'), ''),
            (re.compile(r'<a href="http://ad.doubleclick.net.*?;id=(co[0-9]{7});'), r'<a href="http://pro.imdb.com/company/\1"></a>< a href="')]

    def postprocess_data(self, data):
        if not 'link' in data:
            data = []
        else:
            link = data.pop('link')
            if link and data:
                data = [(link, data)]
            else:
                data = []
        return data


def custom_analyze_title(title):
    """Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
    # XXX: very crappy. :-(
    nt = title.split(' aka ')[0]
    if nt:
        title = nt
    if not title:
        return {}
    return analyze_title(title)

# Manage AKAs.
_reAKAStitles = re.compile(r'(?:aka) <em>"(.*?)(<br>|<\/td>)', re.I | re.M)

class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>find - imdb</title>'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'info': ".//text()",
                            'akas': "./i//text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            custom_analyze_title(x.get('info') or u''),
                            x.get('akas')
                        ))]
    extractors = [Extractor(label='search',
                        path="//td[@class='result_text']",
                        attrs=_attrs)]
    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):

        if self._notDirectHitTitle in html_string[:10240].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                # XXX (HTU): does this still apply?
                html_string = html_string.replace('(TV mini-series)', '(mini)')
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
                                                                    title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            data['data'] = [x for x in data['data'] if x[0] and x[1]]
            for idx, datum in enumerate(data['data']):
                if not isinstance(datum, tuple):
                    continue
                if not datum[0] and datum[1]:
                    continue
                if datum[2] is not None:
                    #akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        # XXX (HTU): couldn't find a result with multiple akas
                        aka = datum[2]
                        akas = [aka[1:-1]]      # remove the quotes
                        #akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        #akas = [a.replace('aka "', '', 1).replace('aka  "',
                                #'', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data


_OBJECTS = {
        'search_movie_parser': ((DOMHTMLSearchMovieParser,), None)
}