SickGear/lib/imdb/parser/http/companyParser.py

"""
parser.http.companyParser module (imdb package).

This module provides the classes (and the instances), used to parse
the IMDb pages on the www.imdb.com server about a company.
E.g., for "Columbia Pictures [us]" the referred page would be:
    main details:   http://www.imdb.com/company/co0071509/

Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
          2008-2017 H. Turgut Uyar <uyar@tekir.org>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

import re
from utils import build_movie, Attribute, Extractor, DOMParserBase, \
                    analyze_imdbid

from imdb.utils import analyze_company_name


class DOMCompanyParser(DOMParserBase):
    """Parser for the main page of a given company.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMCompanyParser()
        result = cparser.parse(company_html_string)
    """
    _containsObjects = True

    extractors = [
        Extractor(
            label='name',
            path="//h1/span[@class='display-title ']",  # note the extra trailing space in class
            attrs=Attribute(
                key='name',
                path="./text()",
                postprocess=lambda x: analyze_company_name(x, stripNotes=True)
            )
        ),

        Extractor(
            label='filmography',
            group="//b/a[@name]",
            group_key="./text()",
            group_key_normalize=lambda x: x.lower(),
            path="../following-sibling::ol[1]/li",
            attrs=Attribute(
                key=None,
                multi=True,
                path={
                    'link': "./a[1]/@href",
                    'title': "./a[1]/text()",
                    'year': "./text()[1]"
                },
                postprocess=lambda x: build_movie(
                    '%s %s' % (x.get('title'), x.get('year').strip()),
                    movieID=analyze_imdbid(x.get('link') or u''),
                    _parsingCompany=True
                )
            )
        )
    ]

    preprocessors = [
        (re.compile('(<b><a name=)', re.I), r'</p>\1')
        ]

    def postprocess_data(self, data):
        for key in data.keys():
            new_key = key.replace('company', 'companies')
            new_key = new_key.replace('other', 'miscellaneous')
            new_key = new_key.replace('distributor', 'distributors')
            if new_key != key:
                data[new_key] = data[key]
                del data[key]
        return data


_OBJECTS = {
    'company_main_parser': ((DOMCompanyParser,), None)
}