""" parser.http.companyParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the www.imdb.com server about a company. E.g., for "Columbia Pictures [us]" the referred page would be: main details: http://www.imdb.com/company/co0071509/ Copyright 2008-2017 Davide Alberani 2008-2017 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from utils import build_movie, Attribute, Extractor, DOMParserBase, \ analyze_imdbid from imdb.utils import analyze_company_name class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True extractors = [ Extractor( label='name', path="//h1/span[@class='display-title ']", # note the extra trailing space in class attrs=Attribute( key='name', path="./text()", postprocess=lambda x: analyze_company_name(x, stripNotes=True) ) ), Extractor( label='filmography', group="//b/a[@name]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::ol[1]/li", attrs=Attribute( key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': "./a[1]/text()", 'year': "./text()[1]" }, postprocess=lambda x: build_movie( '%s %s' % (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or u''), _parsingCompany=True ) ) ) ] preprocessors = [ (re.compile('(\1') ] def postprocess_data(self, data): for key in data.keys(): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data _OBJECTS = { 'company_main_parser': ((DOMCompanyParser,), None) }