SickGear/lib/imdb/parser/http/companyParser.py
JackDandy 78026584eb Update IMDb 5.1 (r907) → 5.2.1dev20171113 (f640595).
Thanks to the backport by @MasterMind2k
2018-03-28 00:45:15 +01:00

98 lines
3.2 KiB
Python

"""
parser.http.companyParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the www.imdb.com server about a company.
E.g., for "Columbia Pictures [us]" the referred page would be:
main details: http://www.imdb.com/company/co0071509/
Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
2008-2017 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from utils import build_movie, Attribute, Extractor, DOMParserBase, \
analyze_imdbid
from imdb.utils import analyze_company_name
class DOMCompanyParser(DOMParserBase):
"""Parser for the main page of a given company.
The page should be provided as a string, as taken from
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
cparser = DOMCompanyParser()
result = cparser.parse(company_html_string)
"""
_containsObjects = True
extractors = [
Extractor(
label='name',
path="//h1/span[@class='display-title ']", # note the extra trailing space in class
attrs=Attribute(
key='name',
path="./text()",
postprocess=lambda x: analyze_company_name(x, stripNotes=True)
)
),
Extractor(
label='filmography',
group="//b/a[@name]",
group_key="./text()",
group_key_normalize=lambda x: x.lower(),
path="../following-sibling::ol[1]/li",
attrs=Attribute(
key=None,
multi=True,
path={
'link': "./a[1]/@href",
'title': "./a[1]/text()",
'year': "./text()[1]"
},
postprocess=lambda x: build_movie(
'%s %s' % (x.get('title'), x.get('year').strip()),
movieID=analyze_imdbid(x.get('link') or u''),
_parsingCompany=True
)
)
)
]
preprocessors = [
(re.compile('(<b><a name=)', re.I), r'</p>\1')
]
def postprocess_data(self, data):
for key in data.keys():
new_key = key.replace('company', 'companies')
new_key = new_key.replace('other', 'miscellaneous')
new_key = new_key.replace('distributor', 'distributors')
if new_key != key:
data[new_key] = data[key]
del data[key]
return data
_OBJECTS = {
'company_main_parser': ((DOMCompanyParser,), None)
}