mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-04 18:33:38 +00:00
78026584eb
Thanks to the backport by @MasterMind2k
98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
"""
|
|
parser.http.companyParser module (imdb package).
|
|
|
|
This module provides the classes (and the instances), used to parse
|
|
the IMDb pages on the www.imdb.com server about a company.
|
|
E.g., for "Columbia Pictures [us]" the referred page would be:
|
|
main details: http://www.imdb.com/company/co0071509/
|
|
|
|
Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
|
|
2008-2017 H. Turgut Uyar <uyar@tekir.org>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
"""
|
|
|
|
import re
|
|
from utils import build_movie, Attribute, Extractor, DOMParserBase, \
|
|
analyze_imdbid
|
|
|
|
from imdb.utils import analyze_company_name
|
|
|
|
|
|
class DOMCompanyParser(DOMParserBase):
|
|
"""Parser for the main page of a given company.
|
|
The page should be provided as a string, as taken from
|
|
the www.imdb.com server. The final result will be a
|
|
dictionary, with a key for every relevant section.
|
|
|
|
Example:
|
|
cparser = DOMCompanyParser()
|
|
result = cparser.parse(company_html_string)
|
|
"""
|
|
_containsObjects = True
|
|
|
|
extractors = [
|
|
Extractor(
|
|
label='name',
|
|
path="//h1/span[@class='display-title ']", # note the extra trailing space in class
|
|
attrs=Attribute(
|
|
key='name',
|
|
path="./text()",
|
|
postprocess=lambda x: analyze_company_name(x, stripNotes=True)
|
|
)
|
|
),
|
|
|
|
Extractor(
|
|
label='filmography',
|
|
group="//b/a[@name]",
|
|
group_key="./text()",
|
|
group_key_normalize=lambda x: x.lower(),
|
|
path="../following-sibling::ol[1]/li",
|
|
attrs=Attribute(
|
|
key=None,
|
|
multi=True,
|
|
path={
|
|
'link': "./a[1]/@href",
|
|
'title': "./a[1]/text()",
|
|
'year': "./text()[1]"
|
|
},
|
|
postprocess=lambda x: build_movie(
|
|
'%s %s' % (x.get('title'), x.get('year').strip()),
|
|
movieID=analyze_imdbid(x.get('link') or u''),
|
|
_parsingCompany=True
|
|
)
|
|
)
|
|
)
|
|
]
|
|
|
|
preprocessors = [
|
|
(re.compile('(<b><a name=)', re.I), r'</p>\1')
|
|
]
|
|
|
|
def postprocess_data(self, data):
|
|
for key in data.keys():
|
|
new_key = key.replace('company', 'companies')
|
|
new_key = new_key.replace('other', 'miscellaneous')
|
|
new_key = new_key.replace('distributor', 'distributors')
|
|
if new_key != key:
|
|
data[new_key] = data[key]
|
|
del data[key]
|
|
return data
|
|
|
|
|
|
_OBJECTS = {
|
|
'company_main_parser': ((DOMCompanyParser,), None)
|
|
}
|
|
|