2018-03-26 17:16:59 +00:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
"""
|
|
|
|
|
parser.http.movieParser module (imdb package).
|
|
|
|
|
|
|
|
|
|
This module provides the classes (and the instances), used to parse the
|
2018-03-26 17:16:59 +00:00
|
|
|
|
IMDb pages on the www.imdb.com server about a movie.
|
2014-03-10 05:18:05 +00:00
|
|
|
|
E.g., for Brian De Palma's "The Untouchables", the referred
|
|
|
|
|
pages would be:
|
2018-03-26 17:16:59 +00:00
|
|
|
|
combined details: http://www.imdb.com/title/tt0094226/reference
|
|
|
|
|
plot summary: http://www.imdb.com/title/tt0094226/plotsummary
|
2014-03-10 05:18:05 +00:00
|
|
|
|
...and so on...
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Copyright 2004-2018 Davide Alberani <da@erlug.linux.it>
|
|
|
|
|
2008-2018 H. Turgut Uyar <uyar@tekir.org>
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
|
"""
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
import functools
|
2014-03-10 05:18:05 +00:00
|
|
|
|
import re
|
|
|
|
|
import urllib
|
|
|
|
|
|
|
|
|
|
from imdb import imdbURL_base
|
|
|
|
|
from imdb.Company import Company
|
2018-03-26 17:16:59 +00:00
|
|
|
|
from imdb.Movie import Movie
|
|
|
|
|
from imdb.Person import Person
|
|
|
|
|
from imdb.utils import _Container, KIND_MAP
|
|
|
|
|
|
|
|
|
|
from .utils import Attribute, DOMParserBase, Extractor, analyze_imdbid, build_person
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Dictionary used to convert some section's names.
|
|
|
|
|
_SECT_CONV = {
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'directed': 'director',
|
|
|
|
|
'directed by': 'director',
|
|
|
|
|
'directors': 'director',
|
|
|
|
|
'editors': 'editor',
|
|
|
|
|
'writing credits': 'writer',
|
|
|
|
|
'writers': 'writer',
|
|
|
|
|
'produced': 'producer',
|
|
|
|
|
'cinematography': 'cinematographer',
|
|
|
|
|
'film editing': 'editor',
|
|
|
|
|
'casting': 'casting director',
|
|
|
|
|
'costume design': 'costume designer',
|
|
|
|
|
'makeup department': 'make up',
|
|
|
|
|
'production management': 'production manager',
|
|
|
|
|
'second unit director or assistant director': 'assistant director',
|
|
|
|
|
'costume and wardrobe department': 'costume department',
|
|
|
|
|
'sound department': 'sound crew',
|
|
|
|
|
'stunts': 'stunt performer',
|
|
|
|
|
'other crew': 'miscellaneous crew',
|
|
|
|
|
'also known as': 'akas',
|
|
|
|
|
'country': 'countries',
|
|
|
|
|
'runtime': 'runtimes',
|
|
|
|
|
'language': 'languages',
|
|
|
|
|
'certification': 'certificates',
|
|
|
|
|
'genre': 'genres',
|
|
|
|
|
'created': 'creator',
|
|
|
|
|
'creators': 'creator',
|
|
|
|
|
'color': 'color info',
|
|
|
|
|
'plot': 'plot outline',
|
|
|
|
|
'art directors': 'art direction',
|
|
|
|
|
'assistant directors': 'assistant director',
|
|
|
|
|
'set decorators': 'set decoration',
|
|
|
|
|
'visual effects department': 'visual effects',
|
|
|
|
|
'miscellaneous': 'miscellaneous crew',
|
|
|
|
|
'make up department': 'make up',
|
|
|
|
|
'plot summary': 'plot outline',
|
|
|
|
|
'cinematographers': 'cinematographer',
|
|
|
|
|
'camera department': 'camera and electrical department',
|
|
|
|
|
'costume designers': 'costume designer',
|
|
|
|
|
'production designers': 'production design',
|
|
|
|
|
'production managers': 'production manager',
|
|
|
|
|
'music original': 'original music',
|
|
|
|
|
'casting directors': 'casting director',
|
|
|
|
|
'other companies': 'miscellaneous companies',
|
|
|
|
|
'producers': 'producer',
|
|
|
|
|
'special effects by': 'special effects department',
|
|
|
|
|
'special effects': 'special effects companies'
|
|
|
|
|
}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _manageRoles(mo):
|
|
|
|
|
"""Perform some transformation on the html, so that roleIDs can
|
|
|
|
|
be easily retrieved."""
|
|
|
|
|
firstHalf = mo.group(1)
|
|
|
|
|
secondHalf = mo.group(2)
|
|
|
|
|
newRoles = []
|
|
|
|
|
roles = secondHalf.split(' / ')
|
|
|
|
|
for role in roles:
|
|
|
|
|
role = role.strip()
|
|
|
|
|
if not role:
|
|
|
|
|
continue
|
|
|
|
|
roleID = analyze_imdbid(role)
|
|
|
|
|
if roleID is None:
|
|
|
|
|
roleID = u'/'
|
|
|
|
|
else:
|
|
|
|
|
roleID += u'/'
|
2018-03-26 17:16:59 +00:00
|
|
|
|
newRoles.append(u'<div class="_imdbpyrole" roleid="%s">%s</div>' % (
|
|
|
|
|
roleID, role.strip()
|
|
|
|
|
))
|
2014-03-10 05:18:05 +00:00
|
|
|
|
return firstHalf + u' / '.join(newRoles) + mo.group(3)
|
|
|
|
|
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
_reRolesMovie = re.compile(r'(<td class="char">)(.*?)(</td>)', re.I | re.M | re.S)
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def _replaceBR(mo):
|
|
|
|
|
"""Replaces <br> tags with '::' (useful for some akas)"""
|
|
|
|
|
txt = mo.group(0)
|
|
|
|
|
return txt.replace('<br>', '::')
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
_reAkas = re.compile(r'<h5>also known as:</h5>.*?</div>', re.I | re.M | re.S)
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
def makeSplitter(lstrip=None, sep='|', comments=True,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
origNotesSep=' (', newNotesSep='::(', strip=None):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
"""Return a splitter function suitable for a given set of data."""
|
|
|
|
|
def splitter(x):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not x:
|
|
|
|
|
return x
|
2014-03-10 05:18:05 +00:00
|
|
|
|
x = x.strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not x:
|
|
|
|
|
return x
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if lstrip is not None:
|
|
|
|
|
x = x.lstrip(lstrip).lstrip()
|
|
|
|
|
lx = x.split(sep)
|
|
|
|
|
lx[:] = filter(None, [j.strip() for j in lx])
|
|
|
|
|
if comments:
|
|
|
|
|
lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
|
|
|
|
|
if strip:
|
|
|
|
|
lx[:] = [j.strip(strip) for j in lx]
|
|
|
|
|
return lx
|
|
|
|
|
return splitter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _toInt(val, replace=()):
|
|
|
|
|
"""Return the value, converted to integer, or None; if present, 'replace'
|
|
|
|
|
must be a list of tuples of values to replace."""
|
|
|
|
|
for before, after in replace:
|
|
|
|
|
val = val.replace(before, after)
|
|
|
|
|
try:
|
|
|
|
|
return int(val)
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
_re_og_title = re.compile(
|
|
|
|
|
ur'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)',
|
|
|
|
|
re.UNICODE
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_og_title(og_title):
|
|
|
|
|
data = {}
|
|
|
|
|
match = _re_og_title.match(og_title)
|
|
|
|
|
if match:
|
|
|
|
|
data['title'] = match.group(1)
|
|
|
|
|
|
|
|
|
|
if match.group(3):
|
|
|
|
|
data['year'] = int(match.group(3))
|
|
|
|
|
|
|
|
|
|
kind = match.group(2) or match.group(6)
|
|
|
|
|
if kind is None:
|
|
|
|
|
kind = 'movie'
|
|
|
|
|
else:
|
|
|
|
|
kind = kind.lower()
|
|
|
|
|
kind = KIND_MAP.get(kind, kind)
|
|
|
|
|
data['kind'] = kind
|
|
|
|
|
|
|
|
|
|
year_separator = match.group(4)
|
|
|
|
|
# There is a year separator so assume an ongoing or ended series
|
|
|
|
|
if year_separator is not None:
|
|
|
|
|
end_year = match.group(5)
|
|
|
|
|
if end_year is not None:
|
|
|
|
|
data['series years'] = '%(year)d-%(end_year)s' % {
|
|
|
|
|
'year': data['year'],
|
|
|
|
|
'end_year': end_year.strip(),
|
|
|
|
|
}
|
|
|
|
|
elif kind.endswith('series'):
|
|
|
|
|
data['series years'] = '%(year)d-' % {'year': data['year']}
|
|
|
|
|
# No year separator and series, so assume that it ended the same year
|
|
|
|
|
elif kind.endswith('series') and 'year' in data:
|
|
|
|
|
data['series years'] = '%(year)d-%(year)d' % {'year': data['year']}
|
|
|
|
|
|
|
|
|
|
if data['kind'] == 'episode' and data['title'][0] == '"':
|
|
|
|
|
quote_end = data['title'].find('"', 1)
|
|
|
|
|
data['tv series title'] = data['title'][1:quote_end]
|
|
|
|
|
data['title'] = data['title'][quote_end + 1:].strip()
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_certificates(certificates):
|
|
|
|
|
def reducer(acc, el):
|
|
|
|
|
cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE)
|
|
|
|
|
|
|
|
|
|
if cert_re.match(el):
|
|
|
|
|
acc.append(el)
|
|
|
|
|
elif acc:
|
|
|
|
|
acc[-1] = u'{}::{}'.format(
|
|
|
|
|
acc[-1],
|
|
|
|
|
el,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return acc
|
|
|
|
|
|
|
|
|
|
certificates = [el.strip() for el in certificates.split('\n') if el.strip()]
|
|
|
|
|
return functools.reduce(reducer, certificates, [])
|
|
|
|
|
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
class DOMHTMLMovieParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "combined details" (and if instance.mdparse is
|
|
|
|
|
True also for the "main details") page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
mparser = DOMHTMLMovieParser()
|
|
|
|
|
result = mparser.parse(combined_details_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='title',
|
|
|
|
|
path="//meta[@property='og:title']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='title',
|
|
|
|
|
path="@content",
|
|
|
|
|
postprocess=analyze_og_title
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
# parser for misc sections like 'casting department', 'stunts', ...
|
|
|
|
|
Extractor(
|
|
|
|
|
label='glossarysections',
|
|
|
|
|
group="//h4[contains(@class, 'ipl-header__content')]",
|
|
|
|
|
group_key="./@name",
|
|
|
|
|
group_key_normalize=lambda x: x.replace('_', ' '),
|
|
|
|
|
path="../../following-sibling::table[1]//tr",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'person': ".//text()",
|
|
|
|
|
'link': "./td[1]/a[@href]/@href"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('person') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link'))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='cast',
|
|
|
|
|
path="//table[@class='cast_list']//tr",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key="cast",
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'person': ".//text()",
|
|
|
|
|
'link': "td[2]/a/@href",
|
|
|
|
|
'roleID': "td[4]/div[@class='_imdbpyrole']/@roleid"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('person') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link')),
|
|
|
|
|
roleID=(x.get('roleID') or u'').split('/'))
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='myrating',
|
|
|
|
|
path="//span[@id='voteuser']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='myrating',
|
|
|
|
|
path=".//text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='plot summary',
|
|
|
|
|
path=".//td[starts-with(text(), 'Plot')]/..//p",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='plot summary',
|
|
|
|
|
path='./text()',
|
|
|
|
|
postprocess=lambda x: x.strip().rstrip('|').rstrip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='genres',
|
|
|
|
|
path="//td[starts-with(text(), 'Genre')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key="genres",
|
|
|
|
|
multi=True,
|
|
|
|
|
path="./text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='runtimes',
|
|
|
|
|
path="//td[starts-with(text(), 'Runtime')]/..//li",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='runtimes',
|
|
|
|
|
path="./text()",
|
|
|
|
|
multi=True,
|
|
|
|
|
postprocess=lambda x: x.strip().replace(' min', '')
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='countries',
|
|
|
|
|
path="//td[starts-with(text(), 'Countr')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='countries',
|
|
|
|
|
path="./text()",
|
|
|
|
|
multi=True
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='country codes',
|
|
|
|
|
path="//td[starts-with(text(), 'Countr')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='country codes',
|
|
|
|
|
path="./@href",
|
|
|
|
|
multi=True,
|
|
|
|
|
postprocess=lambda x: x.split('/')[2].strip().lower()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='language',
|
|
|
|
|
path="//td[starts-with(text(), 'Language')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='language',
|
|
|
|
|
path="./text()",
|
|
|
|
|
multi=True
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='language codes',
|
|
|
|
|
path="//td[starts-with(text(), 'Language')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='language codes',
|
|
|
|
|
path="./@href",
|
|
|
|
|
multi=True,
|
|
|
|
|
postprocess=lambda x: x.split('/')[2].strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='color info',
|
|
|
|
|
path="//td[starts-with(text(), 'Color')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='color info',
|
|
|
|
|
path="./text()",
|
|
|
|
|
multi=True,
|
|
|
|
|
postprocess=lambda x: x.replace(' (', '::(')
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='aspect ratio',
|
|
|
|
|
path="//td[starts-with(text(), 'Aspect')]/..",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='aspect ratio',
|
|
|
|
|
path=".//li/text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='sound mix',
|
|
|
|
|
path="//td[starts-with(text(), 'Sound Mix')]/..//li/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='sound mix',
|
|
|
|
|
path="./text()",
|
|
|
|
|
multi=True,
|
|
|
|
|
postprocess=lambda x: x.replace(' (', '::(')
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='certificates',
|
|
|
|
|
path=".//td[starts-with(text(), 'Certificat')]/..",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='certificates',
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=analyze_certificates
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='h5sections',
|
|
|
|
|
path="//section[contains(@class, 'listo')]",
|
|
|
|
|
attrs=[
|
|
|
|
|
# Collects akas not encosed in <i> tags.
|
|
|
|
|
Attribute(
|
|
|
|
|
key='other akas',
|
|
|
|
|
path=".//td[starts-with(text(), 'Also Known As')]/..//ul//text()",
|
|
|
|
|
postprocess=makeSplitter(
|
|
|
|
|
sep='::', origNotesSep='" - ', newNotesSep='::', strip='"'
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='creator',
|
|
|
|
|
path="//td[starts-with(text(), 'Creator')]/..//a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='creator',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'name': "./text()",
|
|
|
|
|
'link': "./@href"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('name') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link'))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='thin writer',
|
|
|
|
|
path="//div[starts-with(normalize-space(text()), 'Writer')]/ul/li[1]/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='thin writer',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'name': "./text()",
|
|
|
|
|
'link': "./@href"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('name') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link'))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='thin director',
|
|
|
|
|
path="//div[starts-with(normalize-space(text()), 'Director')]/ul/li[1]/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='thin director',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'name': "./text()",
|
|
|
|
|
'link': "./@href"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('name') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link'))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='top 250/bottom 100',
|
|
|
|
|
path="//li[@class='ipl-inline-list__item']//a[starts-with(@href, '/chart/')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='top/bottom rank',
|
|
|
|
|
path="./text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='original air date',
|
|
|
|
|
path="//span[@imdbpy='airdate']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='original air date',
|
|
|
|
|
path="./text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='series years',
|
|
|
|
|
path="//div[@id='tn15title']//span[starts-with(text(), 'TV series')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='series years',
|
|
|
|
|
path="./text()",
|
|
|
|
|
postprocess=lambda x: x.replace('TV series', '').strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='season/episode',
|
|
|
|
|
path="//div[@class='titlereference-overview-season-episode-section']/ul",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='season/episode',
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='number of episodes',
|
|
|
|
|
path="//a[starts-with(text(), 'All Episodes')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='number of episodes',
|
|
|
|
|
path="./text()",
|
|
|
|
|
postprocess=lambda x: int(x.replace('All Episodes', '').strip()[1:-1])
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='episode number',
|
|
|
|
|
path=".//div[@id='tn15epnav']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='episode number',
|
|
|
|
|
path="./text()",
|
|
|
|
|
postprocess=lambda x: int(re.sub(r'[^a-z0-9 ]', '', x.lower())
|
|
|
|
|
.strip()
|
|
|
|
|
.split()[0])
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='previous episode',
|
|
|
|
|
path=".//span[@class='titlereference-overview-episodes-links']//a[contains(text(), 'Previous')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='previous episode',
|
|
|
|
|
path="./@href",
|
|
|
|
|
postprocess=lambda x: analyze_imdbid(x)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='next episode',
|
|
|
|
|
path=".//span[@class='titlereference-overview-episodes-links']//a[contains(text(), 'Next')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='next episode',
|
|
|
|
|
path="./@href",
|
|
|
|
|
postprocess=lambda x: analyze_imdbid(x)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='number of seasons',
|
|
|
|
|
path=".//span[@class='titlereference-overview-years-links']/../a[1]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='number of seasons',
|
|
|
|
|
path="./text()",
|
|
|
|
|
postprocess=lambda x: int(x)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='tv series link',
|
|
|
|
|
path=".//a[starts-with(text(), 'All Episodes')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='tv series link',
|
|
|
|
|
path="./@href"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='akas',
|
|
|
|
|
path="//i[@class='transl']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='akas',
|
|
|
|
|
multi=True,
|
|
|
|
|
path='text()',
|
|
|
|
|
postprocess=lambda x: x
|
|
|
|
|
.replace(' ', ' ')
|
|
|
|
|
.rstrip('-')
|
|
|
|
|
.replace('" - ', '"::', 1)
|
|
|
|
|
.strip('"')
|
|
|
|
|
.replace(' ', ' ')
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='production notes/status',
|
|
|
|
|
path="//td[starts-with(text(), 'Status:')]/..//div[@class='info-content']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='production status',
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip().split('|')[0].strip().lower()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='production notes/status updated',
|
|
|
|
|
path="//td[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='production status updated',
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='production notes/comments',
|
|
|
|
|
path="//td[starts-with(text(), 'Comments:')]/..//div[@class='info-content']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='production comments',
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='production notes/note',
|
|
|
|
|
path="//td[starts-with(text(), 'Note:')]/..//div[@class='info-content']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='production note',
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='blackcatheader',
|
|
|
|
|
group="//b[@class='blackcatheader']",
|
|
|
|
|
group_key="./text()",
|
|
|
|
|
group_key_normalize=lambda x: x.lower(),
|
|
|
|
|
path="../ul/li",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'name': "./a//text()",
|
|
|
|
|
'comp-link': "./a/@href",
|
|
|
|
|
'notes': "./text()"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: Company(name=x.get('name') or u'',
|
|
|
|
|
companyID=analyze_imdbid(x.get('comp-link')),
|
|
|
|
|
notes=(x.get('notes') or u'').strip())
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='rating',
|
|
|
|
|
path="(//span[@class='ipl-rating-star__rating'])[1]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='rating',
|
|
|
|
|
path="./text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='votes',
|
|
|
|
|
path="//span[@class='ipl-rating-star__total-votes'][1]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='votes',
|
|
|
|
|
path="./text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='cover url',
|
|
|
|
|
path="//img[@alt='Poster']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='cover url',
|
|
|
|
|
path="@src"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
('/releaseinfo">', '"><span imdbpy="airdate">'),
|
|
|
|
|
(re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I), r'</div><div>\1'),
|
2014-03-10 05:18:05 +00:00
|
|
|
|
('<small>Full cast and crew for<br>', ''),
|
|
|
|
|
('<td> </td>', '<td>...</td>'),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
(re.compile(r'<span class="tv-extra">TV mini-series(\s+.*?)</span>', re.I),
|
|
|
|
|
r'<span class="tv-extra">TV series\1</span> (mini)'),
|
2014-03-10 05:18:05 +00:00
|
|
|
|
(_reRolesMovie, _manageRoles),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
(_reAkas, _replaceBR)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def preprocess_dom(self, dom):
|
|
|
|
|
# Handle series information.
|
|
|
|
|
xpath = self.xpath(dom, "//b[text()='Series Crew']")
|
|
|
|
|
if xpath:
|
2018-03-26 17:16:59 +00:00
|
|
|
|
b = xpath[-1] # In doubt, take the last one.
|
2014-03-10 05:18:05 +00:00
|
|
|
|
for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
|
|
|
|
|
name = a.get('name')
|
|
|
|
|
if name:
|
|
|
|
|
a.set('name', 'series %s' % name)
|
|
|
|
|
# Remove links to IMDbPro.
|
|
|
|
|
for proLink in self.xpath(dom, "//span[@class='pro-link']"):
|
|
|
|
|
proLink.drop_tree()
|
|
|
|
|
# Remove some 'more' links (keep others, like the one around
|
|
|
|
|
# the number of votes).
|
|
|
|
|
for tn15more in self.xpath(dom,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
"//a[@class='tn15more'][starts-with(@href, '/title/')]"):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
tn15more.drop_tree()
|
|
|
|
|
return dom
|
|
|
|
|
|
|
|
|
|
re_space = re.compile(r'\s+')
|
|
|
|
|
re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
# Convert section names.
|
|
|
|
|
for sect in data.keys():
|
|
|
|
|
if sect in _SECT_CONV:
|
|
|
|
|
data[_SECT_CONV[sect]] = data[sect]
|
|
|
|
|
del data[sect]
|
|
|
|
|
sect = _SECT_CONV[sect]
|
|
|
|
|
# Filter out fake values.
|
|
|
|
|
for key in data:
|
|
|
|
|
value = data[key]
|
|
|
|
|
if isinstance(value, list) and value:
|
|
|
|
|
if isinstance(value[0], Person):
|
|
|
|
|
data[key] = filter(lambda x: x.personID is not None, value)
|
|
|
|
|
if isinstance(value[0], _Container):
|
|
|
|
|
for obj in data[key]:
|
|
|
|
|
obj.accessSystem = self._as
|
|
|
|
|
obj.modFunct = self._modFunct
|
|
|
|
|
if 'akas' in data or 'other akas' in data:
|
|
|
|
|
akas = data.get('akas') or []
|
|
|
|
|
other_akas = data.get('other akas') or []
|
|
|
|
|
akas += other_akas
|
|
|
|
|
nakas = []
|
|
|
|
|
for aka in akas:
|
|
|
|
|
aka = aka.strip()
|
|
|
|
|
if aka.endswith('" -'):
|
|
|
|
|
aka = aka[:-3].rstrip()
|
|
|
|
|
nakas.append(aka)
|
|
|
|
|
if 'akas' in data:
|
|
|
|
|
del data['akas']
|
|
|
|
|
if 'other akas' in data:
|
|
|
|
|
del data['other akas']
|
|
|
|
|
if nakas:
|
|
|
|
|
data['akas'] = nakas
|
2016-01-14 08:54:24 +00:00
|
|
|
|
if 'color info' in data:
|
|
|
|
|
data['color info'] = [x.replace('Color:', '', 1) for x in data['color info']]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if 'runtimes' in data:
|
|
|
|
|
data['runtimes'] = [x.replace(' min', u'')
|
|
|
|
|
for x in data['runtimes']]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if 'number of seasons' in data:
|
|
|
|
|
data['seasons'] = [unicode(i) for i in range(1, data['number of seasons'] + 1)]
|
|
|
|
|
# data['number of seasons'] = seasons[-1] if seasons else len(data['seasons'])
|
|
|
|
|
if 'season/episode' in data:
|
|
|
|
|
tokens = data['season/episode'].split('Episode')
|
|
|
|
|
data['season'] = int(tokens[0].split('Season')[1])
|
|
|
|
|
data['episode'] = int(tokens[1])
|
|
|
|
|
del data['season/episode']
|
|
|
|
|
# if 'original air date' in data:
|
|
|
|
|
# oid = self.re_space.sub(' ', data['original air date']).strip()
|
|
|
|
|
# data['original air date'] = oid
|
|
|
|
|
# aid = self.re_airdate.findall(oid)
|
|
|
|
|
# if aid and len(aid[0]) == 3:
|
|
|
|
|
# date, season, episode = aid[0]
|
|
|
|
|
# date = date.strip()
|
|
|
|
|
# try:
|
|
|
|
|
# season = int(season)
|
|
|
|
|
# except ValueError:
|
|
|
|
|
# pass
|
|
|
|
|
# try:
|
|
|
|
|
# episode = int(episode)
|
|
|
|
|
# except ValueError:
|
|
|
|
|
# pass
|
|
|
|
|
# if date and date != '????':
|
|
|
|
|
# data['original air date'] = date
|
|
|
|
|
# else:
|
|
|
|
|
# del data['original air date']
|
|
|
|
|
# # Handle also "episode 0".
|
|
|
|
|
# if season or isinstance(season, int):
|
|
|
|
|
# data['season'] = season
|
|
|
|
|
# if episode or isinstance(season, int):
|
|
|
|
|
# data['episode'] = episode
|
2014-03-10 05:18:05 +00:00
|
|
|
|
for k in ('writer', 'director'):
|
|
|
|
|
t_k = 'thin %s' % k
|
|
|
|
|
if t_k not in data:
|
|
|
|
|
continue
|
|
|
|
|
if k not in data:
|
|
|
|
|
data[k] = data[t_k]
|
|
|
|
|
del data[t_k]
|
|
|
|
|
if 'top/bottom rank' in data:
|
|
|
|
|
tbVal = data['top/bottom rank'].lower()
|
|
|
|
|
if tbVal.startswith('top'):
|
|
|
|
|
tbKey = 'top 250 rank'
|
2018-03-26 17:16:59 +00:00
|
|
|
|
tbVal = _toInt(tbVal, [('top rated movies: #', '')])
|
2014-03-10 05:18:05 +00:00
|
|
|
|
else:
|
|
|
|
|
tbKey = 'bottom 100 rank'
|
2018-03-26 17:16:59 +00:00
|
|
|
|
tbVal = _toInt(tbVal, [('bottom rated movies: #', '')])
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if tbVal:
|
|
|
|
|
data[tbKey] = tbVal
|
|
|
|
|
del data['top/bottom rank']
|
|
|
|
|
if 'year' in data and data['year'] == '????':
|
|
|
|
|
del data['year']
|
|
|
|
|
if 'tv series link' in data:
|
|
|
|
|
if 'tv series title' in data:
|
|
|
|
|
data['episode of'] = Movie(title=data['tv series title'],
|
2018-03-26 17:16:59 +00:00
|
|
|
|
movieID=analyze_imdbid(data['tv series link']),
|
|
|
|
|
accessSystem=self._as,
|
|
|
|
|
modFunct=self._modFunct)
|
|
|
|
|
data['episode of']['kind'] = 'tv series'
|
2014-03-10 05:18:05 +00:00
|
|
|
|
del data['tv series title']
|
|
|
|
|
del data['tv series link']
|
|
|
|
|
if 'rating' in data:
|
|
|
|
|
try:
|
|
|
|
|
data['rating'] = float(data['rating'].replace('/10', ''))
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
pass
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if data['rating'] == 0:
|
|
|
|
|
del data['rating']
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if 'votes' in data:
|
|
|
|
|
try:
|
2018-03-26 17:16:59 +00:00
|
|
|
|
votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '')
|
2014-03-10 05:18:05 +00:00
|
|
|
|
data['votes'] = int(votes)
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
pass
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _process_plotsummary(x):
|
|
|
|
|
"""Process a plot (contributed by Rdian06)."""
|
|
|
|
|
xauthor = x.get('author')
|
|
|
|
|
xplot = x.get('plot', u'').strip()
|
|
|
|
|
if xauthor:
|
|
|
|
|
xplot += u'::%s' % xauthor
|
|
|
|
|
return xplot
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
class DOMHTMLPlotParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "plot summary" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a 'plot' key, containing a list
|
|
|
|
|
of string with the structure: 'summary::summary_author <author@email>'.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
pparser = HTMLPlotParser()
|
|
|
|
|
result = pparser.parse(plot_summary_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
|
|
|
|
|
# Notice that recently IMDb started to put the email of the
|
|
|
|
|
# author only in the link, that we're not collecting, here.
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='plot',
|
|
|
|
|
path="//ul[@class='zebraList']/li",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='plot',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'plot': './/p[@class="plotSummary"]//text()',
|
|
|
|
|
'author': './/span/em/a/text()'
|
|
|
|
|
},
|
|
|
|
|
postprocess=_process_plotsummary)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _process_award(x):
|
|
|
|
|
award = {}
|
2014-05-29 05:40:12 +00:00
|
|
|
|
_award = x.get('award')
|
|
|
|
|
if _award is not None:
|
|
|
|
|
_award = _award.strip()
|
|
|
|
|
award['award'] = _award
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if not award['award']:
|
|
|
|
|
return {}
|
|
|
|
|
award['year'] = x.get('year').strip()
|
|
|
|
|
if award['year'] and award['year'].isdigit():
|
|
|
|
|
award['year'] = int(award['year'])
|
|
|
|
|
award['result'] = x.get('result').strip()
|
|
|
|
|
category = x.get('category').strip()
|
|
|
|
|
if category:
|
|
|
|
|
award['category'] = category
|
|
|
|
|
received_with = x.get('with')
|
|
|
|
|
if received_with is not None:
|
|
|
|
|
award['with'] = received_with.strip()
|
|
|
|
|
notes = x.get('notes')
|
|
|
|
|
if notes is not None:
|
|
|
|
|
notes = notes.strip()
|
|
|
|
|
if notes:
|
|
|
|
|
award['notes'] = notes
|
|
|
|
|
award['anchor'] = x.get('anchor')
|
|
|
|
|
return award
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLAwardsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "awards" page of a given person or movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
awparser = HTMLAwardsParser()
|
|
|
|
|
result = awparser.parse(awards_html_string)
|
|
|
|
|
"""
|
|
|
|
|
subject = 'title'
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='awards',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
group="//table//big",
|
|
|
|
|
group_key="./a",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
path="./ancestor::tr[1]/following-sibling::tr/td[last()][not(@colspan)]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'year': "../td[1]/a/text()",
|
|
|
|
|
'result': "../td[2]/b/text()",
|
|
|
|
|
'award': "../td[3]/text()",
|
|
|
|
|
'category': "./text()[1]",
|
|
|
|
|
# FIXME: takes only the first co-recipient
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'with': "./small[starts-with(text(), 'Shared with:')]/"
|
|
|
|
|
"following-sibling::a[1]/text()",
|
2014-03-10 05:18:05 +00:00
|
|
|
|
'notes': "./small[last()]//text()",
|
|
|
|
|
'anchor': ".//text()"
|
2018-03-26 17:16:59 +00:00
|
|
|
|
},
|
2014-03-10 05:18:05 +00:00
|
|
|
|
postprocess=_process_award
|
2018-03-26 17:16:59 +00:00
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='recipients',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
group="//table//big",
|
|
|
|
|
group_key="./a",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
path="./ancestor::tr[1]/following-sibling::tr"
|
|
|
|
|
"/td[last()]/small[1]/preceding-sibling::a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'name': "./text()",
|
|
|
|
|
'link': "./@href",
|
|
|
|
|
'anchor': "..//text()"
|
2018-03-26 17:16:59 +00:00
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
|
|
|
|
|
r'\1</table>'),
|
|
|
|
|
(re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
|
|
|
|
|
r'</table><table class="_imdbpy">\1'),
|
|
|
|
|
(re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
|
|
|
|
|
(re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
|
|
|
|
|
(re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def preprocess_dom(self, dom):
|
|
|
|
|
"""Repeat td elements according to their rowspan attributes
|
|
|
|
|
in subsequent tr elements.
|
|
|
|
|
"""
|
|
|
|
|
cols = self.xpath(dom, "//td[@rowspan]")
|
|
|
|
|
for col in cols:
|
|
|
|
|
span = int(col.get('rowspan'))
|
|
|
|
|
del col.attrib['rowspan']
|
|
|
|
|
position = len(self.xpath(col, "./preceding-sibling::td"))
|
|
|
|
|
row = col.getparent()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]:
|
2014-03-10 05:18:05 +00:00
|
|
|
|
# if not cloned, child will be moved to new parent
|
|
|
|
|
clone = self.clone(col)
|
|
|
|
|
# XXX: beware that here we don't use an "adapted" function,
|
|
|
|
|
# because both BeautifulSoup and lxml uses the same
|
|
|
|
|
# "insert" method.
|
|
|
|
|
tr.insert(position, clone)
|
|
|
|
|
return dom
|
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
if len(data) == 0:
|
|
|
|
|
return {}
|
|
|
|
|
nd = []
|
|
|
|
|
for key in data.keys():
|
|
|
|
|
dom = self.get_dom(key)
|
|
|
|
|
assigner = self.xpath(dom, "//a/text()")[0]
|
|
|
|
|
for entry in data[key]:
|
|
|
|
|
if not entry.has_key('name'):
|
|
|
|
|
if not entry:
|
|
|
|
|
continue
|
|
|
|
|
# this is an award, not a recipient
|
|
|
|
|
entry['assigner'] = assigner.strip()
|
|
|
|
|
# find the recipients
|
|
|
|
|
matches = [p for p in data[key]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if 'name' in p and (entry['anchor'] == p['anchor'])]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if self.subject == 'title':
|
2018-03-26 17:16:59 +00:00
|
|
|
|
recipients = [
|
|
|
|
|
Person(name=recipient['name'],
|
|
|
|
|
personID=analyze_imdbid(recipient['link']))
|
|
|
|
|
for recipient in matches
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
entry['to'] = recipients
|
|
|
|
|
elif self.subject == 'name':
|
2018-03-26 17:16:59 +00:00
|
|
|
|
recipients = [
|
|
|
|
|
Movie(title=recipient['name'],
|
|
|
|
|
movieID=analyze_imdbid(recipient['link']))
|
|
|
|
|
for recipient in matches
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
entry['for'] = recipients
|
|
|
|
|
nd.append(entry)
|
|
|
|
|
del entry['anchor']
|
|
|
|
|
return {'awards': nd}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLTaglinesParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "taglines" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
tparser = DOMHTMLTaglinesParser()
|
|
|
|
|
result = tparser.parse(taglines_html_string)
|
|
|
|
|
"""
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='taglines',
|
|
|
|
|
path="//div[@id='taglines_content']/div",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='taglines',
|
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def preprocess_dom(self, dom):
|
|
|
|
|
for node in self.xpath(dom, "//div[@id='taglines_content']/div[@class='header']"):
|
|
|
|
|
node.drop_tree()
|
|
|
|
|
for node in self.xpath(dom, "//div[@id='taglines_content']/div[@id='no_content']"):
|
|
|
|
|
node.drop_tree()
|
|
|
|
|
return dom
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
2014-05-29 05:40:12 +00:00
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
if 'taglines' in data:
|
|
|
|
|
data['taglines'] = [tagline.strip() for tagline in data['taglines']]
|
|
|
|
|
return data
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
class DOMHTMLKeywordsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "keywords" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
kwparser = DOMHTMLKeywordsParser()
|
|
|
|
|
result = kwparser.parse(keywords_html_string)
|
|
|
|
|
"""
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='keywords',
|
|
|
|
|
path="//a[starts-with(@href, '/keyword/')]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='keywords',
|
|
|
|
|
path="./text()", multi=True,
|
|
|
|
|
postprocess=lambda x: x.lower().replace(' ', '-')
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLAlternateVersionsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "alternate versions" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
avparser = HTMLAlternateVersionsParser()
|
|
|
|
|
result = avparser.parse(alternateversions_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='alternate versions',
|
|
|
|
|
path="//ul[@class='trivia']/li",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='alternate versions',
|
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLTriviaParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "trivia" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
avparser = HTMLAlternateVersionsParser()
|
|
|
|
|
result = avparser.parse(alternateversions_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='alternate versions',
|
|
|
|
|
path="//div[@class='sodatext']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='trivia',
|
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip())
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def preprocess_dom(self, dom):
|
|
|
|
|
# Remove "link this quote" links.
|
|
|
|
|
for qLink in self.xpath(dom, "//span[@class='linksoda']"):
|
|
|
|
|
qLink.drop_tree()
|
|
|
|
|
return dom
|
|
|
|
|
|
|
|
|
|
|
2017-01-27 14:25:20 +00:00
|
|
|
|
class DOMHTMLSoundtrackParser(DOMParserBase):
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
preprocessors = [('<br />', '\n'), ('<br>', '\n')]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='soundtrack',
|
|
|
|
|
path="//div[@class='list']//div",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='soundtrack',
|
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.strip()
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
2017-01-27 14:25:20 +00:00
|
|
|
|
if 'soundtrack' in data:
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd = []
|
2017-01-27 14:25:20 +00:00
|
|
|
|
for x in data['soundtrack']:
|
2014-03-10 05:18:05 +00:00
|
|
|
|
ds = x.split('\n')
|
|
|
|
|
title = ds[0]
|
|
|
|
|
if title[0] == '"' and title[-1] == '"':
|
|
|
|
|
title = title[1:-1]
|
|
|
|
|
nds = []
|
|
|
|
|
newData = {}
|
|
|
|
|
for l in ds[1:]:
|
|
|
|
|
if ' with ' in l or ' by ' in l or ' from ' in l \
|
|
|
|
|
or ' of ' in l or l.startswith('From '):
|
|
|
|
|
nds.append(l)
|
|
|
|
|
else:
|
|
|
|
|
if nds:
|
|
|
|
|
nds[-1] += l
|
|
|
|
|
else:
|
|
|
|
|
nds.append(l)
|
|
|
|
|
newData[title] = {}
|
|
|
|
|
for l in nds:
|
|
|
|
|
skip = False
|
|
|
|
|
for sep in ('From ',):
|
|
|
|
|
if l.startswith(sep):
|
|
|
|
|
fdix = len(sep)
|
|
|
|
|
kind = l[:fdix].rstrip().lower()
|
|
|
|
|
info = l[fdix:].lstrip()
|
|
|
|
|
newData[title][kind] = info
|
|
|
|
|
skip = True
|
|
|
|
|
if not skip:
|
|
|
|
|
for sep in ' with ', ' by ', ' from ', ' of ':
|
|
|
|
|
fdix = l.find(sep)
|
|
|
|
|
if fdix != -1:
|
2018-03-26 17:16:59 +00:00
|
|
|
|
fdix = fdix + len(sep)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
kind = l[:fdix].rstrip().lower()
|
|
|
|
|
info = l[fdix:].lstrip()
|
|
|
|
|
newData[title][kind] = info
|
|
|
|
|
break
|
|
|
|
|
nd.append(newData)
|
|
|
|
|
data['soundtrack'] = nd
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLCrazyCreditsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "crazy credits" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
ccparser = DOMHTMLCrazyCreditsParser()
|
|
|
|
|
result = ccparser.parse(crazycredits_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='crazy credits',
|
|
|
|
|
path="//ul/li/tt",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='crazy credits',
|
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()",
|
|
|
|
|
postprocess=lambda x: x.replace('\n', ' ').replace(' ', ' ')
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
2014-05-29 05:40:12 +00:00
|
|
|
|
def _process_goof(x):
|
|
|
|
|
if x['spoiler_category']:
|
|
|
|
|
return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip()
|
|
|
|
|
else:
|
|
|
|
|
return x['category'].strip() + ': ' + x['text'].strip()
|
|
|
|
|
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
class DOMHTMLGoofsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "goofs" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
gparser = DOMHTMLGoofsParser()
|
|
|
|
|
result = gparser.parse(goofs_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='goofs',
|
|
|
|
|
path="//div[@class='soda odd']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='goofs',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'text': "./text()",
|
|
|
|
|
'category': './preceding-sibling::h4[1]/text()',
|
|
|
|
|
'spoiler_category': './h4/text()'
|
|
|
|
|
},
|
|
|
|
|
postprocess=_process_goof
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLQuotesParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "memorable quotes" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
qparser = DOMHTMLQuotesParser()
|
|
|
|
|
result = qparser.parse(quotes_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='quotes_odd',
|
2014-05-29 05:40:12 +00:00
|
|
|
|
path="//div[@class='quote soda odd']",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='quotes_odd',
|
2014-05-29 05:40:12 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=lambda x: x
|
|
|
|
|
.strip()
|
|
|
|
|
.replace(' \n', '::')
|
|
|
|
|
.replace('::\n', '::')
|
|
|
|
|
.replace('\n', ' ')
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='quotes_even',
|
2014-05-29 05:40:12 +00:00
|
|
|
|
path="//div[@class='quote soda even']",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='quotes_even',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path=".//text()",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=lambda x: x
|
|
|
|
|
.strip()
|
|
|
|
|
.replace(' \n', '::')
|
|
|
|
|
.replace('::\n', '::')
|
|
|
|
|
.replace('\n', ' ')
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
2014-05-29 05:40:12 +00:00
|
|
|
|
(re.compile('<a href="#" class="hidesoda hidden">Hide options</a><br>', re.I), '')
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def preprocess_dom(self, dom):
|
|
|
|
|
# Remove "link this quote" links.
|
2014-05-29 05:40:12 +00:00
|
|
|
|
for qLink in self.xpath(dom, "//span[@class='linksoda']"):
|
|
|
|
|
qLink.drop_tree()
|
|
|
|
|
for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
qLink.drop_tree()
|
|
|
|
|
return dom
|
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
2014-05-29 05:40:12 +00:00
|
|
|
|
quotes = data.get('quotes_odd', []) + data.get('quotes_even', [])
|
|
|
|
|
if not quotes:
|
2014-03-10 05:18:05 +00:00
|
|
|
|
return {}
|
2014-05-29 05:40:12 +00:00
|
|
|
|
quotes = [q.split('::') for q in quotes]
|
|
|
|
|
return {'quotes': quotes}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLReleaseinfoParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "release dates" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
rdparser = DOMHTMLReleaseinfoParser()
|
|
|
|
|
result = rdparser.parse(releaseinfo_html_string)
|
|
|
|
|
"""
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='release dates',
|
|
|
|
|
path="//table[@id='release_dates']//tr",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='release dates',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'country': ".//td[1]//text()",
|
|
|
|
|
'date': ".//td[2]//text()",
|
|
|
|
|
'notes': ".//td[3]//text()"
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='akas',
|
|
|
|
|
path="//table[@id='akas']//tr",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='akas',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'title': "./td[1]/text()",
|
|
|
|
|
'countries': "./td[2]/text()"}
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
r'<div class="_imdbpy_akas">\1</div>')
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not ('release dates' in data or 'akas' in data):
|
|
|
|
|
return data
|
2014-03-10 05:18:05 +00:00
|
|
|
|
releases = data.get('release dates') or []
|
|
|
|
|
rl = []
|
|
|
|
|
for i in releases:
|
|
|
|
|
country = i.get('country')
|
|
|
|
|
date = i.get('date')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not (country and date):
|
|
|
|
|
continue
|
2014-03-10 05:18:05 +00:00
|
|
|
|
country = country.strip()
|
|
|
|
|
date = date.strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not (country and date):
|
|
|
|
|
continue
|
2014-03-10 05:18:05 +00:00
|
|
|
|
notes = i['notes']
|
|
|
|
|
info = u'%s::%s' % (country, date)
|
|
|
|
|
if notes:
|
|
|
|
|
info += notes
|
|
|
|
|
rl.append(info)
|
|
|
|
|
if releases:
|
|
|
|
|
del data['release dates']
|
|
|
|
|
if rl:
|
|
|
|
|
data['release dates'] = rl
|
|
|
|
|
akas = data.get('akas') or []
|
|
|
|
|
nakas = []
|
|
|
|
|
for aka in akas:
|
|
|
|
|
title = (aka.get('title') or '').strip()
|
|
|
|
|
if not title:
|
|
|
|
|
continue
|
2014-05-29 05:40:12 +00:00
|
|
|
|
countries = (aka.get('countries') or '').split(',')
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if not countries:
|
|
|
|
|
nakas.append(title)
|
|
|
|
|
else:
|
|
|
|
|
for country in countries:
|
|
|
|
|
nakas.append('%s::%s' % (title, country.strip()))
|
|
|
|
|
if akas:
|
|
|
|
|
del data['akas']
|
|
|
|
|
if nakas:
|
|
|
|
|
data['akas from release info'] = nakas
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLRatingsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "user ratings" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
rparser = DOMHTMLRatingsParser()
|
|
|
|
|
result = rparser.parse(userratings_html_string)
|
|
|
|
|
"""
|
2018-03-26 17:16:59 +00:00
|
|
|
|
re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])', re.I)
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='number of votes',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//td[b='Percentage']/../../tr",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key='votes',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'votes': "td[1]//text()",
|
|
|
|
|
'ordinal': "td[3]//text()"
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='mean and median',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//p[starts-with(text(), 'Arithmetic mean')]",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='mean and median',
|
|
|
|
|
path="text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='rating',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//a[starts-with(@href, '/search/title?user_rating=')]",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='rating',
|
|
|
|
|
path="text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='demographic voters',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//td[b='Average']/../../tr",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='demographic voters',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'voters': "td[1]//text()",
|
|
|
|
|
'votes': "td[2]//text()",
|
|
|
|
|
'average': "td[3]//text()"
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='top 250',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//a[text()='top 250']",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='top 250',
|
|
|
|
|
path="./preceding-sibling::text()[1]"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
nd = {}
|
|
|
|
|
votes = data.get('votes', [])
|
|
|
|
|
if votes:
|
|
|
|
|
nd['number of votes'] = {}
|
|
|
|
|
for i in xrange(1, 11):
|
|
|
|
|
_ordinal = int(votes[i]['ordinal'])
|
|
|
|
|
_strvts = votes[i]['votes'] or '0'
|
2018-03-26 17:16:59 +00:00
|
|
|
|
nd['number of votes'][_ordinal] = int(_strvts.replace(',', ''))
|
2014-03-10 05:18:05 +00:00
|
|
|
|
mean = data.get('mean and median', '')
|
|
|
|
|
if mean:
|
|
|
|
|
means = self.re_means.findall(mean)
|
|
|
|
|
if means and len(means[0]) == 2:
|
|
|
|
|
am, med = means[0]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
am = float(am)
|
|
|
|
|
except (ValueError, OverflowError):
|
|
|
|
|
pass
|
|
|
|
|
if isinstance(am, float):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd['arithmetic mean'] = am
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
med = int(med)
|
|
|
|
|
except (ValueError, OverflowError):
|
|
|
|
|
pass
|
|
|
|
|
if isinstance(med, int):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd['median'] = med
|
|
|
|
|
if 'rating' in data:
|
|
|
|
|
nd['rating'] = float(data['rating'])
|
|
|
|
|
dem_voters = data.get('demographic voters')
|
|
|
|
|
if dem_voters:
|
|
|
|
|
nd['demographic'] = {}
|
|
|
|
|
for i in xrange(1, len(dem_voters)):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if (dem_voters[i]['votes'] is not None) and (dem_voters[i]['votes'].strip()):
|
|
|
|
|
nd['demographic'][dem_voters[i]['voters'].strip().lower()] = \
|
|
|
|
|
(int(dem_voters[i]['votes'].replace(',', '')),
|
|
|
|
|
float(dem_voters[i]['average']))
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if 'imdb users' in nd.get('demographic', {}):
|
|
|
|
|
nd['votes'] = nd['demographic']['imdb users'][0]
|
|
|
|
|
nd['demographic']['all votes'] = nd['demographic']['imdb users']
|
|
|
|
|
del nd['demographic']['imdb users']
|
|
|
|
|
top250 = data.get('top 250')
|
|
|
|
|
if top250:
|
|
|
|
|
sd = top250[9:]
|
|
|
|
|
i = sd.find(' ')
|
|
|
|
|
if i != -1:
|
|
|
|
|
sd = sd[:i]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
sd = int(sd)
|
|
|
|
|
except (ValueError, OverflowError):
|
|
|
|
|
pass
|
|
|
|
|
if isinstance(sd, int):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd['top 250 rank'] = sd
|
|
|
|
|
return nd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLEpisodesRatings(DOMParserBase):
|
|
|
|
|
"""Parser for the "episode ratings ... by date" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
erparser = DOMHTMLEpisodesRatings()
|
|
|
|
|
result = erparser.parse(eprating_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='title',
|
|
|
|
|
path="//title",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='title',
|
|
|
|
|
path="./text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='ep ratings',
|
|
|
|
|
path="//th/../..//tr",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='episodes',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'nr': ".//td[1]/text()",
|
|
|
|
|
'ep title': ".//td[2]//text()",
|
|
|
|
|
'movieID': ".//td[2]/a/@href",
|
|
|
|
|
'rating': ".//td[3]/text()",
|
|
|
|
|
'votes': ".//td[4]/text()"
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if 'title' not in data or 'episodes' not in data:
|
|
|
|
|
return {}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd = []
|
|
|
|
|
title = data['title']
|
|
|
|
|
for i in data['episodes']:
|
|
|
|
|
ept = i['ep title']
|
|
|
|
|
movieID = analyze_imdbid(i['movieID'])
|
|
|
|
|
votes = i['votes']
|
|
|
|
|
rating = i['rating']
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not (ept and movieID and votes and rating):
|
|
|
|
|
continue
|
2014-03-10 05:18:05 +00:00
|
|
|
|
try:
|
|
|
|
|
votes = int(votes.replace(',', '').replace('.', ''))
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
try:
|
|
|
|
|
rating = float(rating)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
ept = ept.strip()
|
|
|
|
|
ept = u'%s {%s' % (title, ept)
|
|
|
|
|
nr = i['nr']
|
|
|
|
|
if nr:
|
|
|
|
|
ept += u' (#%s)' % nr.strip()
|
|
|
|
|
ept += '}'
|
|
|
|
|
if movieID is not None:
|
|
|
|
|
movieID = str(movieID)
|
|
|
|
|
m = Movie(title=ept, movieID=movieID, accessSystem=self._as,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
modFunct=self._modFunct)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
epofdict = m.get('episode of')
|
|
|
|
|
if epofdict is not None:
|
|
|
|
|
m['episode of'] = Movie(data=epofdict, accessSystem=self._as,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
modFunct=self._modFunct)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd.append({'episode': m, 'votes': votes, 'rating': rating})
|
|
|
|
|
return {'episodes rating': nd}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_href(href):
|
|
|
|
|
if (href is not None) and (not href.lower().startswith('http://')):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if href.startswith('/'):
|
|
|
|
|
href = href[1:]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
# TODO: imdbURL_base may be set by the user!
|
|
|
|
|
href = '%s%s' % (imdbURL_base, href)
|
|
|
|
|
return href
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
|
2014-05-29 05:40:12 +00:00
|
|
|
|
class DOMHTMLCriticReviewsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "critic reviews" pages of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-05-29 05:40:12 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
osparser = DOMHTMLCriticReviewsParser()
|
|
|
|
|
result = osparser.parse(officialsites_html_string)
|
|
|
|
|
"""
|
|
|
|
|
kind = 'critic reviews'
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
2014-05-29 05:40:12 +00:00
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='metascore',
|
|
|
|
|
path="//div[@class='metascore_wrap']/div/span",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='metascore',
|
|
|
|
|
path=".//text()"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='metacritic url',
|
|
|
|
|
path="//div[@class='article']/div[@class='see-more']/a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='metacritic url',
|
|
|
|
|
path="./@href"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLReviewsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "reviews" pages of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
|
|
|
|
the www.imdb.com server. The final result will be a
|
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
osparser = DOMHTMLReviewsParser()
|
|
|
|
|
result = osparser.parse(officialsites_html_string)
|
|
|
|
|
"""
|
|
|
|
|
kind = 'reviews'
|
|
|
|
|
|
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='review',
|
|
|
|
|
path="//div[@class='review-container']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='self.kind',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'text': ".//div[@class='text']//text()",
|
|
|
|
|
'helpful': ".//div[@class='text-muted']/text()[1]",
|
|
|
|
|
'title': ".//div[@class='title']//text()",
|
|
|
|
|
'author': ".//span[@class='display-name-link']/a/@href",
|
|
|
|
|
'date': ".//span[@class='review-date']//text()",
|
|
|
|
|
'rating': ".//span[@class='point-scale']/preceding-sibling::span[1]/text()"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: ({
|
|
|
|
|
'content': (x['text'] or u'').replace(u"\n", u" ").replace(u' ', u' ').strip(),
|
|
|
|
|
'helpful': [int(s) for s in (x.get('helpful') or u'').split() if s.isdigit()],
|
|
|
|
|
'title': (x.get('title') or u'').strip(),
|
|
|
|
|
'author': analyze_imdbid(x.get('author')),
|
|
|
|
|
'date': (x.get('date') or u'').strip(),
|
|
|
|
|
'rating': (x.get('rating') or u'').strip()
|
|
|
|
|
})
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
preprocessors = [('<br>', '<br>\n')]
|
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
for review in data.get('reviews', []):
|
|
|
|
|
if review.get('rating') and len(review['rating']) == 2:
|
|
|
|
|
review['rating'] = int(review['rating'][0])
|
|
|
|
|
else:
|
|
|
|
|
review['rating'] = None
|
|
|
|
|
|
|
|
|
|
if review.get('helpful') and len(review['helpful']) == 2:
|
|
|
|
|
review['not_helpful'] = review['helpful'][1] - review['helpful'][0]
|
|
|
|
|
review['helpful'] = review['helpful'][0]
|
|
|
|
|
else:
|
|
|
|
|
review['helpful'] = 0
|
|
|
|
|
review['not_helpful'] = 0
|
|
|
|
|
|
|
|
|
|
review['author'] = u"ur%s" % review['author']
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLFullCreditsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "full credits" (series cast section) page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
|
|
|
|
the www.imdb.com server. The final result will be a
|
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
osparser = DOMHTMLFullCreditsParser()
|
|
|
|
|
result = osparser.parse(officialsites_html_string)
|
|
|
|
|
"""
|
|
|
|
|
kind = 'full credits'
|
|
|
|
|
|
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='cast',
|
|
|
|
|
path="//table[@class='cast_list']//tr[@class='odd' or @class='even']",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key="cast",
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'person': ".//text()",
|
|
|
|
|
'link': "td[2]/a/@href",
|
|
|
|
|
'roleID': "td[4]//div[@class='_imdbpyrole']/@roleid"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('person') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link')),
|
|
|
|
|
roleID=(x.get('roleID') or u'').split('/')
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(_reRolesMovie, _manageRoles)
|
|
|
|
|
]
|
|
|
|
|
|
2016-01-14 08:54:24 +00:00
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
class DOMHTMLOfficialsitesParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "official sites", "external reviews", "newsgroup
|
|
|
|
|
reviews", "miscellaneous links", "sound clips", "video clips" and
|
|
|
|
|
"photographs" pages of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
osparser = DOMHTMLOfficialsitesParser()
|
|
|
|
|
result = osparser.parse(officialsites_html_string)
|
|
|
|
|
"""
|
|
|
|
|
kind = 'official sites'
|
|
|
|
|
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='site',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//ol/li/a",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='self.kind',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'link': "./@href",
|
|
|
|
|
'info': "./text()"
|
|
|
|
|
},
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=lambda x: (
|
|
|
|
|
x.get('info').strip(),
|
|
|
|
|
urllib.unquote(_normalize_href(x.get('link')))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLConnectionParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "connections" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
connparser = DOMHTMLConnectionParser()
|
|
|
|
|
result = connparser.parse(connections_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='connection',
|
|
|
|
|
group="//div[@class='_imdbpy']",
|
|
|
|
|
group_key="./h5/text()",
|
|
|
|
|
group_key_normalize=lambda x: x.lower(),
|
|
|
|
|
path="./a",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
|
|
|
|
path={
|
|
|
|
|
'title': "./text()",
|
|
|
|
|
'movieID': "./@href"
|
|
|
|
|
},
|
|
|
|
|
multi=True
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
('<h5>', '</div><div class="_imdbpy"><h5>'),
|
|
|
|
|
# To get the movie's year.
|
|
|
|
|
('</a> (', ' ('),
|
|
|
|
|
('\n<br/>', '</a>'),
|
|
|
|
|
('<br/> - ', '::')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
for key in data.keys():
|
|
|
|
|
nl = []
|
|
|
|
|
for v in data[key]:
|
|
|
|
|
title = v['title']
|
|
|
|
|
ts = title.split('::', 1)
|
|
|
|
|
title = ts[0].strip()
|
|
|
|
|
notes = u''
|
|
|
|
|
if len(ts) == 2:
|
|
|
|
|
notes = ts[1].strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
m = Movie(title=title, movieID=analyze_imdbid(v['movieID']),
|
|
|
|
|
accessSystem=self._as, notes=notes, modFunct=self._modFunct)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nl.append(m)
|
|
|
|
|
data[key] = nl
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not data:
|
|
|
|
|
return {}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
return {'connections': data}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLLocationsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "locations" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
lparser = DOMHTMLLocationsParser()
|
|
|
|
|
result = lparser.parse(locations_html_string)
|
|
|
|
|
"""
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='locations',
|
|
|
|
|
path="//dt",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='locations',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'place': ".//text()",
|
|
|
|
|
'note': "./following-sibling::dd[1]//text()"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: (
|
|
|
|
|
u'%s::%s' % (
|
|
|
|
|
x['place'].strip(),
|
|
|
|
|
(x['note'] or u'').strip()
|
|
|
|
|
)
|
|
|
|
|
).strip(':')
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLTechParser(DOMParserBase):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
"""Parser for the "technical", "publicity" (for people) and "contacts" (for people)
|
|
|
|
|
pages of a given movie.
|
2014-03-10 05:18:05 +00:00
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
tparser = HTMLTechParser()
|
|
|
|
|
result = tparser.parse(technical_html_string)
|
|
|
|
|
"""
|
|
|
|
|
kind = 'tech'
|
2017-01-27 14:25:20 +00:00
|
|
|
|
re_space = re.compile(r'\s+')
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='tech',
|
|
|
|
|
group="//table//tr/td[@class='label']",
|
|
|
|
|
group_key="./text()",
|
|
|
|
|
group_key_normalize=lambda x: x.lower().strip(),
|
|
|
|
|
path=".",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
|
|
|
|
path="..//td[2]//text()",
|
|
|
|
|
postprocess=lambda x: [t.strip() for t in x.split(':::') if t.strip()]
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('(<h5>.*?</h5>)', re.I), r'</div>\1<div class="_imdbpy">'),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
(re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I), r'\1</div>'),
|
2014-03-10 05:18:05 +00:00
|
|
|
|
# the ones below are for the publicity parser
|
|
|
|
|
(re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
|
|
|
|
|
(re.compile('(</td><td valign="top">)', re.I), r'\1::'),
|
|
|
|
|
(re.compile('(</tr><tr>)', re.I), r'\n\1'),
|
2017-01-27 14:25:20 +00:00
|
|
|
|
(re.compile('<span class="ghost">\|</span>', re.I), r':::'),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
(re.compile('<br/?>', re.I), r':::')
|
2014-03-10 05:18:05 +00:00
|
|
|
|
# this is for splitting individual entries
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
for key in data:
|
2017-01-27 14:25:20 +00:00
|
|
|
|
data[key] = filter(lambda x: x != '|', data[key])
|
|
|
|
|
data[key] = [self.re_space.sub(' ', x).strip() for x in data[key]]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
data[key] = filter(None, data[key])
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if self.kind == 'contacts' and data:
|
2014-03-10 05:18:05 +00:00
|
|
|
|
data = {self.kind: data}
|
|
|
|
|
else:
|
|
|
|
|
if self.kind == 'publicity':
|
|
|
|
|
if 'biography (print)' in data:
|
|
|
|
|
data['biography-print'] = data['biography (print)']
|
|
|
|
|
del data['biography (print)']
|
|
|
|
|
# Tech info.
|
|
|
|
|
for key in data.keys():
|
|
|
|
|
if key.startswith('film negative format'):
|
|
|
|
|
data['film negative format'] = data[key]
|
|
|
|
|
del data[key]
|
|
|
|
|
elif key.startswith('film length'):
|
|
|
|
|
data['film length'] = data[key]
|
|
|
|
|
del data[key]
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
class DOMHTMLBusinessParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "business" and "literature" pages of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
|
|
|
|
the www.imdb.com server. The final result will be a
|
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
bparser = DOMHTMLBusinessParser()
|
|
|
|
|
result = bparser.parse(business_html_string)
|
|
|
|
|
"""
|
|
|
|
|
re_space = re.compile(r'\s+')
|
|
|
|
|
|
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='business',
|
|
|
|
|
path="//div[@id='tn15content']//h5",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='./text()',
|
|
|
|
|
path="./following-sibling::div[1]//text()"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
('</h5>', '</h5><div class="_imdbpy">'),
|
|
|
|
|
('<div id="tn15content">', '<div id="tn15content"><div>'),
|
|
|
|
|
('<h5>', '</div><h5>'),
|
|
|
|
|
('<hr/><h3>', '</div><hr/><h3>'),
|
|
|
|
|
('<br/>', ':::')
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
newData = {}
|
|
|
|
|
for key, value in data.iteritems():
|
|
|
|
|
value = value.strip().split(':::')
|
|
|
|
|
value = [v.strip() for v in value]
|
|
|
|
|
value = filter(None, value)
|
|
|
|
|
if not value:
|
|
|
|
|
continue
|
|
|
|
|
newData[key.lower().strip()] = value
|
|
|
|
|
return newData
|
|
|
|
|
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
class DOMHTMLRecParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "recommendations" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
rparser = HTMLRecParser()
|
|
|
|
|
result = rparser.parse(recommendations_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='recommendations',
|
|
|
|
|
path="//td[@valign='middle'][1]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='../../tr/td[1]//text()',
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'title': ".//text()",
|
|
|
|
|
'movieID': ".//a/@href"
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
for key in data.keys():
|
|
|
|
|
n_key = key
|
|
|
|
|
n_keyl = n_key.lower()
|
|
|
|
|
if n_keyl == 'suggested by the database':
|
|
|
|
|
n_key = 'database'
|
|
|
|
|
elif n_keyl == 'imdb users recommend':
|
|
|
|
|
n_key = 'users'
|
2018-03-26 17:16:59 +00:00
|
|
|
|
data[n_key] = [
|
|
|
|
|
Movie(title=x['title'], movieID=analyze_imdbid(x['movieID']),
|
|
|
|
|
accessSystem=self._as, modFunct=self._modFunct)
|
|
|
|
|
for x in data[key]
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
del data[key]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if data:
|
|
|
|
|
return {'recommendations': data}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLNewsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "news" page of a given movie or person.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
nwparser = DOMHTMLNewsParser()
|
|
|
|
|
result = nwparser.parse(news_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='news',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//h2",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='news',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'title': "./text()",
|
|
|
|
|
'fromdate': "../following-sibling::p[1]/small//text()",
|
|
|
|
|
# FIXME: sometimes (see The Matrix (1999)) <p> is found
|
|
|
|
|
# inside news text.
|
|
|
|
|
'body': "../following-sibling::p[2]//text()",
|
|
|
|
|
'link': "../..//a[text()='Permalink']/@href",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'fulllink': "../..//a[starts-with(text(), 'See full article at')]/@href"
|
|
|
|
|
},
|
2014-03-10 05:18:05 +00:00
|
|
|
|
postprocess=lambda x: {
|
|
|
|
|
'title': x.get('title').strip(),
|
|
|
|
|
'date': x.get('fromdate').split('|')[0].strip(),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'from': x.get('fromdate').split('|')[1].replace('From ', '').strip(),
|
2014-03-10 05:18:05 +00:00
|
|
|
|
'body': (x.get('body') or u'').strip(),
|
|
|
|
|
'link': _normalize_href(x.get('link')),
|
|
|
|
|
'full article link': _normalize_href(x.get('fulllink'))
|
2018-03-26 17:16:59 +00:00
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
|
|
|
|
|
(re.compile('(<hr/>)', re.I), r'</div>\1'),
|
|
|
|
|
(re.compile('<p></p>', re.I), r'')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
if not data.has_key('news'):
|
|
|
|
|
return {}
|
|
|
|
|
for news in data['news']:
|
|
|
|
|
if news.has_key('full article link'):
|
|
|
|
|
if news['full article link'] is None:
|
|
|
|
|
del news['full article link']
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_review(x):
|
|
|
|
|
result = {}
|
|
|
|
|
title = x.get('title').strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if title[-1] == ':':
|
|
|
|
|
title = title[:-1]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
result['title'] = title
|
|
|
|
|
result['link'] = _normalize_href(x.get('link'))
|
2018-03-26 17:16:59 +00:00
|
|
|
|
kind = x.get('kind').strip()
|
|
|
|
|
if kind[-1] == ':':
|
|
|
|
|
kind = kind[:-1]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
result['review kind'] = kind
|
|
|
|
|
text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
|
|
|
|
|
review = '\n'.join(text)
|
|
|
|
|
if x.get('author') is not None:
|
|
|
|
|
author = x.get('author').strip()
|
|
|
|
|
review = review.split(author)[0].strip()
|
|
|
|
|
result['review author'] = author[2:]
|
|
|
|
|
if x.get('item') is not None:
|
|
|
|
|
item = x.get('item').strip()
|
|
|
|
|
review = review[len(item):].strip()
|
|
|
|
|
review = "%s: %s" % (item, review)
|
|
|
|
|
result['review'] = review
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLSeasonEpisodesParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "episode list" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
sparser = DOMHTMLSeasonEpisodesParser()
|
|
|
|
|
result = sparser.parse(episodes_html_string)
|
|
|
|
|
"""
|
|
|
|
|
|
2018-03-26 17:16:59 +00:00
|
|
|
|
extractors = [
|
|
|
|
|
Extractor(
|
|
|
|
|
label='series link',
|
|
|
|
|
path="//div[@class='parent']",
|
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key='series link',
|
|
|
|
|
path=".//a/@href"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='series title',
|
|
|
|
|
path="//head/meta[@property='og:title']",
|
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key='series title',
|
|
|
|
|
path="./@content"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='seasons list',
|
|
|
|
|
path="//select[@id='bySeason']//option",
|
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key='_seasons',
|
|
|
|
|
multi=True,
|
|
|
|
|
path="./@value"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='selected season',
|
|
|
|
|
path="//select[@id='bySeason']//option[@selected]",
|
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key='_current_season',
|
|
|
|
|
path='./@value'
|
2014-03-10 05:18:05 +00:00
|
|
|
|
)
|
|
|
|
|
]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='episodes',
|
|
|
|
|
path=".",
|
|
|
|
|
group="//div[@class='info']",
|
|
|
|
|
group_key=".//meta/@content",
|
|
|
|
|
group_key_normalize=lambda x: 'episode %s' % x,
|
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key=None,
|
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
"link": ".//strong//a[@href][1]/@href",
|
|
|
|
|
"original air date": ".//div[@class='airdate']/text()",
|
|
|
|
|
"title": ".//strong//text()",
|
|
|
|
|
"plot": ".//div[@class='item_description']//text()"
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
series_id = analyze_imdbid(data.get('series link'))
|
|
|
|
|
series_title = data.get('series title', '').strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
selected_season = data.get('_current_season', 'unknown season').strip()
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if not (series_id and series_title):
|
|
|
|
|
return {}
|
|
|
|
|
series = Movie(title=series_title, movieID=str(series_id),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
accessSystem=self._as, modFunct=self._modFunct)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if series.get('kind') == 'movie':
|
|
|
|
|
series['kind'] = u'tv series'
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
selected_season = int(selected_season)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd = {selected_season: {}}
|
2014-05-29 05:40:12 +00:00
|
|
|
|
if 'episode -1' in data:
|
2018-03-26 17:16:59 +00:00
|
|
|
|
counter = 1
|
|
|
|
|
for episode in data['episode -1']:
|
|
|
|
|
while 'episode %d' % counter in data:
|
|
|
|
|
counter += 1
|
|
|
|
|
k = 'episode %d' % counter
|
|
|
|
|
data[k] = [episode]
|
|
|
|
|
del data['episode -1']
|
|
|
|
|
for episode_nr, episode in data.items():
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if not (episode and episode[0] and
|
|
|
|
|
episode_nr.startswith('episode ')):
|
|
|
|
|
continue
|
|
|
|
|
episode = episode[0]
|
|
|
|
|
episode_nr = episode_nr[8:].rstrip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
episode_nr = int(episode_nr)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
2014-03-10 05:18:05 +00:00
|
|
|
|
episode_id = analyze_imdbid(episode.get('link' ''))
|
2018-03-26 17:16:59 +00:00
|
|
|
|
episode_air_date = episode.get('original air date', '').strip()
|
2014-03-10 05:18:05 +00:00
|
|
|
|
episode_title = episode.get('title', '').strip()
|
|
|
|
|
episode_plot = episode.get('plot', '')
|
2016-01-14 08:54:24 +00:00
|
|
|
|
if not (episode_nr is not None and episode_id and episode_title):
|
2014-03-10 05:18:05 +00:00
|
|
|
|
continue
|
|
|
|
|
ep_obj = Movie(movieID=episode_id, title=episode_title,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
accessSystem=self._as, modFunct=self._modFunct)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
ep_obj['kind'] = u'episode'
|
|
|
|
|
ep_obj['episode of'] = series
|
|
|
|
|
ep_obj['season'] = selected_season
|
|
|
|
|
ep_obj['episode'] = episode_nr
|
|
|
|
|
if episode_air_date:
|
|
|
|
|
ep_obj['original air date'] = episode_air_date
|
|
|
|
|
if episode_air_date[-4:].isdigit():
|
|
|
|
|
ep_obj['year'] = episode_air_date[-4:]
|
|
|
|
|
if episode_plot:
|
|
|
|
|
ep_obj['plot'] = episode_plot
|
|
|
|
|
nd[selected_season][episode_nr] = ep_obj
|
|
|
|
|
_seasons = data.get('_seasons') or []
|
|
|
|
|
for idx, season in enumerate(_seasons):
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
_seasons[idx] = int(season)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
return {'episodes': nd, '_seasons': _seasons, '_current_season': selected_season}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_episode(x):
|
|
|
|
|
"""Create a Movie object for a given series' episode."""
|
|
|
|
|
episode_id = analyze_imdbid(x.get('link'))
|
|
|
|
|
episode_title = x.get('title')
|
|
|
|
|
e = Movie(movieID=episode_id, title=episode_title)
|
|
|
|
|
e['kind'] = u'episode'
|
|
|
|
|
oad = x.get('oad')
|
|
|
|
|
if oad:
|
|
|
|
|
e['original air date'] = oad.strip()
|
|
|
|
|
year = x.get('year')
|
|
|
|
|
if year is not None:
|
|
|
|
|
year = year[5:]
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if year == 'unknown':
|
|
|
|
|
year = u'????'
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if year and year.isdigit():
|
|
|
|
|
year = int(year)
|
|
|
|
|
e['year'] = year
|
|
|
|
|
else:
|
|
|
|
|
if oad and oad[-4:].isdigit():
|
|
|
|
|
e['year'] = int(oad[-4:])
|
|
|
|
|
epinfo = x.get('episode')
|
|
|
|
|
if epinfo is not None:
|
|
|
|
|
season, episode = epinfo.split(':')[0].split(',')
|
|
|
|
|
e['season'] = int(season[7:])
|
|
|
|
|
e['episode'] = int(episode[8:])
|
|
|
|
|
else:
|
|
|
|
|
e['season'] = 'unknown'
|
|
|
|
|
e['episode'] = 'unknown'
|
|
|
|
|
plot = x.get('plot')
|
|
|
|
|
if plot:
|
|
|
|
|
e['plot'] = plot.strip()
|
|
|
|
|
return e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLEpisodesParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "episode list" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
eparser = DOMHTMLEpisodesParser()
|
|
|
|
|
result = eparser.parse(episodes_html_string)
|
|
|
|
|
"""
|
|
|
|
|
# XXX: no more used for the list of episodes parser,
|
|
|
|
|
# but only for the episodes cast parser (see below).
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
|
|
|
|
kind = 'episodes list'
|
|
|
|
|
_episodes_path = "..//h4"
|
|
|
|
|
_oad_path = "./following-sibling::span/strong[1]/text()"
|
|
|
|
|
|
|
|
|
|
def _init(self):
|
|
|
|
|
self.extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='series',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//html",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=[
|
|
|
|
|
Attribute(
|
|
|
|
|
key='series title',
|
|
|
|
|
path=".//title/text()"
|
|
|
|
|
),
|
|
|
|
|
Attribute(
|
|
|
|
|
key='series movieID',
|
|
|
|
|
path=".//h1/a[@class='main']/@href",
|
|
|
|
|
postprocess=analyze_imdbid
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
),
|
|
|
|
|
Extractor(
|
|
|
|
|
label='episodes',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
group="//div[@class='_imdbpy']/h3",
|
|
|
|
|
group_key="./a/@name",
|
|
|
|
|
path=self._episodes_path,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'link': "./a/@href",
|
|
|
|
|
'title': "./a/text()",
|
|
|
|
|
'year': "./preceding-sibling::a[1]/@name",
|
|
|
|
|
'episode': "./text()[1]",
|
|
|
|
|
'oad': self._oad_path,
|
|
|
|
|
'plot': "./following-sibling::text()[1]"
|
|
|
|
|
},
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=_build_episode
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if self.kind == 'episodes cast':
|
|
|
|
|
self.extractors += [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='cast',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
group="//h4",
|
|
|
|
|
group_key="./text()[1]",
|
|
|
|
|
group_key_normalize=lambda x: x.strip(),
|
|
|
|
|
path="./following-sibling::table[1]//td[@class='nm']",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
2018-03-26 17:16:59 +00:00
|
|
|
|
path={
|
|
|
|
|
'person': "..//text()",
|
2014-03-10 05:18:05 +00:00
|
|
|
|
'link': "./a/@href",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'roleID': "../td[4]/div[@class='_imdbpyrole']/@roleid"
|
|
|
|
|
},
|
|
|
|
|
postprocess=lambda x: build_person(
|
|
|
|
|
x.get('person') or u'',
|
|
|
|
|
personID=analyze_imdbid(x.get('link')),
|
|
|
|
|
roleID=(x.get('roleID') or u'').split('/'),
|
|
|
|
|
accessSystem=self._as,
|
|
|
|
|
modFunct=self._modFunct
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
(re.compile('(<hr/>\n)(<h3>)', re.I), r'</div>\1<div class="_imdbpy">\2'),
|
2014-03-10 05:18:05 +00:00
|
|
|
|
(re.compile('(</p>\n\n)</div>', re.I), r'\1'),
|
|
|
|
|
(re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
|
|
|
|
|
(_reRolesMovie, _manageRoles),
|
|
|
|
|
(re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
# A bit extreme?
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if 'series title' not in data:
|
|
|
|
|
return {}
|
|
|
|
|
if 'series movieID' not in data:
|
|
|
|
|
return {}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
stitle = data['series title'].replace('- Episode list', '')
|
|
|
|
|
stitle = stitle.replace('- Episodes list', '')
|
|
|
|
|
stitle = stitle.replace('- Episode cast', '')
|
|
|
|
|
stitle = stitle.replace('- Episodes cast', '')
|
|
|
|
|
stitle = stitle.strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not stitle:
|
|
|
|
|
return {}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
seriesID = data['series movieID']
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if seriesID is None:
|
|
|
|
|
return {}
|
2014-03-10 05:18:05 +00:00
|
|
|
|
series = Movie(title=stitle, movieID=str(seriesID),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
accessSystem=self._as, modFunct=self._modFunct)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd = {}
|
|
|
|
|
for key in data.keys():
|
|
|
|
|
if key.startswith('filter-season-') or key.startswith('season-'):
|
|
|
|
|
season_key = key.replace('filter-season-', '').replace('season-', '')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
try:
|
|
|
|
|
season_key = int(season_key)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
2014-03-10 05:18:05 +00:00
|
|
|
|
nd[season_key] = {}
|
|
|
|
|
ep_counter = 1
|
|
|
|
|
for episode in data[key]:
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if not episode:
|
|
|
|
|
continue
|
2014-03-10 05:18:05 +00:00
|
|
|
|
episode_key = episode.get('episode')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
if episode_key is None:
|
|
|
|
|
continue
|
2014-03-10 05:18:05 +00:00
|
|
|
|
if not isinstance(episode_key, int):
|
|
|
|
|
episode_key = ep_counter
|
|
|
|
|
ep_counter += 1
|
2018-03-26 17:16:59 +00:00
|
|
|
|
cast_key = 'Season %s, Episode %s:' % (season_key, episode_key)
|
|
|
|
|
if cast_key in data:
|
2014-03-10 05:18:05 +00:00
|
|
|
|
cast = data[cast_key]
|
|
|
|
|
for i in xrange(len(cast)):
|
|
|
|
|
cast[i].billingPos = i + 1
|
|
|
|
|
episode['cast'] = cast
|
|
|
|
|
episode['episode of'] = series
|
|
|
|
|
nd[season_key][episode_key] = episode
|
|
|
|
|
if len(nd) == 0:
|
|
|
|
|
return {}
|
|
|
|
|
return {'episodes': nd}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser):
|
|
|
|
|
"""Parser for the "episodes cast" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
eparser = DOMHTMLEpisodesParser()
|
|
|
|
|
result = eparser.parse(episodes_html_string)
|
|
|
|
|
"""
|
|
|
|
|
kind = 'episodes cast'
|
|
|
|
|
_episodes_path = "..//h4"
|
|
|
|
|
_oad_path = "./following-sibling::b[1]/text()"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLFaqsParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "FAQ" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
fparser = DOMHTMLFaqsParser()
|
|
|
|
|
result = fparser.parse(faqs_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_defGetRefs = True
|
|
|
|
|
|
|
|
|
|
# XXX: bsoup and lxml don't match (looks like a minor issue, anyway).
|
|
|
|
|
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='faqs',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//div[@class='section']",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='faqs',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'question': "./h3/a/span/text()",
|
|
|
|
|
'answer': "../following-sibling::div[1]//text()"
|
|
|
|
|
},
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=lambda x: u'%s::%s' % (
|
|
|
|
|
x.get('question').strip(),
|
|
|
|
|
'\n\n'.join(x.get('answer').replace('\n\n', '\n').strip().split('||'))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('<br/><br/>', re.I), r'||'),
|
|
|
|
|
(re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
|
|
|
|
|
(re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
|
|
|
|
|
r'[spoiler]\1[/spoiler]')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLAiringParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "airing" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
aparser = DOMHTMLAiringParser()
|
|
|
|
|
result = aparser.parse(airing_html_string)
|
|
|
|
|
"""
|
|
|
|
|
_containsObjects = True
|
|
|
|
|
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='series title',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//title",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='series title',
|
|
|
|
|
path="./text()",
|
|
|
|
|
postprocess=lambda x: x.replace(' - TV schedule', u'')
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='series id',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//h1/a[@href]",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='series id',
|
|
|
|
|
path="./@href"
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
Extractor(
|
|
|
|
|
label='tv airings',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path="//tr[@class]",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='airing',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
multi=True,
|
|
|
|
|
path={
|
|
|
|
|
'date': "./td[1]//text()",
|
|
|
|
|
'time': "./td[2]//text()",
|
|
|
|
|
'channel': "./td[3]//text()",
|
|
|
|
|
'link': "./td[4]/a[1]/@href",
|
|
|
|
|
'title': "./td[4]//text()",
|
|
|
|
|
'season': "./td[5]//text()",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
},
|
2014-03-10 05:18:05 +00:00
|
|
|
|
postprocess=lambda x: {
|
|
|
|
|
'date': x.get('date'),
|
|
|
|
|
'time': x.get('time'),
|
|
|
|
|
'channel': x.get('channel').strip(),
|
|
|
|
|
'link': x.get('link'),
|
|
|
|
|
'title': x.get('title'),
|
|
|
|
|
'season': (x.get('season') or '').strip()
|
2018-03-26 17:16:59 +00:00
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
if len(data) == 0:
|
|
|
|
|
return {}
|
|
|
|
|
seriesTitle = data['series title']
|
|
|
|
|
seriesID = analyze_imdbid(data['series id'])
|
|
|
|
|
if data.has_key('airing'):
|
|
|
|
|
for airing in data['airing']:
|
|
|
|
|
title = airing.get('title', '').strip()
|
|
|
|
|
if not title:
|
|
|
|
|
epsTitle = seriesTitle
|
|
|
|
|
if seriesID is None:
|
|
|
|
|
continue
|
|
|
|
|
epsID = seriesID
|
|
|
|
|
else:
|
|
|
|
|
epsTitle = '%s {%s}' % (data['series title'],
|
|
|
|
|
airing['title'])
|
|
|
|
|
epsID = analyze_imdbid(airing['link'])
|
|
|
|
|
e = Movie(title=epsTitle, movieID=epsID)
|
|
|
|
|
airing['episode'] = e
|
|
|
|
|
del airing['link']
|
|
|
|
|
del airing['title']
|
|
|
|
|
if not airing['season']:
|
|
|
|
|
del airing['season']
|
|
|
|
|
if 'series title' in data:
|
|
|
|
|
del data['series title']
|
|
|
|
|
if 'series id' in data:
|
|
|
|
|
del data['series id']
|
|
|
|
|
if 'airing' in data:
|
|
|
|
|
data['airing'] = filter(None, data['airing'])
|
|
|
|
|
if 'airing' not in data or not data['airing']:
|
|
|
|
|
return {}
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLSynopsisParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "synopsis" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
sparser = HTMLSynopsisParser()
|
|
|
|
|
result = sparser.parse(synopsis_html_string)
|
|
|
|
|
"""
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='synopsis',
|
|
|
|
|
path="//ul[@id='plot-synopsis-content'][not(@style)]",
|
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key='synopsis',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path=".//text()",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=lambda x: '\n\n'.join(x.strip().split('||'))
|
|
|
|
|
)
|
|
|
|
|
)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('<br/><br/>', re.I), r'||')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DOMHTMLParentsGuideParser(DOMParserBase):
|
|
|
|
|
"""Parser for the "parents guide" page of a given movie.
|
|
|
|
|
The page should be provided as a string, as taken from
|
2018-03-26 17:16:59 +00:00
|
|
|
|
the www.imdb.com server. The final result will be a
|
2014-03-10 05:18:05 +00:00
|
|
|
|
dictionary, with a key for every relevant section.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
pgparser = HTMLParentsGuideParser()
|
|
|
|
|
result = pgparser.parse(parentsguide_html_string)
|
|
|
|
|
"""
|
|
|
|
|
extractors = [
|
2018-03-26 17:16:59 +00:00
|
|
|
|
Extractor(
|
|
|
|
|
label='parents guide',
|
2014-03-10 05:18:05 +00:00
|
|
|
|
group="//div[@class='section']",
|
|
|
|
|
group_key="./h3/a/span/text()",
|
|
|
|
|
group_key_normalize=lambda x: x.lower(),
|
|
|
|
|
path="../following-sibling::div[1]/p",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
attrs=Attribute(
|
|
|
|
|
key=None,
|
2014-03-10 05:18:05 +00:00
|
|
|
|
path=".//text()",
|
2018-03-26 17:16:59 +00:00
|
|
|
|
postprocess=lambda x: [
|
|
|
|
|
t.strip().replace('\n', ' ') for t in x.split('||') if t.strip()
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
preprocessors = [
|
|
|
|
|
(re.compile('<br/><br/>', re.I), r'||')
|
2018-03-26 17:16:59 +00:00
|
|
|
|
]
|
2014-03-10 05:18:05 +00:00
|
|
|
|
|
|
|
|
|
def postprocess_data(self, data):
|
|
|
|
|
data2 = {}
|
|
|
|
|
for key in data:
|
|
|
|
|
if data[key]:
|
|
|
|
|
data2[key] = data[key]
|
|
|
|
|
if not data2:
|
|
|
|
|
return {}
|
|
|
|
|
return {'parents guide': data2}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_OBJECTS = {
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'movie_parser': ((DOMHTMLMovieParser,), None),
|
|
|
|
|
'plot_parser': ((DOMHTMLPlotParser,), None),
|
2014-03-10 05:18:05 +00:00
|
|
|
|
'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
|
2018-03-26 17:16:59 +00:00
|
|
|
|
'taglines_parser': ((DOMHTMLTaglinesParser,), None),
|
|
|
|
|
'keywords_parser': ((DOMHTMLKeywordsParser,), None),
|
|
|
|
|
'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
|
|
|
|
|
'goofs_parser': ((DOMHTMLGoofsParser,), None),
|
|
|
|
|
'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
|
|
|
|
|
'trivia_parser': ((DOMHTMLTriviaParser,), None),
|
|
|
|
|
'soundtrack_parser': ((DOMHTMLSoundtrackParser,), None),
|
|
|
|
|
'quotes_parser': ((DOMHTMLQuotesParser,), None),
|
|
|
|
|
'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
|
|
|
|
|
'ratings_parser': ((DOMHTMLRatingsParser,), None),
|
|
|
|
|
'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
|
|
|
|
|
'criticrev_parser': ((DOMHTMLCriticReviewsParser,), {'kind': 'critic reviews'}),
|
|
|
|
|
'reviews_parser': ((DOMHTMLReviewsParser,), {'kind': 'reviews'}),
|
|
|
|
|
'externalsites_parser': ((DOMHTMLOfficialsitesParser,), None),
|
|
|
|
|
'externalrev_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'external reviews'}),
|
|
|
|
|
'misclinks_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'misc links'}),
|
|
|
|
|
'soundclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'sound clips'}),
|
|
|
|
|
'videoclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'video clips'}),
|
|
|
|
|
'photosites_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'photo sites'}),
|
|
|
|
|
'connections_parser': ((DOMHTMLConnectionParser,), None),
|
|
|
|
|
'tech_parser': ((DOMHTMLTechParser,), None),
|
|
|
|
|
'business_parser': ((DOMHTMLBusinessParser,), {'kind': 'business', '_defGetRefs': 1}),
|
|
|
|
|
'literature_parser': ((DOMHTMLBusinessParser,), None),
|
|
|
|
|
'locations_parser': ((DOMHTMLLocationsParser,), None),
|
|
|
|
|
'rec_parser': ((DOMHTMLRecParser,), None),
|
|
|
|
|
'news_parser': ((DOMHTMLNewsParser,), None),
|
|
|
|
|
'episodes_parser': ((DOMHTMLEpisodesParser,), None),
|
|
|
|
|
'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None),
|
|
|
|
|
'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None),
|
|
|
|
|
'eprating_parser': ((DOMHTMLEpisodesRatings,), None),
|
|
|
|
|
'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
|
|
|
|
|
'airing_parser': ((DOMHTMLAiringParser,), None),
|
|
|
|
|
'synopsis_parser': ((DOMHTMLSynopsisParser,), None),
|
|
|
|
|
'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
|
2014-03-10 05:18:05 +00:00
|
|
|
|
}
|