# -*- coding: utf-8 -*- """ parser.http.movieParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the www.imdb.com server about a movie. E.g., for Brian De Palma's "The Untouchables", the referred pages would be: combined details: http://www.imdb.com/title/tt0094226/reference plot summary: http://www.imdb.com/title/tt0094226/plotsummary ...and so on... Copyright 2004-2018 Davide Alberani 2008-2018 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import functools import re import urllib from imdb import imdbURL_base from imdb.Company import Company from imdb.Movie import Movie from imdb.Person import Person from imdb.utils import _Container, KIND_MAP from .utils import Attribute, DOMParserBase, Extractor, analyze_imdbid, build_person # Dictionary used to convert some section's names. _SECT_CONV = { 'directed': 'director', 'directed by': 'director', 'directors': 'director', 'editors': 'editor', 'writing credits': 'writer', 'writers': 'writer', 'produced': 'producer', 'cinematography': 'cinematographer', 'film editing': 'editor', 'casting': 'casting director', 'costume design': 'costume designer', 'makeup department': 'make up', 'production management': 'production manager', 'second unit director or assistant director': 'assistant director', 'costume and wardrobe department': 'costume department', 'sound department': 'sound crew', 'stunts': 'stunt performer', 'other crew': 'miscellaneous crew', 'also known as': 'akas', 'country': 'countries', 'runtime': 'runtimes', 'language': 'languages', 'certification': 'certificates', 'genre': 'genres', 'created': 'creator', 'creators': 'creator', 'color': 'color info', 'plot': 'plot outline', 'art directors': 'art direction', 'assistant directors': 'assistant director', 'set decorators': 'set decoration', 'visual effects department': 'visual effects', 'miscellaneous': 'miscellaneous crew', 'make up department': 'make up', 'plot summary': 'plot outline', 'cinematographers': 'cinematographer', 'camera department': 'camera and electrical department', 'costume designers': 'costume designer', 'production designers': 'production design', 'production managers': 'production manager', 'music original': 'original music', 'casting directors': 'casting director', 'other companies': 'miscellaneous companies', 'producers': 'producer', 'special effects by': 'special effects department', 'special effects': 'special effects companies' } def _manageRoles(mo): """Perform some transformation on the html, so that roleIDs can be easily retrieved.""" firstHalf = mo.group(1) secondHalf = mo.group(2) newRoles = [] roles = secondHalf.split(' / ') for role in roles: role = role.strip() if not role: continue roleID = analyze_imdbid(role) if roleID is None: roleID = u'/' else: roleID += u'/' newRoles.append(u'
%s
' % ( roleID, role.strip() )) return firstHalf + u' / '.join(newRoles) + mo.group(3) _reRolesMovie = re.compile(r'()(.*?)()', re.I | re.M | re.S) def _replaceBR(mo): """Replaces
tags with '::' (useful for some akas)""" txt = mo.group(0) return txt.replace('
', '::') _reAkas = re.compile(r'
also known as:
.*?', re.I | re.M | re.S) def makeSplitter(lstrip=None, sep='|', comments=True, origNotesSep=' (', newNotesSep='::(', strip=None): """Return a splitter function suitable for a given set of data.""" def splitter(x): if not x: return x x = x.strip() if not x: return x if lstrip is not None: x = x.lstrip(lstrip).lstrip() lx = x.split(sep) lx[:] = filter(None, [j.strip() for j in lx]) if comments: lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx] if strip: lx[:] = [j.strip(strip) for j in lx] return lx return splitter def _toInt(val, replace=()): """Return the value, converted to integer, or None; if present, 'replace' must be a list of tuples of values to replace.""" for before, after in replace: val = val.replace(before, after) try: return int(val) except (TypeError, ValueError): return None _re_og_title = re.compile( ur'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)', re.UNICODE ) def analyze_og_title(og_title): data = {} match = _re_og_title.match(og_title) if match: data['title'] = match.group(1) if match.group(3): data['year'] = int(match.group(3)) kind = match.group(2) or match.group(6) if kind is None: kind = 'movie' else: kind = kind.lower() kind = KIND_MAP.get(kind, kind) data['kind'] = kind year_separator = match.group(4) # There is a year separator so assume an ongoing or ended series if year_separator is not None: end_year = match.group(5) if end_year is not None: data['series years'] = '%(year)d-%(end_year)s' % { 'year': data['year'], 'end_year': end_year.strip(), } elif kind.endswith('series'): data['series years'] = '%(year)d-' % {'year': data['year']} # No year separator and series, so assume that it ended the same year elif kind.endswith('series') and 'year' in data: data['series years'] = '%(year)d-%(year)d' % {'year': data['year']} if data['kind'] == 'episode' and data['title'][0] == '"': quote_end = data['title'].find('"', 1) data['tv series title'] = data['title'][1:quote_end] data['title'] = data['title'][quote_end + 1:].strip() return data def analyze_certificates(certificates): def reducer(acc, el): cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE) if cert_re.match(el): acc.append(el) elif acc: acc[-1] = u'{}::{}'.format( acc[-1], el, ) return acc certificates = [el.strip() for el in certificates.split('\n') if el.strip()] return functools.reduce(reducer, certificates, []) class DOMHTMLMovieParser(DOMParserBase): """Parser for the "combined details" (and if instance.mdparse is True also for the "main details") page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: mparser = DOMHTMLMovieParser() result = mparser.parse(combined_details_html_string) """ _containsObjects = True extractors = [ Extractor( label='title', path="//meta[@property='og:title']", attrs=Attribute( key='title', path="@content", postprocess=analyze_og_title ) ), # parser for misc sections like 'casting department', 'stunts', ... Extractor( label='glossarysections', group="//h4[contains(@class, 'ipl-header__content')]", group_key="./@name", group_key_normalize=lambda x: x.replace('_', ' '), path="../../following-sibling::table[1]//tr", attrs=Attribute( key=None, multi=True, path={ 'person': ".//text()", 'link': "./td[1]/a[@href]/@href" }, postprocess=lambda x: build_person( x.get('person') or u'', personID=analyze_imdbid(x.get('link')) ) ) ), Extractor( label='cast', path="//table[@class='cast_list']//tr", attrs=Attribute( key="cast", multi=True, path={ 'person': ".//text()", 'link': "td[2]/a/@href", 'roleID': "td[4]/div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_person( x.get('person') or u'', personID=analyze_imdbid(x.get('link')), roleID=(x.get('roleID') or u'').split('/')) ) ), Extractor( label='myrating', path="//span[@id='voteuser']", attrs=Attribute( key='myrating', path=".//text()" ) ), Extractor( label='plot summary', path=".//td[starts-with(text(), 'Plot')]/..//p", attrs=Attribute( key='plot summary', path='./text()', postprocess=lambda x: x.strip().rstrip('|').rstrip() ) ), Extractor( label='genres', path="//td[starts-with(text(), 'Genre')]/..//li/a", attrs=Attribute( key="genres", multi=True, path="./text()" ) ), Extractor( label='runtimes', path="//td[starts-with(text(), 'Runtime')]/..//li", attrs=Attribute( key='runtimes', path="./text()", multi=True, postprocess=lambda x: x.strip().replace(' min', '') ) ), Extractor( label='countries', path="//td[starts-with(text(), 'Countr')]/..//li/a", attrs=Attribute( key='countries', path="./text()", multi=True ) ), Extractor( label='country codes', path="//td[starts-with(text(), 'Countr')]/..//li/a", attrs=Attribute( key='country codes', path="./@href", multi=True, postprocess=lambda x: x.split('/')[2].strip().lower() ) ), Extractor( label='language', path="//td[starts-with(text(), 'Language')]/..//li/a", attrs=Attribute( key='language', path="./text()", multi=True ) ), Extractor( label='language codes', path="//td[starts-with(text(), 'Language')]/..//li/a", attrs=Attribute( key='language codes', path="./@href", multi=True, postprocess=lambda x: x.split('/')[2].strip() ) ), Extractor( label='color info', path="//td[starts-with(text(), 'Color')]/..//li/a", attrs=Attribute( key='color info', path="./text()", multi=True, postprocess=lambda x: x.replace(' (', '::(') ) ), Extractor( label='aspect ratio', path="//td[starts-with(text(), 'Aspect')]/..", attrs=Attribute( key='aspect ratio', path=".//li/text()", postprocess=lambda x: x.strip() ) ), Extractor( label='sound mix', path="//td[starts-with(text(), 'Sound Mix')]/..//li/a", attrs=Attribute( key='sound mix', path="./text()", multi=True, postprocess=lambda x: x.replace(' (', '::(') ) ), Extractor( label='certificates', path=".//td[starts-with(text(), 'Certificat')]/..", attrs=Attribute( key='certificates', path=".//text()", postprocess=analyze_certificates ) ), Extractor( label='h5sections', path="//section[contains(@class, 'listo')]", attrs=[ # Collects akas not encosed in tags. Attribute( key='other akas', path=".//td[starts-with(text(), 'Also Known As')]/..//ul//text()", postprocess=makeSplitter( sep='::', origNotesSep='" - ', newNotesSep='::', strip='"' ) ) ] ), Extractor( label='creator', path="//td[starts-with(text(), 'Creator')]/..//a", attrs=Attribute( key='creator', multi=True, path={ 'name': "./text()", 'link': "./@href" }, postprocess=lambda x: build_person( x.get('name') or u'', personID=analyze_imdbid(x.get('link')) ) ) ), Extractor( label='thin writer', path="//div[starts-with(normalize-space(text()), 'Writer')]/ul/li[1]/a", attrs=Attribute( key='thin writer', multi=True, path={ 'name': "./text()", 'link': "./@href" }, postprocess=lambda x: build_person( x.get('name') or u'', personID=analyze_imdbid(x.get('link')) ) ) ), Extractor( label='thin director', path="//div[starts-with(normalize-space(text()), 'Director')]/ul/li[1]/a", attrs=Attribute( key='thin director', multi=True, path={ 'name': "./text()", 'link': "./@href" }, postprocess=lambda x: build_person( x.get('name') or u'', personID=analyze_imdbid(x.get('link')) ) ) ), Extractor( label='top 250/bottom 100', path="//li[@class='ipl-inline-list__item']//a[starts-with(@href, '/chart/')]", attrs=Attribute( key='top/bottom rank', path="./text()" ) ), Extractor( label='original air date', path="//span[@imdbpy='airdate']", attrs=Attribute( key='original air date', path="./text()" ) ), Extractor( label='series years', path="//div[@id='tn15title']//span[starts-with(text(), 'TV series')]", attrs=Attribute( key='series years', path="./text()", postprocess=lambda x: x.replace('TV series', '').strip() ) ), Extractor( label='season/episode', path="//div[@class='titlereference-overview-season-episode-section']/ul", attrs=Attribute( key='season/episode', path=".//text()", postprocess=lambda x: x.strip() ) ), Extractor( label='number of episodes', path="//a[starts-with(text(), 'All Episodes')]", attrs=Attribute( key='number of episodes', path="./text()", postprocess=lambda x: int(x.replace('All Episodes', '').strip()[1:-1]) ) ), Extractor( label='episode number', path=".//div[@id='tn15epnav']", attrs=Attribute( key='episode number', path="./text()", postprocess=lambda x: int(re.sub(r'[^a-z0-9 ]', '', x.lower()) .strip() .split()[0]) ) ), Extractor( label='previous episode', path=".//span[@class='titlereference-overview-episodes-links']//a[contains(text(), 'Previous')]", attrs=Attribute( key='previous episode', path="./@href", postprocess=lambda x: analyze_imdbid(x) ) ), Extractor( label='next episode', path=".//span[@class='titlereference-overview-episodes-links']//a[contains(text(), 'Next')]", attrs=Attribute( key='next episode', path="./@href", postprocess=lambda x: analyze_imdbid(x) ) ), Extractor( label='number of seasons', path=".//span[@class='titlereference-overview-years-links']/../a[1]", attrs=Attribute( key='number of seasons', path="./text()", postprocess=lambda x: int(x) ) ), Extractor( label='tv series link', path=".//a[starts-with(text(), 'All Episodes')]", attrs=Attribute( key='tv series link', path="./@href" ) ), Extractor( label='akas', path="//i[@class='transl']", attrs=Attribute( key='akas', multi=True, path='text()', postprocess=lambda x: x .replace(' ', ' ') .rstrip('-') .replace('" - ', '"::', 1) .strip('"') .replace(' ', ' ') ) ), Extractor( label='production notes/status', path="//td[starts-with(text(), 'Status:')]/..//div[@class='info-content']", attrs=Attribute( key='production status', path=".//text()", postprocess=lambda x: x.strip().split('|')[0].strip().lower() ) ), Extractor( label='production notes/status updated', path="//td[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']", attrs=Attribute( key='production status updated', path=".//text()", postprocess=lambda x: x.strip() ) ), Extractor( label='production notes/comments', path="//td[starts-with(text(), 'Comments:')]/..//div[@class='info-content']", attrs=Attribute( key='production comments', path=".//text()", postprocess=lambda x: x.strip() ) ), Extractor( label='production notes/note', path="//td[starts-with(text(), 'Note:')]/..//div[@class='info-content']", attrs=Attribute( key='production note', path=".//text()", postprocess=lambda x: x.strip() ) ), Extractor( label='blackcatheader', group="//b[@class='blackcatheader']", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../ul/li", attrs=Attribute( key=None, multi=True, path={ 'name': "./a//text()", 'comp-link': "./a/@href", 'notes': "./text()" }, postprocess=lambda x: Company(name=x.get('name') or u'', companyID=analyze_imdbid(x.get('comp-link')), notes=(x.get('notes') or u'').strip()) ) ), Extractor( label='rating', path="(//span[@class='ipl-rating-star__rating'])[1]", attrs=Attribute( key='rating', path="./text()" ) ), Extractor( label='votes', path="//span[@class='ipl-rating-star__total-votes'][1]", attrs=Attribute( key='votes', path="./text()" ) ), Extractor( label='cover url', path="//img[@alt='Poster']", attrs=Attribute( key='cover url', path="@src" ) ) ] preprocessors = [ ('/releaseinfo">', '">'), (re.compile(r'(.+?)', re.I), r'
\1'), ('Full cast and crew for
', ''), (' ', '...'), (re.compile(r'TV mini-series(\s+.*?)', re.I), r'TV series\1 (mini)'), (_reRolesMovie, _manageRoles), (_reAkas, _replaceBR) ] def preprocess_dom(self, dom): # Handle series information. xpath = self.xpath(dom, "//b[text()='Series Crew']") if xpath: b = xpath[-1] # In doubt, take the last one. for a in self.xpath(b, "./following::h5/a[@class='glossary']"): name = a.get('name') if name: a.set('name', 'series %s' % name) # Remove links to IMDbPro. for proLink in self.xpath(dom, "//span[@class='pro-link']"): proLink.drop_tree() # Remove some 'more' links (keep others, like the one around # the number of votes). for tn15more in self.xpath(dom, "//a[@class='tn15more'][starts-with(@href, '/title/')]"): tn15more.drop_tree() return dom re_space = re.compile(r'\s+') re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I) def postprocess_data(self, data): # Convert section names. for sect in data.keys(): if sect in _SECT_CONV: data[_SECT_CONV[sect]] = data[sect] del data[sect] sect = _SECT_CONV[sect] # Filter out fake values. for key in data: value = data[key] if isinstance(value, list) and value: if isinstance(value[0], Person): data[key] = filter(lambda x: x.personID is not None, value) if isinstance(value[0], _Container): for obj in data[key]: obj.accessSystem = self._as obj.modFunct = self._modFunct if 'akas' in data or 'other akas' in data: akas = data.get('akas') or [] other_akas = data.get('other akas') or [] akas += other_akas nakas = [] for aka in akas: aka = aka.strip() if aka.endswith('" -'): aka = aka[:-3].rstrip() nakas.append(aka) if 'akas' in data: del data['akas'] if 'other akas' in data: del data['other akas'] if nakas: data['akas'] = nakas if 'color info' in data: data['color info'] = [x.replace('Color:', '', 1) for x in data['color info']] if 'runtimes' in data: data['runtimes'] = [x.replace(' min', u'') for x in data['runtimes']] if 'number of seasons' in data: data['seasons'] = [unicode(i) for i in range(1, data['number of seasons'] + 1)] # data['number of seasons'] = seasons[-1] if seasons else len(data['seasons']) if 'season/episode' in data: tokens = data['season/episode'].split('Episode') data['season'] = int(tokens[0].split('Season')[1]) data['episode'] = int(tokens[1]) del data['season/episode'] # if 'original air date' in data: # oid = self.re_space.sub(' ', data['original air date']).strip() # data['original air date'] = oid # aid = self.re_airdate.findall(oid) # if aid and len(aid[0]) == 3: # date, season, episode = aid[0] # date = date.strip() # try: # season = int(season) # except ValueError: # pass # try: # episode = int(episode) # except ValueError: # pass # if date and date != '????': # data['original air date'] = date # else: # del data['original air date'] # # Handle also "episode 0". # if season or isinstance(season, int): # data['season'] = season # if episode or isinstance(season, int): # data['episode'] = episode for k in ('writer', 'director'): t_k = 'thin %s' % k if t_k not in data: continue if k not in data: data[k] = data[t_k] del data[t_k] if 'top/bottom rank' in data: tbVal = data['top/bottom rank'].lower() if tbVal.startswith('top'): tbKey = 'top 250 rank' tbVal = _toInt(tbVal, [('top rated movies: #', '')]) else: tbKey = 'bottom 100 rank' tbVal = _toInt(tbVal, [('bottom rated movies: #', '')]) if tbVal: data[tbKey] = tbVal del data['top/bottom rank'] if 'year' in data and data['year'] == '????': del data['year'] if 'tv series link' in data: if 'tv series title' in data: data['episode of'] = Movie(title=data['tv series title'], movieID=analyze_imdbid(data['tv series link']), accessSystem=self._as, modFunct=self._modFunct) data['episode of']['kind'] = 'tv series' del data['tv series title'] del data['tv series link'] if 'rating' in data: try: data['rating'] = float(data['rating'].replace('/10', '')) except (TypeError, ValueError): pass if data['rating'] == 0: del data['rating'] if 'votes' in data: try: votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '') data['votes'] = int(votes) except (TypeError, ValueError): pass return data def _process_plotsummary(x): """Process a plot (contributed by Rdian06).""" xauthor = x.get('author') xplot = x.get('plot', u'').strip() if xauthor: xplot += u'::%s' % xauthor return xplot class DOMHTMLPlotParser(DOMParserBase): """Parser for the "plot summary" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a 'plot' key, containing a list of string with the structure: 'summary::summary_author '. Example: pparser = HTMLPlotParser() result = pparser.parse(plot_summary_html_string) """ _defGetRefs = True # Notice that recently IMDb started to put the email of the # author only in the link, that we're not collecting, here. extractors = [ Extractor( label='plot', path="//ul[@class='zebraList']/li", attrs=Attribute( key='plot', multi=True, path={ 'plot': './/p[@class="plotSummary"]//text()', 'author': './/span/em/a/text()' }, postprocess=_process_plotsummary) ) ] def _process_award(x): award = {} _award = x.get('award') if _award is not None: _award = _award.strip() award['award'] = _award if not award['award']: return {} award['year'] = x.get('year').strip() if award['year'] and award['year'].isdigit(): award['year'] = int(award['year']) award['result'] = x.get('result').strip() category = x.get('category').strip() if category: award['category'] = category received_with = x.get('with') if received_with is not None: award['with'] = received_with.strip() notes = x.get('notes') if notes is not None: notes = notes.strip() if notes: award['notes'] = notes award['anchor'] = x.get('anchor') return award class DOMHTMLAwardsParser(DOMParserBase): """Parser for the "awards" page of a given person or movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: awparser = HTMLAwardsParser() result = awparser.parse(awards_html_string) """ subject = 'title' _containsObjects = True extractors = [ Extractor( label='awards', group="//table//big", group_key="./a", path="./ancestor::tr[1]/following-sibling::tr/td[last()][not(@colspan)]", attrs=Attribute( key=None, multi=True, path={ 'year': "../td[1]/a/text()", 'result': "../td[2]/b/text()", 'award': "../td[3]/text()", 'category': "./text()[1]", # FIXME: takes only the first co-recipient 'with': "./small[starts-with(text(), 'Shared with:')]/" "following-sibling::a[1]/text()", 'notes': "./small[last()]//text()", 'anchor': ".//text()" }, postprocess=_process_award ) ), Extractor( label='recipients', group="//table//big", group_key="./a", path="./ancestor::tr[1]/following-sibling::tr" "/td[last()]/small[1]/preceding-sibling::a", attrs=Attribute( key=None, multi=True, path={ 'name': "./text()", 'link': "./@href", 'anchor': "..//text()" } ) ) ] preprocessors = [ (re.compile('(]*>.*?\n\n)', re.I), r'\1'), (re.compile('(]*>\n\n.*?)', re.I), r'\1'), (re.compile('(]*>\n\n)
(.*?)
(.*?\n\n)(\2') ] def preprocess_dom(self, dom): """Repeat td elements according to their rowspan attributes in subsequent tr elements. """ cols = self.xpath(dom, "//td[@rowspan]") for col in cols: span = int(col.get('rowspan')) del col.attrib['rowspan'] position = len(self.xpath(col, "./preceding-sibling::td")) row = col.getparent() for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]: # if not cloned, child will be moved to new parent clone = self.clone(col) # XXX: beware that here we don't use an "adapted" function, # because both BeautifulSoup and lxml uses the same # "insert" method. tr.insert(position, clone) return dom def postprocess_data(self, data): if len(data) == 0: return {} nd = [] for key in data.keys(): dom = self.get_dom(key) assigner = self.xpath(dom, "//a/text()")[0] for entry in data[key]: if not entry.has_key('name'): if not entry: continue # this is an award, not a recipient entry['assigner'] = assigner.strip() # find the recipients matches = [p for p in data[key] if 'name' in p and (entry['anchor'] == p['anchor'])] if self.subject == 'title': recipients = [ Person(name=recipient['name'], personID=analyze_imdbid(recipient['link'])) for recipient in matches ] entry['to'] = recipients elif self.subject == 'name': recipients = [ Movie(title=recipient['name'], movieID=analyze_imdbid(recipient['link'])) for recipient in matches ] entry['for'] = recipients nd.append(entry) del entry['anchor'] return {'awards': nd} class DOMHTMLTaglinesParser(DOMParserBase): """Parser for the "taglines" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = DOMHTMLTaglinesParser() result = tparser.parse(taglines_html_string) """ extractors = [ Extractor( label='taglines', path="//div[@id='taglines_content']/div", attrs=Attribute( key='taglines', multi=True, path=".//text()" ) ) ] def preprocess_dom(self, dom): for node in self.xpath(dom, "//div[@id='taglines_content']/div[@class='header']"): node.drop_tree() for node in self.xpath(dom, "//div[@id='taglines_content']/div[@id='no_content']"): node.drop_tree() return dom def postprocess_data(self, data): if 'taglines' in data: data['taglines'] = [tagline.strip() for tagline in data['taglines']] return data class DOMHTMLKeywordsParser(DOMParserBase): """Parser for the "keywords" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: kwparser = DOMHTMLKeywordsParser() result = kwparser.parse(keywords_html_string) """ extractors = [ Extractor( label='keywords', path="//a[starts-with(@href, '/keyword/')]", attrs=Attribute( key='keywords', path="./text()", multi=True, postprocess=lambda x: x.lower().replace(' ', '-') ) ) ] class DOMHTMLAlternateVersionsParser(DOMParserBase): """Parser for the "alternate versions" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: avparser = HTMLAlternateVersionsParser() result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True extractors = [ Extractor( label='alternate versions', path="//ul[@class='trivia']/li", attrs=Attribute( key='alternate versions', multi=True, path=".//text()", postprocess=lambda x: x.strip() ) ) ] class DOMHTMLTriviaParser(DOMParserBase): """Parser for the "trivia" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: avparser = HTMLAlternateVersionsParser() result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True extractors = [ Extractor( label='alternate versions', path="//div[@class='sodatext']", attrs=Attribute( key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip()) ) ] def preprocess_dom(self, dom): # Remove "link this quote" links. for qLink in self.xpath(dom, "//span[@class='linksoda']"): qLink.drop_tree() return dom class DOMHTMLSoundtrackParser(DOMParserBase): _defGetRefs = True preprocessors = [('
', '\n'), ('
', '\n')] extractors = [ Extractor( label='soundtrack', path="//div[@class='list']//div", attrs=Attribute( key='soundtrack', multi=True, path=".//text()", postprocess=lambda x: x.strip() ) ) ] def postprocess_data(self, data): if 'soundtrack' in data: nd = [] for x in data['soundtrack']: ds = x.split('\n') title = ds[0] if title[0] == '"' and title[-1] == '"': title = title[1:-1] nds = [] newData = {} for l in ds[1:]: if ' with ' in l or ' by ' in l or ' from ' in l \ or ' of ' in l or l.startswith('From '): nds.append(l) else: if nds: nds[-1] += l else: nds.append(l) newData[title] = {} for l in nds: skip = False for sep in ('From ',): if l.startswith(sep): fdix = len(sep) kind = l[:fdix].rstrip().lower() info = l[fdix:].lstrip() newData[title][kind] = info skip = True if not skip: for sep in ' with ', ' by ', ' from ', ' of ': fdix = l.find(sep) if fdix != -1: fdix = fdix + len(sep) kind = l[:fdix].rstrip().lower() info = l[fdix:].lstrip() newData[title][kind] = info break nd.append(newData) data['soundtrack'] = nd return data class DOMHTMLCrazyCreditsParser(DOMParserBase): """Parser for the "crazy credits" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: ccparser = DOMHTMLCrazyCreditsParser() result = ccparser.parse(crazycredits_html_string) """ _defGetRefs = True extractors = [ Extractor( label='crazy credits', path="//ul/li/tt", attrs=Attribute( key='crazy credits', multi=True, path=".//text()", postprocess=lambda x: x.replace('\n', ' ').replace(' ', ' ') ) ) ] def _process_goof(x): if x['spoiler_category']: return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip() else: return x['category'].strip() + ': ' + x['text'].strip() class DOMHTMLGoofsParser(DOMParserBase): """Parser for the "goofs" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLGoofsParser() result = gparser.parse(goofs_html_string) """ _defGetRefs = True extractors = [ Extractor( label='goofs', path="//div[@class='soda odd']", attrs=Attribute( key='goofs', multi=True, path={ 'text': "./text()", 'category': './preceding-sibling::h4[1]/text()', 'spoiler_category': './h4/text()' }, postprocess=_process_goof ) ) ] class DOMHTMLQuotesParser(DOMParserBase): """Parser for the "memorable quotes" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = DOMHTMLQuotesParser() result = qparser.parse(quotes_html_string) """ _defGetRefs = True extractors = [ Extractor( label='quotes_odd', path="//div[@class='quote soda odd']", attrs=Attribute( key='quotes_odd', multi=True, path=".//text()", postprocess=lambda x: x .strip() .replace(' \n', '::') .replace('::\n', '::') .replace('\n', ' ') ) ), Extractor( label='quotes_even', path="//div[@class='quote soda even']", attrs=Attribute( key='quotes_even', multi=True, path=".//text()", postprocess=lambda x: x .strip() .replace(' \n', '::') .replace('::\n', '::') .replace('\n', ' ') ) ) ] preprocessors = [ (re.compile('
', re.I), '') ] def preprocess_dom(self, dom): # Remove "link this quote" links. for qLink in self.xpath(dom, "//span[@class='linksoda']"): qLink.drop_tree() for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"): qLink.drop_tree() return dom def postprocess_data(self, data): quotes = data.get('quotes_odd', []) + data.get('quotes_even', []) if not quotes: return {} quotes = [q.split('::') for q in quotes] return {'quotes': quotes} class DOMHTMLReleaseinfoParser(DOMParserBase): """Parser for the "release dates" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rdparser = DOMHTMLReleaseinfoParser() result = rdparser.parse(releaseinfo_html_string) """ extractors = [ Extractor( label='release dates', path="//table[@id='release_dates']//tr", attrs=Attribute( key='release dates', multi=True, path={ 'country': ".//td[1]//text()", 'date': ".//td[2]//text()", 'notes': ".//td[3]//text()" } ) ), Extractor( label='akas', path="//table[@id='akas']//tr", attrs=Attribute( key='akas', multi=True, path={ 'title': "./td[1]/text()", 'countries': "./td[2]/text()"} ) ) ] preprocessors = [ (re.compile('(
)', re.I | re.M | re.S), r'
\1
') ] def postprocess_data(self, data): if not ('release dates' in data or 'akas' in data): return data releases = data.get('release dates') or [] rl = [] for i in releases: country = i.get('country') date = i.get('date') if not (country and date): continue country = country.strip() date = date.strip() if not (country and date): continue notes = i['notes'] info = u'%s::%s' % (country, date) if notes: info += notes rl.append(info) if releases: del data['release dates'] if rl: data['release dates'] = rl akas = data.get('akas') or [] nakas = [] for aka in akas: title = (aka.get('title') or '').strip() if not title: continue countries = (aka.get('countries') or '').split(',') if not countries: nakas.append(title) else: for country in countries: nakas.append('%s::%s' % (title, country.strip())) if akas: del data['akas'] if nakas: data['akas from release info'] = nakas return data class DOMHTMLRatingsParser(DOMParserBase): """Parser for the "user ratings" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = DOMHTMLRatingsParser() result = rparser.parse(userratings_html_string) """ re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])', re.I) extractors = [ Extractor( label='number of votes', path="//td[b='Percentage']/../../tr", attrs=[ Attribute( key='votes', multi=True, path={ 'votes': "td[1]//text()", 'ordinal': "td[3]//text()" } ) ] ), Extractor( label='mean and median', path="//p[starts-with(text(), 'Arithmetic mean')]", attrs=Attribute( key='mean and median', path="text()" ) ), Extractor( label='rating', path="//a[starts-with(@href, '/search/title?user_rating=')]", attrs=Attribute( key='rating', path="text()" ) ), Extractor( label='demographic voters', path="//td[b='Average']/../../tr", attrs=Attribute( key='demographic voters', multi=True, path={ 'voters': "td[1]//text()", 'votes': "td[2]//text()", 'average': "td[3]//text()" } ) ), Extractor( label='top 250', path="//a[text()='top 250']", attrs=Attribute( key='top 250', path="./preceding-sibling::text()[1]" ) ) ] def postprocess_data(self, data): nd = {} votes = data.get('votes', []) if votes: nd['number of votes'] = {} for i in xrange(1, 11): _ordinal = int(votes[i]['ordinal']) _strvts = votes[i]['votes'] or '0' nd['number of votes'][_ordinal] = int(_strvts.replace(',', '')) mean = data.get('mean and median', '') if mean: means = self.re_means.findall(mean) if means and len(means[0]) == 2: am, med = means[0] try: am = float(am) except (ValueError, OverflowError): pass if isinstance(am, float): nd['arithmetic mean'] = am try: med = int(med) except (ValueError, OverflowError): pass if isinstance(med, int): nd['median'] = med if 'rating' in data: nd['rating'] = float(data['rating']) dem_voters = data.get('demographic voters') if dem_voters: nd['demographic'] = {} for i in xrange(1, len(dem_voters)): if (dem_voters[i]['votes'] is not None) and (dem_voters[i]['votes'].strip()): nd['demographic'][dem_voters[i]['voters'].strip().lower()] = \ (int(dem_voters[i]['votes'].replace(',', '')), float(dem_voters[i]['average'])) if 'imdb users' in nd.get('demographic', {}): nd['votes'] = nd['demographic']['imdb users'][0] nd['demographic']['all votes'] = nd['demographic']['imdb users'] del nd['demographic']['imdb users'] top250 = data.get('top 250') if top250: sd = top250[9:] i = sd.find(' ') if i != -1: sd = sd[:i] try: sd = int(sd) except (ValueError, OverflowError): pass if isinstance(sd, int): nd['top 250 rank'] = sd return nd class DOMHTMLEpisodesRatings(DOMParserBase): """Parser for the "episode ratings ... by date" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: erparser = DOMHTMLEpisodesRatings() result = erparser.parse(eprating_html_string) """ _containsObjects = True extractors = [ Extractor( label='title', path="//title", attrs=Attribute( key='title', path="./text()" ) ), Extractor( label='ep ratings', path="//th/../..//tr", attrs=Attribute( key='episodes', multi=True, path={ 'nr': ".//td[1]/text()", 'ep title': ".//td[2]//text()", 'movieID': ".//td[2]/a/@href", 'rating': ".//td[3]/text()", 'votes': ".//td[4]/text()" } ) ) ] def postprocess_data(self, data): if 'title' not in data or 'episodes' not in data: return {} nd = [] title = data['title'] for i in data['episodes']: ept = i['ep title'] movieID = analyze_imdbid(i['movieID']) votes = i['votes'] rating = i['rating'] if not (ept and movieID and votes and rating): continue try: votes = int(votes.replace(',', '').replace('.', '')) except: pass try: rating = float(rating) except: pass ept = ept.strip() ept = u'%s {%s' % (title, ept) nr = i['nr'] if nr: ept += u' (#%s)' % nr.strip() ept += '}' if movieID is not None: movieID = str(movieID) m = Movie(title=ept, movieID=movieID, accessSystem=self._as, modFunct=self._modFunct) epofdict = m.get('episode of') if epofdict is not None: m['episode of'] = Movie(data=epofdict, accessSystem=self._as, modFunct=self._modFunct) nd.append({'episode': m, 'votes': votes, 'rating': rating}) return {'episodes rating': nd} def _normalize_href(href): if (href is not None) and (not href.lower().startswith('http://')): if href.startswith('/'): href = href[1:] # TODO: imdbURL_base may be set by the user! href = '%s%s' % (imdbURL_base, href) return href class DOMHTMLCriticReviewsParser(DOMParserBase): """Parser for the "critic reviews" pages of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: osparser = DOMHTMLCriticReviewsParser() result = osparser.parse(officialsites_html_string) """ kind = 'critic reviews' extractors = [ Extractor( label='metascore', path="//div[@class='metascore_wrap']/div/span", attrs=Attribute( key='metascore', path=".//text()" ) ), Extractor( label='metacritic url', path="//div[@class='article']/div[@class='see-more']/a", attrs=Attribute( key='metacritic url', path="./@href" ) ) ] class DOMHTMLReviewsParser(DOMParserBase): """Parser for the "reviews" pages of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: osparser = DOMHTMLReviewsParser() result = osparser.parse(officialsites_html_string) """ kind = 'reviews' extractors = [ Extractor( label='review', path="//div[@class='review-container']", attrs=Attribute( key='self.kind', multi=True, path={ 'text': ".//div[@class='text']//text()", 'helpful': ".//div[@class='text-muted']/text()[1]", 'title': ".//div[@class='title']//text()", 'author': ".//span[@class='display-name-link']/a/@href", 'date': ".//span[@class='review-date']//text()", 'rating': ".//span[@class='point-scale']/preceding-sibling::span[1]/text()" }, postprocess=lambda x: ({ 'content': (x['text'] or u'').replace(u"\n", u" ").replace(u' ', u' ').strip(), 'helpful': [int(s) for s in (x.get('helpful') or u'').split() if s.isdigit()], 'title': (x.get('title') or u'').strip(), 'author': analyze_imdbid(x.get('author')), 'date': (x.get('date') or u'').strip(), 'rating': (x.get('rating') or u'').strip() }) ) ) ] preprocessors = [('
', '
\n')] def postprocess_data(self, data): for review in data.get('reviews', []): if review.get('rating') and len(review['rating']) == 2: review['rating'] = int(review['rating'][0]) else: review['rating'] = None if review.get('helpful') and len(review['helpful']) == 2: review['not_helpful'] = review['helpful'][1] - review['helpful'][0] review['helpful'] = review['helpful'][0] else: review['helpful'] = 0 review['not_helpful'] = 0 review['author'] = u"ur%s" % review['author'] return data class DOMHTMLFullCreditsParser(DOMParserBase): """Parser for the "full credits" (series cast section) page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: osparser = DOMHTMLFullCreditsParser() result = osparser.parse(officialsites_html_string) """ kind = 'full credits' extractors = [ Extractor( label='cast', path="//table[@class='cast_list']//tr[@class='odd' or @class='even']", attrs=Attribute( key="cast", multi=True, path={ 'person': ".//text()", 'link': "td[2]/a/@href", 'roleID': "td[4]//div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_person( x.get('person') or u'', personID=analyze_imdbid(x.get('link')), roleID=(x.get('roleID') or u'').split('/') ) ) ), ] preprocessors = [ (_reRolesMovie, _manageRoles) ] class DOMHTMLOfficialsitesParser(DOMParserBase): """Parser for the "official sites", "external reviews", "newsgroup reviews", "miscellaneous links", "sound clips", "video clips" and "photographs" pages of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: osparser = DOMHTMLOfficialsitesParser() result = osparser.parse(officialsites_html_string) """ kind = 'official sites' extractors = [ Extractor( label='site', path="//ol/li/a", attrs=Attribute( key='self.kind', multi=True, path={ 'link': "./@href", 'info': "./text()" }, postprocess=lambda x: ( x.get('info').strip(), urllib.unquote(_normalize_href(x.get('link'))) ) ) ) ] class DOMHTMLConnectionParser(DOMParserBase): """Parser for the "connections" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: connparser = DOMHTMLConnectionParser() result = connparser.parse(connections_html_string) """ _containsObjects = True extractors = [ Extractor( label='connection', group="//div[@class='_imdbpy']", group_key="./h5/text()", group_key_normalize=lambda x: x.lower(), path="./a", attrs=Attribute( key=None, path={ 'title': "./text()", 'movieID': "./@href" }, multi=True ) ) ] preprocessors = [ ('
', '
'), # To get the movie's year. (' (', ' ('), ('\n
', ''), ('
- ', '::') ] def postprocess_data(self, data): for key in data.keys(): nl = [] for v in data[key]: title = v['title'] ts = title.split('::', 1) title = ts[0].strip() notes = u'' if len(ts) == 2: notes = ts[1].strip() m = Movie(title=title, movieID=analyze_imdbid(v['movieID']), accessSystem=self._as, notes=notes, modFunct=self._modFunct) nl.append(m) data[key] = nl if not data: return {} return {'connections': data} class DOMHTMLLocationsParser(DOMParserBase): """Parser for the "locations" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: lparser = DOMHTMLLocationsParser() result = lparser.parse(locations_html_string) """ extractors = [ Extractor( label='locations', path="//dt", attrs=Attribute( key='locations', multi=True, path={ 'place': ".//text()", 'note': "./following-sibling::dd[1]//text()" }, postprocess=lambda x: ( u'%s::%s' % ( x['place'].strip(), (x['note'] or u'').strip() ) ).strip(':') ) ) ] class DOMHTMLTechParser(DOMParserBase): """Parser for the "technical", "publicity" (for people) and "contacts" (for people) pages of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = HTMLTechParser() result = tparser.parse(technical_html_string) """ kind = 'tech' re_space = re.compile(r'\s+') extractors = [ Extractor( label='tech', group="//table//tr/td[@class='label']", group_key="./text()", group_key_normalize=lambda x: x.lower().strip(), path=".", attrs=Attribute( key=None, path="..//td[2]//text()", postprocess=lambda x: [t.strip() for t in x.split(':::') if t.strip()] ) ) ] preprocessors = [ (re.compile('(
.*?
)', re.I), r'
\1
'), (re.compile('((
|

|))\n?
(?!'), # the ones below are for the publicity parser (re.compile('

(.*?)

', re.I), r'\1
'), (re.compile('()', re.I), r'\1::'), (re.compile('()', re.I), r'\n\1'), (re.compile('\|', re.I), r':::'), (re.compile('
', re.I), r':::') # this is for splitting individual entries ] def postprocess_data(self, data): for key in data: data[key] = filter(lambda x: x != '|', data[key]) data[key] = [self.re_space.sub(' ', x).strip() for x in data[key]] data[key] = filter(None, data[key]) if self.kind == 'contacts' and data: data = {self.kind: data} else: if self.kind == 'publicity': if 'biography (print)' in data: data['biography-print'] = data['biography (print)'] del data['biography (print)'] # Tech info. for key in data.keys(): if key.startswith('film negative format'): data['film negative format'] = data[key] del data[key] elif key.startswith('film length'): data['film length'] = data[key] del data[key] return data class DOMHTMLBusinessParser(DOMParserBase): """Parser for the "business" and "literature" pages of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLBusinessParser() result = bparser.parse(business_html_string) """ re_space = re.compile(r'\s+') extractors = [ Extractor( label='business', path="//div[@id='tn15content']//h5", attrs=Attribute( key='./text()', path="./following-sibling::div[1]//text()" ) ) ] preprocessors = [ ('', '
'), ('
', '
'), ('
', '
'), ('

', '


'), ('
', ':::') ] def postprocess_data(self, data): newData = {} for key, value in data.iteritems(): value = value.strip().split(':::') value = [v.strip() for v in value] value = filter(None, value) if not value: continue newData[key.lower().strip()] = value return newData class DOMHTMLRecParser(DOMParserBase): """Parser for the "recommendations" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = HTMLRecParser() result = rparser.parse(recommendations_html_string) """ _containsObjects = True extractors = [ Extractor( label='recommendations', path="//td[@valign='middle'][1]", attrs=Attribute( key='../../tr/td[1]//text()', multi=True, path={ 'title': ".//text()", 'movieID': ".//a/@href" } ) ) ] def postprocess_data(self, data): for key in data.keys(): n_key = key n_keyl = n_key.lower() if n_keyl == 'suggested by the database': n_key = 'database' elif n_keyl == 'imdb users recommend': n_key = 'users' data[n_key] = [ Movie(title=x['title'], movieID=analyze_imdbid(x['movieID']), accessSystem=self._as, modFunct=self._modFunct) for x in data[key] ] del data[key] if data: return {'recommendations': data} return data class DOMHTMLNewsParser(DOMParserBase): """Parser for the "news" page of a given movie or person. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: nwparser = DOMHTMLNewsParser() result = nwparser.parse(news_html_string) """ _defGetRefs = True extractors = [ Extractor( label='news', path="//h2", attrs=Attribute( key='news', multi=True, path={ 'title': "./text()", 'fromdate': "../following-sibling::p[1]/small//text()", # FIXME: sometimes (see The Matrix (1999))

is found # inside news text. 'body': "../following-sibling::p[2]//text()", 'link': "../..//a[text()='Permalink']/@href", 'fulllink': "../..//a[starts-with(text(), 'See full article at')]/@href" }, postprocess=lambda x: { 'title': x.get('title').strip(), 'date': x.get('fromdate').split('|')[0].strip(), 'from': x.get('fromdate').split('|')[1].replace('From ', '').strip(), 'body': (x.get('body') or u'').strip(), 'link': _normalize_href(x.get('link')), 'full article link': _normalize_href(x.get('fulllink')) } ) ) ] preprocessors = [ (re.compile('(]+>

)', re.I), r'
\1'), (re.compile('(
)', re.I), r'
\1'), (re.compile('

', re.I), r'') ] def postprocess_data(self, data): if not data.has_key('news'): return {} for news in data['news']: if news.has_key('full article link'): if news['full article link'] is None: del news['full article link'] return data def _parse_review(x): result = {} title = x.get('title').strip() if title[-1] == ':': title = title[:-1] result['title'] = title result['link'] = _normalize_href(x.get('link')) kind = x.get('kind').strip() if kind[-1] == ':': kind = kind[:-1] result['review kind'] = kind text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||') review = '\n'.join(text) if x.get('author') is not None: author = x.get('author').strip() review = review.split(author)[0].strip() result['review author'] = author[2:] if x.get('item') is not None: item = x.get('item').strip() review = review[len(item):].strip() review = "%s: %s" % (item, review) result['review'] = review return result class DOMHTMLSeasonEpisodesParser(DOMParserBase): """Parser for the "episode list" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = DOMHTMLSeasonEpisodesParser() result = sparser.parse(episodes_html_string) """ extractors = [ Extractor( label='series link', path="//div[@class='parent']", attrs=[ Attribute( key='series link', path=".//a/@href" ) ] ), Extractor( label='series title', path="//head/meta[@property='og:title']", attrs=[ Attribute( key='series title', path="./@content" ) ] ), Extractor( label='seasons list', path="//select[@id='bySeason']//option", attrs=[ Attribute( key='_seasons', multi=True, path="./@value" ) ] ), Extractor( label='selected season', path="//select[@id='bySeason']//option[@selected]", attrs=[ Attribute( key='_current_season', path='./@value' ) ] ), Extractor( label='episodes', path=".", group="//div[@class='info']", group_key=".//meta/@content", group_key_normalize=lambda x: 'episode %s' % x, attrs=[ Attribute( key=None, multi=True, path={ "link": ".//strong//a[@href][1]/@href", "original air date": ".//div[@class='airdate']/text()", "title": ".//strong//text()", "plot": ".//div[@class='item_description']//text()" } ) ] ) ] def postprocess_data(self, data): series_id = analyze_imdbid(data.get('series link')) series_title = data.get('series title', '').strip() selected_season = data.get('_current_season', 'unknown season').strip() if not (series_id and series_title): return {} series = Movie(title=series_title, movieID=str(series_id), accessSystem=self._as, modFunct=self._modFunct) if series.get('kind') == 'movie': series['kind'] = u'tv series' try: selected_season = int(selected_season) except: pass nd = {selected_season: {}} if 'episode -1' in data: counter = 1 for episode in data['episode -1']: while 'episode %d' % counter in data: counter += 1 k = 'episode %d' % counter data[k] = [episode] del data['episode -1'] for episode_nr, episode in data.items(): if not (episode and episode[0] and episode_nr.startswith('episode ')): continue episode = episode[0] episode_nr = episode_nr[8:].rstrip() try: episode_nr = int(episode_nr) except: pass episode_id = analyze_imdbid(episode.get('link' '')) episode_air_date = episode.get('original air date', '').strip() episode_title = episode.get('title', '').strip() episode_plot = episode.get('plot', '') if not (episode_nr is not None and episode_id and episode_title): continue ep_obj = Movie(movieID=episode_id, title=episode_title, accessSystem=self._as, modFunct=self._modFunct) ep_obj['kind'] = u'episode' ep_obj['episode of'] = series ep_obj['season'] = selected_season ep_obj['episode'] = episode_nr if episode_air_date: ep_obj['original air date'] = episode_air_date if episode_air_date[-4:].isdigit(): ep_obj['year'] = episode_air_date[-4:] if episode_plot: ep_obj['plot'] = episode_plot nd[selected_season][episode_nr] = ep_obj _seasons = data.get('_seasons') or [] for idx, season in enumerate(_seasons): try: _seasons[idx] = int(season) except: pass return {'episodes': nd, '_seasons': _seasons, '_current_season': selected_season} def _build_episode(x): """Create a Movie object for a given series' episode.""" episode_id = analyze_imdbid(x.get('link')) episode_title = x.get('title') e = Movie(movieID=episode_id, title=episode_title) e['kind'] = u'episode' oad = x.get('oad') if oad: e['original air date'] = oad.strip() year = x.get('year') if year is not None: year = year[5:] if year == 'unknown': year = u'????' if year and year.isdigit(): year = int(year) e['year'] = year else: if oad and oad[-4:].isdigit(): e['year'] = int(oad[-4:]) epinfo = x.get('episode') if epinfo is not None: season, episode = epinfo.split(':')[0].split(',') e['season'] = int(season[7:]) e['episode'] = int(episode[8:]) else: e['season'] = 'unknown' e['episode'] = 'unknown' plot = x.get('plot') if plot: e['plot'] = plot.strip() return e class DOMHTMLEpisodesParser(DOMParserBase): """Parser for the "episode list" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: eparser = DOMHTMLEpisodesParser() result = eparser.parse(episodes_html_string) """ # XXX: no more used for the list of episodes parser, # but only for the episodes cast parser (see below). _containsObjects = True kind = 'episodes list' _episodes_path = "..//h4" _oad_path = "./following-sibling::span/strong[1]/text()" def _init(self): self.extractors = [ Extractor( label='series', path="//html", attrs=[ Attribute( key='series title', path=".//title/text()" ), Attribute( key='series movieID', path=".//h1/a[@class='main']/@href", postprocess=analyze_imdbid ) ] ), Extractor( label='episodes', group="//div[@class='_imdbpy']/h3", group_key="./a/@name", path=self._episodes_path, attrs=Attribute( key=None, multi=True, path={ 'link': "./a/@href", 'title': "./a/text()", 'year': "./preceding-sibling::a[1]/@name", 'episode': "./text()[1]", 'oad': self._oad_path, 'plot': "./following-sibling::text()[1]" }, postprocess=_build_episode ) ) ] if self.kind == 'episodes cast': self.extractors += [ Extractor( label='cast', group="//h4", group_key="./text()[1]", group_key_normalize=lambda x: x.strip(), path="./following-sibling::table[1]//td[@class='nm']", attrs=Attribute( key=None, multi=True, path={ 'person': "..//text()", 'link': "./a/@href", 'roleID': "../td[4]/div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_person( x.get('person') or u'', personID=analyze_imdbid(x.get('link')), roleID=(x.get('roleID') or u'').split('/'), accessSystem=self._as, modFunct=self._modFunct ) ) ) ] preprocessors = [ (re.compile('(
\n)(

)', re.I), r'

\1
\2'), (re.compile('(

\n\n)
', re.I), r'\1'), (re.compile('

(.*?)

', re.I), r'

\1

'), (_reRolesMovie, _manageRoles), (re.compile('(

\n)(
)', re.I), r'\1
\2') ] def postprocess_data(self, data): # A bit extreme? if 'series title' not in data: return {} if 'series movieID' not in data: return {} stitle = data['series title'].replace('- Episode list', '') stitle = stitle.replace('- Episodes list', '') stitle = stitle.replace('- Episode cast', '') stitle = stitle.replace('- Episodes cast', '') stitle = stitle.strip() if not stitle: return {} seriesID = data['series movieID'] if seriesID is None: return {} series = Movie(title=stitle, movieID=str(seriesID), accessSystem=self._as, modFunct=self._modFunct) nd = {} for key in data.keys(): if key.startswith('filter-season-') or key.startswith('season-'): season_key = key.replace('filter-season-', '').replace('season-', '') try: season_key = int(season_key) except: pass nd[season_key] = {} ep_counter = 1 for episode in data[key]: if not episode: continue episode_key = episode.get('episode') if episode_key is None: continue if not isinstance(episode_key, int): episode_key = ep_counter ep_counter += 1 cast_key = 'Season %s, Episode %s:' % (season_key, episode_key) if cast_key in data: cast = data[cast_key] for i in xrange(len(cast)): cast[i].billingPos = i + 1 episode['cast'] = cast episode['episode of'] = series nd[season_key][episode_key] = episode if len(nd) == 0: return {} return {'episodes': nd} class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser): """Parser for the "episodes cast" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: eparser = DOMHTMLEpisodesParser() result = eparser.parse(episodes_html_string) """ kind = 'episodes cast' _episodes_path = "..//h4" _oad_path = "./following-sibling::b[1]/text()" class DOMHTMLFaqsParser(DOMParserBase): """Parser for the "FAQ" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: fparser = DOMHTMLFaqsParser() result = fparser.parse(faqs_html_string) """ _defGetRefs = True # XXX: bsoup and lxml don't match (looks like a minor issue, anyway). extractors = [ Extractor( label='faqs', path="//div[@class='section']", attrs=Attribute( key='faqs', multi=True, path={ 'question': "./h3/a/span/text()", 'answer': "../following-sibling::div[1]//text()" }, postprocess=lambda x: u'%s::%s' % ( x.get('question').strip(), '\n\n'.join(x.get('answer').replace('\n\n', '\n').strip().split('||')) ) ) ) ] preprocessors = [ (re.compile('

', re.I), r'||'), (re.compile('

(.*?)

\n', re.I), r'||\1--'), (re.compile('(.*?)', re.I), r'[spoiler]\1[/spoiler]') ] class DOMHTMLAiringParser(DOMParserBase): """Parser for the "airing" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: aparser = DOMHTMLAiringParser() result = aparser.parse(airing_html_string) """ _containsObjects = True extractors = [ Extractor( label='series title', path="//title", attrs=Attribute( key='series title', path="./text()", postprocess=lambda x: x.replace(' - TV schedule', u'') ) ), Extractor( label='series id', path="//h1/a[@href]", attrs=Attribute( key='series id', path="./@href" ) ), Extractor( label='tv airings', path="//tr[@class]", attrs=Attribute( key='airing', multi=True, path={ 'date': "./td[1]//text()", 'time': "./td[2]//text()", 'channel': "./td[3]//text()", 'link': "./td[4]/a[1]/@href", 'title': "./td[4]//text()", 'season': "./td[5]//text()", }, postprocess=lambda x: { 'date': x.get('date'), 'time': x.get('time'), 'channel': x.get('channel').strip(), 'link': x.get('link'), 'title': x.get('title'), 'season': (x.get('season') or '').strip() } ) ) ] def postprocess_data(self, data): if len(data) == 0: return {} seriesTitle = data['series title'] seriesID = analyze_imdbid(data['series id']) if data.has_key('airing'): for airing in data['airing']: title = airing.get('title', '').strip() if not title: epsTitle = seriesTitle if seriesID is None: continue epsID = seriesID else: epsTitle = '%s {%s}' % (data['series title'], airing['title']) epsID = analyze_imdbid(airing['link']) e = Movie(title=epsTitle, movieID=epsID) airing['episode'] = e del airing['link'] del airing['title'] if not airing['season']: del airing['season'] if 'series title' in data: del data['series title'] if 'series id' in data: del data['series id'] if 'airing' in data: data['airing'] = filter(None, data['airing']) if 'airing' not in data or not data['airing']: return {} return data class DOMHTMLSynopsisParser(DOMParserBase): """Parser for the "synopsis" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = HTMLSynopsisParser() result = sparser.parse(synopsis_html_string) """ extractors = [ Extractor( label='synopsis', path="//ul[@id='plot-synopsis-content'][not(@style)]", attrs=Attribute( key='synopsis', path=".//text()", postprocess=lambda x: '\n\n'.join(x.strip().split('||')) ) ) ] preprocessors = [ (re.compile('

', re.I), r'||') ] class DOMHTMLParentsGuideParser(DOMParserBase): """Parser for the "parents guide" page of a given movie. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: pgparser = HTMLParentsGuideParser() result = pgparser.parse(parentsguide_html_string) """ extractors = [ Extractor( label='parents guide', group="//div[@class='section']", group_key="./h3/a/span/text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::div[1]/p", attrs=Attribute( key=None, path=".//text()", postprocess=lambda x: [ t.strip().replace('\n', ' ') for t in x.split('||') if t.strip() ] ) ) ] preprocessors = [ (re.compile('

', re.I), r'||') ] def postprocess_data(self, data): data2 = {} for key in data: if data[key]: data2[key] = data[key] if not data2: return {} return {'parents guide': data2} _OBJECTS = { 'movie_parser': ((DOMHTMLMovieParser,), None), 'plot_parser': ((DOMHTMLPlotParser,), None), 'movie_awards_parser': ((DOMHTMLAwardsParser,), None), 'taglines_parser': ((DOMHTMLTaglinesParser,), None), 'keywords_parser': ((DOMHTMLKeywordsParser,), None), 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None), 'goofs_parser': ((DOMHTMLGoofsParser,), None), 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None), 'trivia_parser': ((DOMHTMLTriviaParser,), None), 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), None), 'quotes_parser': ((DOMHTMLQuotesParser,), None), 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None), 'ratings_parser': ((DOMHTMLRatingsParser,), None), 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), 'criticrev_parser': ((DOMHTMLCriticReviewsParser,), {'kind': 'critic reviews'}), 'reviews_parser': ((DOMHTMLReviewsParser,), {'kind': 'reviews'}), 'externalsites_parser': ((DOMHTMLOfficialsitesParser,), None), 'externalrev_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'external reviews'}), 'misclinks_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'misc links'}), 'soundclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'sound clips'}), 'videoclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'video clips'}), 'photosites_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'photo sites'}), 'connections_parser': ((DOMHTMLConnectionParser,), None), 'tech_parser': ((DOMHTMLTechParser,), None), 'business_parser': ((DOMHTMLBusinessParser,), {'kind': 'business', '_defGetRefs': 1}), 'literature_parser': ((DOMHTMLBusinessParser,), None), 'locations_parser': ((DOMHTMLLocationsParser,), None), 'rec_parser': ((DOMHTMLRecParser,), None), 'news_parser': ((DOMHTMLNewsParser,), None), 'episodes_parser': ((DOMHTMLEpisodesParser,), None), 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None), 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None), 'eprating_parser': ((DOMHTMLEpisodesRatings,), None), 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None), 'airing_parser': ((DOMHTMLAiringParser,), None), 'synopsis_parser': ((DOMHTMLSynopsisParser,), None), 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None) }