From 78026584ebe483504c9b5af0ce1f9e8b9700b7d6 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Mon, 26 Mar 2018 18:16:59 +0100 Subject: [PATCH] =?UTF-8?q?Update=20IMDb=205.1=20(r907)=20=E2=86=92=205.2.?= =?UTF-8?q?1dev20171113=20(f640595).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to the backport by @MasterMind2k --- CHANGES.md | 1 + lib/imdb/Company.py | 4 +- lib/imdb/Movie.py | 4 +- lib/imdb/__init__.py | 144 +- lib/imdb/_logging.py | 5 +- lib/imdb/helpers.py | 6 +- lib/imdb/parser/http/__init__.py | 27 +- lib/imdb/parser/http/bsouplxml/_bsoup.py | 42 +- lib/imdb/parser/http/bsouplxml/bsoupxpath.py | 2 +- lib/imdb/parser/http/characterParser.py | 8 +- lib/imdb/parser/http/companyParser.py | 65 +- lib/imdb/parser/http/movieParser.py | 2209 +++++++++++------ lib/imdb/parser/http/personParser.py | 264 +- lib/imdb/parser/http/searchCharacterParser.py | 2 +- lib/imdb/parser/http/searchCompanyParser.py | 41 +- lib/imdb/parser/http/searchKeywordParser.py | 2 +- lib/imdb/parser/http/searchMovieParser.py | 4 +- lib/imdb/parser/http/searchPersonParser.py | 2 +- lib/imdb/parser/http/topBottomParser.py | 45 +- lib/imdb/parser/http/utils.py | 33 +- lib/imdb/parser/sql/__init__.py | 4 +- lib/imdb/parser/sql/alchemyadapter.py | 6 +- lib/imdb/parser/sql/dbschema.py | 133 +- lib/imdb/utils.py | 123 +- 24 files changed, 1992 insertions(+), 1184 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 777b35a6..982e07b9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,6 +7,7 @@ * Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed * Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo * Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538) +* Update IMDb 5.1 (r907) to 5.2.1dev20171113 (f640595) [develop changelog] diff --git a/lib/imdb/Company.py b/lib/imdb/Company.py index 5e05c840..9c8cfa15 100644 --- a/lib/imdb/Company.py +++ b/lib/imdb/Company.py @@ -23,8 +23,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA from copy import deepcopy -from imdb.utils import analyze_company_name, build_company_name, \ - flatten, _Container, cmpCompanies +from imdb.utils import _Container +from imdb.utils import analyze_company_name, build_company_name, cmpCompanies, flatten class Company(_Container): diff --git a/lib/imdb/Movie.py b/lib/imdb/Movie.py index 5cdcde65..9da5cdbe 100644 --- a/lib/imdb/Movie.py +++ b/lib/imdb/Movie.py @@ -24,8 +24,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA from copy import deepcopy from imdb import linguistics -from imdb.utils import analyze_title, build_title, canonicalTitle, \ - flatten, _Container, cmpMovies +from imdb.utils import _Container +from imdb.utils import analyze_title, build_title, canonicalTitle, cmpMovies, flatten class Movie(_Container): diff --git a/lib/imdb/__init__.py b/lib/imdb/__init__.py index 8c5b1943..63fcd597 100644 --- a/lib/imdb/__init__.py +++ b/lib/imdb/__init__.py @@ -6,7 +6,7 @@ a person from the IMDb database. It can fetch data through different media (e.g.: the IMDb web pages, a SQL database, etc.) -Copyright 2004-2016 Davide Alberani +Copyright 2004-2018 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,12 +25,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company', 'available_access_systems'] -__version__ = VERSION = '5.1' +__version__ = VERSION = '5.2.1dev20171113' + +VERSION_NOTICE = """This is the imdbpy-legacy branch of IMDbPY, and requires Python 2. +Please notice that this version is mostly unsupported. + +For a version compatible with Python 3, see the master branch: + https://github.com/alberanid/imdbpy/ +""" + +import sys + +if sys.hexversion >= 0x3000000: + print(VERSION_NOTICE) + sys.exit(1) # Import compatibility module (importing it is enough). import _compat -import sys, os, ConfigParser, logging +import os, ConfigParser, logging from types import MethodType from imdb import Movie, Person, Character, Company @@ -38,38 +51,39 @@ import imdb._logging from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError from imdb.utils import build_title, build_name, build_company_name +_imdb_logger = logging.getLogger('imdbpy') _aux_logger = logging.getLogger('imdbpy.aux') # URLs of the main pages for movies, persons, characters and queries. -imdbURL_base = 'http://akas.imdb.com/' +imdbURL_base = 'http://www.imdb.com/' # NOTE: the urls below will be removed in a future version. # please use the values in the 'urls' attribute # of the IMDbBase subclass instance. -# http://akas.imdb.com/title/ +# http://www.imdb.com/title/ imdbURL_movie_base = '%stitle/' % imdbURL_base -# http://akas.imdb.com/title/tt%s/ +# http://www.imdb.com/title/tt%s/ imdbURL_movie_main = imdbURL_movie_base + 'tt%s/' -# http://akas.imdb.com/name/ +# http://www.imdb.com/name/ imdbURL_person_base = '%sname/' % imdbURL_base -# http://akas.imdb.com/name/nm%s/ +# http://www.imdb.com/name/nm%s/ imdbURL_person_main = imdbURL_person_base + 'nm%s/' -# http://akas.imdb.com/character/ +# http://www.imdb.com/character/ imdbURL_character_base = '%scharacter/' % imdbURL_base -# http://akas.imdb.com/character/ch%s/ +# http://www.imdb.com/character/ch%s/ imdbURL_character_main = imdbURL_character_base + 'ch%s/' -# http://akas.imdb.com/company/ +# http://www.imdb.com/company/ imdbURL_company_base = '%scompany/' % imdbURL_base -# http://akas.imdb.com/company/co%s/ +# http://www.imdb.com/company/co%s/ imdbURL_company_main = imdbURL_company_base + 'co%s/' -# http://akas.imdb.com/keyword/%s/ +# http://www.imdb.com/keyword/%s/ imdbURL_keyword_main = imdbURL_base + 'keyword/%s/' -# http://akas.imdb.com/chart/top +# http://www.imdb.com/chart/top imdbURL_top250 = imdbURL_base + 'chart/top' -# http://akas.imdb.com/chart/bottom +# http://www.imdb.com/chart/bottom imdbURL_bottom100 = imdbURL_base + 'chart/bottom' -# http://akas.imdb.com/find?%s +# http://www.imdb.com/find?%s imdbURL_find = imdbURL_base + 'find?%s' # Name of the configuration file. @@ -103,7 +117,7 @@ class ConfigParserWithCase(ConfigParser.ConfigParser): try: self.read(fname) except (ConfigParser.MissingSectionHeaderError, - ConfigParser.ParsingError), e: + ConfigParser.ParsingError) as e: _aux_logger.warn('Troubles reading config file: %s' % e) # Stop at the first valid file. if self.has_section('imdbpy'): @@ -159,10 +173,8 @@ def IMDb(accessSystem=None, *arguments, **keywords): accessSystem = 'http' kwds.update(keywords) keywords = kwds - except Exception, e: - import logging - logging.getLogger('imdbpy').warn('Unable to read configuration' \ - ' file; complete error: %s' % e) + except Exception as e: + _imdb_logger.warn('Unable to read configuration file; complete error: %s' % e) # It just LOOKS LIKE a bad habit: we tried to read config # options from some files, but something is gone horribly # wrong: ignore everything and pretend we were called with @@ -177,9 +189,8 @@ def IMDb(accessSystem=None, *arguments, **keywords): try: import logging.config logging.config.fileConfig(os.path.expanduser(logCfg)) - except Exception, e: - logging.getLogger('imdbpy').warn('unable to read logger ' \ - 'config: %s' % e) + except Exception as e: + _imdb_logger.warn('unable to read logger config: %s' % e) if accessSystem in ('httpThin', 'webThin', 'htmlThin'): logging.warn('httpThin was removed since IMDbPY 4.8') accessSystem = 'http' @@ -244,9 +255,6 @@ class IMDbBase: # in the subclasses). accessSystem = 'UNKNOWN' - # Top-level logger for IMDbPY. - _imdb_logger = logging.getLogger('imdbpy') - # Whether to re-raise caught exceptions or not. _reraise_exceptions = False @@ -285,30 +293,30 @@ class IMDbBase: imdbURL_base = 'http://%s' % imdbURL_base if not imdbURL_base.endswith('/'): imdbURL_base = '%s/' % imdbURL_base - # http://akas.imdb.com/title/ - imdbURL_movie_base='%stitle/' % imdbURL_base - # http://akas.imdb.com/title/tt%s/ - imdbURL_movie_main=imdbURL_movie_base + 'tt%s/' - # http://akas.imdb.com/name/ - imdbURL_person_base='%sname/' % imdbURL_base - # http://akas.imdb.com/name/nm%s/ - imdbURL_person_main=imdbURL_person_base + 'nm%s/' - # http://akas.imdb.com/character/ - imdbURL_character_base='%scharacter/' % imdbURL_base - # http://akas.imdb.com/character/ch%s/ - imdbURL_character_main=imdbURL_character_base + 'ch%s/' - # http://akas.imdb.com/company/ - imdbURL_company_base='%scompany/' % imdbURL_base - # http://akas.imdb.com/company/co%s/ - imdbURL_company_main=imdbURL_company_base + 'co%s/' - # http://akas.imdb.com/keyword/%s/ - imdbURL_keyword_main=imdbURL_base + 'keyword/%s/' - # http://akas.imdb.com/chart/top - imdbURL_top250=imdbURL_base + 'chart/top' - # http://akas.imdb.com/chart/bottom - imdbURL_bottom100=imdbURL_base + 'chart/bottom' - # http://akas.imdb.com/find?%s - imdbURL_find=imdbURL_base + 'find?%s' + # http://www.imdb.com/title/ + imdbURL_movie_base = '%stitle/' % imdbURL_base + # http://www.imdb.com/title/tt%s/ + imdbURL_movie_main = imdbURL_movie_base + 'tt%s/' + # http://www.imdb.com/name/ + imdbURL_person_base = '%sname/' % imdbURL_base + # http://www.imdb.com/name/nm%s/ + imdbURL_person_main = imdbURL_person_base + 'nm%s/' + # http://www.imdb.com/character/ + imdbURL_character_base = '%scharacter/' % imdbURL_base + # http://www.imdb.com/character/ch%s/ + imdbURL_character_main = imdbURL_character_base + 'ch%s/' + # http://www.imdb.com/company/ + imdbURL_company_base = '%scompany/' % imdbURL_base + # http://www.imdb.com/company/co%s/ + imdbURL_company_main = imdbURL_company_base + 'co%s/' + # http://www.imdb.com/keyword/%s/ + imdbURL_keyword_main = imdbURL_base + 'keyword/%s/' + # http://www.imdb.com/chart/top + imdbURL_top250 = imdbURL_base + 'chart/top' + # http://www.imdb.com/chart/bottom + imdbURL_bottom100 = imdbURL_base + 'chart/bottom' + # http://www.imdb.com/find?%s + imdbURL_find = imdbURL_base + 'find?%s' self.urls = dict( movie_base=imdbURL_movie_base, movie_main=imdbURL_movie_main, @@ -727,16 +735,15 @@ class IMDbBase: mopID = mop.companyID prefix = 'company' else: - raise IMDbError('object ' + repr(mop) + \ - ' is not a Movie, Person, Character or Company instance') + raise IMDbError('object ' + repr(mop) + + ' is not a Movie, Person, Character or Company instance') if mopID is None: # XXX: enough? It's obvious that there are Characters # objects without characterID, so I think they should # just do nothing, when an i.update(character) is tried. if prefix == 'character': return - raise IMDbDataAccessError( \ - 'the supplied object has null movieID, personID or companyID') + raise IMDbDataAccessError('supplied object has null movieID, personID or companyID') if mop.accessSystem == self.accessSystem: aSystem = self else: @@ -760,21 +767,22 @@ class IMDbBase: continue if not i: continue - self._imdb_logger.debug('retrieving "%s" info set', i) + _imdb_logger.debug('retrieving "%s" info set', i) try: method = getattr(aSystem, 'get_%s_%s' % (prefix, i.replace(' ', '_'))) except AttributeError: - self._imdb_logger.error('unknown information set "%s"', i) + _imdb_logger.error('unknown information set "%s"', i) # Keeps going. method = lambda *x: {} try: ret = method(mopID) - except Exception, e: - self._imdb_logger.critical('caught an exception retrieving ' \ - 'or parsing "%s" info set for mopID ' \ - '"%s" (accessSystem: %s)', - i, mopID, mop.accessSystem, exc_info=True) + except Exception: + _imdb_logger.critical( + 'caught an exception retrieving or parsing "%s" info set' + ' for mopID "%s" (accessSystem: %s)', + i, mopID, mop.accessSystem, exc_info=True + ) ret = {} # If requested by the user, reraise the exception. if self._reraise_exceptions: @@ -826,9 +834,7 @@ class IMDbBase: raise NotImplementedError('override this method') def _searchIMDb(self, kind, ton, title_kind=None): - """Search the IMDb akas server for the given title or name.""" - # The Exact Primary search system has gone AWOL, so we resort - # to the mobile search. :-/ + """Search the IMDb www server for the given title or name.""" if not ton: return None ton = ton.strip('"') @@ -935,8 +941,8 @@ class IMDbBase: else: imdbID = aSystem.company2imdbID(build_company_name(mop)) else: - raise IMDbError('object ' + repr(mop) + \ - ' is not a Movie, Person or Character instance') + raise IMDbError('object ' + repr(mop) + + ' is not a Movie, Person or Character instance') return imdbID def get_imdbURL(self, mop): @@ -954,8 +960,8 @@ class IMDbBase: elif isinstance(mop, Company.Company): url_firstPart = imdbURL_company_main else: - raise IMDbError('object ' + repr(mop) + \ - ' is not a Movie, Person, Character or Company instance') + raise IMDbError('object ' + repr(mop) + + ' is not a Movie, Person, Character or Company instance') return url_firstPart % imdbID def get_special_methods(self): diff --git a/lib/imdb/_logging.py b/lib/imdb/_logging.py index 2b8a286a..e159f32f 100644 --- a/lib/imdb/_logging.py +++ b/lib/imdb/_logging.py @@ -32,8 +32,9 @@ LEVELS = {'debug': logging.DEBUG, imdbpyLogger = logging.getLogger('imdbpy') imdbpyStreamHandler = logging.StreamHandler() -imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \ - ' %(pathname)s:%(lineno)d: %(message)s') +imdbpyFormatter = logging.Formatter( + '%(asctime)s %(levelname)s [%(name)s] %(pathname)s:%(lineno)d: %(message)s' +) imdbpyStreamHandler.setFormatter(imdbpyFormatter) imdbpyLogger.addHandler(imdbpyStreamHandler) diff --git a/lib/imdb/helpers.py b/lib/imdb/helpers.py index f2206142..b54c7e8d 100644 --- a/lib/imdb/helpers.py +++ b/lib/imdb/helpers.py @@ -269,8 +269,8 @@ for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items(): everyentcharrefs[k] = v everyentcharrefs['#%s' % ord(v)] = v everyentcharrefsget = everyentcharrefs.get -re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % - '|'.join(map(re.escape, everyentcharrefs))) +re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % '|'.join(map(re.escape, + everyentcharrefs))) re_everyentcharrefssub = re_everyentcharrefs.sub def _replAllXMLRef(match): @@ -408,7 +408,7 @@ def _valueWithType(tag, tagValue): # Extra tags to get (if values were not already read from title/name). _titleTags = ('imdbindex', 'kind', 'year') -_nameTags = ('imdbindex') +_nameTags = ('imdbindex',) _companyTags = ('imdbindex', 'country') def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None, diff --git a/lib/imdb/parser/http/__init__.py b/lib/imdb/parser/http/__init__.py index a3001a08..3b832df2 100644 --- a/lib/imdb/parser/http/__init__.py +++ b/lib/imdb/parser/http/__init__.py @@ -7,7 +7,7 @@ the imdb.IMDb function will return an instance of this class when called with the 'accessSystem' argument set to "http" or "web" or "html" (this is the default). -Copyright 2004-2012 Davide Alberani +Copyright 2004-2017 Davide Alberani 2008 H. Turgut Uyar This program is free software; you can redistribute it and/or modify @@ -26,6 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import sys +import ssl import socket import logging from urllib import FancyURLopener, quote_plus @@ -68,8 +69,8 @@ class _ModuleProxy: """Initialize a proxy for the given module; defaultKeys, if set, muste be a dictionary of values to set for instanced objects.""" if oldParsers or fallBackToNew: - _aux_logger.warn('The old set of parsers was removed; falling ' \ - 'back to the new parsers.') + _aux_logger.warn('The old set of parsers was removed;' + ' falling back to the new parsers.') self.useModule = useModule if defaultKeys is None: defaultKeys = {} @@ -142,6 +143,7 @@ class IMDbURLopener(FancyURLopener): def __init__(self, *args, **kwargs): self._last_url = u'' + kwargs['context'] = ssl.SSLContext(ssl.PROTOCOL_SSLv23) FancyURLopener.__init__(self, *args, **kwargs) # Headers to add to every request. # XXX: IMDb's web server doesn't like urllib-based programs, @@ -211,9 +213,9 @@ class IMDbURLopener(FancyURLopener): if server_encode is None and content: begin_h = content.find('text/html; charset=') if begin_h != -1: - end_h = content[19+begin_h:].find('"') + end_h = content[19 + begin_h:].find('"') if end_h != -1: - server_encode = content[19+begin_h:19+begin_h+end_h] + server_encode = content[19 + begin_h:19 + begin_h + end_h] if server_encode: try: if lookup(server_encode): @@ -237,9 +239,10 @@ class IMDbURLopener(FancyURLopener): if encode is None: encode = 'latin_1' # The detection of the encoding is error prone... - self._logger.warn('Unable to detect the encoding of the retrieved ' - 'page [%s]; falling back to default latin1.', encode) - ##print unicode(content, encode, 'replace').encode('utf8') + self._logger.warn('Unable to detect the encoding of the retrieved page [%s];' + ' falling back to default utf8.', encode) + if isinstance(content, unicode): + return content return unicode(content, encode, 'replace') def http_error_default(self, url, fp, errcode, errmsg, headers): @@ -288,8 +291,8 @@ class IMDbHTTPAccessSystem(IMDbBase): self._getRefs = True self._mdparse = False if isThin: - self._http_logger.warn('"httpThin" access system no longer ' + - 'supported; "http" used automatically', exc_info=False) + self._http_logger.warn('"httpThin" access system no longer supported;' + ' "http" used automatically', exc_info=False) self.isThin = 0 if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'): self.accessSystem = 'http' @@ -503,7 +506,7 @@ class IMDbHTTPAccessSystem(IMDbBase): return self.smProxy.search_movie_parser.parse(cont, results=results)['data'] def get_movie_main(self, movieID): - cont = self._retrieve(self.urls['movie_main'] % movieID + 'combined') + cont = self._retrieve(self.urls['movie_main'] % movieID + 'reference') return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse) def get_movie_full_credits(self, movieID): @@ -811,7 +814,7 @@ class IMDbHTTPAccessSystem(IMDbBase): def _search_keyword(self, keyword, results): # XXX: the IMDb web server seems to have some serious problem with # non-ascii keyword. - # E.g.: http://akas.imdb.com/keyword/fianc%E9/ + # E.g.: http://www.imdb.com/keyword/fianc%E9/ # will return a 500 Internal Server Error: Redirect Recursion. keyword = keyword.encode('utf8', 'ignore') try: diff --git a/lib/imdb/parser/http/bsouplxml/_bsoup.py b/lib/imdb/parser/http/bsouplxml/_bsoup.py index afab5da9..d5ff3faa 100644 --- a/lib/imdb/parser/http/bsouplxml/_bsoup.py +++ b/lib/imdb/parser/http/bsouplxml/_bsoup.py @@ -171,7 +171,7 @@ class PageElement: return self def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." + """Finds the last element beneath this object to be parsed.""" lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: lastChild = lastChild.contents[-1] @@ -184,7 +184,7 @@ class PageElement: newChild = NavigableString(newChild) position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent != None: + if hasattr(newChild, 'parent') and newChild.parent is not None: # We're 'inserting' an element that's already one # of this object's children. if newChild.parent == self: @@ -323,7 +323,7 @@ class PageElement: return r def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." + """Iterates over a generator looking for things that match.""" if isinstance(name, SoupStrainer): strainer = name @@ -415,7 +415,7 @@ class NavigableString(unicode, PageElement): return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): - return (NavigableString.__str__(self),) + return NavigableString.__str__(self), def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -460,7 +460,7 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def _invert(h): - "Cheap function to invert a hash." + """Cheap function to invert a hash.""" i = {} for k,v in h.items(): i[v] = k @@ -501,14 +501,14 @@ class Tag(PageElement): def __init__(self, parser, name, attrs=None, parent=None, previous=None): - "Basic constructor." + """Basic constructor.""" # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name - if attrs == None: + if attrs is None: attrs = [] self.attrs = attrs self.contents = [] @@ -541,18 +541,18 @@ class Tag(PageElement): return self._getAttrMap()[key] def __iter__(self): - "Iterating over a tag iterates over its contents." + """Iterating over a tag iterates over its contents.""" return iter(self.contents) def __len__(self): - "The length of a tag is the length of its list of contents." + """The length of a tag is the length of its list of contents.""" return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): - "A tag is non-None even if it has no contents." + """A tag is non-None even if it has no contents.""" return True def __setitem__(self, key, value): @@ -570,7 +570,7 @@ class Tag(PageElement): self._getAttrMap()[key] = value def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." + """Deleting tag[key] deletes all 'key' attributes for the tag.""" for item in self.attrs: if item[0] == key: self.attrs.remove(item) @@ -911,7 +911,7 @@ class SoupStrainer: #print "Matching %s against %s" % (markup, matchAgainst) result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: - result = markup != None + result = markup is not None elif callable(matchAgainst): result = matchAgainst(markup) else: @@ -1130,7 +1130,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. - del(self.markupMassage) + del self.markupMassage self.reset() SGMLParser.feed(self, markup) @@ -1253,7 +1253,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None + isNestable = nestingResetTriggers is not None isResetNesting = self.RESET_NESTING_TAGS.has_key(name) popTo = None inclusive = True @@ -1264,9 +1264,9 @@ class BeautifulStoneSoup(Tag, SGMLParser): #last occurance. popTo = name break - if (nestingResetTriggers != None + if (nestingResetTriggers is not None and p.name in nestingResetTriggers) \ - or (nestingResetTriggers == None and isResetNesting + or (nestingResetTriggers is None and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name)): #If we encounter one of the nesting reset triggers @@ -1342,11 +1342,11 @@ class BeautifulStoneSoup(Tag, SGMLParser): self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): - "Handle comments as Comment objects." + """Handle comments as Comment objects.""" self._toStringSubclass(text, Comment) def handle_charref(self, ref): - "Handle character references as data." + """Handle character references as data.""" if self.convertEntities: data = unichr(int(ref)) else: @@ -1397,7 +1397,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): self.handle_data(data) def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." + """Handle DOCTYPEs and the like as Declaration objects.""" self._toStringSubclass(data, Declaration) def parse_declaration(self, i): @@ -1793,8 +1793,8 @@ class UnicodeDammit: return self.markup def _toUnicode(self, data, encoding): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' + """Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases""" # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ diff --git a/lib/imdb/parser/http/bsouplxml/bsoupxpath.py b/lib/imdb/parser/http/bsouplxml/bsoupxpath.py index c5c489db..671f62a4 100644 --- a/lib/imdb/parser/http/bsouplxml/bsoupxpath.py +++ b/lib/imdb/parser/http/bsouplxml/bsoupxpath.py @@ -67,7 +67,7 @@ def tokenize_path(path): if path[i] == '/': if i > 0: separators.append((last_position, i)) - if (path[i+1] == '/'): + if path[i+1] == '/': last_position = i i = i + 1 else: diff --git a/lib/imdb/parser/http/characterParser.py b/lib/imdb/parser/http/characterParser.py index ff5ea09b..2950f52e 100644 --- a/lib/imdb/parser/http/characterParser.py +++ b/lib/imdb/parser/http/characterParser.py @@ -2,7 +2,7 @@ parser.http.characterParser module (imdb package). This module provides the classes (and the instances), used to parse -the IMDb pages on the akas.imdb.com server about a character. +the IMDb pages on the www.imdb.com server about a character. E.g., for "Jesse James" the referred pages would be: main details: http://www.imdb.com/character/ch0000001/ biography: http://www.imdb.com/character/ch0000001/bio @@ -37,7 +37,7 @@ _personIDs = re.compile(r'/name/nm([0-9]{7})') class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): """Parser for the "filmography" page of a given character. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -101,7 +101,7 @@ class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): class DOMHTMLCharacterBioParser(DOMParserBase): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -146,7 +146,7 @@ class DOMHTMLCharacterBioParser(DOMParserBase): class DOMHTMLCharacterQuotesParser(DOMParserBase): """Parser for the "quotes" page of a given character. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: diff --git a/lib/imdb/parser/http/companyParser.py b/lib/imdb/parser/http/companyParser.py index 84337916..50cd67e9 100644 --- a/lib/imdb/parser/http/companyParser.py +++ b/lib/imdb/parser/http/companyParser.py @@ -2,12 +2,12 @@ parser.http.companyParser module (imdb package). This module provides the classes (and the instances), used to parse -the IMDb pages on the akas.imdb.com server about a company. +the IMDb pages on the www.imdb.com server about a company. E.g., for "Columbia Pictures [us]" the referred page would be: - main details: http://akas.imdb.com/company/co0071509/ + main details: http://www.imdb.com/company/co0071509/ -Copyright 2008-2009 Davide Alberani - 2008 H. Turgut Uyar +Copyright 2008-2017 Davide Alberani + 2008-2017 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,7 +34,7 @@ from imdb.utils import analyze_company_name class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -44,31 +44,38 @@ class DOMCompanyParser(DOMParserBase): _containsObjects = True extractors = [ - Extractor(label='name', - path="//title", - attrs=Attribute(key='name', - path="./text()", - postprocess=lambda x: \ - analyze_company_name(x, stripNotes=True))), + Extractor( + label='name', + path="//h1/span[@class='display-title ']", # note the extra trailing space in class + attrs=Attribute( + key='name', + path="./text()", + postprocess=lambda x: analyze_company_name(x, stripNotes=True) + ) + ), - Extractor(label='filmography', - group="//b/a[@name]", - group_key="./text()", - group_key_normalize=lambda x: x.lower(), - path="../following-sibling::ol[1]/li", - attrs=Attribute(key=None, - multi=True, - path={ - 'link': "./a[1]/@href", - 'title': "./a[1]/text()", - 'year': "./text()[1]" - }, - postprocess=lambda x: - build_movie(u'%s %s' % \ - (x.get('title'), x.get('year').strip()), - movieID=analyze_imdbid(x.get('link') or u''), - _parsingCompany=True))), - ] + Extractor( + label='filmography', + group="//b/a[@name]", + group_key="./text()", + group_key_normalize=lambda x: x.lower(), + path="../following-sibling::ol[1]/li", + attrs=Attribute( + key=None, + multi=True, + path={ + 'link': "./a[1]/@href", + 'title': "./a[1]/text()", + 'year': "./text()[1]" + }, + postprocess=lambda x: build_movie( + '%s %s' % (x.get('title'), x.get('year').strip()), + movieID=analyze_imdbid(x.get('link') or u''), + _parsingCompany=True + ) + ) + ) + ] preprocessors = [ (re.compile('(\1') diff --git a/lib/imdb/parser/http/movieParser.py b/lib/imdb/parser/http/movieParser.py index f4589d7a..b5789dd7 100644 --- a/lib/imdb/parser/http/movieParser.py +++ b/lib/imdb/parser/http/movieParser.py @@ -1,16 +1,18 @@ +# -*- coding: utf-8 -*- + """ parser.http.movieParser module (imdb package). This module provides the classes (and the instances), used to parse the -IMDb pages on the akas.imdb.com server about a movie. +IMDb pages on the www.imdb.com server about a movie. E.g., for Brian De Palma's "The Untouchables", the referred pages would be: - combined details: http://akas.imdb.com/title/tt0094226/combined - plot summary: http://akas.imdb.com/title/tt0094226/plotsummary + combined details: http://www.imdb.com/title/tt0094226/reference + plot summary: http://www.imdb.com/title/tt0094226/plotsummary ...and so on... -Copyright 2004-2016 Davide Alberani - 2008 H. Turgut Uyar +Copyright 2004-2018 Davide Alberani + 2008-2018 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,69 +29,68 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ +import functools import re import urllib from imdb import imdbURL_base -from imdb.Person import Person -from imdb.Movie import Movie from imdb.Company import Company -from imdb.utils import analyze_title, split_company_name_notes, _Container -from utils import build_person, DOMParserBase, Attribute, Extractor, \ - analyze_imdbid +from imdb.Movie import Movie +from imdb.Person import Person +from imdb.utils import _Container, KIND_MAP + +from .utils import Attribute, DOMParserBase, Extractor, analyze_imdbid, build_person # Dictionary used to convert some section's names. _SECT_CONV = { - 'directed': 'director', - 'directed by': 'director', - 'directors': 'director', - 'editors': 'editor', - 'writing credits': 'writer', - 'writers': 'writer', - 'produced': 'producer', - 'cinematography': 'cinematographer', - 'film editing': 'editor', - 'casting': 'casting director', - 'costume design': 'costume designer', - 'makeup department': 'make up', - 'production management': 'production manager', - 'second unit director or assistant director': 'assistant director', - 'costume and wardrobe department': 'costume department', - 'sound department': 'sound crew', - 'stunts': 'stunt performer', - 'other crew': 'miscellaneous crew', - 'also known as': 'akas', - 'country': 'countries', - 'runtime': 'runtimes', - 'language': 'languages', - 'certification': 'certificates', - 'genre': 'genres', - 'created': 'creator', - 'creators': 'creator', - 'color': 'color info', - 'plot': 'plot outline', - 'seasons': 'number of seasons', - 'art directors': 'art direction', - 'assistant directors': 'assistant director', - 'set decorators': 'set decoration', - 'visual effects department': 'visual effects', - 'production managers': 'production manager', - 'miscellaneous': 'miscellaneous crew', - 'make up department': 'make up', - 'plot summary': 'plot outline', - 'cinematographers': 'cinematographer', - 'camera department': 'camera and electrical department', - 'costume designers': 'costume designer', - 'production designers': 'production design', - 'production managers': 'production manager', - 'music original': 'original music', - 'casting directors': 'casting director', - 'other companies': 'miscellaneous companies', - 'producers': 'producer', - 'special effects by': 'special effects department', - 'special effects': 'special effects companies' - } + 'directed': 'director', + 'directed by': 'director', + 'directors': 'director', + 'editors': 'editor', + 'writing credits': 'writer', + 'writers': 'writer', + 'produced': 'producer', + 'cinematography': 'cinematographer', + 'film editing': 'editor', + 'casting': 'casting director', + 'costume design': 'costume designer', + 'makeup department': 'make up', + 'production management': 'production manager', + 'second unit director or assistant director': 'assistant director', + 'costume and wardrobe department': 'costume department', + 'sound department': 'sound crew', + 'stunts': 'stunt performer', + 'other crew': 'miscellaneous crew', + 'also known as': 'akas', + 'country': 'countries', + 'runtime': 'runtimes', + 'language': 'languages', + 'certification': 'certificates', + 'genre': 'genres', + 'created': 'creator', + 'creators': 'creator', + 'color': 'color info', + 'plot': 'plot outline', + 'art directors': 'art direction', + 'assistant directors': 'assistant director', + 'set decorators': 'set decoration', + 'visual effects department': 'visual effects', + 'miscellaneous': 'miscellaneous crew', + 'make up department': 'make up', + 'plot summary': 'plot outline', + 'cinematographers': 'cinematographer', + 'camera department': 'camera and electrical department', + 'costume designers': 'costume designer', + 'production designers': 'production design', + 'production managers': 'production manager', + 'music original': 'original music', + 'casting directors': 'casting director', + 'other companies': 'miscellaneous companies', + 'producers': 'producer', + 'special effects by': 'special effects department', + 'special effects': 'special effects companies' +} def _manageRoles(mo): @@ -108,28 +109,33 @@ def _manageRoles(mo): roleID = u'/' else: roleID += u'/' - newRoles.append(u'
%s
' % \ - (roleID, role.strip())) + newRoles.append(u'
%s
' % ( + roleID, role.strip() + )) return firstHalf + u' / '.join(newRoles) + mo.group(3) -_reRolesMovie = re.compile(r'()(.*?)()', - re.I | re.M | re.S) +_reRolesMovie = re.compile(r'()(.*?)()', re.I | re.M | re.S) + def _replaceBR(mo): """Replaces
tags with '::' (useful for some akas)""" txt = mo.group(0) return txt.replace('
', '::') + _reAkas = re.compile(r'
also known as:
.*?', re.I | re.M | re.S) + def makeSplitter(lstrip=None, sep='|', comments=True, - origNotesSep=' (', newNotesSep='::(', strip=None): + origNotesSep=' (', newNotesSep='::(', strip=None): """Return a splitter function suitable for a given set of data.""" def splitter(x): - if not x: return x + if not x: + return x x = x.strip() - if not x: return x + if not x: + return x if lstrip is not None: x = x.lstrip(lstrip).lstrip() lx = x.split(sep) @@ -153,11 +159,75 @@ def _toInt(val, replace=()): return None +_re_og_title = re.compile( + ur'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)', + re.UNICODE +) + + +def analyze_og_title(og_title): + data = {} + match = _re_og_title.match(og_title) + if match: + data['title'] = match.group(1) + + if match.group(3): + data['year'] = int(match.group(3)) + + kind = match.group(2) or match.group(6) + if kind is None: + kind = 'movie' + else: + kind = kind.lower() + kind = KIND_MAP.get(kind, kind) + data['kind'] = kind + + year_separator = match.group(4) + # There is a year separator so assume an ongoing or ended series + if year_separator is not None: + end_year = match.group(5) + if end_year is not None: + data['series years'] = '%(year)d-%(end_year)s' % { + 'year': data['year'], + 'end_year': end_year.strip(), + } + elif kind.endswith('series'): + data['series years'] = '%(year)d-' % {'year': data['year']} + # No year separator and series, so assume that it ended the same year + elif kind.endswith('series') and 'year' in data: + data['series years'] = '%(year)d-%(year)d' % {'year': data['year']} + + if data['kind'] == 'episode' and data['title'][0] == '"': + quote_end = data['title'].find('"', 1) + data['tv series title'] = data['title'][1:quote_end] + data['title'] = data['title'][quote_end + 1:].strip() + + return data + + +def analyze_certificates(certificates): + def reducer(acc, el): + cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE) + + if cert_re.match(el): + acc.append(el) + elif acc: + acc[-1] = u'{}::{}'.format( + acc[-1], + el, + ) + + return acc + + certificates = [el.strip() for el in certificates.split('\n') if el.strip()] + return functools.reduce(reducer, certificates, []) + + class DOMHTMLMovieParser(DOMParserBase): """Parser for the "combined details" (and if instance.mdparse is True also for the "main details") page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -166,258 +236,465 @@ class DOMHTMLMovieParser(DOMParserBase): """ _containsObjects = True - extractors = [Extractor(label='title', - path="//h1", - attrs=Attribute(key='title', - path=".//text()", - postprocess=analyze_title)), + extractors = [ + Extractor( + label='title', + path="//meta[@property='og:title']", + attrs=Attribute( + key='title', + path="@content", + postprocess=analyze_og_title + ) + ), - Extractor(label='glossarysections', - group="//a[@class='glossary']", - group_key="./@name", - group_key_normalize=lambda x: x.replace('_', ' '), - path="../../../..//tr", - attrs=Attribute(key=None, - multi=True, - path={'person': ".//text()", - 'link': "./td[1]/a[@href]/@href"}, - postprocess=lambda x: \ - build_person(x.get('person') or u'', - personID=analyze_imdbid(x.get('link'))) - )), + # parser for misc sections like 'casting department', 'stunts', ... + Extractor( + label='glossarysections', + group="//h4[contains(@class, 'ipl-header__content')]", + group_key="./@name", + group_key_normalize=lambda x: x.replace('_', ' '), + path="../../following-sibling::table[1]//tr", + attrs=Attribute( + key=None, + multi=True, + path={ + 'person': ".//text()", + 'link': "./td[1]/a[@href]/@href" + }, + postprocess=lambda x: build_person( + x.get('person') or u'', + personID=analyze_imdbid(x.get('link')) + ) + ) + ), - Extractor(label='cast', - path="//table[@class='cast']//tr", - attrs=Attribute(key="cast", - multi=True, - path={'person': ".//text()", - 'link': "td[2]/a/@href", - 'roleID': \ - "td[4]/div[@class='_imdbpyrole']/@roleid"}, - postprocess=lambda x: \ - build_person(x.get('person') or u'', - personID=analyze_imdbid(x.get('link')), - roleID=(x.get('roleID') or u'').split('/')) - )), + Extractor( + label='cast', + path="//table[@class='cast_list']//tr", + attrs=Attribute( + key="cast", + multi=True, + path={ + 'person': ".//text()", + 'link': "td[2]/a/@href", + 'roleID': "td[4]/div[@class='_imdbpyrole']/@roleid" + }, + postprocess=lambda x: build_person( + x.get('person') or u'', + personID=analyze_imdbid(x.get('link')), + roleID=(x.get('roleID') or u'').split('/')) + ) + ), - Extractor(label='genres', - path="//div[@class='info']//a[starts-with(@href," \ - " '/Sections/Genres')]", - attrs=Attribute(key="genres", - multi=True, - path="./text()")), + Extractor( + label='myrating', + path="//span[@id='voteuser']", + attrs=Attribute( + key='myrating', + path=".//text()" + ) + ), - Extractor(label='myrating', - path="//span[@id='voteuser']", - attrs=Attribute(key='myrating', - path=".//text()")), + Extractor( + label='plot summary', + path=".//td[starts-with(text(), 'Plot')]/..//p", + attrs=Attribute( + key='plot summary', + path='./text()', + postprocess=lambda x: x.strip().rstrip('|').rstrip() + ) + ), - Extractor(label='h5sections', - path="//div[@class='info']/h5/..", - attrs=[ - Attribute(key="plot summary", - path="./h5[starts-with(text(), " \ - "'Plot:')]/../div/text()", - postprocess=lambda x: \ - x.strip().rstrip('|').rstrip()), - Attribute(key="aspect ratio", - path="./h5[starts-with(text()," \ - " 'Aspect')]/../div/text()", - postprocess=lambda x: x.strip()), - Attribute(key="mpaa", - path="./h5/a[starts-with(text()," \ - " 'MPAA')]/../../div/text()", - postprocess=lambda x: x.strip()), - Attribute(key="countries", - path="./h5[starts-with(text(), " \ - "'Countr')]/../div[@class='info-content']//text()", - postprocess=makeSplitter('|')), - Attribute(key="language", - path="./h5[starts-with(text(), " \ - "'Language')]/..//text()", - postprocess=makeSplitter('Language:')), - Attribute(key='color info', - path="./h5[starts-with(text(), " \ - "'Color')]/..//text()", - postprocess=makeSplitter('|')), - Attribute(key='sound mix', - path="./h5[starts-with(text(), " \ - "'Sound Mix')]/..//text()", - postprocess=makeSplitter('Sound Mix:')), - # Collects akas not encosed in tags. - Attribute(key='other akas', - path="./h5[starts-with(text(), " \ - "'Also Known As')]/../div//text()", - postprocess=makeSplitter(sep='::', - origNotesSep='" - ', - newNotesSep='::', - strip='"')), - Attribute(key='runtimes', - path="./h5[starts-with(text(), " \ - "'Runtime')]/../div/text()", - postprocess=makeSplitter()), - Attribute(key='certificates', - path="./h5[starts-with(text(), " \ - "'Certificat')]/..//text()", - postprocess=makeSplitter('Certification:')), - Attribute(key='number of seasons', - path="./h5[starts-with(text(), " \ - "'Seasons')]/..//text()", - postprocess=lambda x: x.count('|') + 1), - Attribute(key='original air date', - path="./h5[starts-with(text(), " \ - "'Original Air Date')]/../div/text()"), - Attribute(key='tv series link', - path="./h5[starts-with(text(), " \ - "'TV Series')]/..//a/@href"), - Attribute(key='tv series title', - path="./h5[starts-with(text(), " \ - "'TV Series')]/..//a/text()") - ]), + Extractor( + label='genres', + path="//td[starts-with(text(), 'Genre')]/..//li/a", + attrs=Attribute( + key="genres", + multi=True, + path="./text()" + ) + ), - Extractor(label='language codes', - path="//h5[starts-with(text(), 'Language')]/..//a[starts-with(@href, '/language/')]", - attrs=Attribute(key='language codes', multi=True, - path="./@href", - postprocess=lambda x: x.split('/')[2].strip() - )), + Extractor( + label='runtimes', + path="//td[starts-with(text(), 'Runtime')]/..//li", + attrs=Attribute( + key='runtimes', + path="./text()", + multi=True, + postprocess=lambda x: x.strip().replace(' min', '') + ) + ), - Extractor(label='country codes', - path="//h5[starts-with(text(), 'Country')]/..//a[starts-with(@href, '/country/')]", - attrs=Attribute(key='country codes', multi=True, - path="./@href", - postprocess=lambda x: x.split('/')[2].strip() - )), + Extractor( + label='countries', + path="//td[starts-with(text(), 'Countr')]/..//li/a", + attrs=Attribute( + key='countries', + path="./text()", + multi=True + ) + ), - Extractor(label='creator', - path="//h5[starts-with(text(), 'Creator')]/..//a", - attrs=Attribute(key='creator', multi=True, - path={'name': "./text()", - 'link': "./@href"}, - postprocess=lambda x: \ - build_person(x.get('name') or u'', - personID=analyze_imdbid(x.get('link'))) - )), + Extractor( + label='country codes', + path="//td[starts-with(text(), 'Countr')]/..//li/a", + attrs=Attribute( + key='country codes', + path="./@href", + multi=True, + postprocess=lambda x: x.split('/')[2].strip().lower() + ) + ), - Extractor(label='thin writer', - path="//h5[starts-with(text(), 'Writer')]/..//a", - attrs=Attribute(key='thin writer', multi=True, - path={'name': "./text()", - 'link': "./@href"}, - postprocess=lambda x: \ - build_person(x.get('name') or u'', - personID=analyze_imdbid(x.get('link'))) - )), + Extractor( + label='language', + path="//td[starts-with(text(), 'Language')]/..//li/a", + attrs=Attribute( + key='language', + path="./text()", + multi=True + ) + ), - Extractor(label='thin director', - path="//h5[starts-with(text(), 'Director')]/..//a", - attrs=Attribute(key='thin director', multi=True, - path={'name': "./text()", - 'link': "@href"}, - postprocess=lambda x: \ - build_person(x.get('name') or u'', - personID=analyze_imdbid(x.get('link'))) - )), + Extractor( + label='language codes', + path="//td[starts-with(text(), 'Language')]/..//li/a", + attrs=Attribute( + key='language codes', + path="./@href", + multi=True, + postprocess=lambda x: x.split('/')[2].strip() + ) + ), - Extractor(label='top 250/bottom 100', - path="//div[@class='starbar-special']/" \ - "a[starts-with(@href, '/chart/')]", - attrs=Attribute(key='top/bottom rank', - path="./text()")), + Extractor( + label='color info', + path="//td[starts-with(text(), 'Color')]/..//li/a", + attrs=Attribute( + key='color info', + path="./text()", + multi=True, + postprocess=lambda x: x.replace(' (', '::(') + ) + ), - Extractor(label='series years', - path="//div[@id='tn15title']//span" \ - "[starts-with(text(), 'TV series')]", - attrs=Attribute(key='series years', - path="./text()", - postprocess=lambda x: \ - x.replace('TV series','').strip())), + Extractor( + label='aspect ratio', + path="//td[starts-with(text(), 'Aspect')]/..", + attrs=Attribute( + key='aspect ratio', + path=".//li/text()", + postprocess=lambda x: x.strip() + ) + ), - Extractor(label='number of episodes', - path="//a[@title='Full Episode List']", - attrs=Attribute(key='number of episodes', - path="./text()", - postprocess=lambda x: \ - _toInt(x, [(' Episodes', '')]))), + Extractor( + label='sound mix', + path="//td[starts-with(text(), 'Sound Mix')]/..//li/a", + attrs=Attribute( + key='sound mix', + path="./text()", + multi=True, + postprocess=lambda x: x.replace(' (', '::(') + ) + ), - Extractor(label='akas', - path="//i[@class='transl']", - attrs=Attribute(key='akas', multi=True, path='text()', - postprocess=lambda x: - x.replace(' ', ' ').rstrip('-').replace('" - ', - '"::', 1).strip('"').replace(' ', ' '))), + Extractor( + label='certificates', + path=".//td[starts-with(text(), 'Certificat')]/..", + attrs=Attribute( + key='certificates', + path=".//text()", + postprocess=analyze_certificates + ) + ), - Extractor(label='production notes/status', - path="//h5[starts-with(text(), 'Status:')]/..//div[@class='info-content']", - attrs=Attribute(key='production status', - path=".//text()", - postprocess=lambda x: x.strip().split('|')[0].strip().lower())), + Extractor( + label='h5sections', + path="//section[contains(@class, 'listo')]", + attrs=[ + # Collects akas not encosed in tags. + Attribute( + key='other akas', + path=".//td[starts-with(text(), 'Also Known As')]/..//ul//text()", + postprocess=makeSplitter( + sep='::', origNotesSep='" - ', newNotesSep='::', strip='"' + ) + ) + ] + ), - Extractor(label='production notes/status updated', - path="//h5[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']", - attrs=Attribute(key='production status updated', - path=".//text()", - postprocess=lambda x: x.strip())), + Extractor( + label='creator', + path="//td[starts-with(text(), 'Creator')]/..//a", + attrs=Attribute( + key='creator', + multi=True, + path={ + 'name': "./text()", + 'link': "./@href" + }, + postprocess=lambda x: build_person( + x.get('name') or u'', + personID=analyze_imdbid(x.get('link')) + ) + ) + ), - Extractor(label='production notes/comments', - path="//h5[starts-with(text(), 'Comments:')]/..//div[@class='info-content']", - attrs=Attribute(key='production comments', - path=".//text()", - postprocess=lambda x: x.strip())), + Extractor( + label='thin writer', + path="//div[starts-with(normalize-space(text()), 'Writer')]/ul/li[1]/a", + attrs=Attribute( + key='thin writer', + multi=True, + path={ + 'name': "./text()", + 'link': "./@href" + }, + postprocess=lambda x: build_person( + x.get('name') or u'', + personID=analyze_imdbid(x.get('link')) + ) + ) + ), - Extractor(label='production notes/note', - path="//h5[starts-with(text(), 'Note:')]/..//div[@class='info-content']", - attrs=Attribute(key='production note', - path=".//text()", - postprocess=lambda x: x.strip())), + Extractor( + label='thin director', + path="//div[starts-with(normalize-space(text()), 'Director')]/ul/li[1]/a", + attrs=Attribute( + key='thin director', + multi=True, + path={ + 'name': "./text()", + 'link': "./@href" + }, + postprocess=lambda x: build_person( + x.get('name') or u'', + personID=analyze_imdbid(x.get('link')) + ) + ) + ), - Extractor(label='blackcatheader', - group="//b[@class='blackcatheader']", - group_key="./text()", - group_key_normalize=lambda x: x.lower(), - path="../ul/li", - attrs=Attribute(key=None, - multi=True, - path={'name': "./a//text()", - 'comp-link': "./a/@href", - 'notes': "./text()"}, - postprocess=lambda x: \ - Company(name=x.get('name') or u'', - companyID=analyze_imdbid(x.get('comp-link')), - notes=(x.get('notes') or u'').strip()) - )), + Extractor( + label='top 250/bottom 100', + path="//li[@class='ipl-inline-list__item']//a[starts-with(@href, '/chart/')]", + attrs=Attribute( + key='top/bottom rank', + path="./text()" + ) + ), - Extractor(label='rating', - path="//div[@class='starbar-meta']/b", - attrs=Attribute(key='rating', - path=".//text()")), + Extractor( + label='original air date', + path="//span[@imdbpy='airdate']", + attrs=Attribute( + key='original air date', + path="./text()" + ) + ), - Extractor(label='votes', - path="//div[@class='starbar-meta']/a[@href]", - attrs=Attribute(key='votes', - path=".//text()")), + Extractor( + label='series years', + path="//div[@id='tn15title']//span[starts-with(text(), 'TV series')]", + attrs=Attribute( + key='series years', + path="./text()", + postprocess=lambda x: x.replace('TV series', '').strip() + ) + ), - Extractor(label='cover url', - path="//a[@name='poster']", - attrs=Attribute(key='cover url', - path="./img/@src")) - ] + Extractor( + label='season/episode', + path="//div[@class='titlereference-overview-season-episode-section']/ul", + attrs=Attribute( + key='season/episode', + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='number of episodes', + path="//a[starts-with(text(), 'All Episodes')]", + attrs=Attribute( + key='number of episodes', + path="./text()", + postprocess=lambda x: int(x.replace('All Episodes', '').strip()[1:-1]) + ) + ), + + Extractor( + label='episode number', + path=".//div[@id='tn15epnav']", + attrs=Attribute( + key='episode number', + path="./text()", + postprocess=lambda x: int(re.sub(r'[^a-z0-9 ]', '', x.lower()) + .strip() + .split()[0]) + ) + ), + + Extractor( + label='previous episode', + path=".//span[@class='titlereference-overview-episodes-links']//a[contains(text(), 'Previous')]", + attrs=Attribute( + key='previous episode', + path="./@href", + postprocess=lambda x: analyze_imdbid(x) + ) + ), + + Extractor( + label='next episode', + path=".//span[@class='titlereference-overview-episodes-links']//a[contains(text(), 'Next')]", + attrs=Attribute( + key='next episode', + path="./@href", + postprocess=lambda x: analyze_imdbid(x) + ) + ), + + Extractor( + label='number of seasons', + path=".//span[@class='titlereference-overview-years-links']/../a[1]", + attrs=Attribute( + key='number of seasons', + path="./text()", + postprocess=lambda x: int(x) + ) + ), + + Extractor( + label='tv series link', + path=".//a[starts-with(text(), 'All Episodes')]", + attrs=Attribute( + key='tv series link', + path="./@href" + ) + ), + + Extractor( + label='akas', + path="//i[@class='transl']", + attrs=Attribute( + key='akas', + multi=True, + path='text()', + postprocess=lambda x: x + .replace(' ', ' ') + .rstrip('-') + .replace('" - ', '"::', 1) + .strip('"') + .replace(' ', ' ') + ) + ), + + Extractor( + label='production notes/status', + path="//td[starts-with(text(), 'Status:')]/..//div[@class='info-content']", + attrs=Attribute( + key='production status', + path=".//text()", + postprocess=lambda x: x.strip().split('|')[0].strip().lower() + ) + ), + + Extractor( + label='production notes/status updated', + path="//td[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']", + attrs=Attribute( + key='production status updated', + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='production notes/comments', + path="//td[starts-with(text(), 'Comments:')]/..//div[@class='info-content']", + attrs=Attribute( + key='production comments', + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='production notes/note', + path="//td[starts-with(text(), 'Note:')]/..//div[@class='info-content']", + attrs=Attribute( + key='production note', + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='blackcatheader', + group="//b[@class='blackcatheader']", + group_key="./text()", + group_key_normalize=lambda x: x.lower(), + path="../ul/li", + attrs=Attribute( + key=None, + multi=True, + path={ + 'name': "./a//text()", + 'comp-link': "./a/@href", + 'notes': "./text()" + }, + postprocess=lambda x: Company(name=x.get('name') or u'', + companyID=analyze_imdbid(x.get('comp-link')), + notes=(x.get('notes') or u'').strip()) + ) + ), + + Extractor( + label='rating', + path="(//span[@class='ipl-rating-star__rating'])[1]", + attrs=Attribute( + key='rating', + path="./text()" + ) + ), + + Extractor( + label='votes', + path="//span[@class='ipl-rating-star__total-votes'][1]", + attrs=Attribute( + key='votes', + path="./text()" + ) + ), + + Extractor( + label='cover url', + path="//img[@alt='Poster']", + attrs=Attribute( + key='cover url', + path="@src" + ) + ) + ] preprocessors = [ - (re.compile(r'(.+?)', re.I), - r'
\1'), + ('/releaseinfo">', '">'), + (re.compile(r'(.+?)', re.I), r'
\1'), ('Full cast and crew for
', ''), (' ', '...'), - ('TV mini-series', - '(mini)'), + (re.compile(r'TV mini-series(\s+.*?)', re.I), + r'TV series\1 (mini)'), (_reRolesMovie, _manageRoles), - (_reAkas, _replaceBR)] + (_reAkas, _replaceBR) + ] def preprocess_dom(self, dom): # Handle series information. xpath = self.xpath(dom, "//b[text()='Series Crew']") if xpath: - b = xpath[-1] # In doubt, take the last one. + b = xpath[-1] # In doubt, take the last one. for a in self.xpath(b, "./following::h5/a[@class='glossary']"): name = a.get('name') if name: @@ -428,12 +705,13 @@ class DOMHTMLMovieParser(DOMParserBase): # Remove some 'more' links (keep others, like the one around # the number of votes). for tn15more in self.xpath(dom, - "//a[@class='tn15more'][starts-with(@href, '/title/')]"): + "//a[@class='tn15more'][starts-with(@href, '/title/')]"): tn15more.drop_tree() return dom re_space = re.compile(r'\s+') re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I) + def postprocess_data(self, data): # Convert section names. for sect in data.keys(): @@ -472,26 +750,38 @@ class DOMHTMLMovieParser(DOMParserBase): if 'runtimes' in data: data['runtimes'] = [x.replace(' min', u'') for x in data['runtimes']] - if 'original air date' in data: - oid = self.re_space.sub(' ', data['original air date']).strip() - data['original air date'] = oid - aid = self.re_airdate.findall(oid) - if aid and len(aid[0]) == 3: - date, season, episode = aid[0] - date = date.strip() - try: season = int(season) - except: pass - try: episode = int(episode) - except: pass - if date and date != '????': - data['original air date'] = date - else: - del data['original air date'] - # Handle also "episode 0". - if season or type(season) is type(0): - data['season'] = season - if episode or type(season) is type(0): - data['episode'] = episode + if 'number of seasons' in data: + data['seasons'] = [unicode(i) for i in range(1, data['number of seasons'] + 1)] + # data['number of seasons'] = seasons[-1] if seasons else len(data['seasons']) + if 'season/episode' in data: + tokens = data['season/episode'].split('Episode') + data['season'] = int(tokens[0].split('Season')[1]) + data['episode'] = int(tokens[1]) + del data['season/episode'] + # if 'original air date' in data: + # oid = self.re_space.sub(' ', data['original air date']).strip() + # data['original air date'] = oid + # aid = self.re_airdate.findall(oid) + # if aid and len(aid[0]) == 3: + # date, season, episode = aid[0] + # date = date.strip() + # try: + # season = int(season) + # except ValueError: + # pass + # try: + # episode = int(episode) + # except ValueError: + # pass + # if date and date != '????': + # data['original air date'] = date + # else: + # del data['original air date'] + # # Handle also "episode 0". + # if season or isinstance(season, int): + # data['season'] = season + # if episode or isinstance(season, int): + # data['episode'] = episode for k in ('writer', 'director'): t_k = 'thin %s' % k if t_k not in data: @@ -503,10 +793,10 @@ class DOMHTMLMovieParser(DOMParserBase): tbVal = data['top/bottom rank'].lower() if tbVal.startswith('top'): tbKey = 'top 250 rank' - tbVal = _toInt(tbVal, [('top 250: #', '')]) + tbVal = _toInt(tbVal, [('top rated movies: #', '')]) else: tbKey = 'bottom 100 rank' - tbVal = _toInt(tbVal, [('bottom 100: #', '')]) + tbVal = _toInt(tbVal, [('bottom rated movies: #', '')]) if tbVal: data[tbKey] = tbVal del data['top/bottom rank'] @@ -515,10 +805,10 @@ class DOMHTMLMovieParser(DOMParserBase): if 'tv series link' in data: if 'tv series title' in data: data['episode of'] = Movie(title=data['tv series title'], - movieID=analyze_imdbid( - data['tv series link']), - accessSystem=self._as, - modFunct=self._modFunct) + movieID=analyze_imdbid(data['tv series link']), + accessSystem=self._as, + modFunct=self._modFunct) + data['episode of']['kind'] = 'tv series' del data['tv series title'] del data['tv series link'] if 'rating' in data: @@ -526,9 +816,11 @@ class DOMHTMLMovieParser(DOMParserBase): data['rating'] = float(data['rating'].replace('/10', '')) except (TypeError, ValueError): pass + if data['rating'] == 0: + del data['rating'] if 'votes' in data: try: - votes = data['votes'].replace(',', '').replace('votes', '') + votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '') data['votes'] = int(votes) except (TypeError, ValueError): pass @@ -543,10 +835,11 @@ def _process_plotsummary(x): xplot += u'::%s' % xauthor return xplot + class DOMHTMLPlotParser(DOMParserBase): """Parser for the "plot summary" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a 'plot' key, containing a list of string with the structure: 'summary::summary_author '. @@ -558,13 +851,20 @@ class DOMHTMLPlotParser(DOMParserBase): # Notice that recently IMDb started to put the email of the # author only in the link, that we're not collecting, here. - extractors = [Extractor(label='plot', - path="//p[@class='plotSummary']", - attrs=Attribute(key='plot', - multi=True, - path={'plot': './/text()', - 'author': './span/em/a/text()'}, - postprocess=_process_plotsummary))] + extractors = [ + Extractor( + label='plot', + path="//ul[@class='zebraList']/li", + attrs=Attribute( + key='plot', + multi=True, + path={ + 'plot': './/p[@class="plotSummary"]//text()', + 'author': './/span/em/a/text()' + }, + postprocess=_process_plotsummary) + ) + ] def _process_award(x): @@ -594,11 +894,10 @@ def _process_award(x): return award - class DOMHTMLAwardsParser(DOMParserBase): """Parser for the "awards" page of a given person or movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -609,12 +908,13 @@ class DOMHTMLAwardsParser(DOMParserBase): _containsObjects = True extractors = [ - Extractor(label='awards', + Extractor( + label='awards', group="//table//big", group_key="./a", - path="./ancestor::tr[1]/following-sibling::tr/" \ - "td[last()][not(@colspan)]", - attrs=Attribute(key=None, + path="./ancestor::tr[1]/following-sibling::tr/td[last()][not(@colspan)]", + attrs=Attribute( + key=None, multi=True, path={ 'year': "../td[1]/a/text()", @@ -622,26 +922,31 @@ class DOMHTMLAwardsParser(DOMParserBase): 'award': "../td[3]/text()", 'category': "./text()[1]", # FIXME: takes only the first co-recipient - 'with': "./small[starts-with(text()," \ - " 'Shared with:')]/following-sibling::a[1]/text()", + 'with': "./small[starts-with(text(), 'Shared with:')]/" + "following-sibling::a[1]/text()", 'notes': "./small[last()]//text()", 'anchor': ".//text()" - }, + }, postprocess=_process_award - )), - Extractor(label='recipients', + ) + ), + + Extractor( + label='recipients', group="//table//big", group_key="./a", - path="./ancestor::tr[1]/following-sibling::tr/" \ - "td[last()]/small[1]/preceding-sibling::a", - attrs=Attribute(key=None, + path="./ancestor::tr[1]/following-sibling::tr" + "/td[last()]/small[1]/preceding-sibling::a", + attrs=Attribute( + key=None, multi=True, path={ 'name': "./text()", 'link': "./@href", 'anchor': "..//text()" - } - )) + } + ) + ) ] preprocessors = [ @@ -652,7 +957,7 @@ class DOMHTMLAwardsParser(DOMParserBase): (re.compile('(]*>\n\n)(.*?)
(.*?\n\n)(\2') - ] + ] def preprocess_dom(self, dom): """Repeat td elements according to their rowspan attributes @@ -664,7 +969,7 @@ class DOMHTMLAwardsParser(DOMParserBase): del col.attrib['rowspan'] position = len(self.xpath(col, "./preceding-sibling::td")) row = col.getparent() - for tr in self.xpath(row, "./following-sibling::tr")[:span-1]: + for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]: # if not cloned, child will be moved to new parent clone = self.clone(col) # XXX: beware that here we don't use an "adapted" function, @@ -688,17 +993,20 @@ class DOMHTMLAwardsParser(DOMParserBase): entry['assigner'] = assigner.strip() # find the recipients matches = [p for p in data[key] - if p.has_key('name') and (entry['anchor'] == - p['anchor'])] + if 'name' in p and (entry['anchor'] == p['anchor'])] if self.subject == 'title': - recipients = [Person(name=recipient['name'], - personID=analyze_imdbid(recipient['link'])) - for recipient in matches] + recipients = [ + Person(name=recipient['name'], + personID=analyze_imdbid(recipient['link'])) + for recipient in matches + ] entry['to'] = recipients elif self.subject == 'name': - recipients = [Movie(title=recipient['name'], - movieID=analyze_imdbid(recipient['link'])) - for recipient in matches] + recipients = [ + Movie(title=recipient['name'], + movieID=analyze_imdbid(recipient['link'])) + for recipient in matches + ] entry['for'] = recipients nd.append(entry) del entry['anchor'] @@ -708,18 +1016,31 @@ class DOMHTMLAwardsParser(DOMParserBase): class DOMHTMLTaglinesParser(DOMParserBase): """Parser for the "taglines" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = DOMHTMLTaglinesParser() result = tparser.parse(taglines_html_string) """ - extractors = [Extractor(label='taglines', - path='//*[contains(concat(" ", normalize-space(@class), " "), " soda ")]', - attrs=Attribute(key='taglines', - multi=True, - path="./text()"))] + extractors = [ + Extractor( + label='taglines', + path="//div[@id='taglines_content']/div", + attrs=Attribute( + key='taglines', + multi=True, + path=".//text()" + ) + ) + ] + + def preprocess_dom(self, dom): + for node in self.xpath(dom, "//div[@id='taglines_content']/div[@class='header']"): + node.drop_tree() + for node in self.xpath(dom, "//div[@id='taglines_content']/div[@id='no_content']"): + node.drop_tree() + return dom def postprocess_data(self, data): if 'taglines' in data: @@ -730,25 +1051,30 @@ class DOMHTMLTaglinesParser(DOMParserBase): class DOMHTMLKeywordsParser(DOMParserBase): """Parser for the "keywords" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: kwparser = DOMHTMLKeywordsParser() result = kwparser.parse(keywords_html_string) """ - extractors = [Extractor(label='keywords', - path="//a[starts-with(@href, '/keyword/')]", - attrs=Attribute(key='keywords', - path="./text()", multi=True, - postprocess=lambda x: \ - x.lower().replace(' ', '-')))] + extractors = [ + Extractor( + label='keywords', + path="//a[starts-with(@href, '/keyword/')]", + attrs=Attribute( + key='keywords', + path="./text()", multi=True, + postprocess=lambda x: x.lower().replace(' ', '-') + ) + ) + ] class DOMHTMLAlternateVersionsParser(DOMParserBase): """Parser for the "alternate versions" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -756,18 +1082,25 @@ class DOMHTMLAlternateVersionsParser(DOMParserBase): result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True - extractors = [Extractor(label='alternate versions', - path="//ul[@class='trivia']/li", - attrs=Attribute(key='alternate versions', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip()))] + + extractors = [ + Extractor( + label='alternate versions', + path="//ul[@class='trivia']/li", + attrs=Attribute( + key='alternate versions', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip() + ) + ) + ] class DOMHTMLTriviaParser(DOMParserBase): """Parser for the "trivia" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -775,12 +1108,18 @@ class DOMHTMLTriviaParser(DOMParserBase): result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True - extractors = [Extractor(label='alternate versions', - path="//div[@class='sodatext']", - attrs=Attribute(key='trivia', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip()))] + + extractors = [ + Extractor( + label='alternate versions', + path="//div[@class='sodatext']", + attrs=Attribute( + key='trivia', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip()) + ) + ] def preprocess_dom(self, dom): # Remove "link this quote" links. @@ -789,16 +1128,21 @@ class DOMHTMLTriviaParser(DOMParserBase): return dom - class DOMHTMLSoundtrackParser(DOMParserBase): _defGetRefs = True preprocessors = [('
', '\n'), ('
', '\n')] - extractors = [Extractor(label='soundtrack', - path="//div[@class='list']//div", - attrs=Attribute(key='soundtrack', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip()))] + extractors = [ + Extractor( + label='soundtrack', + path="//div[@class='list']//div", + attrs=Attribute( + key='soundtrack', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip() + ) + ) + ] def postprocess_data(self, data): if 'soundtrack' in data: @@ -833,7 +1177,7 @@ class DOMHTMLSoundtrackParser(DOMParserBase): for sep in ' with ', ' by ', ' from ', ' of ': fdix = l.find(sep) if fdix != -1: - fdix = fdix+len(sep) + fdix = fdix + len(sep) kind = l[:fdix].rstrip().lower() info = l[fdix:].lstrip() newData[title][kind] = info @@ -846,7 +1190,7 @@ class DOMHTMLSoundtrackParser(DOMParserBase): class DOMHTMLCrazyCreditsParser(DOMParserBase): """Parser for the "crazy credits" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -855,11 +1199,18 @@ class DOMHTMLCrazyCreditsParser(DOMParserBase): """ _defGetRefs = True - extractors = [Extractor(label='crazy credits', path="//ul/li/tt", - attrs=Attribute(key='crazy credits', multi=True, - path=".//text()", - postprocess=lambda x: \ - x.replace('\n', ' ').replace(' ', ' ')))] + extractors = [ + Extractor( + label='crazy credits', + path="//ul/li/tt", + attrs=Attribute( + key='crazy credits', + multi=True, + path=".//text()", + postprocess=lambda x: x.replace('\n', ' ').replace(' ', ' ') + ) + ) + ] def _process_goof(x): @@ -872,7 +1223,7 @@ def _process_goof(x): class DOMHTMLGoofsParser(DOMParserBase): """Parser for the "goofs" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -881,20 +1232,28 @@ class DOMHTMLGoofsParser(DOMParserBase): """ _defGetRefs = True - extractors = [Extractor(label='goofs', path="//div[@class='soda odd']", - attrs=Attribute(key='goofs', multi=True, - path={ - 'text':"./text()", - 'category':'./preceding-sibling::h4[1]/text()', - 'spoiler_category': './h4/text()' - }, - postprocess=_process_goof))] + extractors = [ + Extractor( + label='goofs', + path="//div[@class='soda odd']", + attrs=Attribute( + key='goofs', + multi=True, + path={ + 'text': "./text()", + 'category': './preceding-sibling::h4[1]/text()', + 'spoiler_category': './h4/text()' + }, + postprocess=_process_goof + ) + ) + ] class DOMHTMLQuotesParser(DOMParserBase): """Parser for the "memorable quotes" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -904,21 +1263,36 @@ class DOMHTMLQuotesParser(DOMParserBase): _defGetRefs = True extractors = [ - Extractor(label='quotes_odd', + Extractor( + label='quotes_odd', path="//div[@class='quote soda odd']", - attrs=Attribute(key='quotes_odd', + attrs=Attribute( + key='quotes_odd', multi=True, path=".//text()", - postprocess=lambda x: x.strip().replace(' \n', - '::').replace('::\n', '::').replace('\n', ' '))), - Extractor(label='quotes_even', + postprocess=lambda x: x + .strip() + .replace(' \n', '::') + .replace('::\n', '::') + .replace('\n', ' ') + ) + ), + + Extractor( + label='quotes_even', path="//div[@class='quote soda even']", - attrs=Attribute(key='quotes_even', + attrs=Attribute( + key='quotes_even', multi=True, path=".//text()", - postprocess=lambda x: x.strip().replace(' \n', - '::').replace('::\n', '::').replace('\n', ' '))) - ] + postprocess=lambda x: x + .strip() + .replace(' \n', '::') + .replace('::\n', '::') + .replace('\n', ' ') + ) + ) + ] preprocessors = [ (re.compile('

', re.I), '') @@ -943,40 +1317,60 @@ class DOMHTMLQuotesParser(DOMParserBase): class DOMHTMLReleaseinfoParser(DOMParserBase): """Parser for the "release dates" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rdparser = DOMHTMLReleaseinfoParser() result = rdparser.parse(releaseinfo_html_string) """ - extractors = [Extractor(label='release dates', - path="//table[@id='release_dates']//tr", - attrs=Attribute(key='release dates', multi=True, - path={'country': ".//td[1]//text()", - 'date': ".//td[2]//text()", - 'notes': ".//td[3]//text()"})), - Extractor(label='akas', - path="//table[@id='akas']//tr", - attrs=Attribute(key='akas', multi=True, - path={'title': "./td[1]/text()", - 'countries': "./td[2]/text()"}))] + extractors = [ + Extractor( + label='release dates', + path="//table[@id='release_dates']//tr", + attrs=Attribute( + key='release dates', + multi=True, + path={ + 'country': ".//td[1]//text()", + 'date': ".//td[2]//text()", + 'notes': ".//td[3]//text()" + } + ) + ), + + Extractor( + label='akas', + path="//table[@id='akas']//tr", + attrs=Attribute( + key='akas', + multi=True, + path={ + 'title': "./td[1]/text()", + 'countries': "./td[2]/text()"} + ) + ) + ] preprocessors = [ (re.compile('(
)', re.I | re.M | re.S), - r'
\1
')] + r'
\1
') + ] def postprocess_data(self, data): - if not ('release dates' in data or 'akas' in data): return data + if not ('release dates' in data or 'akas' in data): + return data releases = data.get('release dates') or [] rl = [] for i in releases: country = i.get('country') date = i.get('date') - if not (country and date): continue + if not (country and date): + continue country = country.strip() date = date.strip() - if not (country and date): continue + if not (country and date): + continue notes = i['notes'] info = u'%s::%s' % (country, date) if notes: @@ -1008,46 +1402,72 @@ class DOMHTMLReleaseinfoParser(DOMParserBase): class DOMHTMLRatingsParser(DOMParserBase): """Parser for the "user ratings" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = DOMHTMLRatingsParser() result = rparser.parse(userratings_html_string) """ - re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])', - re.I) + re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])', re.I) + extractors = [ - Extractor(label='number of votes', + Extractor( + label='number of votes', path="//td[b='Percentage']/../../tr", - attrs=[Attribute(key='votes', - multi=True, - path={ - 'votes': "td[1]//text()", - 'ordinal': "td[3]//text()" - })]), - Extractor(label='mean and median', + attrs=[ + Attribute( + key='votes', + multi=True, + path={ + 'votes': "td[1]//text()", + 'ordinal': "td[3]//text()" + } + ) + ] + ), + + Extractor( + label='mean and median', path="//p[starts-with(text(), 'Arithmetic mean')]", - attrs=Attribute(key='mean and median', - path="text()")), - Extractor(label='rating', + attrs=Attribute( + key='mean and median', + path="text()" + ) + ), + + Extractor( + label='rating', path="//a[starts-with(@href, '/search/title?user_rating=')]", - attrs=Attribute(key='rating', - path="text()")), - Extractor(label='demographic voters', + attrs=Attribute( + key='rating', + path="text()" + ) + ), + + Extractor( + label='demographic voters', path="//td[b='Average']/../../tr", - attrs=Attribute(key='demographic voters', - multi=True, - path={ - 'voters': "td[1]//text()", - 'votes': "td[2]//text()", - 'average': "td[3]//text()" - })), - Extractor(label='top 250', + attrs=Attribute( + key='demographic voters', + multi=True, + path={ + 'voters': "td[1]//text()", + 'votes': "td[2]//text()", + 'average': "td[3]//text()" + } + ) + ), + + Extractor( + label='top 250', path="//a[text()='top 250']", - attrs=Attribute(key='top 250', - path="./preceding-sibling::text()[1]")) - ] + attrs=Attribute( + key='top 250', + path="./preceding-sibling::text()[1]" + ) + ) + ] def postprocess_data(self, data): nd = {} @@ -1057,20 +1477,23 @@ class DOMHTMLRatingsParser(DOMParserBase): for i in xrange(1, 11): _ordinal = int(votes[i]['ordinal']) _strvts = votes[i]['votes'] or '0' - nd['number of votes'][_ordinal] = \ - int(_strvts.replace(',', '')) + nd['number of votes'][_ordinal] = int(_strvts.replace(',', '')) mean = data.get('mean and median', '') if mean: means = self.re_means.findall(mean) if means and len(means[0]) == 2: am, med = means[0] - try: am = float(am) - except (ValueError, OverflowError): pass - if type(am) is type(1.0): + try: + am = float(am) + except (ValueError, OverflowError): + pass + if isinstance(am, float): nd['arithmetic mean'] = am - try: med = int(med) - except (ValueError, OverflowError): pass - if type(med) is type(0): + try: + med = int(med) + except (ValueError, OverflowError): + pass + if isinstance(med, int): nd['median'] = med if 'rating' in data: nd['rating'] = float(data['rating']) @@ -1078,11 +1501,10 @@ class DOMHTMLRatingsParser(DOMParserBase): if dem_voters: nd['demographic'] = {} for i in xrange(1, len(dem_voters)): - if (dem_voters[i]['votes'] is not None) \ - and (dem_voters[i]['votes'].strip()): - nd['demographic'][dem_voters[i]['voters'].strip().lower()] \ - = (int(dem_voters[i]['votes'].replace(',', '')), - float(dem_voters[i]['average'])) + if (dem_voters[i]['votes'] is not None) and (dem_voters[i]['votes'].strip()): + nd['demographic'][dem_voters[i]['voters'].strip().lower()] = \ + (int(dem_voters[i]['votes'].replace(',', '')), + float(dem_voters[i]['average'])) if 'imdb users' in nd.get('demographic', {}): nd['votes'] = nd['demographic']['imdb users'][0] nd['demographic']['all votes'] = nd['demographic']['imdb users'] @@ -1093,9 +1515,11 @@ class DOMHTMLRatingsParser(DOMParserBase): i = sd.find(' ') if i != -1: sd = sd[:i] - try: sd = int(sd) - except (ValueError, OverflowError): pass - if type(sd) is type(0): + try: + sd = int(sd) + except (ValueError, OverflowError): + pass + if isinstance(sd, int): nd['top 250 rank'] = sd return nd @@ -1103,7 +1527,7 @@ class DOMHTMLRatingsParser(DOMParserBase): class DOMHTMLEpisodesRatings(DOMParserBase): """Parser for the "episode ratings ... by date" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1112,19 +1536,36 @@ class DOMHTMLEpisodesRatings(DOMParserBase): """ _containsObjects = True - extractors = [Extractor(label='title', path="//title", - attrs=Attribute(key='title', path="./text()")), - Extractor(label='ep ratings', - path="//th/../..//tr", - attrs=Attribute(key='episodes', multi=True, - path={'nr': ".//td[1]/text()", - 'ep title': ".//td[2]//text()", - 'movieID': ".//td[2]/a/@href", - 'rating': ".//td[3]/text()", - 'votes': ".//td[4]/text()"}))] + extractors = [ + Extractor( + label='title', + path="//title", + attrs=Attribute( + key='title', + path="./text()" + ) + ), + + Extractor( + label='ep ratings', + path="//th/../..//tr", + attrs=Attribute( + key='episodes', + multi=True, + path={ + 'nr': ".//td[1]/text()", + 'ep title': ".//td[2]//text()", + 'movieID': ".//td[2]/a/@href", + 'rating': ".//td[3]/text()", + 'votes': ".//td[4]/text()" + } + ) + ) + ] def postprocess_data(self, data): - if 'title' not in data or 'episodes' not in data: return {} + if 'title' not in data or 'episodes' not in data: + return {} nd = [] title = data['title'] for i in data['episodes']: @@ -1132,7 +1573,8 @@ class DOMHTMLEpisodesRatings(DOMParserBase): movieID = analyze_imdbid(i['movieID']) votes = i['votes'] rating = i['rating'] - if not (ept and movieID and votes and rating): continue + if not (ept and movieID and votes and rating): + continue try: votes = int(votes.replace(',', '').replace('.', '')) except: @@ -1150,26 +1592,28 @@ class DOMHTMLEpisodesRatings(DOMParserBase): if movieID is not None: movieID = str(movieID) m = Movie(title=ept, movieID=movieID, accessSystem=self._as, - modFunct=self._modFunct) + modFunct=self._modFunct) epofdict = m.get('episode of') if epofdict is not None: m['episode of'] = Movie(data=epofdict, accessSystem=self._as, - modFunct=self._modFunct) + modFunct=self._modFunct) nd.append({'episode': m, 'votes': votes, 'rating': rating}) return {'episodes rating': nd} def _normalize_href(href): if (href is not None) and (not href.lower().startswith('http://')): - if href.startswith('/'): href = href[1:] + if href.startswith('/'): + href = href[1:] # TODO: imdbURL_base may be set by the user! href = '%s%s' % (imdbURL_base, href) return href + class DOMHTMLCriticReviewsParser(DOMParserBase): """Parser for the "critic reviews" pages of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1179,21 +1623,130 @@ class DOMHTMLCriticReviewsParser(DOMParserBase): kind = 'critic reviews' extractors = [ - Extractor(label='metascore', - path="//div[@class='metascore_wrap']/div/span", - attrs=Attribute(key='metascore', - path=".//text()")), - Extractor(label='metacritic url', - path="//div[@class='article']/div[@class='see-more']/a", - attrs=Attribute(key='metacritic url', - path="./@href")) ] + Extractor( + label='metascore', + path="//div[@class='metascore_wrap']/div/span", + attrs=Attribute( + key='metascore', + path=".//text()" + ) + ), + + Extractor( + label='metacritic url', + path="//div[@class='article']/div[@class='see-more']/a", + attrs=Attribute( + key='metacritic url', + path="./@href" + ) + ) + ] + + +class DOMHTMLReviewsParser(DOMParserBase): + """Parser for the "reviews" pages of a given movie. + The page should be provided as a string, as taken from + the www.imdb.com server. The final result will be a + dictionary, with a key for every relevant section. + + Example: + osparser = DOMHTMLReviewsParser() + result = osparser.parse(officialsites_html_string) + """ + kind = 'reviews' + + extractors = [ + Extractor( + label='review', + path="//div[@class='review-container']", + attrs=Attribute( + key='self.kind', + multi=True, + path={ + 'text': ".//div[@class='text']//text()", + 'helpful': ".//div[@class='text-muted']/text()[1]", + 'title': ".//div[@class='title']//text()", + 'author': ".//span[@class='display-name-link']/a/@href", + 'date': ".//span[@class='review-date']//text()", + 'rating': ".//span[@class='point-scale']/preceding-sibling::span[1]/text()" + }, + postprocess=lambda x: ({ + 'content': (x['text'] or u'').replace(u"\n", u" ").replace(u' ', u' ').strip(), + 'helpful': [int(s) for s in (x.get('helpful') or u'').split() if s.isdigit()], + 'title': (x.get('title') or u'').strip(), + 'author': analyze_imdbid(x.get('author')), + 'date': (x.get('date') or u'').strip(), + 'rating': (x.get('rating') or u'').strip() + }) + ) + ) + ] + + preprocessors = [('
', '
\n')] + + def postprocess_data(self, data): + for review in data.get('reviews', []): + if review.get('rating') and len(review['rating']) == 2: + review['rating'] = int(review['rating'][0]) + else: + review['rating'] = None + + if review.get('helpful') and len(review['helpful']) == 2: + review['not_helpful'] = review['helpful'][1] - review['helpful'][0] + review['helpful'] = review['helpful'][0] + else: + review['helpful'] = 0 + review['not_helpful'] = 0 + + review['author'] = u"ur%s" % review['author'] + + return data + + +class DOMHTMLFullCreditsParser(DOMParserBase): + """Parser for the "full credits" (series cast section) page of a given movie. + The page should be provided as a string, as taken from + the www.imdb.com server. The final result will be a + dictionary, with a key for every relevant section. + + Example: + osparser = DOMHTMLFullCreditsParser() + result = osparser.parse(officialsites_html_string) + """ + kind = 'full credits' + + extractors = [ + Extractor( + label='cast', + path="//table[@class='cast_list']//tr[@class='odd' or @class='even']", + attrs=Attribute( + key="cast", + multi=True, + path={ + 'person': ".//text()", + 'link': "td[2]/a/@href", + 'roleID': "td[4]//div[@class='_imdbpyrole']/@roleid" + }, + postprocess=lambda x: build_person( + x.get('person') or u'', + personID=analyze_imdbid(x.get('link')), + roleID=(x.get('roleID') or u'').split('/') + ) + ) + ), + ] + + preprocessors = [ + (_reRolesMovie, _manageRoles) + ] + class DOMHTMLOfficialsitesParser(DOMParserBase): """Parser for the "official sites", "external reviews", "newsgroup reviews", "miscellaneous links", "sound clips", "video clips" and "photographs" pages of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1203,23 +1756,29 @@ class DOMHTMLOfficialsitesParser(DOMParserBase): kind = 'official sites' extractors = [ - Extractor(label='site', + Extractor( + label='site', path="//ol/li/a", - attrs=Attribute(key='self.kind', + attrs=Attribute( + key='self.kind', multi=True, path={ 'link': "./@href", 'info': "./text()" }, - postprocess=lambda x: (x.get('info').strip(), - urllib.unquote(_normalize_href(x.get('link')))))) - ] + postprocess=lambda x: ( + x.get('info').strip(), + urllib.unquote(_normalize_href(x.get('link'))) + ) + ) + ) + ] class DOMHTMLConnectionParser(DOMParserBase): """Parser for the "connections" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1228,15 +1787,23 @@ class DOMHTMLConnectionParser(DOMParserBase): """ _containsObjects = True - extractors = [Extractor(label='connection', - group="//div[@class='_imdbpy']", - group_key="./h5/text()", - group_key_normalize=lambda x: x.lower(), - path="./a", - attrs=Attribute(key=None, - path={'title': "./text()", - 'movieID': "./@href"}, - multi=True))] + extractors = [ + Extractor( + label='connection', + group="//div[@class='_imdbpy']", + group_key="./h5/text()", + group_key_normalize=lambda x: x.lower(), + path="./a", + attrs=Attribute( + key=None, + path={ + 'title': "./text()", + 'movieID': "./@href" + }, + multi=True + ) + ) + ] preprocessors = [ ('
', '
'), @@ -1244,7 +1811,7 @@ class DOMHTMLConnectionParser(DOMParserBase): (' (', ' ('), ('\n
', ''), ('
- ', '::') - ] + ] def postprocess_data(self, data): for key in data.keys(): @@ -1256,42 +1823,52 @@ class DOMHTMLConnectionParser(DOMParserBase): notes = u'' if len(ts) == 2: notes = ts[1].strip() - m = Movie(title=title, - movieID=analyze_imdbid(v['movieID']), - accessSystem=self._as, notes=notes, - modFunct=self._modFunct) + m = Movie(title=title, movieID=analyze_imdbid(v['movieID']), + accessSystem=self._as, notes=notes, modFunct=self._modFunct) nl.append(m) data[key] = nl - if not data: return {} + if not data: + return {} return {'connections': data} class DOMHTMLLocationsParser(DOMParserBase): """Parser for the "locations" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: lparser = DOMHTMLLocationsParser() result = lparser.parse(locations_html_string) """ - extractors = [Extractor(label='locations', path="//dt", - attrs=Attribute(key='locations', multi=True, - path={'place': ".//text()", - 'note': "./following-sibling::dd[1]" \ - "//text()"}, - postprocess=lambda x: (u'%s::%s' % ( - x['place'].strip(), - (x['note'] or u'').strip())).strip(':')))] + extractors = [ + Extractor( + label='locations', + path="//dt", + attrs=Attribute( + key='locations', + multi=True, + path={ + 'place': ".//text()", + 'note': "./following-sibling::dd[1]//text()" + }, + postprocess=lambda x: ( + u'%s::%s' % ( + x['place'].strip(), + (x['note'] or u'').strip() + ) + ).strip(':') + ) + ) + ] class DOMHTMLTechParser(DOMParserBase): - """Parser for the "technical", "business", "literature", - "publicity" (for people) and "contacts (for people) pages of - a given movie. + """Parser for the "technical", "publicity" (for people) and "contacts" (for people) + pages of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1301,38 +1878,39 @@ class DOMHTMLTechParser(DOMParserBase): kind = 'tech' re_space = re.compile(r'\s+') - extractors = [Extractor(label='tech', - group="//table//tr/td[@class='label']", - group_key="./text()", - group_key_normalize=lambda x: x.lower().strip(), - path=".", - attrs=Attribute(key=None, - path="..//td[2]//text()", - postprocess=lambda x: [t.strip() - for t in x.split(':::') if t.strip()]))] + extractors = [ + Extractor( + label='tech', + group="//table//tr/td[@class='label']", + group_key="./text()", + group_key_normalize=lambda x: x.lower().strip(), + path=".", + attrs=Attribute( + key=None, + path="..//td[2]//text()", + postprocess=lambda x: [t.strip() for t in x.split(':::') if t.strip()] + ) + ) + ] preprocessors = [ (re.compile('(
.*?
)', re.I), r'
\1
'), - (re.compile('((
|

|))\n?
(?!'), + (re.compile('((
|

|))\n?
(?!'), # the ones below are for the publicity parser (re.compile('

(.*?)

', re.I), r'\1
'), (re.compile('()', re.I), r'\1::'), (re.compile('()', re.I), r'\n\1'), (re.compile('\|', re.I), r':::'), - (re.compile('
', re.I), r':::'), + (re.compile('
', re.I), r':::') # this is for splitting individual entries - ] + ] def postprocess_data(self, data): for key in data: data[key] = filter(lambda x: x != '|', data[key]) data[key] = [self.re_space.sub(' ', x).strip() for x in data[key]] data[key] = filter(None, data[key]) - if self.kind in ('literature', 'business', 'contacts') and data: - if 'screenplay/teleplay' in data: - data['screenplay-teleplay'] = data['screenplay/teleplay'] - del data['screenplay/teleplay'] + if self.kind == 'contacts' and data: data = {self.kind: data} else: if self.kind == 'publicity': @@ -1350,10 +1928,53 @@ class DOMHTMLTechParser(DOMParserBase): return data +class DOMHTMLBusinessParser(DOMParserBase): + """Parser for the "business" and "literature" pages of a given movie. + The page should be provided as a string, as taken from + the www.imdb.com server. The final result will be a + dictionary, with a key for every relevant section. + + Example: + bparser = DOMHTMLBusinessParser() + result = bparser.parse(business_html_string) + """ + re_space = re.compile(r'\s+') + + extractors = [ + Extractor( + label='business', + path="//div[@id='tn15content']//h5", + attrs=Attribute( + key='./text()', + path="./following-sibling::div[1]//text()" + ) + ) + ] + + preprocessors = [ + ('', '
'), + ('
', '
'), + ('
', '
'), + ('

', '


'), + ('
', ':::') + ] + + def postprocess_data(self, data): + newData = {} + for key, value in data.iteritems(): + value = value.strip().split(':::') + value = [v.strip() for v in value] + value = filter(None, value) + if not value: + continue + newData[key.lower().strip()] = value + return newData + + class DOMHTMLRecParser(DOMParserBase): """Parser for the "recommendations" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1362,12 +1983,21 @@ class DOMHTMLRecParser(DOMParserBase): """ _containsObjects = True - extractors = [Extractor(label='recommendations', - path="//td[@valign='middle'][1]", - attrs=Attribute(key='../../tr/td[1]//text()', - multi=True, - path={'title': ".//text()", - 'movieID': ".//a/@href"}))] + extractors = [ + Extractor( + label='recommendations', + path="//td[@valign='middle'][1]", + attrs=Attribute( + key='../../tr/td[1]//text()', + multi=True, + path={ + 'title': ".//text()", + 'movieID': ".//a/@href" + } + ) + ) + ] + def postprocess_data(self, data): for key in data.keys(): n_key = key @@ -1376,19 +2006,21 @@ class DOMHTMLRecParser(DOMParserBase): n_key = 'database' elif n_keyl == 'imdb users recommend': n_key = 'users' - data[n_key] = [Movie(title=x['title'], - movieID=analyze_imdbid(x['movieID']), - accessSystem=self._as, modFunct=self._modFunct) - for x in data[key]] + data[n_key] = [ + Movie(title=x['title'], movieID=analyze_imdbid(x['movieID']), + accessSystem=self._as, modFunct=self._modFunct) + for x in data[key] + ] del data[key] - if data: return {'recommendations': data} + if data: + return {'recommendations': data} return data class DOMHTMLNewsParser(DOMParserBase): """Parser for the "news" page of a given movie or person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1398,9 +2030,11 @@ class DOMHTMLNewsParser(DOMParserBase): _defGetRefs = True extractors = [ - Extractor(label='news', + Extractor( + label='news', path="//h2", - attrs=Attribute(key='news', + attrs=Attribute( + key='news', multi=True, path={ 'title': "./text()", @@ -1409,25 +2043,25 @@ class DOMHTMLNewsParser(DOMParserBase): # inside news text. 'body': "../following-sibling::p[2]//text()", 'link': "../..//a[text()='Permalink']/@href", - 'fulllink': "../..//a[starts-with(text(), " \ - "'See full article at')]/@href" - }, + 'fulllink': "../..//a[starts-with(text(), 'See full article at')]/@href" + }, postprocess=lambda x: { 'title': x.get('title').strip(), 'date': x.get('fromdate').split('|')[0].strip(), - 'from': x.get('fromdate').split('|')[1].replace('From ', - '').strip(), + 'from': x.get('fromdate').split('|')[1].replace('From ', '').strip(), 'body': (x.get('body') or u'').strip(), 'link': _normalize_href(x.get('link')), 'full article link': _normalize_href(x.get('fulllink')) - })) - ] + } + ) + ) + ] preprocessors = [ (re.compile('(]+>

)', re.I), r'
\1'), (re.compile('(
)', re.I), r'
\1'), (re.compile('

', re.I), r'') - ] + ] def postprocess_data(self, data): if not data.has_key('news'): @@ -1442,11 +2076,13 @@ class DOMHTMLNewsParser(DOMParserBase): def _parse_review(x): result = {} title = x.get('title').strip() - if title[-1] == ':': title = title[:-1] + if title[-1] == ':': + title = title[:-1] result['title'] = title result['link'] = _normalize_href(x.get('link')) - kind = x.get('kind').strip() - if kind[-1] == ':': kind = kind[:-1] + kind = x.get('kind').strip() + if kind[-1] == ':': + kind = kind[:-1] result['review kind'] = kind text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||') review = '\n'.join(text) @@ -1465,93 +2101,122 @@ def _parse_review(x): class DOMHTMLSeasonEpisodesParser(DOMParserBase): """Parser for the "episode list" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = DOMHTMLSeasonEpisodesParser() result = sparser.parse(episodes_html_string) """ + extractors = [ - Extractor(label='series link', - path="//div[@class='parent']", - attrs=[Attribute(key='series link', - path=".//a/@href")] - ), - - Extractor(label='series title', - path="//head/meta[@property='og:title']", - attrs=[Attribute(key='series title', - path="./@content")] - ), - - Extractor(label='seasons list', - path="//select[@id='bySeason']//option", - attrs=[Attribute(key='_seasons', - multi=True, - path="./@value")]), - - Extractor(label='selected season', - path="//select[@id='bySeason']//option[@selected]", - attrs=[Attribute(key='_current_season', - path='./@value')]), - - Extractor(label='episodes', - path=".", - group="//div[@class='info']", - group_key=".//meta/@content", - group_key_normalize=lambda x: 'episode %s' % x, - attrs=[Attribute(key=None, - multi=True, - path={ - "link": ".//strong//a[@href][1]/@href", - "original air date": ".//div[@class='airdate']/text()", - "title": ".//strong//text()", - "plot": ".//div[@class='item_description']//text()" - } - )] + Extractor( + label='series link', + path="//div[@class='parent']", + attrs=[ + Attribute( + key='series link', + path=".//a/@href" ) ] + ), + + Extractor( + label='series title', + path="//head/meta[@property='og:title']", + attrs=[ + Attribute( + key='series title', + path="./@content" + ) + ] + ), + + Extractor( + label='seasons list', + path="//select[@id='bySeason']//option", + attrs=[ + Attribute( + key='_seasons', + multi=True, + path="./@value" + ) + ] + ), + + Extractor( + label='selected season', + path="//select[@id='bySeason']//option[@selected]", + attrs=[ + Attribute( + key='_current_season', + path='./@value' + ) + ] + ), + + Extractor( + label='episodes', + path=".", + group="//div[@class='info']", + group_key=".//meta/@content", + group_key_normalize=lambda x: 'episode %s' % x, + attrs=[ + Attribute( + key=None, + multi=True, + path={ + "link": ".//strong//a[@href][1]/@href", + "original air date": ".//div[@class='airdate']/text()", + "title": ".//strong//text()", + "plot": ".//div[@class='item_description']//text()" + } + ) + ] + ) + ] def postprocess_data(self, data): series_id = analyze_imdbid(data.get('series link')) series_title = data.get('series title', '').strip() - selected_season = data.get('_current_season', - 'unknown season').strip() + selected_season = data.get('_current_season', 'unknown season').strip() if not (series_id and series_title): return {} series = Movie(title=series_title, movieID=str(series_id), - accessSystem=self._as, modFunct=self._modFunct) + accessSystem=self._as, modFunct=self._modFunct) if series.get('kind') == 'movie': series['kind'] = u'tv series' - try: selected_season = int(selected_season) - except: pass + try: + selected_season = int(selected_season) + except: + pass nd = {selected_season: {}} if 'episode -1' in data: - counter = 1 - for episode in data['episode -1']: - while 'episode %d' % counter in data: - counter += 1 - k = 'episode %d' % counter - data[k] = [episode] - del data['episode -1'] - for episode_nr, episode in data.iteritems(): + counter = 1 + for episode in data['episode -1']: + while 'episode %d' % counter in data: + counter += 1 + k = 'episode %d' % counter + data[k] = [episode] + del data['episode -1'] + for episode_nr, episode in data.items(): if not (episode and episode[0] and episode_nr.startswith('episode ')): continue episode = episode[0] episode_nr = episode_nr[8:].rstrip() - try: episode_nr = int(episode_nr) - except: pass + try: + episode_nr = int(episode_nr) + except: + pass episode_id = analyze_imdbid(episode.get('link' '')) - episode_air_date = episode.get('original air date', - '').strip() + episode_air_date = episode.get('original air date', '').strip() episode_title = episode.get('title', '').strip() episode_plot = episode.get('plot', '') if not (episode_nr is not None and episode_id and episode_title): continue ep_obj = Movie(movieID=episode_id, title=episode_title, - accessSystem=self._as, modFunct=self._modFunct) + accessSystem=self._as, modFunct=self._modFunct) ep_obj['kind'] = u'episode' ep_obj['episode of'] = series ep_obj['season'] = selected_season @@ -1565,10 +2230,11 @@ class DOMHTMLSeasonEpisodesParser(DOMParserBase): nd[selected_season][episode_nr] = ep_obj _seasons = data.get('_seasons') or [] for idx, season in enumerate(_seasons): - try: _seasons[idx] = int(season) - except: pass - return {'episodes': nd, '_seasons': _seasons, - '_current_season': selected_season} + try: + _seasons[idx] = int(season) + except: + pass + return {'episodes': nd, '_seasons': _seasons, '_current_season': selected_season} def _build_episode(x): @@ -1583,7 +2249,8 @@ def _build_episode(x): year = x.get('year') if year is not None: year = year[5:] - if year == 'unknown': year = u'????' + if year == 'unknown': + year = u'????' if year and year.isdigit(): year = int(year) e['year'] = year @@ -1607,7 +2274,7 @@ def _build_episode(x): class DOMHTMLEpisodesParser(DOMParserBase): """Parser for the "episode list" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1624,19 +2291,28 @@ class DOMHTMLEpisodesParser(DOMParserBase): def _init(self): self.extractors = [ - Extractor(label='series', + Extractor( + label='series', path="//html", - attrs=[Attribute(key='series title', - path=".//title/text()"), - Attribute(key='series movieID', - path=".//h1/a[@class='main']/@href", - postprocess=analyze_imdbid) - ]), - Extractor(label='episodes', + attrs=[ + Attribute( + key='series title', + path=".//title/text()" + ), + Attribute( + key='series movieID', + path=".//h1/a[@class='main']/@href", + postprocess=analyze_imdbid + ) + ] + ), + Extractor( + label='episodes', group="//div[@class='_imdbpy']/h3", group_key="./a/@name", path=self._episodes_path, - attrs=Attribute(key=None, + attrs=Attribute( + key=None, multi=True, path={ 'link': "./a/@href", @@ -1646,69 +2322,85 @@ class DOMHTMLEpisodesParser(DOMParserBase): 'oad': self._oad_path, 'plot': "./following-sibling::text()[1]" }, - postprocess=_build_episode))] + postprocess=_build_episode + ) + ) + ] + if self.kind == 'episodes cast': self.extractors += [ - Extractor(label='cast', + Extractor( + label='cast', group="//h4", group_key="./text()[1]", group_key_normalize=lambda x: x.strip(), path="./following-sibling::table[1]//td[@class='nm']", - attrs=Attribute(key=None, + attrs=Attribute( + key=None, multi=True, - path={'person': "..//text()", + path={ + 'person': "..//text()", 'link': "./a/@href", - 'roleID': \ - "../td[4]/div[@class='_imdbpyrole']/@roleid"}, - postprocess=lambda x: \ - build_person(x.get('person') or u'', - personID=analyze_imdbid(x.get('link')), - roleID=(x.get('roleID') or u'').split('/'), - accessSystem=self._as, - modFunct=self._modFunct))) - ] + 'roleID': "../td[4]/div[@class='_imdbpyrole']/@roleid" + }, + postprocess=lambda x: build_person( + x.get('person') or u'', + personID=analyze_imdbid(x.get('link')), + roleID=(x.get('roleID') or u'').split('/'), + accessSystem=self._as, + modFunct=self._modFunct + ) + ) + ) + ] preprocessors = [ - (re.compile('(
\n)(

)', re.I), - r'

\1
\2'), + (re.compile('(
\n)(

)', re.I), r'

\1
\2'), (re.compile('(

\n\n)
', re.I), r'\1'), (re.compile('

(.*?)

', re.I), r'

\1

'), (_reRolesMovie, _manageRoles), (re.compile('(

\n)(
)', re.I), r'\1
\2') - ] + ] def postprocess_data(self, data): # A bit extreme? - if not 'series title' in data: return {} - if not 'series movieID' in data: return {} + if 'series title' not in data: + return {} + if 'series movieID' not in data: + return {} stitle = data['series title'].replace('- Episode list', '') stitle = stitle.replace('- Episodes list', '') stitle = stitle.replace('- Episode cast', '') stitle = stitle.replace('- Episodes cast', '') stitle = stitle.strip() - if not stitle: return {} + if not stitle: + return {} seriesID = data['series movieID'] - if seriesID is None: return {} + if seriesID is None: + return {} series = Movie(title=stitle, movieID=str(seriesID), - accessSystem=self._as, modFunct=self._modFunct) + accessSystem=self._as, modFunct=self._modFunct) nd = {} for key in data.keys(): if key.startswith('filter-season-') or key.startswith('season-'): season_key = key.replace('filter-season-', '').replace('season-', '') - try: season_key = int(season_key) - except: pass + try: + season_key = int(season_key) + except: + pass nd[season_key] = {} ep_counter = 1 for episode in data[key]: - if not episode: continue + if not episode: + continue episode_key = episode.get('episode') - if episode_key is None: continue + if episode_key is None: + continue if not isinstance(episode_key, int): episode_key = ep_counter ep_counter += 1 - cast_key = 'Season %s, Episode %s:' % (season_key, - episode_key) - if data.has_key(cast_key): + cast_key = 'Season %s, Episode %s:' % (season_key, episode_key) + if cast_key in data: cast = data[cast_key] for i in xrange(len(cast)): cast[i].billingPos = i + 1 @@ -1723,7 +2415,7 @@ class DOMHTMLEpisodesParser(DOMParserBase): class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser): """Parser for the "episodes cast" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1738,7 +2430,7 @@ class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser): class DOMHTMLFaqsParser(DOMParserBase): """Parser for the "FAQ" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1750,31 +2442,36 @@ class DOMHTMLFaqsParser(DOMParserBase): # XXX: bsoup and lxml don't match (looks like a minor issue, anyway). extractors = [ - Extractor(label='faqs', + Extractor( + label='faqs', path="//div[@class='section']", - attrs=Attribute(key='faqs', + attrs=Attribute( + key='faqs', multi=True, path={ 'question': "./h3/a/span/text()", 'answer': "../following-sibling::div[1]//text()" }, - postprocess=lambda x: u'%s::%s' % (x.get('question').strip(), - '\n\n'.join(x.get('answer').replace( - '\n\n', '\n').strip().split('||'))))) - ] + postprocess=lambda x: u'%s::%s' % ( + x.get('question').strip(), + '\n\n'.join(x.get('answer').replace('\n\n', '\n').strip().split('||')) + ) + ) + ) + ] preprocessors = [ (re.compile('

', re.I), r'||'), (re.compile('

(.*?)

\n', re.I), r'||\1--'), (re.compile('(.*?)', re.I), r'[spoiler]\1[/spoiler]') - ] + ] class DOMHTMLAiringParser(DOMParserBase): """Parser for the "airing" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1784,18 +2481,30 @@ class DOMHTMLAiringParser(DOMParserBase): _containsObjects = True extractors = [ - Extractor(label='series title', + Extractor( + label='series title', path="//title", - attrs=Attribute(key='series title', path="./text()", - postprocess=lambda x: \ - x.replace(' - TV schedule', u''))), - Extractor(label='series id', - path="//h1/a[@href]", - attrs=Attribute(key='series id', path="./@href")), + attrs=Attribute( + key='series title', + path="./text()", + postprocess=lambda x: x.replace(' - TV schedule', u'') + ) + ), - Extractor(label='tv airings', + Extractor( + label='series id', + path="//h1/a[@href]", + attrs=Attribute( + key='series id', + path="./@href" + ) + ), + + Extractor( + label='tv airings', path="//tr[@class]", - attrs=Attribute(key='airing', + attrs=Attribute( + key='airing', multi=True, path={ 'date': "./td[1]//text()", @@ -1804,7 +2513,7 @@ class DOMHTMLAiringParser(DOMParserBase): 'link': "./td[4]/a[1]/@href", 'title': "./td[4]//text()", 'season': "./td[5]//text()", - }, + }, postprocess=lambda x: { 'date': x.get('date'), 'time': x.get('time'), @@ -1812,8 +2521,9 @@ class DOMHTMLAiringParser(DOMParserBase): 'link': x.get('link'), 'title': x.get('title'), 'season': (x.get('season') or '').strip() - } - )) + } + ) + ) ] def postprocess_data(self, data): @@ -1853,7 +2563,7 @@ class DOMHTMLAiringParser(DOMParserBase): class DOMHTMLSynopsisParser(DOMParserBase): """Parser for the "synopsis" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1861,22 +2571,26 @@ class DOMHTMLSynopsisParser(DOMParserBase): result = sparser.parse(synopsis_html_string) """ extractors = [ - Extractor(label='synopsis', - path="//div[@class='display'][not(@style)]", - attrs=Attribute(key='synopsis', + Extractor( + label='synopsis', + path="//ul[@id='plot-synopsis-content'][not(@style)]", + attrs=Attribute( + key='synopsis', path=".//text()", - postprocess=lambda x: '\n\n'.join(x.strip().split('||')))) + postprocess=lambda x: '\n\n'.join(x.strip().split('||')) + ) + ) ] preprocessors = [ (re.compile('

', re.I), r'||') - ] + ] class DOMHTMLParentsGuideParser(DOMParserBase): """Parser for the "parents guide" page of a given movie. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -1884,20 +2598,25 @@ class DOMHTMLParentsGuideParser(DOMParserBase): result = pgparser.parse(parentsguide_html_string) """ extractors = [ - Extractor(label='parents guide', + Extractor( + label='parents guide', group="//div[@class='section']", group_key="./h3/a/span/text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::div[1]/p", - attrs=Attribute(key=None, + attrs=Attribute( + key=None, path=".//text()", - postprocess=lambda x: [t.strip().replace('\n', ' ') - for t in x.split('||') if t.strip()])) + postprocess=lambda x: [ + t.strip().replace('\n', ' ') for t in x.split('||') if t.strip() + ] + ) + ) ] preprocessors = [ (re.compile('

', re.I), r'||') - ] + ] def postprocess_data(self, data): data2 = {} @@ -1910,49 +2629,41 @@ class DOMHTMLParentsGuideParser(DOMParserBase): _OBJECTS = { - 'movie_parser': ((DOMHTMLMovieParser,), None), - 'plot_parser': ((DOMHTMLPlotParser,), None), + 'movie_parser': ((DOMHTMLMovieParser,), None), + 'plot_parser': ((DOMHTMLPlotParser,), None), 'movie_awards_parser': ((DOMHTMLAwardsParser,), None), - 'taglines_parser': ((DOMHTMLTaglinesParser,), None), - 'keywords_parser': ((DOMHTMLKeywordsParser,), None), - 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None), - 'goofs_parser': ((DOMHTMLGoofsParser,), None), - 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None), - 'trivia_parser': ((DOMHTMLTriviaParser,), None), - 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), None), - 'quotes_parser': ((DOMHTMLQuotesParser,), None), - 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None), - 'ratings_parser': ((DOMHTMLRatingsParser,), None), - 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), - 'criticrev_parser': ((DOMHTMLCriticReviewsParser,), - {'kind': 'critic reviews'}), - 'externalrev_parser': ((DOMHTMLOfficialsitesParser,), - {'kind': 'external reviews'}), - 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,), - {'kind': 'newsgroup reviews'}), - 'misclinks_parser': ((DOMHTMLOfficialsitesParser,), - {'kind': 'misc links'}), - 'soundclips_parser': ((DOMHTMLOfficialsitesParser,), - {'kind': 'sound clips'}), - 'videoclips_parser': ((DOMHTMLOfficialsitesParser,), - {'kind': 'video clips'}), - 'photosites_parser': ((DOMHTMLOfficialsitesParser,), - {'kind': 'photo sites'}), - 'connections_parser': ((DOMHTMLConnectionParser,), None), - 'tech_parser': ((DOMHTMLTechParser,), None), - 'business_parser': ((DOMHTMLTechParser,), - {'kind': 'business', '_defGetRefs': 1}), - 'literature_parser': ((DOMHTMLTechParser,), {'kind': 'literature'}), - 'locations_parser': ((DOMHTMLLocationsParser,), None), - 'rec_parser': ((DOMHTMLRecParser,), None), - 'news_parser': ((DOMHTMLNewsParser,), None), - 'episodes_parser': ((DOMHTMLEpisodesParser,), None), - 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None), - 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None), - 'eprating_parser': ((DOMHTMLEpisodesRatings,), None), - 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None), - 'airing_parser': ((DOMHTMLAiringParser,), None), - 'synopsis_parser': ((DOMHTMLSynopsisParser,), None), - 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None) + 'taglines_parser': ((DOMHTMLTaglinesParser,), None), + 'keywords_parser': ((DOMHTMLKeywordsParser,), None), + 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None), + 'goofs_parser': ((DOMHTMLGoofsParser,), None), + 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None), + 'trivia_parser': ((DOMHTMLTriviaParser,), None), + 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), None), + 'quotes_parser': ((DOMHTMLQuotesParser,), None), + 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None), + 'ratings_parser': ((DOMHTMLRatingsParser,), None), + 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), + 'criticrev_parser': ((DOMHTMLCriticReviewsParser,), {'kind': 'critic reviews'}), + 'reviews_parser': ((DOMHTMLReviewsParser,), {'kind': 'reviews'}), + 'externalsites_parser': ((DOMHTMLOfficialsitesParser,), None), + 'externalrev_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'external reviews'}), + 'misclinks_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'misc links'}), + 'soundclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'sound clips'}), + 'videoclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'video clips'}), + 'photosites_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'photo sites'}), + 'connections_parser': ((DOMHTMLConnectionParser,), None), + 'tech_parser': ((DOMHTMLTechParser,), None), + 'business_parser': ((DOMHTMLBusinessParser,), {'kind': 'business', '_defGetRefs': 1}), + 'literature_parser': ((DOMHTMLBusinessParser,), None), + 'locations_parser': ((DOMHTMLLocationsParser,), None), + 'rec_parser': ((DOMHTMLRecParser,), None), + 'news_parser': ((DOMHTMLNewsParser,), None), + 'episodes_parser': ((DOMHTMLEpisodesParser,), None), + 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None), + 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None), + 'eprating_parser': ((DOMHTMLEpisodesRatings,), None), + 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None), + 'airing_parser': ((DOMHTMLAiringParser,), None), + 'synopsis_parser': ((DOMHTMLSynopsisParser,), None), + 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None) } - diff --git a/lib/imdb/parser/http/personParser.py b/lib/imdb/parser/http/personParser.py index caf8b2ef..ac02811f 100644 --- a/lib/imdb/parser/http/personParser.py +++ b/lib/imdb/parser/http/personParser.py @@ -2,10 +2,10 @@ parser.http.personParser module (imdb package). This module provides the classes (and the instances), used to parse -the IMDb pages on the akas.imdb.com server about a person. +the IMDb pages on the www.imdb.com server about a person. E.g., for "Mel Gibson" the referred pages would be: - categorized: http://akas.imdb.com/name/nm0000154/maindetails - biography: http://akas.imdb.com/name/nm0000154/bio + categorized: http://www.imdb.com/name/nm0000154/maindetails + biography: http://www.imdb.com/name/nm0000154/bio ...and so on... Copyright 2004-2013 Davide Alberani @@ -52,7 +52,7 @@ def build_date(date): class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -192,7 +192,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase): class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -225,92 +225,157 @@ class DOMHTMLBioParser(DOMParserBase): # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ - Extractor(label='headshot', - path="//a[@name='headshot']", - attrs=Attribute(key='headshot', - path="./img/@src")), - Extractor(label='birth info', - path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]", - attrs=_birth_attrs), - Extractor(label='death info', - path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]", - attrs=_death_attrs), - Extractor(label='nick names', - path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]", - attrs=Attribute(key='nick names', - path="./text()", - joiner='|', - postprocess=lambda x: [n.strip().replace(' (', - '::(', 1) for n in x.split('|') - if n.strip()])), - Extractor(label='birth name', - path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]", - attrs=Attribute(key='birth name', - path="./text()", - postprocess=lambda x: canonicalName(x.strip()))), - Extractor(label='height', - path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]", - attrs=Attribute(key='height', - path="./text()", - postprocess=lambda x: x.strip())), - Extractor(label='mini biography', - path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]", - attrs=Attribute(key='mini biography', - multi=True, - path={ - 'bio': ".//text()", - 'by': ".//a[@name='ba']//text()" - }, - postprocess=lambda x: "%s::%s" % \ - ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(), - (x.get('by') or u'').strip() or u'Anonymous'))), - Extractor(label='spouse', - path="//div[h5='Spouse']/table/tr", - attrs=Attribute(key='spouse', - multi=True, - path={ - 'name': "./td[1]//text()", - 'info': "./td[2]//text()" - }, - postprocess=lambda x: ("%s::%s" % \ - (x.get('name').strip(), - (x.get('info') or u'').strip())).strip(':'))), - Extractor(label='trade mark', - path="//div[h5='Trade Mark']/p", - attrs=Attribute(key='trade mark', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip())), - Extractor(label='trivia', - path="//div[h5='Trivia']/p", - attrs=Attribute(key='trivia', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip())), - Extractor(label='quotes', - path="//div[h5='Personal Quotes']/p", - attrs=Attribute(key='quotes', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip())), - Extractor(label='salary', - path="//div[h5='Salary']/table/tr", - attrs=Attribute(key='salary history', - multi=True, - path={ - 'title': "./td[1]//text()", - 'info': "./td[2]/text()", - }, - postprocess=lambda x: "%s::%s" % \ - (x.get('title').strip(), - x.get('info').strip()))), - Extractor(label='where now', - path="//div[h5='Where Are They Now']/p", - attrs=Attribute(key='where now', - multi=True, - path=".//text()", - postprocess=lambda x: x.strip())), - ] + Extractor( + label='headshot', + path="//a[@name='headshot']", + attrs=Attribute( + key='headshot', + path="./img/@src" + ) + ), + + Extractor( + label='birth info', + path="//table[@id='overviewTable']" + "//td[text()='Date of Birth']/following-sibling::td[1]", + attrs=_birth_attrs + ), + + Extractor( + label='death info', + path="//table[@id='overviewTable']" + "//td[text()='Date of Death']/following-sibling::td[1]", + attrs=_death_attrs + ), + + Extractor( + label='nick names', + path="//table[@id='overviewTable']" + "//td[text()='Nickenames']/following-sibling::td[1]", + attrs=Attribute( + key='nick names', + path="./text()", + joiner='|', + postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') + if n.strip()] + ) + ), + + Extractor( + label='birth name', + path="//table[@id='overviewTable']" + "//td[text()='Birth Name']/following-sibling::td[1]", + attrs=Attribute( + key='birth name', + path="./text()", + postprocess=lambda x: canonicalName(x.strip()) + ) + ), + + Extractor( + label='height', + path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]", + attrs=Attribute( + key='height', + path="./text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='mini biography', + path="//a[@name='mini_bio']/following-sibling::" + "div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]", + attrs=Attribute( + key='mini biography', + multi=True, + path={ + 'bio': ".//text()", + 'by': ".//a[@name='ba']//text()" + }, + postprocess=lambda x: "%s::%s" % ( + (x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(), + (x.get('by') or u'').strip() or u'Anonymous' + ) + ) + ), + + Extractor( + label='spouse', + path="//div[h5='Spouse']/table/tr", + attrs=Attribute( + key='spouse', + multi=True, + path={ + 'name': "./td[1]//text()", + 'info': "./td[2]//text()" + }, + postprocess=lambda x: ("%s::%s" % ( + x.get('name').strip(), + (x.get('info') or u'').strip())).strip(':') + ) + ), + + Extractor( + label='trade mark', + path="//div[h5='Trade Mark']/p", + attrs=Attribute( + key='trade mark', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='trivia', + path="//div[h5='Trivia']/p", + attrs=Attribute( + key='trivia', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='quotes', + path="//div[h5='Personal Quotes']/p", + attrs=Attribute( + key='quotes', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip() + ) + ), + + Extractor( + label='salary', + path="//div[h5='Salary']/table/tr", + attrs=Attribute( + key='salary history', + multi=True, + path={ + 'title': "./td[1]//text()", + 'info': "./td[2]/text()", + }, + postprocess=lambda x: "%s::%s" % ( + x.get('title').strip(), + x.get('info').strip()) + ) + ), + + Extractor( + label='where now', + path="//div[h5='Where Are They Now']/p", + attrs=Attribute( + key='where now', + multi=True, + path=".//text()", + postprocess=lambda x: x.strip() + ) + ) + ] preprocessors = [ (re.compile('(
)', re.I), r'
\1'), @@ -329,7 +394,7 @@ class DOMHTMLBioParser(DOMParserBase): class DOMHTMLResumeParser(DOMParserBase): """Parser for the "resume" page of a given person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -406,13 +471,13 @@ class DOMHTMLResumeParser(DOMParserBase): continue if len(data[key][0]) == 3: for item in data[key]: - item[:] = [x for x in item if not x == None] + item[:] = [x for x in item if not x is None] continue if len(data[key][0]) == 2: new_key = {} for item in data[key]: - if item[0] == None: + if item[0] is None: continue if ':' in item[0]: if item[1].replace(item[0], '')[1:].strip() == '': @@ -422,15 +487,14 @@ class DOMHTMLResumeParser(DOMParserBase): new_key[item[0]] = item[1] data[key] = new_key - new_data = {} - new_data['resume'] = data + new_data = {'resume': data} return new_data class DOMHTMLOtherWorksParser(DOMParserBase): """Parser for the "other works" and "agent" pages of a given person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -466,7 +530,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID): minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: - slfRole = minfo[minidx+3:].lstrip() + slfRole = minfo[minidx + 3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') @@ -504,7 +568,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID): class DOMHTMLSeriesParser(DOMParserBase): """Parser for the "by TV series" page of a given person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -559,7 +623,7 @@ class DOMHTMLSeriesParser(DOMParserBase): class DOMHTMLPersonGenresParser(DOMParserBase): """Parser for the "by genre" and "by keywords" pages of a given person. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: diff --git a/lib/imdb/parser/http/searchCharacterParser.py b/lib/imdb/parser/http/searchCharacterParser.py index 5f281fa0..fde5e5a3 100644 --- a/lib/imdb/parser/http/searchCharacterParser.py +++ b/lib/imdb/parser/http/searchCharacterParser.py @@ -5,7 +5,7 @@ This module provides the HTMLSearchCharacterParser class (and the search_character_parser instance), used to parse the results of a search for a given character. E.g., when searching for the name "Jesse James", the parsed page would be: - http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James + http://www.imdb.com/find?s=ch;mx=20;q=Jesse+James Copyright 2007-2012 Davide Alberani 2008 H. Turgut Uyar diff --git a/lib/imdb/parser/http/searchCompanyParser.py b/lib/imdb/parser/http/searchCompanyParser.py index 40ea8a72..918591c4 100644 --- a/lib/imdb/parser/http/searchCompanyParser.py +++ b/lib/imdb/parser/http/searchCompanyParser.py @@ -5,7 +5,7 @@ This module provides the HTMLSearchCompanyParser class (and the search_company_parser instance), used to parse the results of a search for a given company. E.g., when searching for the name "Columbia Pictures", the parsed page would be: - http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures + http://www.imdb.com/find?s=co;mx=20;q=Columbia+Pictures Copyright 2008-2012 Davide Alberani 2008 H. Turgut Uyar @@ -46,22 +46,29 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): _titleBuilder = lambda self, x: build_company_name(x) _linkPrefix = '/company/co' - _attrs = [Attribute(key='data', - multi=True, - path={ - 'link': "./a[1]/@href", - 'name': "./a[1]/text()", - 'notes': "./text()[1]" - }, - postprocess=lambda x: ( - analyze_imdbid(x.get('link')), - analyze_company_name(x.get('name')+(x.get('notes') - or u''), stripNotes=True) - ))] - extractors = [Extractor(label='search', - path="//td[@class='result_text']/a[starts-with(@href, " \ - "'/company/co')]/..", - attrs=_attrs)] + _attrs = [ + Attribute( + key='data', + multi=True, + path={ + 'link': "./a[1]/@href", + 'name': "./a[1]/text()", + 'notes': "./text()[1]" + }, + postprocess=lambda x: ( + analyze_imdbid(x.get('link')), + analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True) + ) + ) + ] + + extractors = [ + Extractor( + label='search', + path="//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..", + attrs=_attrs + ) + ] _OBJECTS = { diff --git a/lib/imdb/parser/http/searchKeywordParser.py b/lib/imdb/parser/http/searchKeywordParser.py index 4161fa48..3c6c2b41 100644 --- a/lib/imdb/parser/http/searchKeywordParser.py +++ b/lib/imdb/parser/http/searchKeywordParser.py @@ -5,7 +5,7 @@ This module provides the HTMLSearchKeywordParser class (and the search_company_parser instance), used to parse the results of a search for a given keyword. E.g., when searching for the keyword "alabama", the parsed page would be: - http://akas.imdb.com/find?s=kw;mx=20;q=alabama + http://www.imdb.com/find?s=kw;mx=20;q=alabama Copyright 2009 Davide Alberani diff --git a/lib/imdb/parser/http/searchMovieParser.py b/lib/imdb/parser/http/searchMovieParser.py index 781610cf..11c3dfef 100644 --- a/lib/imdb/parser/http/searchMovieParser.py +++ b/lib/imdb/parser/http/searchMovieParser.py @@ -6,7 +6,7 @@ search_movie_parser instance), used to parse the results of a search for a given title. E.g., for when searching for the title "the passion", the parsed page would be: - http://akas.imdb.com/find?q=the+passion&tt=on&mx=20 + http://www.imdb.com/find?q=the+passion&tt=on&mx=20 Copyright 2004-2013 Davide Alberani 2008 H. Turgut Uyar @@ -67,7 +67,7 @@ class DOMBasicMovieParser(DOMParserBase): data = [] else: link = data.pop('link') - if (link and data): + if link and data: data = [(link, data)] else: data = [] diff --git a/lib/imdb/parser/http/searchPersonParser.py b/lib/imdb/parser/http/searchPersonParser.py index 2dd26941..be532f35 100644 --- a/lib/imdb/parser/http/searchPersonParser.py +++ b/lib/imdb/parser/http/searchPersonParser.py @@ -5,7 +5,7 @@ This module provides the HTMLSearchPersonParser class (and the search_person_parser instance), used to parse the results of a search for a given person. E.g., when searching for the name "Mel Gibson", the parsed page would be: - http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20 + http://www.imdb.com/find?q=Mel+Gibson&nm=on&mx=20 Copyright 2004-2013 Davide Alberani 2008 H. Turgut Uyar diff --git a/lib/imdb/parser/http/topBottomParser.py b/lib/imdb/parser/http/topBottomParser.py index 1b8bb9f0..5d79a86f 100644 --- a/lib/imdb/parser/http/topBottomParser.py +++ b/lib/imdb/parser/http/topBottomParser.py @@ -4,8 +4,8 @@ parser.http.topBottomParser module (imdb package). This module provides the classes (and the instances), used to parse the lists of top 250 and bottom 100 movies. E.g.: - http://akas.imdb.com/chart/top - http://akas.imdb.com/chart/bottom + http://www.imdb.com/chart/top + http://www.imdb.com/chart/bottom Copyright 2009-2015 Davide Alberani @@ -31,7 +31,7 @@ from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid class DOMHTMLTop250Parser(DOMParserBase): """Parser for the "top 250" page. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: @@ -42,17 +42,24 @@ class DOMHTMLTop250Parser(DOMParserBase): ranktext = 'top 250 rank' def _init(self): - self.extractors = [Extractor(label=self.label, - path="//div[@id='main']//div[1]//div//table//tbody//tr", - attrs=Attribute(key=None, - multi=True, - path={self.ranktext: "./td[2]//text()", - 'rating': "./td[3]//strong//text()", - 'title': "./td[2]//a//text()", - 'year': "./td[2]//span//text()", - 'movieID': "./td[2]//a/@href", - 'votes': "./td[3]//strong/@title" - }))] + self.extractors = [ + Extractor( + label=self.label, + path="//div[@id='main']//div[1]//div//table//tbody//tr", + attrs=Attribute( + key=None, + multi=True, + path={ + self.ranktext: "./td[2]/text()", + 'rating': "./td[3]//strong//text()", + 'title': "./td[2]//a//text()", + 'year': "./td[2]//span//text()", + 'movieID': "./td[2]//a/@href", + 'votes': "./td[3]//strong/@title" + } + ) + ) + ] def postprocess_data(self, data): if not data or self.label not in data: @@ -73,9 +80,11 @@ class DOMHTMLTop250Parser(DOMParserBase): if theID in seenIDs: continue seenIDs.append(theID) - minfo = analyze_title(d['title']+" "+d['year']) - try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) - except: pass + minfo = analyze_title(d['title'] + ' ' + d['year']) + try: + minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) + except: + pass if 'votes' in d: try: votes = d['votes'].replace(' votes','') @@ -93,7 +102,7 @@ class DOMHTMLTop250Parser(DOMParserBase): class DOMHTMLBottom100Parser(DOMHTMLTop250Parser): """Parser for the "bottom 100" page. The page should be provided as a string, as taken from - the akas.imdb.com server. The final result will be a + the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: diff --git a/lib/imdb/parser/http/utils.py b/lib/imdb/parser/http/utils.py index 5aefb3ce..ae51c432 100644 --- a/lib/imdb/parser/http/utils.py +++ b/lib/imdb/parser/http/utils.py @@ -35,7 +35,9 @@ from imdb.Character import Character # Year, imdbIndex and kind. -re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)') +re_yearKind_index = re.compile( + r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)' +) # Match imdb ids in href tags re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)') @@ -304,7 +306,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None, elif title[-14:] == 'TV mini-series': title = title[:-14] + ' (mini)' if title and title.endswith(_defSep.rstrip()): - title = title[:-len(_defSep)+1] + title = title[:-len(_defSep) + 1] # Try to understand where the movie title ends. while True: if year: @@ -320,18 +322,17 @@ def build_movie(txt, movieID=None, roleID=None, status=None, # Try to match paired parentheses; yes: sometimes there are # parentheses inside comments... nidx = title.rfind('(') - while (nidx != -1 and \ - title[nidx:].count('(') != title[nidx:].count(')')): + while nidx != -1 and title[nidx:].count('(') != title[nidx:].count(')'): nidx = title[:nidx].rfind('(') # Unbalanced parentheses: stop here. if nidx == -1: break # The last item in parentheses seems to be a year: stop here. - first4 = title[nidx+1:nidx+5] - if (first4.isdigit() or first4 == '????') and \ - title[nidx+5:nidx+6] in (')', '/'): break + first4 = title[nidx + 1:nidx + 5] + if (first4.isdigit() or first4 == '????') and title[nidx + 5:nidx + 6] in (')', '/'): + break # The last item in parentheses is a known kind: stop here. - if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', - 'TV series', 'short'): break + if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', 'TV series', 'short'): + break # Else, in parentheses there are some notes. # XXX: should the notes in the role half be kept separated # from the notes in the movie title half? @@ -471,8 +472,8 @@ class DOMParserBase(object): if _gotError: warnings.warn('falling back to "%s"' % mod) break - except ImportError, e: - if idx+1 >= nrMods: + except ImportError as e: + if idx + 1 >= nrMods: # Raise the exception, if we don't have any more # options to try. raise IMDbError('unable to use any parser in %s: %s' % \ @@ -786,10 +787,10 @@ class Extractor(object): def __repr__(self): """String representation of an Extractor object.""" - r = '' % (id(self), - self.label, self.path, repr(self.attrs), self.group, - self.group_key, self.group_key_normalize) + t = '' + r = t % (id(self), self.label, self.path, repr(self.attrs), self.group, + self.group_key, self.group_key_normalize) return r @@ -825,7 +826,7 @@ def _parse_ref(text, link, info): yearK = re_yearKind_index.match(info) if yearK and yearK.start() == 0: text += ' %s' % info[:yearK.end()] - return (text.replace('\n', ' '), link) + return text.replace('\n', ' '), link class GatherRefs(DOMParserBase): diff --git a/lib/imdb/parser/sql/__init__.py b/lib/imdb/parser/sql/__init__.py index 8b3166e7..4e085e4a 100644 --- a/lib/imdb/parser/sql/__init__.py +++ b/lib/imdb/parser/sql/__init__.py @@ -687,7 +687,7 @@ class IMDbSqlAccessSystem(IMDbBase): elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) - return (trefs, nrefs) + return trefs, nrefs def _extractRefs(self, o): """Scan for titles or names references in strings.""" @@ -702,7 +702,7 @@ class IMDbSqlAccessSystem(IMDbBase): "imdb.parser.sql.IMDbSqlAccessSystem; " "if it's not a recursion limit exceeded and we're not " "running in a Symbian environment, it's a bug:\n%s" % e) - return (trefs, nrefs) + return trefs, nrefs def _changeAKAencoding(self, akanotes, akatitle): """Return akatitle in the correct charset, as specified in diff --git a/lib/imdb/parser/sql/alchemyadapter.py b/lib/imdb/parser/sql/alchemyadapter.py index eb14e76c..b1bf9ee2 100644 --- a/lib/imdb/parser/sql/alchemyadapter.py +++ b/lib/imdb/parser/sql/alchemyadapter.py @@ -437,11 +437,13 @@ def ISNULL(x): """Emulate SQLObject's ISNULL.""" # XXX: Should we use null()? Can null() be a global instance? # XXX: Is it safe to test None with the == operator, in this case? - return x == None + return x is None + def ISNOTNULL(x): """Emulate SQLObject's ISNOTNULL.""" - return x != None + return x is not None + def CONTAINSSTRING(expr, pattern): """Emulate SQLObject's CONTAINSSTRING.""" diff --git a/lib/imdb/parser/sql/dbschema.py b/lib/imdb/parser/sql/dbschema.py index 2f359fba..a97e5c47 100644 --- a/lib/imdb/parser/sql/dbschema.py +++ b/lib/imdb/parser/sql/dbschema.py @@ -122,53 +122,80 @@ class DBTable(object): # Default values to insert in some tables: {'column': (list, of, values, ...)} -kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie', - 'tv mini series', 'video game', 'episode')} -companyTypeDefs = {'kind': ('distributors', 'production companies', - 'special effects companies', 'miscellaneous companies')} -infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages', - 'certificates', 'sound mix', 'tech info', 'countries', 'taglines', - 'keywords', 'alternate versions', 'crazy credits', 'goofs', - 'soundtrack', 'quotes', 'release dates', 'trivia', 'locations', - 'mini biography', 'birth notes', 'birth date', 'height', - 'death date', 'spouse', 'other works', 'birth name', - 'salary history', 'nick names', 'books', 'agent address', - 'biographical movies', 'portrayed in', 'where now', 'trade mark', - 'interviews', 'article', 'magazine cover photo', 'pictorial', - 'death notes', 'LD disc format', 'LD year', 'LD digital sound', - 'LD official retail price', 'LD frequency response', 'LD pressing plant', - 'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date', - 'LD production country', 'LD contrast', 'LD color rendition', - 'LD picture format', 'LD video noise', 'LD video artifacts', - 'LD release country', 'LD sharpness', 'LD dynamic range', - 'LD audio noise', 'LD color information', 'LD group genre', - 'LD quality program', 'LD close captions-teletext-ld-g', - 'LD category', 'LD analog left', 'LD certification', - 'LD audio quality', 'LD video quality', 'LD aspect ratio', - 'LD analog right', 'LD additional information', - 'LD number of chapter stops', 'LD dialogue intellegibility', - 'LD disc size', 'LD master format', 'LD subtitles', - 'LD status of availablility', 'LD quality of source', - 'LD number of sides', 'LD video standard', 'LD supplement', - 'LD original title', 'LD sound encoding', 'LD number', 'LD label', - 'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay', - 'novel', 'adaption', 'book', 'production process protocol', - 'printed media reviews', 'essays', 'other literature', 'mpaa', - 'plot', 'votes distribution', 'votes', 'rating', - 'production dates', 'copyright holder', 'filming dates', 'budget', - 'weekend gross', 'gross', 'opening weekend', 'rentals', - 'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')} -compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')} -linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as', - 'references', 'referenced in', 'spoofs', 'spoofed in', - 'features', 'featured in', 'spin off from', 'spin off', - 'version of', 'similar to', 'edited into', - 'edited from', 'alternate language version of', - 'unknown link')} -roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer', - 'cinematographer', 'composer', 'costume designer', - 'director', 'editor', 'miscellaneous crew', - 'production designer', 'guest')} +kindTypeDefs = { + 'kind': ( + 'movie', 'tv series', 'tv movie', 'video movie', + 'tv mini series', 'video game', 'episode', 'short', 'tv short' + ) +} + +companyTypeDefs = { + 'kind': ( + 'distributors', 'production companies', + 'special effects companies', 'miscellaneous companies' + ) +} + +infoTypeDefs = { + 'info': ( + 'runtimes', 'color info', 'genres', 'languages', + 'certificates', 'sound mix', 'tech info', 'countries', 'taglines', + 'keywords', 'alternate versions', 'crazy credits', 'goofs', + 'soundtrack', 'quotes', 'release dates', 'trivia', 'locations', + 'mini biography', 'birth notes', 'birth date', 'height', + 'death date', 'spouse', 'other works', 'birth name', + 'salary history', 'nick names', 'books', 'agent address', + 'biographical movies', 'portrayed in', 'where now', 'trade mark', + 'interviews', 'article', 'magazine cover photo', 'pictorial', + 'death notes', 'LD disc format', 'LD year', 'LD digital sound', + 'LD official retail price', 'LD frequency response', 'LD pressing plant', + 'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date', + 'LD production country', 'LD contrast', 'LD color rendition', + 'LD picture format', 'LD video noise', 'LD video artifacts', + 'LD release country', 'LD sharpness', 'LD dynamic range', + 'LD audio noise', 'LD color information', 'LD group genre', + 'LD quality program', 'LD close captions-teletext-ld-g', + 'LD category', 'LD analog left', 'LD certification', + 'LD audio quality', 'LD video quality', 'LD aspect ratio', + 'LD analog right', 'LD additional information', + 'LD number of chapter stops', 'LD dialogue intellegibility', + 'LD disc size', 'LD master format', 'LD subtitles', + 'LD status of availablility', 'LD quality of source', + 'LD number of sides', 'LD video standard', 'LD supplement', + 'LD original title', 'LD sound encoding', 'LD number', 'LD label', + 'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay', + 'novel', 'adaption', 'book', 'production process protocol', + 'printed media reviews', 'essays', 'other literature', 'mpaa', + 'plot', 'votes distribution', 'votes', 'rating', + 'production dates', 'copyright holder', 'filming dates', 'budget', + 'weekend gross', 'gross', 'opening weekend', 'rentals', + 'admissions', 'studios', 'top 250 rank', 'bottom 10 rank' + ) +} + +compCastTypeDefs = { + 'kind': ('cast', 'crew', 'complete', 'complete+verified') +} + +linkTypeDefs = { + 'link': ( + 'follows', 'followed by', 'remake of', 'remade as', + 'references', 'referenced in', 'spoofs', 'spoofed in', + 'features', 'featured in', 'spin off from', 'spin off', + 'version of', 'similar to', 'edited into', + 'edited from', 'alternate language version of', + 'unknown link' + ) +} + +roleTypeDefs = { + 'role': ( + 'actor', 'actress', 'producer', 'writer', + 'cinematographer', 'composer', 'costume designer', + 'director', 'editor', 'miscellaneous crew', + 'production designer', 'guest' + ) +} # Schema of tables in our database. # XXX: Foreign keys can be used to create constrains between tables, @@ -186,7 +213,7 @@ DB_SCHEMA = [ # the alternateID attribute here will be ignored by SQLAlchemy. DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), - DBCol('imdbIndex', UNICODECOL, length=12, default=None), + DBCol('imdbIndex', STRINGCOL, length=12, default=None), DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'), DBCol('gender', STRINGCOL, length=1, default=None), DBCol('namePcodeCf', STRINGCOL, length=5, default=None, @@ -204,7 +231,7 @@ DB_SCHEMA = [ # from namePcodeNf. DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), - DBCol('imdbIndex', UNICODECOL, length=12, default=None), + DBCol('imdbIndex', STRINGCOL, length=12, default=None), DBCol('imdbID', INTCOL, default=None), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, index='idx_pcodenf'), @@ -218,7 +245,7 @@ DB_SCHEMA = [ # namePcodeSf is the soundex of the name plus the country code. DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), - DBCol('countryCode', UNICODECOL, length=255, default=None), + DBCol('countryCode', STRINGCOL, length=255, default=None), DBCol('imdbID', INTCOL, default=None), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, index='idx_pcodenf'), @@ -237,7 +264,7 @@ DB_SCHEMA = [ DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('title', UNICODECOL, notNone=True, index='idx_title', indexLen=10), - DBCol('imdbIndex', UNICODECOL, length=12, default=None), + DBCol('imdbIndex', STRINGCOL, length=12, default=None), DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'), DBCol('productionYear', INTCOL, default=None), DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"), @@ -264,7 +291,7 @@ DB_SCHEMA = [ DBCol('personID', INTCOL, notNone=True, index='idx_person', foreignKey='Name'), DBCol('name', UNICODECOL, notNone=True), - DBCol('imdbIndex', UNICODECOL, length=12, default=None), + DBCol('imdbIndex', STRINGCOL, length=12, default=None), DBCol('namePcodeCf', STRINGCOL, length=5, default=None, index='idx_pcodecf'), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, @@ -291,7 +318,7 @@ DB_SCHEMA = [ DBCol('movieID', INTCOL, notNone=True, index='idx_movieid', foreignKey='Title'), DBCol('title', UNICODECOL, notNone=True), - DBCol('imdbIndex', UNICODECOL, length=12, default=None), + DBCol('imdbIndex', STRINGCOL, length=12, default=None), DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'), DBCol('productionYear', INTCOL, default=None), DBCol('phoneticCode', STRINGCOL, length=5, default=None, diff --git a/lib/imdb/utils.py b/lib/imdb/utils.py index 284edae0..4b5a81be 100644 --- a/lib/imdb/utils.py +++ b/lib/imdb/utils.py @@ -42,8 +42,22 @@ _utils_logger = logging.getLogger('imdbpy.utils') # and year of release. # XXX: probably L, C, D and M are far too much! ;-) re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)') -re_extended_year_index = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?((?:[0-9\?]{4})(?:-[0-9\?]{4})?)(?:/([IVXLCDM]+)?)?\)') -re_remove_kind = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?') +re_m_episode = re.compile(r'\(TV Episode\)\s+-\s+', re.I) +re_m_series = re.compile(r'Season\s+\d+\s+\|\s+Episode\s+\d+\s+-', re.I) +re_m_imdbIndex = re.compile(r'\(([IVXLCDM]+)\)') +re_m_kind = re.compile( + r'\((TV episode|TV Series|TV mini-series|mini|TV|Video|Video Game|VG|Short|TV Movie|TV Short|V)\)', + re.I) + +KIND_MAP = { + 'tv': 'tv movie', + 'tv episode': 'episode', + 'v': 'video movie', + 'video': 'video movie', + 'vg': 'video game', + 'mini': 'tv mini series', + 'tv mini-series': 'tv mini series' +} # Match only the imdbIndex (for name strings). re_index = re.compile(r'^\(([IVXLCDM]+)\)$') @@ -283,13 +297,6 @@ def _split_series_episode(title): # that means this is an episode title, as returned by # the web server. series_title = title[:second_quot] - ##elif episode_or_year[-1:] == '}': - ## # Title of the episode, as in the plain text data files. - ## begin_eps = episode_or_year.find('{') - ## if begin_eps == -1: return series_title, episode_or_year - ## series_title = title[:second_quot+begin_eps].rstrip() - ## # episode_or_year is returned with the {...} - ## episode_or_year = episode_or_year[begin_eps:] return series_title, episode_or_year @@ -383,65 +390,24 @@ def analyze_title(title, canonical=None, canonicalSeries=None, # tv mini series: 5,497 # video game: 5,490 # More up-to-date statistics: http://us.imdb.com/database_statistics - if title.endswith('(TV)'): - kind = u'tv movie' - title = title[:-4].rstrip() - elif title.endswith('(TV Movie)'): - kind = u'tv movie' - title = title[:-10].rstrip() - elif title.endswith('(V)'): - kind = u'video movie' - title = title[:-3].rstrip() - elif title.lower().endswith('(video)'): - kind = u'video movie' - title = title[:-7].rstrip() - elif title.endswith('(TV Short)'): - kind = u'tv short' - title = title[:-10].rstrip() - elif title.endswith('(TV Mini-Series)'): - kind = u'tv mini series' - title = title[:-16].rstrip() - elif title.endswith('(mini)'): - kind = u'tv mini series' - title = title[:-6].rstrip() - elif title.endswith('(VG)'): - kind = u'video game' - title = title[:-4].rstrip() - elif title.endswith('(Video Game)'): - kind = u'video game' - title = title[:-12].rstrip() - elif title.endswith('(TV Series)'): - epindex = title.find('(TV Episode) - ') - if epindex >= 0: - # It's an episode of a series. - kind = u'episode' - series_info = analyze_title(title[epindex + 15:]) - result['episode of'] = series_info.get('title') - result['series year'] = series_info.get('year') - title = title[:epindex] - else: - kind = u'tv series' - title = title[:-11].rstrip() + epindex = re_m_episode.search(title) + if epindex: + # It's an episode of a series. + kind = 'episode' + series_title = title[epindex.end():] + series_title = re_m_series.sub('', series_title) + series_info = analyze_title(series_title) + result['episode of'] = series_info.get('title') + result['series year'] = series_info.get('year') + title = title[:epindex.start()].strip() + else: + detected_kind = re_m_kind.findall(title) + if detected_kind: + kind = detected_kind[-1].lower().replace('-', '') + kind = KIND_MAP.get(kind, kind) + title = re_m_kind.sub('', title).strip() # Search for the year and the optional imdbIndex (a roman number). yi = re_year_index.findall(title) - if not yi: - yi = re_extended_year_index.findall(title) - if yi: - yk, yiy, yii = yi[-1] - yi = [(yiy, yii)] - if yk == 'TV episode': - kind = u'episode' - elif yk in ('TV', 'TV Movie'): - kind = u'tv movie' - elif yk == 'TV Series': - kind = u'tv series' - elif yk == 'Video': - kind = u'video movie' - elif yk in ('TV mini-series', 'TV Mini-Series'): - kind = u'tv mini series' - elif yk == 'Video Game': - kind = u'video game' - title = re_remove_kind.sub('(', title) if yi: last_yi = yi[-1] year = last_yi[0] @@ -450,7 +416,12 @@ def analyze_title(title, canonical=None, canonicalSeries=None, year = year[:-len(imdbIndex)-1] i = title.rfind('(%s)' % last_yi[0]) if i != -1: - title = title[:i-1].rstrip() + title = title[:i - 1].rstrip() + if not imdbIndex: + detect_imdbIndex = re_m_imdbIndex.findall(title) + if detect_imdbIndex: + imdbIndex = detect_imdbIndex[-1] + title = re_m_imdbIndex.sub('', title).strip() # This is a tv (mini) series: strip the '"' at the begin and at the end. # XXX: strip('"') is not used for compatibility with Python 2.0. if title and title[0] == title[-1] == '"': @@ -464,8 +435,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None, title = canonicalTitle(title) else: title = normalizeTitle(title) - # 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series', - # 'tv movie', 'video movie', 'video game') result['title'] = title result['kind'] = kind or u'movie' if year and year != '????': @@ -832,7 +801,7 @@ def date_and_notes(s): """Parse (birth|death) date and notes; returns a tuple in the form (date, notes).""" s = s.strip() - if not s: return (u'', u'') + if not s: return u'', u'' notes = u'' if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february', 'march', 'april', 'may', 'june', @@ -990,7 +959,7 @@ def _tag4TON(ton, addAccessSystem=False, _containerOnly=False): beginTag += extras if ton.notes: beginTag += u'%s' % _normalizeValue(ton.notes) - return (beginTag, u'' % tag) + return beginTag, u'' % tag TAGS_TO_MODIFY = { @@ -1264,8 +1233,8 @@ class _Container(object): self.__role = role currentRole = property(_get_currentRole, _set_currentRole, - doc="The role of a Person in a Movie" + \ - " or the interpreter of a Character in a Movie.") + doc="The role of a Person in a Movie" + " or the interpreter of a Character in a Movie.") def _init(self, **kwds): pass @@ -1478,10 +1447,10 @@ class _Container(object): except RuntimeError, e: # Symbian/python 2.2 has a poor regexp implementation. import warnings - warnings.warn('RuntimeError in ' - "imdb.utils._Container.__getitem__; if it's not " - "a recursion limit exceeded and we're not running " - "in a Symbian environment, it's a bug:\n%s" % e) + warnings.warn("RuntimeError in imdb.utils._Container.__getitem__;" + " if it's not a recursion limit exceeded and we're" + " not running in a Symbian environment, it's a" + " bug:\n%s" % e) return rawData def __setitem__(self, key, item):