Merge branch 'feature/UpdateIMDb' into develop

This commit is contained in:
JackDandy 2018-03-28 00:45:35 +01:00
commit 655b8e422a
24 changed files with 1992 additions and 1184 deletions

View file

@ -7,6 +7,7 @@
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed * Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo * Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538) * Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
* Update IMDb 5.1 (r907) to 5.2.1dev20171113 (f640595)
[develop changelog] [develop changelog]

View file

@ -23,8 +23,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
from copy import deepcopy from copy import deepcopy
from imdb.utils import analyze_company_name, build_company_name, \ from imdb.utils import _Container
flatten, _Container, cmpCompanies from imdb.utils import analyze_company_name, build_company_name, cmpCompanies, flatten
class Company(_Container): class Company(_Container):

View file

@ -24,8 +24,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
from copy import deepcopy from copy import deepcopy
from imdb import linguistics from imdb import linguistics
from imdb.utils import analyze_title, build_title, canonicalTitle, \ from imdb.utils import _Container
flatten, _Container, cmpMovies from imdb.utils import analyze_title, build_title, canonicalTitle, cmpMovies, flatten
class Movie(_Container): class Movie(_Container):

View file

@ -6,7 +6,7 @@ a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages, It can fetch data through different media (e.g.: the IMDb web pages,
a SQL database, etc.) a SQL database, etc.)
Copyright 2004-2016 Davide Alberani <da@erlug.linux.it> Copyright 2004-2018 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -25,12 +25,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company', __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems'] 'available_access_systems']
__version__ = VERSION = '5.1' __version__ = VERSION = '5.2.1dev20171113'
VERSION_NOTICE = """This is the imdbpy-legacy branch of IMDbPY, and requires Python 2.
Please notice that this version is mostly unsupported.
For a version compatible with Python 3, see the master branch:
https://github.com/alberanid/imdbpy/
"""
import sys
if sys.hexversion >= 0x3000000:
print(VERSION_NOTICE)
sys.exit(1)
# Import compatibility module (importing it is enough). # Import compatibility module (importing it is enough).
import _compat import _compat
import sys, os, ConfigParser, logging import os, ConfigParser, logging
from types import MethodType from types import MethodType
from imdb import Movie, Person, Character, Company from imdb import Movie, Person, Character, Company
@ -38,38 +51,39 @@ import imdb._logging
from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError
from imdb.utils import build_title, build_name, build_company_name from imdb.utils import build_title, build_name, build_company_name
_imdb_logger = logging.getLogger('imdbpy')
_aux_logger = logging.getLogger('imdbpy.aux') _aux_logger = logging.getLogger('imdbpy.aux')
# URLs of the main pages for movies, persons, characters and queries. # URLs of the main pages for movies, persons, characters and queries.
imdbURL_base = 'http://akas.imdb.com/' imdbURL_base = 'http://www.imdb.com/'
# NOTE: the urls below will be removed in a future version. # NOTE: the urls below will be removed in a future version.
# please use the values in the 'urls' attribute # please use the values in the 'urls' attribute
# of the IMDbBase subclass instance. # of the IMDbBase subclass instance.
# http://akas.imdb.com/title/ # http://www.imdb.com/title/
imdbURL_movie_base = '%stitle/' % imdbURL_base imdbURL_movie_base = '%stitle/' % imdbURL_base
# http://akas.imdb.com/title/tt%s/ # http://www.imdb.com/title/tt%s/
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/' imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
# http://akas.imdb.com/name/ # http://www.imdb.com/name/
imdbURL_person_base = '%sname/' % imdbURL_base imdbURL_person_base = '%sname/' % imdbURL_base
# http://akas.imdb.com/name/nm%s/ # http://www.imdb.com/name/nm%s/
imdbURL_person_main = imdbURL_person_base + 'nm%s/' imdbURL_person_main = imdbURL_person_base + 'nm%s/'
# http://akas.imdb.com/character/ # http://www.imdb.com/character/
imdbURL_character_base = '%scharacter/' % imdbURL_base imdbURL_character_base = '%scharacter/' % imdbURL_base
# http://akas.imdb.com/character/ch%s/ # http://www.imdb.com/character/ch%s/
imdbURL_character_main = imdbURL_character_base + 'ch%s/' imdbURL_character_main = imdbURL_character_base + 'ch%s/'
# http://akas.imdb.com/company/ # http://www.imdb.com/company/
imdbURL_company_base = '%scompany/' % imdbURL_base imdbURL_company_base = '%scompany/' % imdbURL_base
# http://akas.imdb.com/company/co%s/ # http://www.imdb.com/company/co%s/
imdbURL_company_main = imdbURL_company_base + 'co%s/' imdbURL_company_main = imdbURL_company_base + 'co%s/'
# http://akas.imdb.com/keyword/%s/ # http://www.imdb.com/keyword/%s/
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/' imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top # http://www.imdb.com/chart/top
imdbURL_top250 = imdbURL_base + 'chart/top' imdbURL_top250 = imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom # http://www.imdb.com/chart/bottom
imdbURL_bottom100 = imdbURL_base + 'chart/bottom' imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s # http://www.imdb.com/find?%s
imdbURL_find = imdbURL_base + 'find?%s' imdbURL_find = imdbURL_base + 'find?%s'
# Name of the configuration file. # Name of the configuration file.
@ -103,7 +117,7 @@ class ConfigParserWithCase(ConfigParser.ConfigParser):
try: try:
self.read(fname) self.read(fname)
except (ConfigParser.MissingSectionHeaderError, except (ConfigParser.MissingSectionHeaderError,
ConfigParser.ParsingError), e: ConfigParser.ParsingError) as e:
_aux_logger.warn('Troubles reading config file: %s' % e) _aux_logger.warn('Troubles reading config file: %s' % e)
# Stop at the first valid file. # Stop at the first valid file.
if self.has_section('imdbpy'): if self.has_section('imdbpy'):
@ -159,10 +173,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
accessSystem = 'http' accessSystem = 'http'
kwds.update(keywords) kwds.update(keywords)
keywords = kwds keywords = kwds
except Exception, e: except Exception as e:
import logging _imdb_logger.warn('Unable to read configuration file; complete error: %s' % e)
logging.getLogger('imdbpy').warn('Unable to read configuration' \
' file; complete error: %s' % e)
# It just LOOKS LIKE a bad habit: we tried to read config # It just LOOKS LIKE a bad habit: we tried to read config
# options from some files, but something is gone horribly # options from some files, but something is gone horribly
# wrong: ignore everything and pretend we were called with # wrong: ignore everything and pretend we were called with
@ -177,9 +189,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
try: try:
import logging.config import logging.config
logging.config.fileConfig(os.path.expanduser(logCfg)) logging.config.fileConfig(os.path.expanduser(logCfg))
except Exception, e: except Exception as e:
logging.getLogger('imdbpy').warn('unable to read logger ' \ _imdb_logger.warn('unable to read logger config: %s' % e)
'config: %s' % e)
if accessSystem in ('httpThin', 'webThin', 'htmlThin'): if accessSystem in ('httpThin', 'webThin', 'htmlThin'):
logging.warn('httpThin was removed since IMDbPY 4.8') logging.warn('httpThin was removed since IMDbPY 4.8')
accessSystem = 'http' accessSystem = 'http'
@ -244,9 +255,6 @@ class IMDbBase:
# in the subclasses). # in the subclasses).
accessSystem = 'UNKNOWN' accessSystem = 'UNKNOWN'
# Top-level logger for IMDbPY.
_imdb_logger = logging.getLogger('imdbpy')
# Whether to re-raise caught exceptions or not. # Whether to re-raise caught exceptions or not.
_reraise_exceptions = False _reraise_exceptions = False
@ -285,30 +293,30 @@ class IMDbBase:
imdbURL_base = 'http://%s' % imdbURL_base imdbURL_base = 'http://%s' % imdbURL_base
if not imdbURL_base.endswith('/'): if not imdbURL_base.endswith('/'):
imdbURL_base = '%s/' % imdbURL_base imdbURL_base = '%s/' % imdbURL_base
# http://akas.imdb.com/title/ # http://www.imdb.com/title/
imdbURL_movie_base='%stitle/' % imdbURL_base imdbURL_movie_base = '%stitle/' % imdbURL_base
# http://akas.imdb.com/title/tt%s/ # http://www.imdb.com/title/tt%s/
imdbURL_movie_main=imdbURL_movie_base + 'tt%s/' imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
# http://akas.imdb.com/name/ # http://www.imdb.com/name/
imdbURL_person_base='%sname/' % imdbURL_base imdbURL_person_base = '%sname/' % imdbURL_base
# http://akas.imdb.com/name/nm%s/ # http://www.imdb.com/name/nm%s/
imdbURL_person_main=imdbURL_person_base + 'nm%s/' imdbURL_person_main = imdbURL_person_base + 'nm%s/'
# http://akas.imdb.com/character/ # http://www.imdb.com/character/
imdbURL_character_base='%scharacter/' % imdbURL_base imdbURL_character_base = '%scharacter/' % imdbURL_base
# http://akas.imdb.com/character/ch%s/ # http://www.imdb.com/character/ch%s/
imdbURL_character_main=imdbURL_character_base + 'ch%s/' imdbURL_character_main = imdbURL_character_base + 'ch%s/'
# http://akas.imdb.com/company/ # http://www.imdb.com/company/
imdbURL_company_base='%scompany/' % imdbURL_base imdbURL_company_base = '%scompany/' % imdbURL_base
# http://akas.imdb.com/company/co%s/ # http://www.imdb.com/company/co%s/
imdbURL_company_main=imdbURL_company_base + 'co%s/' imdbURL_company_main = imdbURL_company_base + 'co%s/'
# http://akas.imdb.com/keyword/%s/ # http://www.imdb.com/keyword/%s/
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/' imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top # http://www.imdb.com/chart/top
imdbURL_top250=imdbURL_base + 'chart/top' imdbURL_top250 = imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom # http://www.imdb.com/chart/bottom
imdbURL_bottom100=imdbURL_base + 'chart/bottom' imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s # http://www.imdb.com/find?%s
imdbURL_find=imdbURL_base + 'find?%s' imdbURL_find = imdbURL_base + 'find?%s'
self.urls = dict( self.urls = dict(
movie_base=imdbURL_movie_base, movie_base=imdbURL_movie_base,
movie_main=imdbURL_movie_main, movie_main=imdbURL_movie_main,
@ -727,16 +735,15 @@ class IMDbBase:
mopID = mop.companyID mopID = mop.companyID
prefix = 'company' prefix = 'company'
else: else:
raise IMDbError('object ' + repr(mop) + \ raise IMDbError('object ' + repr(mop) +
' is not a Movie, Person, Character or Company instance') ' is not a Movie, Person, Character or Company instance')
if mopID is None: if mopID is None:
# XXX: enough? It's obvious that there are Characters # XXX: enough? It's obvious that there are Characters
# objects without characterID, so I think they should # objects without characterID, so I think they should
# just do nothing, when an i.update(character) is tried. # just do nothing, when an i.update(character) is tried.
if prefix == 'character': if prefix == 'character':
return return
raise IMDbDataAccessError( \ raise IMDbDataAccessError('supplied object has null movieID, personID or companyID')
'the supplied object has null movieID, personID or companyID')
if mop.accessSystem == self.accessSystem: if mop.accessSystem == self.accessSystem:
aSystem = self aSystem = self
else: else:
@ -760,21 +767,22 @@ class IMDbBase:
continue continue
if not i: if not i:
continue continue
self._imdb_logger.debug('retrieving "%s" info set', i) _imdb_logger.debug('retrieving "%s" info set', i)
try: try:
method = getattr(aSystem, 'get_%s_%s' % method = getattr(aSystem, 'get_%s_%s' %
(prefix, i.replace(' ', '_'))) (prefix, i.replace(' ', '_')))
except AttributeError: except AttributeError:
self._imdb_logger.error('unknown information set "%s"', i) _imdb_logger.error('unknown information set "%s"', i)
# Keeps going. # Keeps going.
method = lambda *x: {} method = lambda *x: {}
try: try:
ret = method(mopID) ret = method(mopID)
except Exception, e: except Exception:
self._imdb_logger.critical('caught an exception retrieving ' \ _imdb_logger.critical(
'or parsing "%s" info set for mopID ' \ 'caught an exception retrieving or parsing "%s" info set'
'"%s" (accessSystem: %s)', ' for mopID "%s" (accessSystem: %s)',
i, mopID, mop.accessSystem, exc_info=True) i, mopID, mop.accessSystem, exc_info=True
)
ret = {} ret = {}
# If requested by the user, reraise the exception. # If requested by the user, reraise the exception.
if self._reraise_exceptions: if self._reraise_exceptions:
@ -826,9 +834,7 @@ class IMDbBase:
raise NotImplementedError('override this method') raise NotImplementedError('override this method')
def _searchIMDb(self, kind, ton, title_kind=None): def _searchIMDb(self, kind, ton, title_kind=None):
"""Search the IMDb akas server for the given title or name.""" """Search the IMDb www server for the given title or name."""
# The Exact Primary search system has gone AWOL, so we resort
# to the mobile search. :-/
if not ton: if not ton:
return None return None
ton = ton.strip('"') ton = ton.strip('"')
@ -935,8 +941,8 @@ class IMDbBase:
else: else:
imdbID = aSystem.company2imdbID(build_company_name(mop)) imdbID = aSystem.company2imdbID(build_company_name(mop))
else: else:
raise IMDbError('object ' + repr(mop) + \ raise IMDbError('object ' + repr(mop) +
' is not a Movie, Person or Character instance') ' is not a Movie, Person or Character instance')
return imdbID return imdbID
def get_imdbURL(self, mop): def get_imdbURL(self, mop):
@ -954,8 +960,8 @@ class IMDbBase:
elif isinstance(mop, Company.Company): elif isinstance(mop, Company.Company):
url_firstPart = imdbURL_company_main url_firstPart = imdbURL_company_main
else: else:
raise IMDbError('object ' + repr(mop) + \ raise IMDbError('object ' + repr(mop) +
' is not a Movie, Person, Character or Company instance') ' is not a Movie, Person, Character or Company instance')
return url_firstPart % imdbID return url_firstPart % imdbID
def get_special_methods(self): def get_special_methods(self):

View file

@ -32,8 +32,9 @@ LEVELS = {'debug': logging.DEBUG,
imdbpyLogger = logging.getLogger('imdbpy') imdbpyLogger = logging.getLogger('imdbpy')
imdbpyStreamHandler = logging.StreamHandler() imdbpyStreamHandler = logging.StreamHandler()
imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \ imdbpyFormatter = logging.Formatter(
' %(pathname)s:%(lineno)d: %(message)s') '%(asctime)s %(levelname)s [%(name)s] %(pathname)s:%(lineno)d: %(message)s'
)
imdbpyStreamHandler.setFormatter(imdbpyFormatter) imdbpyStreamHandler.setFormatter(imdbpyFormatter)
imdbpyLogger.addHandler(imdbpyStreamHandler) imdbpyLogger.addHandler(imdbpyStreamHandler)

View file

@ -269,8 +269,8 @@ for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items():
everyentcharrefs[k] = v everyentcharrefs[k] = v
everyentcharrefs['#%s' % ord(v)] = v everyentcharrefs['#%s' % ord(v)] = v
everyentcharrefsget = everyentcharrefs.get everyentcharrefsget = everyentcharrefs.get
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % '|'.join(map(re.escape,
'|'.join(map(re.escape, everyentcharrefs))) everyentcharrefs)))
re_everyentcharrefssub = re_everyentcharrefs.sub re_everyentcharrefssub = re_everyentcharrefs.sub
def _replAllXMLRef(match): def _replAllXMLRef(match):
@ -408,7 +408,7 @@ def _valueWithType(tag, tagValue):
# Extra tags to get (if values were not already read from title/name). # Extra tags to get (if values were not already read from title/name).
_titleTags = ('imdbindex', 'kind', 'year') _titleTags = ('imdbindex', 'kind', 'year')
_nameTags = ('imdbindex') _nameTags = ('imdbindex',)
_companyTags = ('imdbindex', 'country') _companyTags = ('imdbindex', 'country')
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None, def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,

View file

@ -7,7 +7,7 @@ the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "http" or "web" called with the 'accessSystem' argument set to "http" or "web"
or "html" (this is the default). or "html" (this is the default).
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2017 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -26,6 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
""" """
import sys import sys
import ssl
import socket import socket
import logging import logging
from urllib import FancyURLopener, quote_plus from urllib import FancyURLopener, quote_plus
@ -68,8 +69,8 @@ class _ModuleProxy:
"""Initialize a proxy for the given module; defaultKeys, if set, """Initialize a proxy for the given module; defaultKeys, if set,
muste be a dictionary of values to set for instanced objects.""" muste be a dictionary of values to set for instanced objects."""
if oldParsers or fallBackToNew: if oldParsers or fallBackToNew:
_aux_logger.warn('The old set of parsers was removed; falling ' \ _aux_logger.warn('The old set of parsers was removed;'
'back to the new parsers.') ' falling back to the new parsers.')
self.useModule = useModule self.useModule = useModule
if defaultKeys is None: if defaultKeys is None:
defaultKeys = {} defaultKeys = {}
@ -142,6 +143,7 @@ class IMDbURLopener(FancyURLopener):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self._last_url = u'' self._last_url = u''
kwargs['context'] = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
FancyURLopener.__init__(self, *args, **kwargs) FancyURLopener.__init__(self, *args, **kwargs)
# Headers to add to every request. # Headers to add to every request.
# XXX: IMDb's web server doesn't like urllib-based programs, # XXX: IMDb's web server doesn't like urllib-based programs,
@ -211,9 +213,9 @@ class IMDbURLopener(FancyURLopener):
if server_encode is None and content: if server_encode is None and content:
begin_h = content.find('text/html; charset=') begin_h = content.find('text/html; charset=')
if begin_h != -1: if begin_h != -1:
end_h = content[19+begin_h:].find('"') end_h = content[19 + begin_h:].find('"')
if end_h != -1: if end_h != -1:
server_encode = content[19+begin_h:19+begin_h+end_h] server_encode = content[19 + begin_h:19 + begin_h + end_h]
if server_encode: if server_encode:
try: try:
if lookup(server_encode): if lookup(server_encode):
@ -237,9 +239,10 @@ class IMDbURLopener(FancyURLopener):
if encode is None: if encode is None:
encode = 'latin_1' encode = 'latin_1'
# The detection of the encoding is error prone... # The detection of the encoding is error prone...
self._logger.warn('Unable to detect the encoding of the retrieved ' self._logger.warn('Unable to detect the encoding of the retrieved page [%s];'
'page [%s]; falling back to default latin1.', encode) ' falling back to default utf8.', encode)
##print unicode(content, encode, 'replace').encode('utf8') if isinstance(content, unicode):
return content
return unicode(content, encode, 'replace') return unicode(content, encode, 'replace')
def http_error_default(self, url, fp, errcode, errmsg, headers): def http_error_default(self, url, fp, errcode, errmsg, headers):
@ -288,8 +291,8 @@ class IMDbHTTPAccessSystem(IMDbBase):
self._getRefs = True self._getRefs = True
self._mdparse = False self._mdparse = False
if isThin: if isThin:
self._http_logger.warn('"httpThin" access system no longer ' + self._http_logger.warn('"httpThin" access system no longer supported;'
'supported; "http" used automatically', exc_info=False) ' "http" used automatically', exc_info=False)
self.isThin = 0 self.isThin = 0
if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'): if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'):
self.accessSystem = 'http' self.accessSystem = 'http'
@ -503,7 +506,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return self.smProxy.search_movie_parser.parse(cont, results=results)['data'] return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
def get_movie_main(self, movieID): def get_movie_main(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'combined') cont = self._retrieve(self.urls['movie_main'] % movieID + 'reference')
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse) return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
def get_movie_full_credits(self, movieID): def get_movie_full_credits(self, movieID):
@ -811,7 +814,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
def _search_keyword(self, keyword, results): def _search_keyword(self, keyword, results):
# XXX: the IMDb web server seems to have some serious problem with # XXX: the IMDb web server seems to have some serious problem with
# non-ascii keyword. # non-ascii keyword.
# E.g.: http://akas.imdb.com/keyword/fianc%E9/ # E.g.: http://www.imdb.com/keyword/fianc%E9/
# will return a 500 Internal Server Error: Redirect Recursion. # will return a 500 Internal Server Error: Redirect Recursion.
keyword = keyword.encode('utf8', 'ignore') keyword = keyword.encode('utf8', 'ignore')
try: try:

View file

@ -171,7 +171,7 @@ class PageElement:
return self return self
def _lastRecursiveChild(self): def _lastRecursiveChild(self):
"Finds the last element beneath this object to be parsed." """Finds the last element beneath this object to be parsed."""
lastChild = self lastChild = self
while hasattr(lastChild, 'contents') and lastChild.contents: while hasattr(lastChild, 'contents') and lastChild.contents:
lastChild = lastChild.contents[-1] lastChild = lastChild.contents[-1]
@ -184,7 +184,7 @@ class PageElement:
newChild = NavigableString(newChild) newChild = NavigableString(newChild)
position = min(position, len(self.contents)) position = min(position, len(self.contents))
if hasattr(newChild, 'parent') and newChild.parent != None: if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one # We're 'inserting' an element that's already one
# of this object's children. # of this object's children.
if newChild.parent == self: if newChild.parent == self:
@ -323,7 +323,7 @@ class PageElement:
return r return r
def _findAll(self, name, attrs, text, limit, generator, **kwargs): def _findAll(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match." """Iterates over a generator looking for things that match."""
if isinstance(name, SoupStrainer): if isinstance(name, SoupStrainer):
strainer = name strainer = name
@ -415,7 +415,7 @@ class NavigableString(unicode, PageElement):
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self): def __getnewargs__(self):
return (NavigableString.__str__(self),) return NavigableString.__str__(self),
def __getattr__(self, attr): def __getattr__(self, attr):
"""text.string gives you text. This is for backwards """text.string gives you text. This is for backwards
@ -460,7 +460,7 @@ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents.""" """Represents a found HTML tag with its attributes and contents."""
def _invert(h): def _invert(h):
"Cheap function to invert a hash." """Cheap function to invert a hash."""
i = {} i = {}
for k,v in h.items(): for k,v in h.items():
i[v] = k i[v] = k
@ -501,14 +501,14 @@ class Tag(PageElement):
def __init__(self, parser, name, attrs=None, parent=None, def __init__(self, parser, name, attrs=None, parent=None,
previous=None): previous=None):
"Basic constructor." """Basic constructor."""
# We don't actually store the parser object: that lets extracted # We don't actually store the parser object: that lets extracted
# chunks be garbage-collected # chunks be garbage-collected
self.parserClass = parser.__class__ self.parserClass = parser.__class__
self.isSelfClosing = parser.isSelfClosingTag(name) self.isSelfClosing = parser.isSelfClosingTag(name)
self.name = name self.name = name
if attrs == None: if attrs is None:
attrs = [] attrs = []
self.attrs = attrs self.attrs = attrs
self.contents = [] self.contents = []
@ -541,18 +541,18 @@ class Tag(PageElement):
return self._getAttrMap()[key] return self._getAttrMap()[key]
def __iter__(self): def __iter__(self):
"Iterating over a tag iterates over its contents." """Iterating over a tag iterates over its contents."""
return iter(self.contents) return iter(self.contents)
def __len__(self): def __len__(self):
"The length of a tag is the length of its list of contents." """The length of a tag is the length of its list of contents."""
return len(self.contents) return len(self.contents)
def __contains__(self, x): def __contains__(self, x):
return x in self.contents return x in self.contents
def __nonzero__(self): def __nonzero__(self):
"A tag is non-None even if it has no contents." """A tag is non-None even if it has no contents."""
return True return True
def __setitem__(self, key, value): def __setitem__(self, key, value):
@ -570,7 +570,7 @@ class Tag(PageElement):
self._getAttrMap()[key] = value self._getAttrMap()[key] = value
def __delitem__(self, key): def __delitem__(self, key):
"Deleting tag[key] deletes all 'key' attributes for the tag." """Deleting tag[key] deletes all 'key' attributes for the tag."""
for item in self.attrs: for item in self.attrs:
if item[0] == key: if item[0] == key:
self.attrs.remove(item) self.attrs.remove(item)
@ -911,7 +911,7 @@ class SoupStrainer:
#print "Matching %s against %s" % (markup, matchAgainst) #print "Matching %s against %s" % (markup, matchAgainst)
result = False result = False
if matchAgainst == True and type(matchAgainst) == types.BooleanType: if matchAgainst == True and type(matchAgainst) == types.BooleanType:
result = markup != None result = markup is not None
elif callable(matchAgainst): elif callable(matchAgainst):
result = matchAgainst(markup) result = matchAgainst(markup)
else: else:
@ -1130,7 +1130,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
# Python installations can't copy regexes. If anyone # Python installations can't copy regexes. If anyone
# was relying on the existence of markupMassage, this # was relying on the existence of markupMassage, this
# might cause problems. # might cause problems.
del(self.markupMassage) del self.markupMassage
self.reset() self.reset()
SGMLParser.feed(self, markup) SGMLParser.feed(self, markup)
@ -1253,7 +1253,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
""" """
nestingResetTriggers = self.NESTABLE_TAGS.get(name) nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None isNestable = nestingResetTriggers is not None
isResetNesting = self.RESET_NESTING_TAGS.has_key(name) isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
popTo = None popTo = None
inclusive = True inclusive = True
@ -1264,9 +1264,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
#last occurance. #last occurance.
popTo = name popTo = name
break break
if (nestingResetTriggers != None if (nestingResetTriggers is not None
and p.name in nestingResetTriggers) \ and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting or (nestingResetTriggers is None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)): and self.RESET_NESTING_TAGS.has_key(p.name)):
#If we encounter one of the nesting reset triggers #If we encounter one of the nesting reset triggers
@ -1342,11 +1342,11 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self._toStringSubclass(text, ProcessingInstruction) self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text): def handle_comment(self, text):
"Handle comments as Comment objects." """Handle comments as Comment objects."""
self._toStringSubclass(text, Comment) self._toStringSubclass(text, Comment)
def handle_charref(self, ref): def handle_charref(self, ref):
"Handle character references as data." """Handle character references as data."""
if self.convertEntities: if self.convertEntities:
data = unichr(int(ref)) data = unichr(int(ref))
else: else:
@ -1397,7 +1397,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self.handle_data(data) self.handle_data(data)
def handle_decl(self, data): def handle_decl(self, data):
"Handle DOCTYPEs and the like as Declaration objects." """Handle DOCTYPEs and the like as Declaration objects."""
self._toStringSubclass(data, Declaration) self._toStringSubclass(data, Declaration)
def parse_declaration(self, i): def parse_declaration(self, i):
@ -1793,8 +1793,8 @@ class UnicodeDammit:
return self.markup return self.markup
def _toUnicode(self, data, encoding): def _toUnicode(self, data, encoding):
'''Given a string and its encoding, decodes the string into Unicode. """Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases''' %encoding is a string recognized by encodings.aliases"""
# strip Byte Order Mark (if present) # strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ if (len(data) >= 4) and (data[:2] == '\xfe\xff') \

View file

@ -67,7 +67,7 @@ def tokenize_path(path):
if path[i] == '/': if path[i] == '/':
if i > 0: if i > 0:
separators.append((last_position, i)) separators.append((last_position, i))
if (path[i+1] == '/'): if path[i+1] == '/':
last_position = i last_position = i
i = i + 1 i = i + 1
else: else:

View file

@ -2,7 +2,7 @@
parser.http.characterParser module (imdb package). parser.http.characterParser module (imdb package).
This module provides the classes (and the instances), used to parse This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a character. the IMDb pages on the www.imdb.com server about a character.
E.g., for "Jesse James" the referred pages would be: E.g., for "Jesse James" the referred pages would be:
main details: http://www.imdb.com/character/ch0000001/ main details: http://www.imdb.com/character/ch0000001/
biography: http://www.imdb.com/character/ch0000001/bio biography: http://www.imdb.com/character/ch0000001/bio
@ -37,7 +37,7 @@ _personIDs = re.compile(r'/name/nm([0-9]{7})')
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
"""Parser for the "filmography" page of a given character. """Parser for the "filmography" page of a given character.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -101,7 +101,7 @@ class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
class DOMHTMLCharacterBioParser(DOMParserBase): class DOMHTMLCharacterBioParser(DOMParserBase):
"""Parser for the "biography" page of a given character. """Parser for the "biography" page of a given character.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -146,7 +146,7 @@ class DOMHTMLCharacterBioParser(DOMParserBase):
class DOMHTMLCharacterQuotesParser(DOMParserBase): class DOMHTMLCharacterQuotesParser(DOMParserBase):
"""Parser for the "quotes" page of a given character. """Parser for the "quotes" page of a given character.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:

View file

@ -2,12 +2,12 @@
parser.http.companyParser module (imdb package). parser.http.companyParser module (imdb package).
This module provides the classes (and the instances), used to parse This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a company. the IMDb pages on the www.imdb.com server about a company.
E.g., for "Columbia Pictures [us]" the referred page would be: E.g., for "Columbia Pictures [us]" the referred page would be:
main details: http://akas.imdb.com/company/co0071509/ main details: http://www.imdb.com/company/co0071509/
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it> Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008-2017 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -34,7 +34,7 @@ from imdb.utils import analyze_company_name
class DOMCompanyParser(DOMParserBase): class DOMCompanyParser(DOMParserBase):
"""Parser for the main page of a given company. """Parser for the main page of a given company.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -44,31 +44,38 @@ class DOMCompanyParser(DOMParserBase):
_containsObjects = True _containsObjects = True
extractors = [ extractors = [
Extractor(label='name', Extractor(
path="//title", label='name',
attrs=Attribute(key='name', path="//h1/span[@class='display-title ']", # note the extra trailing space in class
path="./text()", attrs=Attribute(
postprocess=lambda x: \ key='name',
analyze_company_name(x, stripNotes=True))), path="./text()",
postprocess=lambda x: analyze_company_name(x, stripNotes=True)
)
),
Extractor(label='filmography', Extractor(
group="//b/a[@name]", label='filmography',
group_key="./text()", group="//b/a[@name]",
group_key_normalize=lambda x: x.lower(), group_key="./text()",
path="../following-sibling::ol[1]/li", group_key_normalize=lambda x: x.lower(),
attrs=Attribute(key=None, path="../following-sibling::ol[1]/li",
multi=True, attrs=Attribute(
path={ key=None,
'link': "./a[1]/@href", multi=True,
'title': "./a[1]/text()", path={
'year': "./text()[1]" 'link': "./a[1]/@href",
}, 'title': "./a[1]/text()",
postprocess=lambda x: 'year': "./text()[1]"
build_movie(u'%s %s' % \ },
(x.get('title'), x.get('year').strip()), postprocess=lambda x: build_movie(
movieID=analyze_imdbid(x.get('link') or u''), '%s %s' % (x.get('title'), x.get('year').strip()),
_parsingCompany=True))), movieID=analyze_imdbid(x.get('link') or u''),
] _parsingCompany=True
)
)
)
]
preprocessors = [ preprocessors = [
(re.compile('(<b><a name=)', re.I), r'</p>\1') (re.compile('(<b><a name=)', re.I), r'</p>\1')

File diff suppressed because it is too large Load diff

View file

@ -2,10 +2,10 @@
parser.http.personParser module (imdb package). parser.http.personParser module (imdb package).
This module provides the classes (and the instances), used to parse This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a person. the IMDb pages on the www.imdb.com server about a person.
E.g., for "Mel Gibson" the referred pages would be: E.g., for "Mel Gibson" the referred pages would be:
categorized: http://akas.imdb.com/name/nm0000154/maindetails categorized: http://www.imdb.com/name/nm0000154/maindetails
biography: http://akas.imdb.com/name/nm0000154/bio biography: http://www.imdb.com/name/nm0000154/bio
...and so on... ...and so on...
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
@ -52,7 +52,7 @@ def build_date(date):
class DOMHTMLMaindetailsParser(DOMParserBase): class DOMHTMLMaindetailsParser(DOMParserBase):
"""Parser for the "categorized" (maindetails) page of a given person. """Parser for the "categorized" (maindetails) page of a given person.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -192,7 +192,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
class DOMHTMLBioParser(DOMParserBase): class DOMHTMLBioParser(DOMParserBase):
"""Parser for the "biography" page of a given person. """Parser for the "biography" page of a given person.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -225,92 +225,157 @@ class DOMHTMLBioParser(DOMParserBase):
# TODO: check if this slicing is always correct # TODO: check if this slicing is always correct
postprocess=lambda x: u''.join(x).strip()[2:])] postprocess=lambda x: u''.join(x).strip()[2:])]
extractors = [ extractors = [
Extractor(label='headshot', Extractor(
path="//a[@name='headshot']", label='headshot',
attrs=Attribute(key='headshot', path="//a[@name='headshot']",
path="./img/@src")), attrs=Attribute(
Extractor(label='birth info', key='headshot',
path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]", path="./img/@src"
attrs=_birth_attrs), )
Extractor(label='death info', ),
path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
attrs=_death_attrs), Extractor(
Extractor(label='nick names', label='birth info',
path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]", path="//table[@id='overviewTable']"
attrs=Attribute(key='nick names', "//td[text()='Date of Birth']/following-sibling::td[1]",
path="./text()", attrs=_birth_attrs
joiner='|', ),
postprocess=lambda x: [n.strip().replace(' (',
'::(', 1) for n in x.split('|') Extractor(
if n.strip()])), label='death info',
Extractor(label='birth name', path="//table[@id='overviewTable']"
path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]", "//td[text()='Date of Death']/following-sibling::td[1]",
attrs=Attribute(key='birth name', attrs=_death_attrs
path="./text()", ),
postprocess=lambda x: canonicalName(x.strip()))),
Extractor(label='height', Extractor(
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]", label='nick names',
attrs=Attribute(key='height', path="//table[@id='overviewTable']"
path="./text()", "//td[text()='Nickenames']/following-sibling::td[1]",
postprocess=lambda x: x.strip())), attrs=Attribute(
Extractor(label='mini biography', key='nick names',
path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]", path="./text()",
attrs=Attribute(key='mini biography', joiner='|',
multi=True, postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|')
path={ if n.strip()]
'bio': ".//text()", )
'by': ".//a[@name='ba']//text()" ),
},
postprocess=lambda x: "%s::%s" % \ Extractor(
((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(), label='birth name',
(x.get('by') or u'').strip() or u'Anonymous'))), path="//table[@id='overviewTable']"
Extractor(label='spouse', "//td[text()='Birth Name']/following-sibling::td[1]",
path="//div[h5='Spouse']/table/tr", attrs=Attribute(
attrs=Attribute(key='spouse', key='birth name',
multi=True, path="./text()",
path={ postprocess=lambda x: canonicalName(x.strip())
'name': "./td[1]//text()", )
'info': "./td[2]//text()" ),
},
postprocess=lambda x: ("%s::%s" % \ Extractor(
(x.get('name').strip(), label='height',
(x.get('info') or u'').strip())).strip(':'))), path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
Extractor(label='trade mark', attrs=Attribute(
path="//div[h5='Trade Mark']/p", key='height',
attrs=Attribute(key='trade mark', path="./text()",
multi=True, postprocess=lambda x: x.strip()
path=".//text()", )
postprocess=lambda x: x.strip())), ),
Extractor(label='trivia',
path="//div[h5='Trivia']/p", Extractor(
attrs=Attribute(key='trivia', label='mini biography',
multi=True, path="//a[@name='mini_bio']/following-sibling::"
path=".//text()", "div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
postprocess=lambda x: x.strip())), attrs=Attribute(
Extractor(label='quotes', key='mini biography',
path="//div[h5='Personal Quotes']/p", multi=True,
attrs=Attribute(key='quotes', path={
multi=True, 'bio': ".//text()",
path=".//text()", 'by': ".//a[@name='ba']//text()"
postprocess=lambda x: x.strip())), },
Extractor(label='salary', postprocess=lambda x: "%s::%s" % (
path="//div[h5='Salary']/table/tr", (x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
attrs=Attribute(key='salary history', (x.get('by') or u'').strip() or u'Anonymous'
multi=True, )
path={ )
'title': "./td[1]//text()", ),
'info': "./td[2]/text()",
}, Extractor(
postprocess=lambda x: "%s::%s" % \ label='spouse',
(x.get('title').strip(), path="//div[h5='Spouse']/table/tr",
x.get('info').strip()))), attrs=Attribute(
Extractor(label='where now', key='spouse',
path="//div[h5='Where Are They Now']/p", multi=True,
attrs=Attribute(key='where now', path={
multi=True, 'name': "./td[1]//text()",
path=".//text()", 'info': "./td[2]//text()"
postprocess=lambda x: x.strip())), },
] postprocess=lambda x: ("%s::%s" % (
x.get('name').strip(),
(x.get('info') or u'').strip())).strip(':')
)
),
Extractor(
label='trade mark',
path="//div[h5='Trade Mark']/p",
attrs=Attribute(
key='trade mark',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='trivia',
path="//div[h5='Trivia']/p",
attrs=Attribute(
key='trivia',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='quotes',
path="//div[h5='Personal Quotes']/p",
attrs=Attribute(
key='quotes',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='salary',
path="//div[h5='Salary']/table/tr",
attrs=Attribute(
key='salary history',
multi=True,
path={
'title': "./td[1]//text()",
'info': "./td[2]/text()",
},
postprocess=lambda x: "%s::%s" % (
x.get('title').strip(),
x.get('info').strip())
)
),
Extractor(
label='where now',
path="//div[h5='Where Are They Now']/p",
attrs=Attribute(
key='where now',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
)
]
preprocessors = [ preprocessors = [
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
@ -329,7 +394,7 @@ class DOMHTMLBioParser(DOMParserBase):
class DOMHTMLResumeParser(DOMParserBase): class DOMHTMLResumeParser(DOMParserBase):
"""Parser for the "resume" page of a given person. """Parser for the "resume" page of a given person.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -406,13 +471,13 @@ class DOMHTMLResumeParser(DOMParserBase):
continue continue
if len(data[key][0]) == 3: if len(data[key][0]) == 3:
for item in data[key]: for item in data[key]:
item[:] = [x for x in item if not x == None] item[:] = [x for x in item if not x is None]
continue continue
if len(data[key][0]) == 2: if len(data[key][0]) == 2:
new_key = {} new_key = {}
for item in data[key]: for item in data[key]:
if item[0] == None: if item[0] is None:
continue continue
if ':' in item[0]: if ':' in item[0]:
if item[1].replace(item[0], '')[1:].strip() == '': if item[1].replace(item[0], '')[1:].strip() == '':
@ -422,15 +487,14 @@ class DOMHTMLResumeParser(DOMParserBase):
new_key[item[0]] = item[1] new_key[item[0]] = item[1]
data[key] = new_key data[key] = new_key
new_data = {} new_data = {'resume': data}
new_data['resume'] = data
return new_data return new_data
class DOMHTMLOtherWorksParser(DOMParserBase): class DOMHTMLOtherWorksParser(DOMParserBase):
"""Parser for the "other works" and "agent" pages of a given person. """Parser for the "other works" and "agent" pages of a given person.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -466,7 +530,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
minidx = minfo.find(' -') minidx = minfo.find(' -')
# Sometimes, for some unknown reason, the role is left in minfo. # Sometimes, for some unknown reason, the role is left in minfo.
if minidx != -1: if minidx != -1:
slfRole = minfo[minidx+3:].lstrip() slfRole = minfo[minidx + 3:].lstrip()
minfo = minfo[:minidx].rstrip() minfo = minfo[:minidx].rstrip()
if slfRole.endswith(')'): if slfRole.endswith(')'):
commidx = slfRole.rfind('(') commidx = slfRole.rfind('(')
@ -504,7 +568,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
class DOMHTMLSeriesParser(DOMParserBase): class DOMHTMLSeriesParser(DOMParserBase):
"""Parser for the "by TV series" page of a given person. """Parser for the "by TV series" page of a given person.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -559,7 +623,7 @@ class DOMHTMLSeriesParser(DOMParserBase):
class DOMHTMLPersonGenresParser(DOMParserBase): class DOMHTMLPersonGenresParser(DOMParserBase):
"""Parser for the "by genre" and "by keywords" pages of a given person. """Parser for the "by genre" and "by keywords" pages of a given person.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchCharacterParser class (and the
search_character_parser instance), used to parse the results of a search search_character_parser instance), used to parse the results of a search
for a given character. for a given character.
E.g., when searching for the name "Jesse James", the parsed page would be: E.g., when searching for the name "Jesse James", the parsed page would be:
http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James http://www.imdb.com/find?s=ch;mx=20;q=Jesse+James
Copyright 2007-2012 Davide Alberani <da@erlug.linux.it> Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchCompanyParser class (and the
search_company_parser instance), used to parse the results of a search search_company_parser instance), used to parse the results of a search
for a given company. for a given company.
E.g., when searching for the name "Columbia Pictures", the parsed page would be: E.g., when searching for the name "Columbia Pictures", the parsed page would be:
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures http://www.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
Copyright 2008-2012 Davide Alberani <da@erlug.linux.it> Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
@ -46,22 +46,29 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
_titleBuilder = lambda self, x: build_company_name(x) _titleBuilder = lambda self, x: build_company_name(x)
_linkPrefix = '/company/co' _linkPrefix = '/company/co'
_attrs = [Attribute(key='data', _attrs = [
multi=True, Attribute(
path={ key='data',
'link': "./a[1]/@href", multi=True,
'name': "./a[1]/text()", path={
'notes': "./text()[1]" 'link': "./a[1]/@href",
}, 'name': "./a[1]/text()",
postprocess=lambda x: ( 'notes': "./text()[1]"
analyze_imdbid(x.get('link')), },
analyze_company_name(x.get('name')+(x.get('notes') postprocess=lambda x: (
or u''), stripNotes=True) analyze_imdbid(x.get('link')),
))] analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True)
extractors = [Extractor(label='search', )
path="//td[@class='result_text']/a[starts-with(@href, " \ )
"'/company/co')]/..", ]
attrs=_attrs)]
extractors = [
Extractor(
label='search',
path="//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..",
attrs=_attrs
)
]
_OBJECTS = { _OBJECTS = {

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchKeywordParser class (and the
search_company_parser instance), used to parse the results of a search search_company_parser instance), used to parse the results of a search
for a given keyword. for a given keyword.
E.g., when searching for the keyword "alabama", the parsed page would be: E.g., when searching for the keyword "alabama", the parsed page would be:
http://akas.imdb.com/find?s=kw;mx=20;q=alabama http://www.imdb.com/find?s=kw;mx=20;q=alabama
Copyright 2009 Davide Alberani <da@erlug.linux.it> Copyright 2009 Davide Alberani <da@erlug.linux.it>

View file

@ -6,7 +6,7 @@ search_movie_parser instance), used to parse the results of a search
for a given title. for a given title.
E.g., for when searching for the title "the passion", the parsed E.g., for when searching for the title "the passion", the parsed
page would be: page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20 http://www.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
@ -67,7 +67,7 @@ class DOMBasicMovieParser(DOMParserBase):
data = [] data = []
else: else:
link = data.pop('link') link = data.pop('link')
if (link and data): if link and data:
data = [(link, data)] data = [(link, data)]
else: else:
data = [] data = []

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchPersonParser class (and the
search_person_parser instance), used to parse the results of a search search_person_parser instance), used to parse the results of a search
for a given person. for a given person.
E.g., when searching for the name "Mel Gibson", the parsed page would be: E.g., when searching for the name "Mel Gibson", the parsed page would be:
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20 http://www.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>

View file

@ -4,8 +4,8 @@ parser.http.topBottomParser module (imdb package).
This module provides the classes (and the instances), used to parse the This module provides the classes (and the instances), used to parse the
lists of top 250 and bottom 100 movies. lists of top 250 and bottom 100 movies.
E.g.: E.g.:
http://akas.imdb.com/chart/top http://www.imdb.com/chart/top
http://akas.imdb.com/chart/bottom http://www.imdb.com/chart/bottom
Copyright 2009-2015 Davide Alberani <da@erlug.linux.it> Copyright 2009-2015 Davide Alberani <da@erlug.linux.it>
@ -31,7 +31,7 @@ from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
class DOMHTMLTop250Parser(DOMParserBase): class DOMHTMLTop250Parser(DOMParserBase):
"""Parser for the "top 250" page. """Parser for the "top 250" page.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:
@ -42,17 +42,24 @@ class DOMHTMLTop250Parser(DOMParserBase):
ranktext = 'top 250 rank' ranktext = 'top 250 rank'
def _init(self): def _init(self):
self.extractors = [Extractor(label=self.label, self.extractors = [
path="//div[@id='main']//div[1]//div//table//tbody//tr", Extractor(
attrs=Attribute(key=None, label=self.label,
multi=True, path="//div[@id='main']//div[1]//div//table//tbody//tr",
path={self.ranktext: "./td[2]//text()", attrs=Attribute(
'rating': "./td[3]//strong//text()", key=None,
'title': "./td[2]//a//text()", multi=True,
'year': "./td[2]//span//text()", path={
'movieID': "./td[2]//a/@href", self.ranktext: "./td[2]/text()",
'votes': "./td[3]//strong/@title" 'rating': "./td[3]//strong//text()",
}))] 'title': "./td[2]//a//text()",
'year': "./td[2]//span//text()",
'movieID': "./td[2]//a/@href",
'votes': "./td[3]//strong/@title"
}
)
)
]
def postprocess_data(self, data): def postprocess_data(self, data):
if not data or self.label not in data: if not data or self.label not in data:
@ -73,9 +80,11 @@ class DOMHTMLTop250Parser(DOMParserBase):
if theID in seenIDs: if theID in seenIDs:
continue continue
seenIDs.append(theID) seenIDs.append(theID)
minfo = analyze_title(d['title']+" "+d['year']) minfo = analyze_title(d['title'] + ' ' + d['year'])
try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) try:
except: pass minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
except:
pass
if 'votes' in d: if 'votes' in d:
try: try:
votes = d['votes'].replace(' votes','') votes = d['votes'].replace(' votes','')
@ -93,7 +102,7 @@ class DOMHTMLTop250Parser(DOMParserBase):
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser): class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
"""Parser for the "bottom 100" page. """Parser for the "bottom 100" page.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section. dictionary, with a key for every relevant section.
Example: Example:

View file

@ -35,7 +35,9 @@ from imdb.Character import Character
# Year, imdbIndex and kind. # Year, imdbIndex and kind.
re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)') re_yearKind_index = re.compile(
r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)'
)
# Match imdb ids in href tags # Match imdb ids in href tags
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)') re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
@ -304,7 +306,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
elif title[-14:] == 'TV mini-series': elif title[-14:] == 'TV mini-series':
title = title[:-14] + ' (mini)' title = title[:-14] + ' (mini)'
if title and title.endswith(_defSep.rstrip()): if title and title.endswith(_defSep.rstrip()):
title = title[:-len(_defSep)+1] title = title[:-len(_defSep) + 1]
# Try to understand where the movie title ends. # Try to understand where the movie title ends.
while True: while True:
if year: if year:
@ -320,18 +322,17 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
# Try to match paired parentheses; yes: sometimes there are # Try to match paired parentheses; yes: sometimes there are
# parentheses inside comments... # parentheses inside comments...
nidx = title.rfind('(') nidx = title.rfind('(')
while (nidx != -1 and \ while nidx != -1 and title[nidx:].count('(') != title[nidx:].count(')'):
title[nidx:].count('(') != title[nidx:].count(')')):
nidx = title[:nidx].rfind('(') nidx = title[:nidx].rfind('(')
# Unbalanced parentheses: stop here. # Unbalanced parentheses: stop here.
if nidx == -1: break if nidx == -1: break
# The last item in parentheses seems to be a year: stop here. # The last item in parentheses seems to be a year: stop here.
first4 = title[nidx+1:nidx+5] first4 = title[nidx + 1:nidx + 5]
if (first4.isdigit() or first4 == '????') and \ if (first4.isdigit() or first4 == '????') and title[nidx + 5:nidx + 6] in (')', '/'):
title[nidx+5:nidx+6] in (')', '/'): break break
# The last item in parentheses is a known kind: stop here. # The last item in parentheses is a known kind: stop here.
if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', 'TV series', 'short'):
'TV series', 'short'): break break
# Else, in parentheses there are some notes. # Else, in parentheses there are some notes.
# XXX: should the notes in the role half be kept separated # XXX: should the notes in the role half be kept separated
# from the notes in the movie title half? # from the notes in the movie title half?
@ -471,8 +472,8 @@ class DOMParserBase(object):
if _gotError: if _gotError:
warnings.warn('falling back to "%s"' % mod) warnings.warn('falling back to "%s"' % mod)
break break
except ImportError, e: except ImportError as e:
if idx+1 >= nrMods: if idx + 1 >= nrMods:
# Raise the exception, if we don't have any more # Raise the exception, if we don't have any more
# options to try. # options to try.
raise IMDbError('unable to use any parser in %s: %s' % \ raise IMDbError('unable to use any parser in %s: %s' % \
@ -786,10 +787,10 @@ class Extractor(object):
def __repr__(self): def __repr__(self):
"""String representation of an Extractor object.""" """String representation of an Extractor object."""
r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \ t = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, group_key=%s' + \
'group_key=%s group_key_normalize=%s)>' % (id(self), ', group_key_normalize=%s)>'
self.label, self.path, repr(self.attrs), self.group, r = t % (id(self), self.label, self.path, repr(self.attrs), self.group,
self.group_key, self.group_key_normalize) self.group_key, self.group_key_normalize)
return r return r
@ -825,7 +826,7 @@ def _parse_ref(text, link, info):
yearK = re_yearKind_index.match(info) yearK = re_yearKind_index.match(info)
if yearK and yearK.start() == 0: if yearK and yearK.start() == 0:
text += ' %s' % info[:yearK.end()] text += ' %s' % info[:yearK.end()]
return (text.replace('\n', ' '), link) return text.replace('\n', ' '), link
class GatherRefs(DOMParserBase): class GatherRefs(DOMParserBase):

View file

@ -687,7 +687,7 @@ class IMDbSqlAccessSystem(IMDbBase):
elif isinstance(o, dict): elif isinstance(o, dict):
for value in o.values(): for value in o.values():
self._findRefs(value, trefs, nrefs) self._findRefs(value, trefs, nrefs)
return (trefs, nrefs) return trefs, nrefs
def _extractRefs(self, o): def _extractRefs(self, o):
"""Scan for titles or names references in strings.""" """Scan for titles or names references in strings."""
@ -702,7 +702,7 @@ class IMDbSqlAccessSystem(IMDbBase):
"imdb.parser.sql.IMDbSqlAccessSystem; " "imdb.parser.sql.IMDbSqlAccessSystem; "
"if it's not a recursion limit exceeded and we're not " "if it's not a recursion limit exceeded and we're not "
"running in a Symbian environment, it's a bug:\n%s" % e) "running in a Symbian environment, it's a bug:\n%s" % e)
return (trefs, nrefs) return trefs, nrefs
def _changeAKAencoding(self, akanotes, akatitle): def _changeAKAencoding(self, akanotes, akatitle):
"""Return akatitle in the correct charset, as specified in """Return akatitle in the correct charset, as specified in

View file

@ -437,11 +437,13 @@ def ISNULL(x):
"""Emulate SQLObject's ISNULL.""" """Emulate SQLObject's ISNULL."""
# XXX: Should we use null()? Can null() be a global instance? # XXX: Should we use null()? Can null() be a global instance?
# XXX: Is it safe to test None with the == operator, in this case? # XXX: Is it safe to test None with the == operator, in this case?
return x == None return x is None
def ISNOTNULL(x): def ISNOTNULL(x):
"""Emulate SQLObject's ISNOTNULL.""" """Emulate SQLObject's ISNOTNULL."""
return x != None return x is not None
def CONTAINSSTRING(expr, pattern): def CONTAINSSTRING(expr, pattern):
"""Emulate SQLObject's CONTAINSSTRING.""" """Emulate SQLObject's CONTAINSSTRING."""

View file

@ -122,53 +122,80 @@ class DBTable(object):
# Default values to insert in some tables: {'column': (list, of, values, ...)} # Default values to insert in some tables: {'column': (list, of, values, ...)}
kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie', kindTypeDefs = {
'tv mini series', 'video game', 'episode')} 'kind': (
companyTypeDefs = {'kind': ('distributors', 'production companies', 'movie', 'tv series', 'tv movie', 'video movie',
'special effects companies', 'miscellaneous companies')} 'tv mini series', 'video game', 'episode', 'short', 'tv short'
infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages', )
'certificates', 'sound mix', 'tech info', 'countries', 'taglines', }
'keywords', 'alternate versions', 'crazy credits', 'goofs',
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations', companyTypeDefs = {
'mini biography', 'birth notes', 'birth date', 'height', 'kind': (
'death date', 'spouse', 'other works', 'birth name', 'distributors', 'production companies',
'salary history', 'nick names', 'books', 'agent address', 'special effects companies', 'miscellaneous companies'
'biographical movies', 'portrayed in', 'where now', 'trade mark', )
'interviews', 'article', 'magazine cover photo', 'pictorial', }
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
'LD official retail price', 'LD frequency response', 'LD pressing plant', infoTypeDefs = {
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date', 'info': (
'LD production country', 'LD contrast', 'LD color rendition', 'runtimes', 'color info', 'genres', 'languages',
'LD picture format', 'LD video noise', 'LD video artifacts', 'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
'LD release country', 'LD sharpness', 'LD dynamic range', 'keywords', 'alternate versions', 'crazy credits', 'goofs',
'LD audio noise', 'LD color information', 'LD group genre', 'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
'LD quality program', 'LD close captions-teletext-ld-g', 'mini biography', 'birth notes', 'birth date', 'height',
'LD category', 'LD analog left', 'LD certification', 'death date', 'spouse', 'other works', 'birth name',
'LD audio quality', 'LD video quality', 'LD aspect ratio', 'salary history', 'nick names', 'books', 'agent address',
'LD analog right', 'LD additional information', 'biographical movies', 'portrayed in', 'where now', 'trade mark',
'LD number of chapter stops', 'LD dialogue intellegibility', 'interviews', 'article', 'magazine cover photo', 'pictorial',
'LD disc size', 'LD master format', 'LD subtitles', 'death notes', 'LD disc format', 'LD year', 'LD digital sound',
'LD status of availablility', 'LD quality of source', 'LD official retail price', 'LD frequency response', 'LD pressing plant',
'LD number of sides', 'LD video standard', 'LD supplement', 'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
'LD original title', 'LD sound encoding', 'LD number', 'LD label', 'LD production country', 'LD contrast', 'LD color rendition',
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay', 'LD picture format', 'LD video noise', 'LD video artifacts',
'novel', 'adaption', 'book', 'production process protocol', 'LD release country', 'LD sharpness', 'LD dynamic range',
'printed media reviews', 'essays', 'other literature', 'mpaa', 'LD audio noise', 'LD color information', 'LD group genre',
'plot', 'votes distribution', 'votes', 'rating', 'LD quality program', 'LD close captions-teletext-ld-g',
'production dates', 'copyright holder', 'filming dates', 'budget', 'LD category', 'LD analog left', 'LD certification',
'weekend gross', 'gross', 'opening weekend', 'rentals', 'LD audio quality', 'LD video quality', 'LD aspect ratio',
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')} 'LD analog right', 'LD additional information',
compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')} 'LD number of chapter stops', 'LD dialogue intellegibility',
linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as', 'LD disc size', 'LD master format', 'LD subtitles',
'references', 'referenced in', 'spoofs', 'spoofed in', 'LD status of availablility', 'LD quality of source',
'features', 'featured in', 'spin off from', 'spin off', 'LD number of sides', 'LD video standard', 'LD supplement',
'version of', 'similar to', 'edited into', 'LD original title', 'LD sound encoding', 'LD number', 'LD label',
'edited from', 'alternate language version of', 'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
'unknown link')} 'novel', 'adaption', 'book', 'production process protocol',
roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer', 'printed media reviews', 'essays', 'other literature', 'mpaa',
'cinematographer', 'composer', 'costume designer', 'plot', 'votes distribution', 'votes', 'rating',
'director', 'editor', 'miscellaneous crew', 'production dates', 'copyright holder', 'filming dates', 'budget',
'production designer', 'guest')} 'weekend gross', 'gross', 'opening weekend', 'rentals',
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank'
)
}
compCastTypeDefs = {
'kind': ('cast', 'crew', 'complete', 'complete+verified')
}
linkTypeDefs = {
'link': (
'follows', 'followed by', 'remake of', 'remade as',
'references', 'referenced in', 'spoofs', 'spoofed in',
'features', 'featured in', 'spin off from', 'spin off',
'version of', 'similar to', 'edited into',
'edited from', 'alternate language version of',
'unknown link'
)
}
roleTypeDefs = {
'role': (
'actor', 'actress', 'producer', 'writer',
'cinematographer', 'composer', 'costume designer',
'director', 'editor', 'miscellaneous crew',
'production designer', 'guest'
)
}
# Schema of tables in our database. # Schema of tables in our database.
# XXX: Foreign keys can be used to create constrains between tables, # XXX: Foreign keys can be used to create constrains between tables,
@ -186,7 +213,7 @@ DB_SCHEMA = [
# the alternateID attribute here will be ignored by SQLAlchemy. # the alternateID attribute here will be ignored by SQLAlchemy.
DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'), DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'),
DBCol('gender', STRINGCOL, length=1, default=None), DBCol('gender', STRINGCOL, length=1, default=None),
DBCol('namePcodeCf', STRINGCOL, length=5, default=None, DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
@ -204,7 +231,7 @@ DB_SCHEMA = [
# from namePcodeNf. # from namePcodeNf.
DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('imdbID', INTCOL, default=None), DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None, DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'), index='idx_pcodenf'),
@ -218,7 +245,7 @@ DB_SCHEMA = [
# namePcodeSf is the soundex of the name plus the country code. # namePcodeSf is the soundex of the name plus the country code.
DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('countryCode', UNICODECOL, length=255, default=None), DBCol('countryCode', STRINGCOL, length=255, default=None),
DBCol('imdbID', INTCOL, default=None), DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None, DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'), index='idx_pcodenf'),
@ -237,7 +264,7 @@ DB_SCHEMA = [
DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('title', UNICODECOL, notNone=True, DBCol('title', UNICODECOL, notNone=True,
index='idx_title', indexLen=10), index='idx_title', indexLen=10),
DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'), DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
DBCol('productionYear', INTCOL, default=None), DBCol('productionYear', INTCOL, default=None),
DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"), DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"),
@ -264,7 +291,7 @@ DB_SCHEMA = [
DBCol('personID', INTCOL, notNone=True, index='idx_person', DBCol('personID', INTCOL, notNone=True, index='idx_person',
foreignKey='Name'), foreignKey='Name'),
DBCol('name', UNICODECOL, notNone=True), DBCol('name', UNICODECOL, notNone=True),
DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('namePcodeCf', STRINGCOL, length=5, default=None, DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
index='idx_pcodecf'), index='idx_pcodecf'),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None, DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
@ -291,7 +318,7 @@ DB_SCHEMA = [
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid', DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
foreignKey='Title'), foreignKey='Title'),
DBCol('title', UNICODECOL, notNone=True), DBCol('title', UNICODECOL, notNone=True),
DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'), DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
DBCol('productionYear', INTCOL, default=None), DBCol('productionYear', INTCOL, default=None),
DBCol('phoneticCode', STRINGCOL, length=5, default=None, DBCol('phoneticCode', STRINGCOL, length=5, default=None,

View file

@ -42,8 +42,22 @@ _utils_logger = logging.getLogger('imdbpy.utils')
# and year of release. # and year of release.
# XXX: probably L, C, D and M are far too much! ;-) # XXX: probably L, C, D and M are far too much! ;-)
re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)') re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
re_extended_year_index = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?((?:[0-9\?]{4})(?:-[0-9\?]{4})?)(?:/([IVXLCDM]+)?)?\)') re_m_episode = re.compile(r'\(TV Episode\)\s+-\s+', re.I)
re_remove_kind = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?') re_m_series = re.compile(r'Season\s+\d+\s+\|\s+Episode\s+\d+\s+-', re.I)
re_m_imdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
re_m_kind = re.compile(
r'\((TV episode|TV Series|TV mini-series|mini|TV|Video|Video Game|VG|Short|TV Movie|TV Short|V)\)',
re.I)
KIND_MAP = {
'tv': 'tv movie',
'tv episode': 'episode',
'v': 'video movie',
'video': 'video movie',
'vg': 'video game',
'mini': 'tv mini series',
'tv mini-series': 'tv mini series'
}
# Match only the imdbIndex (for name strings). # Match only the imdbIndex (for name strings).
re_index = re.compile(r'^\(([IVXLCDM]+)\)$') re_index = re.compile(r'^\(([IVXLCDM]+)\)$')
@ -283,13 +297,6 @@ def _split_series_episode(title):
# that means this is an episode title, as returned by # that means this is an episode title, as returned by
# the web server. # the web server.
series_title = title[:second_quot] series_title = title[:second_quot]
##elif episode_or_year[-1:] == '}':
## # Title of the episode, as in the plain text data files.
## begin_eps = episode_or_year.find('{')
## if begin_eps == -1: return series_title, episode_or_year
## series_title = title[:second_quot+begin_eps].rstrip()
## # episode_or_year is returned with the {...}
## episode_or_year = episode_or_year[begin_eps:]
return series_title, episode_or_year return series_title, episode_or_year
@ -383,65 +390,24 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
# tv mini series: 5,497 # tv mini series: 5,497
# video game: 5,490 # video game: 5,490
# More up-to-date statistics: http://us.imdb.com/database_statistics # More up-to-date statistics: http://us.imdb.com/database_statistics
if title.endswith('(TV)'): epindex = re_m_episode.search(title)
kind = u'tv movie' if epindex:
title = title[:-4].rstrip() # It's an episode of a series.
elif title.endswith('(TV Movie)'): kind = 'episode'
kind = u'tv movie' series_title = title[epindex.end():]
title = title[:-10].rstrip() series_title = re_m_series.sub('', series_title)
elif title.endswith('(V)'): series_info = analyze_title(series_title)
kind = u'video movie' result['episode of'] = series_info.get('title')
title = title[:-3].rstrip() result['series year'] = series_info.get('year')
elif title.lower().endswith('(video)'): title = title[:epindex.start()].strip()
kind = u'video movie' else:
title = title[:-7].rstrip() detected_kind = re_m_kind.findall(title)
elif title.endswith('(TV Short)'): if detected_kind:
kind = u'tv short' kind = detected_kind[-1].lower().replace('-', '')
title = title[:-10].rstrip() kind = KIND_MAP.get(kind, kind)
elif title.endswith('(TV Mini-Series)'): title = re_m_kind.sub('', title).strip()
kind = u'tv mini series'
title = title[:-16].rstrip()
elif title.endswith('(mini)'):
kind = u'tv mini series'
title = title[:-6].rstrip()
elif title.endswith('(VG)'):
kind = u'video game'
title = title[:-4].rstrip()
elif title.endswith('(Video Game)'):
kind = u'video game'
title = title[:-12].rstrip()
elif title.endswith('(TV Series)'):
epindex = title.find('(TV Episode) - ')
if epindex >= 0:
# It's an episode of a series.
kind = u'episode'
series_info = analyze_title(title[epindex + 15:])
result['episode of'] = series_info.get('title')
result['series year'] = series_info.get('year')
title = title[:epindex]
else:
kind = u'tv series'
title = title[:-11].rstrip()
# Search for the year and the optional imdbIndex (a roman number). # Search for the year and the optional imdbIndex (a roman number).
yi = re_year_index.findall(title) yi = re_year_index.findall(title)
if not yi:
yi = re_extended_year_index.findall(title)
if yi:
yk, yiy, yii = yi[-1]
yi = [(yiy, yii)]
if yk == 'TV episode':
kind = u'episode'
elif yk in ('TV', 'TV Movie'):
kind = u'tv movie'
elif yk == 'TV Series':
kind = u'tv series'
elif yk == 'Video':
kind = u'video movie'
elif yk in ('TV mini-series', 'TV Mini-Series'):
kind = u'tv mini series'
elif yk == 'Video Game':
kind = u'video game'
title = re_remove_kind.sub('(', title)
if yi: if yi:
last_yi = yi[-1] last_yi = yi[-1]
year = last_yi[0] year = last_yi[0]
@ -450,7 +416,12 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
year = year[:-len(imdbIndex)-1] year = year[:-len(imdbIndex)-1]
i = title.rfind('(%s)' % last_yi[0]) i = title.rfind('(%s)' % last_yi[0])
if i != -1: if i != -1:
title = title[:i-1].rstrip() title = title[:i - 1].rstrip()
if not imdbIndex:
detect_imdbIndex = re_m_imdbIndex.findall(title)
if detect_imdbIndex:
imdbIndex = detect_imdbIndex[-1]
title = re_m_imdbIndex.sub('', title).strip()
# This is a tv (mini) series: strip the '"' at the begin and at the end. # This is a tv (mini) series: strip the '"' at the begin and at the end.
# XXX: strip('"') is not used for compatibility with Python 2.0. # XXX: strip('"') is not used for compatibility with Python 2.0.
if title and title[0] == title[-1] == '"': if title and title[0] == title[-1] == '"':
@ -464,8 +435,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
title = canonicalTitle(title) title = canonicalTitle(title)
else: else:
title = normalizeTitle(title) title = normalizeTitle(title)
# 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series',
# 'tv movie', 'video movie', 'video game')
result['title'] = title result['title'] = title
result['kind'] = kind or u'movie' result['kind'] = kind or u'movie'
if year and year != '????': if year and year != '????':
@ -832,7 +801,7 @@ def date_and_notes(s):
"""Parse (birth|death) date and notes; returns a tuple in the """Parse (birth|death) date and notes; returns a tuple in the
form (date, notes).""" form (date, notes)."""
s = s.strip() s = s.strip()
if not s: return (u'', u'') if not s: return u'', u''
notes = u'' notes = u''
if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february', if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february',
'march', 'april', 'may', 'june', 'march', 'april', 'may', 'june',
@ -990,7 +959,7 @@ def _tag4TON(ton, addAccessSystem=False, _containerOnly=False):
beginTag += extras beginTag += extras
if ton.notes: if ton.notes:
beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes) beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes)
return (beginTag, u'</%s>' % tag) return beginTag, u'</%s>' % tag
TAGS_TO_MODIFY = { TAGS_TO_MODIFY = {
@ -1264,8 +1233,8 @@ class _Container(object):
self.__role = role self.__role = role
currentRole = property(_get_currentRole, _set_currentRole, currentRole = property(_get_currentRole, _set_currentRole,
doc="The role of a Person in a Movie" + \ doc="The role of a Person in a Movie"
" or the interpreter of a Character in a Movie.") " or the interpreter of a Character in a Movie.")
def _init(self, **kwds): pass def _init(self, **kwds): pass
@ -1478,10 +1447,10 @@ class _Container(object):
except RuntimeError, e: except RuntimeError, e:
# Symbian/python 2.2 has a poor regexp implementation. # Symbian/python 2.2 has a poor regexp implementation.
import warnings import warnings
warnings.warn('RuntimeError in ' warnings.warn("RuntimeError in imdb.utils._Container.__getitem__;"
"imdb.utils._Container.__getitem__; if it's not " " if it's not a recursion limit exceeded and we're"
"a recursion limit exceeded and we're not running " " not running in a Symbian environment, it's a"
"in a Symbian environment, it's a bug:\n%s" % e) " bug:\n%s" % e)
return rawData return rawData
def __setitem__(self, key, item): def __setitem__(self, key, item):