mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-22 09:33:37 +00:00
Merge branch 'feature/UpdateIMDb' into develop
This commit is contained in:
commit
655b8e422a
24 changed files with 1992 additions and 1184 deletions
|
@ -7,6 +7,7 @@
|
||||||
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
||||||
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
|
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
|
||||||
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
|
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
|
||||||
|
* Update IMDb 5.1 (r907) to 5.2.1dev20171113 (f640595)
|
||||||
|
|
||||||
[develop changelog]
|
[develop changelog]
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
from imdb.utils import analyze_company_name, build_company_name, \
|
from imdb.utils import _Container
|
||||||
flatten, _Container, cmpCompanies
|
from imdb.utils import analyze_company_name, build_company_name, cmpCompanies, flatten
|
||||||
|
|
||||||
|
|
||||||
class Company(_Container):
|
class Company(_Container):
|
||||||
|
|
|
@ -24,8 +24,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
from imdb import linguistics
|
from imdb import linguistics
|
||||||
from imdb.utils import analyze_title, build_title, canonicalTitle, \
|
from imdb.utils import _Container
|
||||||
flatten, _Container, cmpMovies
|
from imdb.utils import analyze_title, build_title, canonicalTitle, cmpMovies, flatten
|
||||||
|
|
||||||
|
|
||||||
class Movie(_Container):
|
class Movie(_Container):
|
||||||
|
|
|
@ -6,7 +6,7 @@ a person from the IMDb database.
|
||||||
It can fetch data through different media (e.g.: the IMDb web pages,
|
It can fetch data through different media (e.g.: the IMDb web pages,
|
||||||
a SQL database, etc.)
|
a SQL database, etc.)
|
||||||
|
|
||||||
Copyright 2004-2016 Davide Alberani <da@erlug.linux.it>
|
Copyright 2004-2018 Davide Alberani <da@erlug.linux.it>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -25,12 +25,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
|
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
|
||||||
'available_access_systems']
|
'available_access_systems']
|
||||||
__version__ = VERSION = '5.1'
|
__version__ = VERSION = '5.2.1dev20171113'
|
||||||
|
|
||||||
|
VERSION_NOTICE = """This is the imdbpy-legacy branch of IMDbPY, and requires Python 2.
|
||||||
|
Please notice that this version is mostly unsupported.
|
||||||
|
|
||||||
|
For a version compatible with Python 3, see the master branch:
|
||||||
|
https://github.com/alberanid/imdbpy/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.hexversion >= 0x3000000:
|
||||||
|
print(VERSION_NOTICE)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Import compatibility module (importing it is enough).
|
# Import compatibility module (importing it is enough).
|
||||||
import _compat
|
import _compat
|
||||||
|
|
||||||
import sys, os, ConfigParser, logging
|
import os, ConfigParser, logging
|
||||||
from types import MethodType
|
from types import MethodType
|
||||||
|
|
||||||
from imdb import Movie, Person, Character, Company
|
from imdb import Movie, Person, Character, Company
|
||||||
|
@ -38,38 +51,39 @@ import imdb._logging
|
||||||
from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError
|
from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError
|
||||||
from imdb.utils import build_title, build_name, build_company_name
|
from imdb.utils import build_title, build_name, build_company_name
|
||||||
|
|
||||||
|
_imdb_logger = logging.getLogger('imdbpy')
|
||||||
_aux_logger = logging.getLogger('imdbpy.aux')
|
_aux_logger = logging.getLogger('imdbpy.aux')
|
||||||
|
|
||||||
|
|
||||||
# URLs of the main pages for movies, persons, characters and queries.
|
# URLs of the main pages for movies, persons, characters and queries.
|
||||||
imdbURL_base = 'http://akas.imdb.com/'
|
imdbURL_base = 'http://www.imdb.com/'
|
||||||
|
|
||||||
# NOTE: the urls below will be removed in a future version.
|
# NOTE: the urls below will be removed in a future version.
|
||||||
# please use the values in the 'urls' attribute
|
# please use the values in the 'urls' attribute
|
||||||
# of the IMDbBase subclass instance.
|
# of the IMDbBase subclass instance.
|
||||||
# http://akas.imdb.com/title/
|
# http://www.imdb.com/title/
|
||||||
imdbURL_movie_base = '%stitle/' % imdbURL_base
|
imdbURL_movie_base = '%stitle/' % imdbURL_base
|
||||||
# http://akas.imdb.com/title/tt%s/
|
# http://www.imdb.com/title/tt%s/
|
||||||
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
|
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
|
||||||
# http://akas.imdb.com/name/
|
# http://www.imdb.com/name/
|
||||||
imdbURL_person_base = '%sname/' % imdbURL_base
|
imdbURL_person_base = '%sname/' % imdbURL_base
|
||||||
# http://akas.imdb.com/name/nm%s/
|
# http://www.imdb.com/name/nm%s/
|
||||||
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
|
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
|
||||||
# http://akas.imdb.com/character/
|
# http://www.imdb.com/character/
|
||||||
imdbURL_character_base = '%scharacter/' % imdbURL_base
|
imdbURL_character_base = '%scharacter/' % imdbURL_base
|
||||||
# http://akas.imdb.com/character/ch%s/
|
# http://www.imdb.com/character/ch%s/
|
||||||
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
|
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
|
||||||
# http://akas.imdb.com/company/
|
# http://www.imdb.com/company/
|
||||||
imdbURL_company_base = '%scompany/' % imdbURL_base
|
imdbURL_company_base = '%scompany/' % imdbURL_base
|
||||||
# http://akas.imdb.com/company/co%s/
|
# http://www.imdb.com/company/co%s/
|
||||||
imdbURL_company_main = imdbURL_company_base + 'co%s/'
|
imdbURL_company_main = imdbURL_company_base + 'co%s/'
|
||||||
# http://akas.imdb.com/keyword/%s/
|
# http://www.imdb.com/keyword/%s/
|
||||||
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
|
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
|
||||||
# http://akas.imdb.com/chart/top
|
# http://www.imdb.com/chart/top
|
||||||
imdbURL_top250 = imdbURL_base + 'chart/top'
|
imdbURL_top250 = imdbURL_base + 'chart/top'
|
||||||
# http://akas.imdb.com/chart/bottom
|
# http://www.imdb.com/chart/bottom
|
||||||
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
|
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
|
||||||
# http://akas.imdb.com/find?%s
|
# http://www.imdb.com/find?%s
|
||||||
imdbURL_find = imdbURL_base + 'find?%s'
|
imdbURL_find = imdbURL_base + 'find?%s'
|
||||||
|
|
||||||
# Name of the configuration file.
|
# Name of the configuration file.
|
||||||
|
@ -103,7 +117,7 @@ class ConfigParserWithCase(ConfigParser.ConfigParser):
|
||||||
try:
|
try:
|
||||||
self.read(fname)
|
self.read(fname)
|
||||||
except (ConfigParser.MissingSectionHeaderError,
|
except (ConfigParser.MissingSectionHeaderError,
|
||||||
ConfigParser.ParsingError), e:
|
ConfigParser.ParsingError) as e:
|
||||||
_aux_logger.warn('Troubles reading config file: %s' % e)
|
_aux_logger.warn('Troubles reading config file: %s' % e)
|
||||||
# Stop at the first valid file.
|
# Stop at the first valid file.
|
||||||
if self.has_section('imdbpy'):
|
if self.has_section('imdbpy'):
|
||||||
|
@ -159,10 +173,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
|
||||||
accessSystem = 'http'
|
accessSystem = 'http'
|
||||||
kwds.update(keywords)
|
kwds.update(keywords)
|
||||||
keywords = kwds
|
keywords = kwds
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
import logging
|
_imdb_logger.warn('Unable to read configuration file; complete error: %s' % e)
|
||||||
logging.getLogger('imdbpy').warn('Unable to read configuration' \
|
|
||||||
' file; complete error: %s' % e)
|
|
||||||
# It just LOOKS LIKE a bad habit: we tried to read config
|
# It just LOOKS LIKE a bad habit: we tried to read config
|
||||||
# options from some files, but something is gone horribly
|
# options from some files, but something is gone horribly
|
||||||
# wrong: ignore everything and pretend we were called with
|
# wrong: ignore everything and pretend we were called with
|
||||||
|
@ -177,9 +189,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
|
||||||
try:
|
try:
|
||||||
import logging.config
|
import logging.config
|
||||||
logging.config.fileConfig(os.path.expanduser(logCfg))
|
logging.config.fileConfig(os.path.expanduser(logCfg))
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
logging.getLogger('imdbpy').warn('unable to read logger ' \
|
_imdb_logger.warn('unable to read logger config: %s' % e)
|
||||||
'config: %s' % e)
|
|
||||||
if accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
if accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
||||||
logging.warn('httpThin was removed since IMDbPY 4.8')
|
logging.warn('httpThin was removed since IMDbPY 4.8')
|
||||||
accessSystem = 'http'
|
accessSystem = 'http'
|
||||||
|
@ -244,9 +255,6 @@ class IMDbBase:
|
||||||
# in the subclasses).
|
# in the subclasses).
|
||||||
accessSystem = 'UNKNOWN'
|
accessSystem = 'UNKNOWN'
|
||||||
|
|
||||||
# Top-level logger for IMDbPY.
|
|
||||||
_imdb_logger = logging.getLogger('imdbpy')
|
|
||||||
|
|
||||||
# Whether to re-raise caught exceptions or not.
|
# Whether to re-raise caught exceptions or not.
|
||||||
_reraise_exceptions = False
|
_reraise_exceptions = False
|
||||||
|
|
||||||
|
@ -285,30 +293,30 @@ class IMDbBase:
|
||||||
imdbURL_base = 'http://%s' % imdbURL_base
|
imdbURL_base = 'http://%s' % imdbURL_base
|
||||||
if not imdbURL_base.endswith('/'):
|
if not imdbURL_base.endswith('/'):
|
||||||
imdbURL_base = '%s/' % imdbURL_base
|
imdbURL_base = '%s/' % imdbURL_base
|
||||||
# http://akas.imdb.com/title/
|
# http://www.imdb.com/title/
|
||||||
imdbURL_movie_base='%stitle/' % imdbURL_base
|
imdbURL_movie_base = '%stitle/' % imdbURL_base
|
||||||
# http://akas.imdb.com/title/tt%s/
|
# http://www.imdb.com/title/tt%s/
|
||||||
imdbURL_movie_main=imdbURL_movie_base + 'tt%s/'
|
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
|
||||||
# http://akas.imdb.com/name/
|
# http://www.imdb.com/name/
|
||||||
imdbURL_person_base='%sname/' % imdbURL_base
|
imdbURL_person_base = '%sname/' % imdbURL_base
|
||||||
# http://akas.imdb.com/name/nm%s/
|
# http://www.imdb.com/name/nm%s/
|
||||||
imdbURL_person_main=imdbURL_person_base + 'nm%s/'
|
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
|
||||||
# http://akas.imdb.com/character/
|
# http://www.imdb.com/character/
|
||||||
imdbURL_character_base='%scharacter/' % imdbURL_base
|
imdbURL_character_base = '%scharacter/' % imdbURL_base
|
||||||
# http://akas.imdb.com/character/ch%s/
|
# http://www.imdb.com/character/ch%s/
|
||||||
imdbURL_character_main=imdbURL_character_base + 'ch%s/'
|
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
|
||||||
# http://akas.imdb.com/company/
|
# http://www.imdb.com/company/
|
||||||
imdbURL_company_base='%scompany/' % imdbURL_base
|
imdbURL_company_base = '%scompany/' % imdbURL_base
|
||||||
# http://akas.imdb.com/company/co%s/
|
# http://www.imdb.com/company/co%s/
|
||||||
imdbURL_company_main=imdbURL_company_base + 'co%s/'
|
imdbURL_company_main = imdbURL_company_base + 'co%s/'
|
||||||
# http://akas.imdb.com/keyword/%s/
|
# http://www.imdb.com/keyword/%s/
|
||||||
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/'
|
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
|
||||||
# http://akas.imdb.com/chart/top
|
# http://www.imdb.com/chart/top
|
||||||
imdbURL_top250=imdbURL_base + 'chart/top'
|
imdbURL_top250 = imdbURL_base + 'chart/top'
|
||||||
# http://akas.imdb.com/chart/bottom
|
# http://www.imdb.com/chart/bottom
|
||||||
imdbURL_bottom100=imdbURL_base + 'chart/bottom'
|
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
|
||||||
# http://akas.imdb.com/find?%s
|
# http://www.imdb.com/find?%s
|
||||||
imdbURL_find=imdbURL_base + 'find?%s'
|
imdbURL_find = imdbURL_base + 'find?%s'
|
||||||
self.urls = dict(
|
self.urls = dict(
|
||||||
movie_base=imdbURL_movie_base,
|
movie_base=imdbURL_movie_base,
|
||||||
movie_main=imdbURL_movie_main,
|
movie_main=imdbURL_movie_main,
|
||||||
|
@ -727,16 +735,15 @@ class IMDbBase:
|
||||||
mopID = mop.companyID
|
mopID = mop.companyID
|
||||||
prefix = 'company'
|
prefix = 'company'
|
||||||
else:
|
else:
|
||||||
raise IMDbError('object ' + repr(mop) + \
|
raise IMDbError('object ' + repr(mop) +
|
||||||
' is not a Movie, Person, Character or Company instance')
|
' is not a Movie, Person, Character or Company instance')
|
||||||
if mopID is None:
|
if mopID is None:
|
||||||
# XXX: enough? It's obvious that there are Characters
|
# XXX: enough? It's obvious that there are Characters
|
||||||
# objects without characterID, so I think they should
|
# objects without characterID, so I think they should
|
||||||
# just do nothing, when an i.update(character) is tried.
|
# just do nothing, when an i.update(character) is tried.
|
||||||
if prefix == 'character':
|
if prefix == 'character':
|
||||||
return
|
return
|
||||||
raise IMDbDataAccessError( \
|
raise IMDbDataAccessError('supplied object has null movieID, personID or companyID')
|
||||||
'the supplied object has null movieID, personID or companyID')
|
|
||||||
if mop.accessSystem == self.accessSystem:
|
if mop.accessSystem == self.accessSystem:
|
||||||
aSystem = self
|
aSystem = self
|
||||||
else:
|
else:
|
||||||
|
@ -760,21 +767,22 @@ class IMDbBase:
|
||||||
continue
|
continue
|
||||||
if not i:
|
if not i:
|
||||||
continue
|
continue
|
||||||
self._imdb_logger.debug('retrieving "%s" info set', i)
|
_imdb_logger.debug('retrieving "%s" info set', i)
|
||||||
try:
|
try:
|
||||||
method = getattr(aSystem, 'get_%s_%s' %
|
method = getattr(aSystem, 'get_%s_%s' %
|
||||||
(prefix, i.replace(' ', '_')))
|
(prefix, i.replace(' ', '_')))
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
self._imdb_logger.error('unknown information set "%s"', i)
|
_imdb_logger.error('unknown information set "%s"', i)
|
||||||
# Keeps going.
|
# Keeps going.
|
||||||
method = lambda *x: {}
|
method = lambda *x: {}
|
||||||
try:
|
try:
|
||||||
ret = method(mopID)
|
ret = method(mopID)
|
||||||
except Exception, e:
|
except Exception:
|
||||||
self._imdb_logger.critical('caught an exception retrieving ' \
|
_imdb_logger.critical(
|
||||||
'or parsing "%s" info set for mopID ' \
|
'caught an exception retrieving or parsing "%s" info set'
|
||||||
'"%s" (accessSystem: %s)',
|
' for mopID "%s" (accessSystem: %s)',
|
||||||
i, mopID, mop.accessSystem, exc_info=True)
|
i, mopID, mop.accessSystem, exc_info=True
|
||||||
|
)
|
||||||
ret = {}
|
ret = {}
|
||||||
# If requested by the user, reraise the exception.
|
# If requested by the user, reraise the exception.
|
||||||
if self._reraise_exceptions:
|
if self._reraise_exceptions:
|
||||||
|
@ -826,9 +834,7 @@ class IMDbBase:
|
||||||
raise NotImplementedError('override this method')
|
raise NotImplementedError('override this method')
|
||||||
|
|
||||||
def _searchIMDb(self, kind, ton, title_kind=None):
|
def _searchIMDb(self, kind, ton, title_kind=None):
|
||||||
"""Search the IMDb akas server for the given title or name."""
|
"""Search the IMDb www server for the given title or name."""
|
||||||
# The Exact Primary search system has gone AWOL, so we resort
|
|
||||||
# to the mobile search. :-/
|
|
||||||
if not ton:
|
if not ton:
|
||||||
return None
|
return None
|
||||||
ton = ton.strip('"')
|
ton = ton.strip('"')
|
||||||
|
@ -935,8 +941,8 @@ class IMDbBase:
|
||||||
else:
|
else:
|
||||||
imdbID = aSystem.company2imdbID(build_company_name(mop))
|
imdbID = aSystem.company2imdbID(build_company_name(mop))
|
||||||
else:
|
else:
|
||||||
raise IMDbError('object ' + repr(mop) + \
|
raise IMDbError('object ' + repr(mop) +
|
||||||
' is not a Movie, Person or Character instance')
|
' is not a Movie, Person or Character instance')
|
||||||
return imdbID
|
return imdbID
|
||||||
|
|
||||||
def get_imdbURL(self, mop):
|
def get_imdbURL(self, mop):
|
||||||
|
@ -954,8 +960,8 @@ class IMDbBase:
|
||||||
elif isinstance(mop, Company.Company):
|
elif isinstance(mop, Company.Company):
|
||||||
url_firstPart = imdbURL_company_main
|
url_firstPart = imdbURL_company_main
|
||||||
else:
|
else:
|
||||||
raise IMDbError('object ' + repr(mop) + \
|
raise IMDbError('object ' + repr(mop) +
|
||||||
' is not a Movie, Person, Character or Company instance')
|
' is not a Movie, Person, Character or Company instance')
|
||||||
return url_firstPart % imdbID
|
return url_firstPart % imdbID
|
||||||
|
|
||||||
def get_special_methods(self):
|
def get_special_methods(self):
|
||||||
|
|
|
@ -32,8 +32,9 @@ LEVELS = {'debug': logging.DEBUG,
|
||||||
|
|
||||||
imdbpyLogger = logging.getLogger('imdbpy')
|
imdbpyLogger = logging.getLogger('imdbpy')
|
||||||
imdbpyStreamHandler = logging.StreamHandler()
|
imdbpyStreamHandler = logging.StreamHandler()
|
||||||
imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \
|
imdbpyFormatter = logging.Formatter(
|
||||||
' %(pathname)s:%(lineno)d: %(message)s')
|
'%(asctime)s %(levelname)s [%(name)s] %(pathname)s:%(lineno)d: %(message)s'
|
||||||
|
)
|
||||||
imdbpyStreamHandler.setFormatter(imdbpyFormatter)
|
imdbpyStreamHandler.setFormatter(imdbpyFormatter)
|
||||||
imdbpyLogger.addHandler(imdbpyStreamHandler)
|
imdbpyLogger.addHandler(imdbpyStreamHandler)
|
||||||
|
|
||||||
|
|
|
@ -269,8 +269,8 @@ for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items():
|
||||||
everyentcharrefs[k] = v
|
everyentcharrefs[k] = v
|
||||||
everyentcharrefs['#%s' % ord(v)] = v
|
everyentcharrefs['#%s' % ord(v)] = v
|
||||||
everyentcharrefsget = everyentcharrefs.get
|
everyentcharrefsget = everyentcharrefs.get
|
||||||
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' %
|
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % '|'.join(map(re.escape,
|
||||||
'|'.join(map(re.escape, everyentcharrefs)))
|
everyentcharrefs)))
|
||||||
re_everyentcharrefssub = re_everyentcharrefs.sub
|
re_everyentcharrefssub = re_everyentcharrefs.sub
|
||||||
|
|
||||||
def _replAllXMLRef(match):
|
def _replAllXMLRef(match):
|
||||||
|
@ -408,7 +408,7 @@ def _valueWithType(tag, tagValue):
|
||||||
|
|
||||||
# Extra tags to get (if values were not already read from title/name).
|
# Extra tags to get (if values were not already read from title/name).
|
||||||
_titleTags = ('imdbindex', 'kind', 'year')
|
_titleTags = ('imdbindex', 'kind', 'year')
|
||||||
_nameTags = ('imdbindex')
|
_nameTags = ('imdbindex',)
|
||||||
_companyTags = ('imdbindex', 'country')
|
_companyTags = ('imdbindex', 'country')
|
||||||
|
|
||||||
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,
|
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,
|
||||||
|
|
|
@ -7,7 +7,7 @@ the imdb.IMDb function will return an instance of this class when
|
||||||
called with the 'accessSystem' argument set to "http" or "web"
|
called with the 'accessSystem' argument set to "http" or "web"
|
||||||
or "html" (this is the default).
|
or "html" (this is the default).
|
||||||
|
|
||||||
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
|
Copyright 2004-2017 Davide Alberani <da@erlug.linux.it>
|
||||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
@ -26,6 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
import ssl
|
||||||
import socket
|
import socket
|
||||||
import logging
|
import logging
|
||||||
from urllib import FancyURLopener, quote_plus
|
from urllib import FancyURLopener, quote_plus
|
||||||
|
@ -68,8 +69,8 @@ class _ModuleProxy:
|
||||||
"""Initialize a proxy for the given module; defaultKeys, if set,
|
"""Initialize a proxy for the given module; defaultKeys, if set,
|
||||||
muste be a dictionary of values to set for instanced objects."""
|
muste be a dictionary of values to set for instanced objects."""
|
||||||
if oldParsers or fallBackToNew:
|
if oldParsers or fallBackToNew:
|
||||||
_aux_logger.warn('The old set of parsers was removed; falling ' \
|
_aux_logger.warn('The old set of parsers was removed;'
|
||||||
'back to the new parsers.')
|
' falling back to the new parsers.')
|
||||||
self.useModule = useModule
|
self.useModule = useModule
|
||||||
if defaultKeys is None:
|
if defaultKeys is None:
|
||||||
defaultKeys = {}
|
defaultKeys = {}
|
||||||
|
@ -142,6 +143,7 @@ class IMDbURLopener(FancyURLopener):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._last_url = u''
|
self._last_url = u''
|
||||||
|
kwargs['context'] = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||||
FancyURLopener.__init__(self, *args, **kwargs)
|
FancyURLopener.__init__(self, *args, **kwargs)
|
||||||
# Headers to add to every request.
|
# Headers to add to every request.
|
||||||
# XXX: IMDb's web server doesn't like urllib-based programs,
|
# XXX: IMDb's web server doesn't like urllib-based programs,
|
||||||
|
@ -211,9 +213,9 @@ class IMDbURLopener(FancyURLopener):
|
||||||
if server_encode is None and content:
|
if server_encode is None and content:
|
||||||
begin_h = content.find('text/html; charset=')
|
begin_h = content.find('text/html; charset=')
|
||||||
if begin_h != -1:
|
if begin_h != -1:
|
||||||
end_h = content[19+begin_h:].find('"')
|
end_h = content[19 + begin_h:].find('"')
|
||||||
if end_h != -1:
|
if end_h != -1:
|
||||||
server_encode = content[19+begin_h:19+begin_h+end_h]
|
server_encode = content[19 + begin_h:19 + begin_h + end_h]
|
||||||
if server_encode:
|
if server_encode:
|
||||||
try:
|
try:
|
||||||
if lookup(server_encode):
|
if lookup(server_encode):
|
||||||
|
@ -237,9 +239,10 @@ class IMDbURLopener(FancyURLopener):
|
||||||
if encode is None:
|
if encode is None:
|
||||||
encode = 'latin_1'
|
encode = 'latin_1'
|
||||||
# The detection of the encoding is error prone...
|
# The detection of the encoding is error prone...
|
||||||
self._logger.warn('Unable to detect the encoding of the retrieved '
|
self._logger.warn('Unable to detect the encoding of the retrieved page [%s];'
|
||||||
'page [%s]; falling back to default latin1.', encode)
|
' falling back to default utf8.', encode)
|
||||||
##print unicode(content, encode, 'replace').encode('utf8')
|
if isinstance(content, unicode):
|
||||||
|
return content
|
||||||
return unicode(content, encode, 'replace')
|
return unicode(content, encode, 'replace')
|
||||||
|
|
||||||
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
||||||
|
@ -288,8 +291,8 @@ class IMDbHTTPAccessSystem(IMDbBase):
|
||||||
self._getRefs = True
|
self._getRefs = True
|
||||||
self._mdparse = False
|
self._mdparse = False
|
||||||
if isThin:
|
if isThin:
|
||||||
self._http_logger.warn('"httpThin" access system no longer ' +
|
self._http_logger.warn('"httpThin" access system no longer supported;'
|
||||||
'supported; "http" used automatically', exc_info=False)
|
' "http" used automatically', exc_info=False)
|
||||||
self.isThin = 0
|
self.isThin = 0
|
||||||
if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
||||||
self.accessSystem = 'http'
|
self.accessSystem = 'http'
|
||||||
|
@ -503,7 +506,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
|
||||||
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
|
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
|
||||||
|
|
||||||
def get_movie_main(self, movieID):
|
def get_movie_main(self, movieID):
|
||||||
cont = self._retrieve(self.urls['movie_main'] % movieID + 'combined')
|
cont = self._retrieve(self.urls['movie_main'] % movieID + 'reference')
|
||||||
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
|
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
|
||||||
|
|
||||||
def get_movie_full_credits(self, movieID):
|
def get_movie_full_credits(self, movieID):
|
||||||
|
@ -811,7 +814,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
|
||||||
def _search_keyword(self, keyword, results):
|
def _search_keyword(self, keyword, results):
|
||||||
# XXX: the IMDb web server seems to have some serious problem with
|
# XXX: the IMDb web server seems to have some serious problem with
|
||||||
# non-ascii keyword.
|
# non-ascii keyword.
|
||||||
# E.g.: http://akas.imdb.com/keyword/fianc%E9/
|
# E.g.: http://www.imdb.com/keyword/fianc%E9/
|
||||||
# will return a 500 Internal Server Error: Redirect Recursion.
|
# will return a 500 Internal Server Error: Redirect Recursion.
|
||||||
keyword = keyword.encode('utf8', 'ignore')
|
keyword = keyword.encode('utf8', 'ignore')
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -171,7 +171,7 @@ class PageElement:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _lastRecursiveChild(self):
|
def _lastRecursiveChild(self):
|
||||||
"Finds the last element beneath this object to be parsed."
|
"""Finds the last element beneath this object to be parsed."""
|
||||||
lastChild = self
|
lastChild = self
|
||||||
while hasattr(lastChild, 'contents') and lastChild.contents:
|
while hasattr(lastChild, 'contents') and lastChild.contents:
|
||||||
lastChild = lastChild.contents[-1]
|
lastChild = lastChild.contents[-1]
|
||||||
|
@ -184,7 +184,7 @@ class PageElement:
|
||||||
newChild = NavigableString(newChild)
|
newChild = NavigableString(newChild)
|
||||||
|
|
||||||
position = min(position, len(self.contents))
|
position = min(position, len(self.contents))
|
||||||
if hasattr(newChild, 'parent') and newChild.parent != None:
|
if hasattr(newChild, 'parent') and newChild.parent is not None:
|
||||||
# We're 'inserting' an element that's already one
|
# We're 'inserting' an element that's already one
|
||||||
# of this object's children.
|
# of this object's children.
|
||||||
if newChild.parent == self:
|
if newChild.parent == self:
|
||||||
|
@ -323,7 +323,7 @@ class PageElement:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
|
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
|
||||||
"Iterates over a generator looking for things that match."
|
"""Iterates over a generator looking for things that match."""
|
||||||
|
|
||||||
if isinstance(name, SoupStrainer):
|
if isinstance(name, SoupStrainer):
|
||||||
strainer = name
|
strainer = name
|
||||||
|
@ -415,7 +415,7 @@ class NavigableString(unicode, PageElement):
|
||||||
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||||
|
|
||||||
def __getnewargs__(self):
|
def __getnewargs__(self):
|
||||||
return (NavigableString.__str__(self),)
|
return NavigableString.__str__(self),
|
||||||
|
|
||||||
def __getattr__(self, attr):
|
def __getattr__(self, attr):
|
||||||
"""text.string gives you text. This is for backwards
|
"""text.string gives you text. This is for backwards
|
||||||
|
@ -460,7 +460,7 @@ class Tag(PageElement):
|
||||||
"""Represents a found HTML tag with its attributes and contents."""
|
"""Represents a found HTML tag with its attributes and contents."""
|
||||||
|
|
||||||
def _invert(h):
|
def _invert(h):
|
||||||
"Cheap function to invert a hash."
|
"""Cheap function to invert a hash."""
|
||||||
i = {}
|
i = {}
|
||||||
for k,v in h.items():
|
for k,v in h.items():
|
||||||
i[v] = k
|
i[v] = k
|
||||||
|
@ -501,14 +501,14 @@ class Tag(PageElement):
|
||||||
|
|
||||||
def __init__(self, parser, name, attrs=None, parent=None,
|
def __init__(self, parser, name, attrs=None, parent=None,
|
||||||
previous=None):
|
previous=None):
|
||||||
"Basic constructor."
|
"""Basic constructor."""
|
||||||
|
|
||||||
# We don't actually store the parser object: that lets extracted
|
# We don't actually store the parser object: that lets extracted
|
||||||
# chunks be garbage-collected
|
# chunks be garbage-collected
|
||||||
self.parserClass = parser.__class__
|
self.parserClass = parser.__class__
|
||||||
self.isSelfClosing = parser.isSelfClosingTag(name)
|
self.isSelfClosing = parser.isSelfClosingTag(name)
|
||||||
self.name = name
|
self.name = name
|
||||||
if attrs == None:
|
if attrs is None:
|
||||||
attrs = []
|
attrs = []
|
||||||
self.attrs = attrs
|
self.attrs = attrs
|
||||||
self.contents = []
|
self.contents = []
|
||||||
|
@ -541,18 +541,18 @@ class Tag(PageElement):
|
||||||
return self._getAttrMap()[key]
|
return self._getAttrMap()[key]
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"Iterating over a tag iterates over its contents."
|
"""Iterating over a tag iterates over its contents."""
|
||||||
return iter(self.contents)
|
return iter(self.contents)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"The length of a tag is the length of its list of contents."
|
"""The length of a tag is the length of its list of contents."""
|
||||||
return len(self.contents)
|
return len(self.contents)
|
||||||
|
|
||||||
def __contains__(self, x):
|
def __contains__(self, x):
|
||||||
return x in self.contents
|
return x in self.contents
|
||||||
|
|
||||||
def __nonzero__(self):
|
def __nonzero__(self):
|
||||||
"A tag is non-None even if it has no contents."
|
"""A tag is non-None even if it has no contents."""
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
|
@ -570,7 +570,7 @@ class Tag(PageElement):
|
||||||
self._getAttrMap()[key] = value
|
self._getAttrMap()[key] = value
|
||||||
|
|
||||||
def __delitem__(self, key):
|
def __delitem__(self, key):
|
||||||
"Deleting tag[key] deletes all 'key' attributes for the tag."
|
"""Deleting tag[key] deletes all 'key' attributes for the tag."""
|
||||||
for item in self.attrs:
|
for item in self.attrs:
|
||||||
if item[0] == key:
|
if item[0] == key:
|
||||||
self.attrs.remove(item)
|
self.attrs.remove(item)
|
||||||
|
@ -911,7 +911,7 @@ class SoupStrainer:
|
||||||
#print "Matching %s against %s" % (markup, matchAgainst)
|
#print "Matching %s against %s" % (markup, matchAgainst)
|
||||||
result = False
|
result = False
|
||||||
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
|
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
|
||||||
result = markup != None
|
result = markup is not None
|
||||||
elif callable(matchAgainst):
|
elif callable(matchAgainst):
|
||||||
result = matchAgainst(markup)
|
result = matchAgainst(markup)
|
||||||
else:
|
else:
|
||||||
|
@ -1130,7 +1130,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||||
# Python installations can't copy regexes. If anyone
|
# Python installations can't copy regexes. If anyone
|
||||||
# was relying on the existence of markupMassage, this
|
# was relying on the existence of markupMassage, this
|
||||||
# might cause problems.
|
# might cause problems.
|
||||||
del(self.markupMassage)
|
del self.markupMassage
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
SGMLParser.feed(self, markup)
|
SGMLParser.feed(self, markup)
|
||||||
|
@ -1253,7 +1253,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
|
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
|
||||||
isNestable = nestingResetTriggers != None
|
isNestable = nestingResetTriggers is not None
|
||||||
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
|
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
|
||||||
popTo = None
|
popTo = None
|
||||||
inclusive = True
|
inclusive = True
|
||||||
|
@ -1264,9 +1264,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||||
#last occurance.
|
#last occurance.
|
||||||
popTo = name
|
popTo = name
|
||||||
break
|
break
|
||||||
if (nestingResetTriggers != None
|
if (nestingResetTriggers is not None
|
||||||
and p.name in nestingResetTriggers) \
|
and p.name in nestingResetTriggers) \
|
||||||
or (nestingResetTriggers == None and isResetNesting
|
or (nestingResetTriggers is None and isResetNesting
|
||||||
and self.RESET_NESTING_TAGS.has_key(p.name)):
|
and self.RESET_NESTING_TAGS.has_key(p.name)):
|
||||||
|
|
||||||
#If we encounter one of the nesting reset triggers
|
#If we encounter one of the nesting reset triggers
|
||||||
|
@ -1342,11 +1342,11 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||||
self._toStringSubclass(text, ProcessingInstruction)
|
self._toStringSubclass(text, ProcessingInstruction)
|
||||||
|
|
||||||
def handle_comment(self, text):
|
def handle_comment(self, text):
|
||||||
"Handle comments as Comment objects."
|
"""Handle comments as Comment objects."""
|
||||||
self._toStringSubclass(text, Comment)
|
self._toStringSubclass(text, Comment)
|
||||||
|
|
||||||
def handle_charref(self, ref):
|
def handle_charref(self, ref):
|
||||||
"Handle character references as data."
|
"""Handle character references as data."""
|
||||||
if self.convertEntities:
|
if self.convertEntities:
|
||||||
data = unichr(int(ref))
|
data = unichr(int(ref))
|
||||||
else:
|
else:
|
||||||
|
@ -1397,7 +1397,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_decl(self, data):
|
def handle_decl(self, data):
|
||||||
"Handle DOCTYPEs and the like as Declaration objects."
|
"""Handle DOCTYPEs and the like as Declaration objects."""
|
||||||
self._toStringSubclass(data, Declaration)
|
self._toStringSubclass(data, Declaration)
|
||||||
|
|
||||||
def parse_declaration(self, i):
|
def parse_declaration(self, i):
|
||||||
|
@ -1793,8 +1793,8 @@ class UnicodeDammit:
|
||||||
return self.markup
|
return self.markup
|
||||||
|
|
||||||
def _toUnicode(self, data, encoding):
|
def _toUnicode(self, data, encoding):
|
||||||
'''Given a string and its encoding, decodes the string into Unicode.
|
"""Given a string and its encoding, decodes the string into Unicode.
|
||||||
%encoding is a string recognized by encodings.aliases'''
|
%encoding is a string recognized by encodings.aliases"""
|
||||||
|
|
||||||
# strip Byte Order Mark (if present)
|
# strip Byte Order Mark (if present)
|
||||||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
||||||
|
|
|
@ -67,7 +67,7 @@ def tokenize_path(path):
|
||||||
if path[i] == '/':
|
if path[i] == '/':
|
||||||
if i > 0:
|
if i > 0:
|
||||||
separators.append((last_position, i))
|
separators.append((last_position, i))
|
||||||
if (path[i+1] == '/'):
|
if path[i+1] == '/':
|
||||||
last_position = i
|
last_position = i
|
||||||
i = i + 1
|
i = i + 1
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
parser.http.characterParser module (imdb package).
|
parser.http.characterParser module (imdb package).
|
||||||
|
|
||||||
This module provides the classes (and the instances), used to parse
|
This module provides the classes (and the instances), used to parse
|
||||||
the IMDb pages on the akas.imdb.com server about a character.
|
the IMDb pages on the www.imdb.com server about a character.
|
||||||
E.g., for "Jesse James" the referred pages would be:
|
E.g., for "Jesse James" the referred pages would be:
|
||||||
main details: http://www.imdb.com/character/ch0000001/
|
main details: http://www.imdb.com/character/ch0000001/
|
||||||
biography: http://www.imdb.com/character/ch0000001/bio
|
biography: http://www.imdb.com/character/ch0000001/bio
|
||||||
|
@ -37,7 +37,7 @@ _personIDs = re.compile(r'/name/nm([0-9]{7})')
|
||||||
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
||||||
"""Parser for the "filmography" page of a given character.
|
"""Parser for the "filmography" page of a given character.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -101,7 +101,7 @@ class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
||||||
class DOMHTMLCharacterBioParser(DOMParserBase):
|
class DOMHTMLCharacterBioParser(DOMParserBase):
|
||||||
"""Parser for the "biography" page of a given character.
|
"""Parser for the "biography" page of a given character.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -146,7 +146,7 @@ class DOMHTMLCharacterBioParser(DOMParserBase):
|
||||||
class DOMHTMLCharacterQuotesParser(DOMParserBase):
|
class DOMHTMLCharacterQuotesParser(DOMParserBase):
|
||||||
"""Parser for the "quotes" page of a given character.
|
"""Parser for the "quotes" page of a given character.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
|
@ -2,12 +2,12 @@
|
||||||
parser.http.companyParser module (imdb package).
|
parser.http.companyParser module (imdb package).
|
||||||
|
|
||||||
This module provides the classes (and the instances), used to parse
|
This module provides the classes (and the instances), used to parse
|
||||||
the IMDb pages on the akas.imdb.com server about a company.
|
the IMDb pages on the www.imdb.com server about a company.
|
||||||
E.g., for "Columbia Pictures [us]" the referred page would be:
|
E.g., for "Columbia Pictures [us]" the referred page would be:
|
||||||
main details: http://akas.imdb.com/company/co0071509/
|
main details: http://www.imdb.com/company/co0071509/
|
||||||
|
|
||||||
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
|
Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
|
||||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
2008-2017 H. Turgut Uyar <uyar@tekir.org>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -34,7 +34,7 @@ from imdb.utils import analyze_company_name
|
||||||
class DOMCompanyParser(DOMParserBase):
|
class DOMCompanyParser(DOMParserBase):
|
||||||
"""Parser for the main page of a given company.
|
"""Parser for the main page of a given company.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -44,31 +44,38 @@ class DOMCompanyParser(DOMParserBase):
|
||||||
_containsObjects = True
|
_containsObjects = True
|
||||||
|
|
||||||
extractors = [
|
extractors = [
|
||||||
Extractor(label='name',
|
Extractor(
|
||||||
path="//title",
|
label='name',
|
||||||
attrs=Attribute(key='name',
|
path="//h1/span[@class='display-title ']", # note the extra trailing space in class
|
||||||
path="./text()",
|
attrs=Attribute(
|
||||||
postprocess=lambda x: \
|
key='name',
|
||||||
analyze_company_name(x, stripNotes=True))),
|
path="./text()",
|
||||||
|
postprocess=lambda x: analyze_company_name(x, stripNotes=True)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
Extractor(label='filmography',
|
Extractor(
|
||||||
group="//b/a[@name]",
|
label='filmography',
|
||||||
group_key="./text()",
|
group="//b/a[@name]",
|
||||||
group_key_normalize=lambda x: x.lower(),
|
group_key="./text()",
|
||||||
path="../following-sibling::ol[1]/li",
|
group_key_normalize=lambda x: x.lower(),
|
||||||
attrs=Attribute(key=None,
|
path="../following-sibling::ol[1]/li",
|
||||||
multi=True,
|
attrs=Attribute(
|
||||||
path={
|
key=None,
|
||||||
'link': "./a[1]/@href",
|
multi=True,
|
||||||
'title': "./a[1]/text()",
|
path={
|
||||||
'year': "./text()[1]"
|
'link': "./a[1]/@href",
|
||||||
},
|
'title': "./a[1]/text()",
|
||||||
postprocess=lambda x:
|
'year': "./text()[1]"
|
||||||
build_movie(u'%s %s' % \
|
},
|
||||||
(x.get('title'), x.get('year').strip()),
|
postprocess=lambda x: build_movie(
|
||||||
movieID=analyze_imdbid(x.get('link') or u''),
|
'%s %s' % (x.get('title'), x.get('year').strip()),
|
||||||
_parsingCompany=True))),
|
movieID=analyze_imdbid(x.get('link') or u''),
|
||||||
]
|
_parsingCompany=True
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
preprocessors = [
|
preprocessors = [
|
||||||
(re.compile('(<b><a name=)', re.I), r'</p>\1')
|
(re.compile('(<b><a name=)', re.I), r'</p>\1')
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2,10 +2,10 @@
|
||||||
parser.http.personParser module (imdb package).
|
parser.http.personParser module (imdb package).
|
||||||
|
|
||||||
This module provides the classes (and the instances), used to parse
|
This module provides the classes (and the instances), used to parse
|
||||||
the IMDb pages on the akas.imdb.com server about a person.
|
the IMDb pages on the www.imdb.com server about a person.
|
||||||
E.g., for "Mel Gibson" the referred pages would be:
|
E.g., for "Mel Gibson" the referred pages would be:
|
||||||
categorized: http://akas.imdb.com/name/nm0000154/maindetails
|
categorized: http://www.imdb.com/name/nm0000154/maindetails
|
||||||
biography: http://akas.imdb.com/name/nm0000154/bio
|
biography: http://www.imdb.com/name/nm0000154/bio
|
||||||
...and so on...
|
...and so on...
|
||||||
|
|
||||||
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
||||||
|
@ -52,7 +52,7 @@ def build_date(date):
|
||||||
class DOMHTMLMaindetailsParser(DOMParserBase):
|
class DOMHTMLMaindetailsParser(DOMParserBase):
|
||||||
"""Parser for the "categorized" (maindetails) page of a given person.
|
"""Parser for the "categorized" (maindetails) page of a given person.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -192,7 +192,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
|
||||||
class DOMHTMLBioParser(DOMParserBase):
|
class DOMHTMLBioParser(DOMParserBase):
|
||||||
"""Parser for the "biography" page of a given person.
|
"""Parser for the "biography" page of a given person.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -225,92 +225,157 @@ class DOMHTMLBioParser(DOMParserBase):
|
||||||
# TODO: check if this slicing is always correct
|
# TODO: check if this slicing is always correct
|
||||||
postprocess=lambda x: u''.join(x).strip()[2:])]
|
postprocess=lambda x: u''.join(x).strip()[2:])]
|
||||||
extractors = [
|
extractors = [
|
||||||
Extractor(label='headshot',
|
Extractor(
|
||||||
path="//a[@name='headshot']",
|
label='headshot',
|
||||||
attrs=Attribute(key='headshot',
|
path="//a[@name='headshot']",
|
||||||
path="./img/@src")),
|
attrs=Attribute(
|
||||||
Extractor(label='birth info',
|
key='headshot',
|
||||||
path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
|
path="./img/@src"
|
||||||
attrs=_birth_attrs),
|
)
|
||||||
Extractor(label='death info',
|
),
|
||||||
path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
|
|
||||||
attrs=_death_attrs),
|
Extractor(
|
||||||
Extractor(label='nick names',
|
label='birth info',
|
||||||
path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
|
path="//table[@id='overviewTable']"
|
||||||
attrs=Attribute(key='nick names',
|
"//td[text()='Date of Birth']/following-sibling::td[1]",
|
||||||
path="./text()",
|
attrs=_birth_attrs
|
||||||
joiner='|',
|
),
|
||||||
postprocess=lambda x: [n.strip().replace(' (',
|
|
||||||
'::(', 1) for n in x.split('|')
|
Extractor(
|
||||||
if n.strip()])),
|
label='death info',
|
||||||
Extractor(label='birth name',
|
path="//table[@id='overviewTable']"
|
||||||
path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
|
"//td[text()='Date of Death']/following-sibling::td[1]",
|
||||||
attrs=Attribute(key='birth name',
|
attrs=_death_attrs
|
||||||
path="./text()",
|
),
|
||||||
postprocess=lambda x: canonicalName(x.strip()))),
|
|
||||||
Extractor(label='height',
|
Extractor(
|
||||||
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
|
label='nick names',
|
||||||
attrs=Attribute(key='height',
|
path="//table[@id='overviewTable']"
|
||||||
path="./text()",
|
"//td[text()='Nickenames']/following-sibling::td[1]",
|
||||||
postprocess=lambda x: x.strip())),
|
attrs=Attribute(
|
||||||
Extractor(label='mini biography',
|
key='nick names',
|
||||||
path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
|
path="./text()",
|
||||||
attrs=Attribute(key='mini biography',
|
joiner='|',
|
||||||
multi=True,
|
postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|')
|
||||||
path={
|
if n.strip()]
|
||||||
'bio': ".//text()",
|
)
|
||||||
'by': ".//a[@name='ba']//text()"
|
),
|
||||||
},
|
|
||||||
postprocess=lambda x: "%s::%s" % \
|
Extractor(
|
||||||
((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
|
label='birth name',
|
||||||
(x.get('by') or u'').strip() or u'Anonymous'))),
|
path="//table[@id='overviewTable']"
|
||||||
Extractor(label='spouse',
|
"//td[text()='Birth Name']/following-sibling::td[1]",
|
||||||
path="//div[h5='Spouse']/table/tr",
|
attrs=Attribute(
|
||||||
attrs=Attribute(key='spouse',
|
key='birth name',
|
||||||
multi=True,
|
path="./text()",
|
||||||
path={
|
postprocess=lambda x: canonicalName(x.strip())
|
||||||
'name': "./td[1]//text()",
|
)
|
||||||
'info': "./td[2]//text()"
|
),
|
||||||
},
|
|
||||||
postprocess=lambda x: ("%s::%s" % \
|
Extractor(
|
||||||
(x.get('name').strip(),
|
label='height',
|
||||||
(x.get('info') or u'').strip())).strip(':'))),
|
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
|
||||||
Extractor(label='trade mark',
|
attrs=Attribute(
|
||||||
path="//div[h5='Trade Mark']/p",
|
key='height',
|
||||||
attrs=Attribute(key='trade mark',
|
path="./text()",
|
||||||
multi=True,
|
postprocess=lambda x: x.strip()
|
||||||
path=".//text()",
|
)
|
||||||
postprocess=lambda x: x.strip())),
|
),
|
||||||
Extractor(label='trivia',
|
|
||||||
path="//div[h5='Trivia']/p",
|
Extractor(
|
||||||
attrs=Attribute(key='trivia',
|
label='mini biography',
|
||||||
multi=True,
|
path="//a[@name='mini_bio']/following-sibling::"
|
||||||
path=".//text()",
|
"div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
|
||||||
postprocess=lambda x: x.strip())),
|
attrs=Attribute(
|
||||||
Extractor(label='quotes',
|
key='mini biography',
|
||||||
path="//div[h5='Personal Quotes']/p",
|
multi=True,
|
||||||
attrs=Attribute(key='quotes',
|
path={
|
||||||
multi=True,
|
'bio': ".//text()",
|
||||||
path=".//text()",
|
'by': ".//a[@name='ba']//text()"
|
||||||
postprocess=lambda x: x.strip())),
|
},
|
||||||
Extractor(label='salary',
|
postprocess=lambda x: "%s::%s" % (
|
||||||
path="//div[h5='Salary']/table/tr",
|
(x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
|
||||||
attrs=Attribute(key='salary history',
|
(x.get('by') or u'').strip() or u'Anonymous'
|
||||||
multi=True,
|
)
|
||||||
path={
|
)
|
||||||
'title': "./td[1]//text()",
|
),
|
||||||
'info': "./td[2]/text()",
|
|
||||||
},
|
Extractor(
|
||||||
postprocess=lambda x: "%s::%s" % \
|
label='spouse',
|
||||||
(x.get('title').strip(),
|
path="//div[h5='Spouse']/table/tr",
|
||||||
x.get('info').strip()))),
|
attrs=Attribute(
|
||||||
Extractor(label='where now',
|
key='spouse',
|
||||||
path="//div[h5='Where Are They Now']/p",
|
multi=True,
|
||||||
attrs=Attribute(key='where now',
|
path={
|
||||||
multi=True,
|
'name': "./td[1]//text()",
|
||||||
path=".//text()",
|
'info': "./td[2]//text()"
|
||||||
postprocess=lambda x: x.strip())),
|
},
|
||||||
]
|
postprocess=lambda x: ("%s::%s" % (
|
||||||
|
x.get('name').strip(),
|
||||||
|
(x.get('info') or u'').strip())).strip(':')
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
Extractor(
|
||||||
|
label='trade mark',
|
||||||
|
path="//div[h5='Trade Mark']/p",
|
||||||
|
attrs=Attribute(
|
||||||
|
key='trade mark',
|
||||||
|
multi=True,
|
||||||
|
path=".//text()",
|
||||||
|
postprocess=lambda x: x.strip()
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
Extractor(
|
||||||
|
label='trivia',
|
||||||
|
path="//div[h5='Trivia']/p",
|
||||||
|
attrs=Attribute(
|
||||||
|
key='trivia',
|
||||||
|
multi=True,
|
||||||
|
path=".//text()",
|
||||||
|
postprocess=lambda x: x.strip()
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
Extractor(
|
||||||
|
label='quotes',
|
||||||
|
path="//div[h5='Personal Quotes']/p",
|
||||||
|
attrs=Attribute(
|
||||||
|
key='quotes',
|
||||||
|
multi=True,
|
||||||
|
path=".//text()",
|
||||||
|
postprocess=lambda x: x.strip()
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
Extractor(
|
||||||
|
label='salary',
|
||||||
|
path="//div[h5='Salary']/table/tr",
|
||||||
|
attrs=Attribute(
|
||||||
|
key='salary history',
|
||||||
|
multi=True,
|
||||||
|
path={
|
||||||
|
'title': "./td[1]//text()",
|
||||||
|
'info': "./td[2]/text()",
|
||||||
|
},
|
||||||
|
postprocess=lambda x: "%s::%s" % (
|
||||||
|
x.get('title').strip(),
|
||||||
|
x.get('info').strip())
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
Extractor(
|
||||||
|
label='where now',
|
||||||
|
path="//div[h5='Where Are They Now']/p",
|
||||||
|
attrs=Attribute(
|
||||||
|
key='where now',
|
||||||
|
multi=True,
|
||||||
|
path=".//text()",
|
||||||
|
postprocess=lambda x: x.strip()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
preprocessors = [
|
preprocessors = [
|
||||||
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
|
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
|
||||||
|
@ -329,7 +394,7 @@ class DOMHTMLBioParser(DOMParserBase):
|
||||||
class DOMHTMLResumeParser(DOMParserBase):
|
class DOMHTMLResumeParser(DOMParserBase):
|
||||||
"""Parser for the "resume" page of a given person.
|
"""Parser for the "resume" page of a given person.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -406,13 +471,13 @@ class DOMHTMLResumeParser(DOMParserBase):
|
||||||
continue
|
continue
|
||||||
if len(data[key][0]) == 3:
|
if len(data[key][0]) == 3:
|
||||||
for item in data[key]:
|
for item in data[key]:
|
||||||
item[:] = [x for x in item if not x == None]
|
item[:] = [x for x in item if not x is None]
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(data[key][0]) == 2:
|
if len(data[key][0]) == 2:
|
||||||
new_key = {}
|
new_key = {}
|
||||||
for item in data[key]:
|
for item in data[key]:
|
||||||
if item[0] == None:
|
if item[0] is None:
|
||||||
continue
|
continue
|
||||||
if ':' in item[0]:
|
if ':' in item[0]:
|
||||||
if item[1].replace(item[0], '')[1:].strip() == '':
|
if item[1].replace(item[0], '')[1:].strip() == '':
|
||||||
|
@ -422,15 +487,14 @@ class DOMHTMLResumeParser(DOMParserBase):
|
||||||
new_key[item[0]] = item[1]
|
new_key[item[0]] = item[1]
|
||||||
data[key] = new_key
|
data[key] = new_key
|
||||||
|
|
||||||
new_data = {}
|
new_data = {'resume': data}
|
||||||
new_data['resume'] = data
|
|
||||||
return new_data
|
return new_data
|
||||||
|
|
||||||
|
|
||||||
class DOMHTMLOtherWorksParser(DOMParserBase):
|
class DOMHTMLOtherWorksParser(DOMParserBase):
|
||||||
"""Parser for the "other works" and "agent" pages of a given person.
|
"""Parser for the "other works" and "agent" pages of a given person.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -466,7 +530,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
|
||||||
minidx = minfo.find(' -')
|
minidx = minfo.find(' -')
|
||||||
# Sometimes, for some unknown reason, the role is left in minfo.
|
# Sometimes, for some unknown reason, the role is left in minfo.
|
||||||
if minidx != -1:
|
if minidx != -1:
|
||||||
slfRole = minfo[minidx+3:].lstrip()
|
slfRole = minfo[minidx + 3:].lstrip()
|
||||||
minfo = minfo[:minidx].rstrip()
|
minfo = minfo[:minidx].rstrip()
|
||||||
if slfRole.endswith(')'):
|
if slfRole.endswith(')'):
|
||||||
commidx = slfRole.rfind('(')
|
commidx = slfRole.rfind('(')
|
||||||
|
@ -504,7 +568,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
|
||||||
class DOMHTMLSeriesParser(DOMParserBase):
|
class DOMHTMLSeriesParser(DOMParserBase):
|
||||||
"""Parser for the "by TV series" page of a given person.
|
"""Parser for the "by TV series" page of a given person.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -559,7 +623,7 @@ class DOMHTMLSeriesParser(DOMParserBase):
|
||||||
class DOMHTMLPersonGenresParser(DOMParserBase):
|
class DOMHTMLPersonGenresParser(DOMParserBase):
|
||||||
"""Parser for the "by genre" and "by keywords" pages of a given person.
|
"""Parser for the "by genre" and "by keywords" pages of a given person.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchCharacterParser class (and the
|
||||||
search_character_parser instance), used to parse the results of a search
|
search_character_parser instance), used to parse the results of a search
|
||||||
for a given character.
|
for a given character.
|
||||||
E.g., when searching for the name "Jesse James", the parsed page would be:
|
E.g., when searching for the name "Jesse James", the parsed page would be:
|
||||||
http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James
|
http://www.imdb.com/find?s=ch;mx=20;q=Jesse+James
|
||||||
|
|
||||||
Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
|
Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
|
||||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchCompanyParser class (and the
|
||||||
search_company_parser instance), used to parse the results of a search
|
search_company_parser instance), used to parse the results of a search
|
||||||
for a given company.
|
for a given company.
|
||||||
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
|
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
|
||||||
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
|
http://www.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
|
||||||
|
|
||||||
Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
|
Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
|
||||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||||
|
@ -46,22 +46,29 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
|
||||||
_titleBuilder = lambda self, x: build_company_name(x)
|
_titleBuilder = lambda self, x: build_company_name(x)
|
||||||
_linkPrefix = '/company/co'
|
_linkPrefix = '/company/co'
|
||||||
|
|
||||||
_attrs = [Attribute(key='data',
|
_attrs = [
|
||||||
multi=True,
|
Attribute(
|
||||||
path={
|
key='data',
|
||||||
'link': "./a[1]/@href",
|
multi=True,
|
||||||
'name': "./a[1]/text()",
|
path={
|
||||||
'notes': "./text()[1]"
|
'link': "./a[1]/@href",
|
||||||
},
|
'name': "./a[1]/text()",
|
||||||
postprocess=lambda x: (
|
'notes': "./text()[1]"
|
||||||
analyze_imdbid(x.get('link')),
|
},
|
||||||
analyze_company_name(x.get('name')+(x.get('notes')
|
postprocess=lambda x: (
|
||||||
or u''), stripNotes=True)
|
analyze_imdbid(x.get('link')),
|
||||||
))]
|
analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True)
|
||||||
extractors = [Extractor(label='search',
|
)
|
||||||
path="//td[@class='result_text']/a[starts-with(@href, " \
|
)
|
||||||
"'/company/co')]/..",
|
]
|
||||||
attrs=_attrs)]
|
|
||||||
|
extractors = [
|
||||||
|
Extractor(
|
||||||
|
label='search',
|
||||||
|
path="//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..",
|
||||||
|
attrs=_attrs
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
_OBJECTS = {
|
_OBJECTS = {
|
||||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchKeywordParser class (and the
|
||||||
search_company_parser instance), used to parse the results of a search
|
search_company_parser instance), used to parse the results of a search
|
||||||
for a given keyword.
|
for a given keyword.
|
||||||
E.g., when searching for the keyword "alabama", the parsed page would be:
|
E.g., when searching for the keyword "alabama", the parsed page would be:
|
||||||
http://akas.imdb.com/find?s=kw;mx=20;q=alabama
|
http://www.imdb.com/find?s=kw;mx=20;q=alabama
|
||||||
|
|
||||||
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ search_movie_parser instance), used to parse the results of a search
|
||||||
for a given title.
|
for a given title.
|
||||||
E.g., for when searching for the title "the passion", the parsed
|
E.g., for when searching for the title "the passion", the parsed
|
||||||
page would be:
|
page would be:
|
||||||
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
|
http://www.imdb.com/find?q=the+passion&tt=on&mx=20
|
||||||
|
|
||||||
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
||||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||||
|
@ -67,7 +67,7 @@ class DOMBasicMovieParser(DOMParserBase):
|
||||||
data = []
|
data = []
|
||||||
else:
|
else:
|
||||||
link = data.pop('link')
|
link = data.pop('link')
|
||||||
if (link and data):
|
if link and data:
|
||||||
data = [(link, data)]
|
data = [(link, data)]
|
||||||
else:
|
else:
|
||||||
data = []
|
data = []
|
||||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchPersonParser class (and the
|
||||||
search_person_parser instance), used to parse the results of a search
|
search_person_parser instance), used to parse the results of a search
|
||||||
for a given person.
|
for a given person.
|
||||||
E.g., when searching for the name "Mel Gibson", the parsed page would be:
|
E.g., when searching for the name "Mel Gibson", the parsed page would be:
|
||||||
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
|
http://www.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
|
||||||
|
|
||||||
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
||||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||||
|
|
|
@ -4,8 +4,8 @@ parser.http.topBottomParser module (imdb package).
|
||||||
This module provides the classes (and the instances), used to parse the
|
This module provides the classes (and the instances), used to parse the
|
||||||
lists of top 250 and bottom 100 movies.
|
lists of top 250 and bottom 100 movies.
|
||||||
E.g.:
|
E.g.:
|
||||||
http://akas.imdb.com/chart/top
|
http://www.imdb.com/chart/top
|
||||||
http://akas.imdb.com/chart/bottom
|
http://www.imdb.com/chart/bottom
|
||||||
|
|
||||||
Copyright 2009-2015 Davide Alberani <da@erlug.linux.it>
|
Copyright 2009-2015 Davide Alberani <da@erlug.linux.it>
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
|
||||||
class DOMHTMLTop250Parser(DOMParserBase):
|
class DOMHTMLTop250Parser(DOMParserBase):
|
||||||
"""Parser for the "top 250" page.
|
"""Parser for the "top 250" page.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -42,17 +42,24 @@ class DOMHTMLTop250Parser(DOMParserBase):
|
||||||
ranktext = 'top 250 rank'
|
ranktext = 'top 250 rank'
|
||||||
|
|
||||||
def _init(self):
|
def _init(self):
|
||||||
self.extractors = [Extractor(label=self.label,
|
self.extractors = [
|
||||||
path="//div[@id='main']//div[1]//div//table//tbody//tr",
|
Extractor(
|
||||||
attrs=Attribute(key=None,
|
label=self.label,
|
||||||
multi=True,
|
path="//div[@id='main']//div[1]//div//table//tbody//tr",
|
||||||
path={self.ranktext: "./td[2]//text()",
|
attrs=Attribute(
|
||||||
'rating': "./td[3]//strong//text()",
|
key=None,
|
||||||
'title': "./td[2]//a//text()",
|
multi=True,
|
||||||
'year': "./td[2]//span//text()",
|
path={
|
||||||
'movieID': "./td[2]//a/@href",
|
self.ranktext: "./td[2]/text()",
|
||||||
'votes': "./td[3]//strong/@title"
|
'rating': "./td[3]//strong//text()",
|
||||||
}))]
|
'title': "./td[2]//a//text()",
|
||||||
|
'year': "./td[2]//span//text()",
|
||||||
|
'movieID': "./td[2]//a/@href",
|
||||||
|
'votes': "./td[3]//strong/@title"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def postprocess_data(self, data):
|
def postprocess_data(self, data):
|
||||||
if not data or self.label not in data:
|
if not data or self.label not in data:
|
||||||
|
@ -73,9 +80,11 @@ class DOMHTMLTop250Parser(DOMParserBase):
|
||||||
if theID in seenIDs:
|
if theID in seenIDs:
|
||||||
continue
|
continue
|
||||||
seenIDs.append(theID)
|
seenIDs.append(theID)
|
||||||
minfo = analyze_title(d['title']+" "+d['year'])
|
minfo = analyze_title(d['title'] + ' ' + d['year'])
|
||||||
try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
|
try:
|
||||||
except: pass
|
minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if 'votes' in d:
|
if 'votes' in d:
|
||||||
try:
|
try:
|
||||||
votes = d['votes'].replace(' votes','')
|
votes = d['votes'].replace(' votes','')
|
||||||
|
@ -93,7 +102,7 @@ class DOMHTMLTop250Parser(DOMParserBase):
|
||||||
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
|
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
|
||||||
"""Parser for the "bottom 100" page.
|
"""Parser for the "bottom 100" page.
|
||||||
The page should be provided as a string, as taken from
|
The page should be provided as a string, as taken from
|
||||||
the akas.imdb.com server. The final result will be a
|
the www.imdb.com server. The final result will be a
|
||||||
dictionary, with a key for every relevant section.
|
dictionary, with a key for every relevant section.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
|
@ -35,7 +35,9 @@ from imdb.Character import Character
|
||||||
|
|
||||||
|
|
||||||
# Year, imdbIndex and kind.
|
# Year, imdbIndex and kind.
|
||||||
re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)')
|
re_yearKind_index = re.compile(
|
||||||
|
r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)'
|
||||||
|
)
|
||||||
|
|
||||||
# Match imdb ids in href tags
|
# Match imdb ids in href tags
|
||||||
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
|
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
|
||||||
|
@ -304,7 +306,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
|
||||||
elif title[-14:] == 'TV mini-series':
|
elif title[-14:] == 'TV mini-series':
|
||||||
title = title[:-14] + ' (mini)'
|
title = title[:-14] + ' (mini)'
|
||||||
if title and title.endswith(_defSep.rstrip()):
|
if title and title.endswith(_defSep.rstrip()):
|
||||||
title = title[:-len(_defSep)+1]
|
title = title[:-len(_defSep) + 1]
|
||||||
# Try to understand where the movie title ends.
|
# Try to understand where the movie title ends.
|
||||||
while True:
|
while True:
|
||||||
if year:
|
if year:
|
||||||
|
@ -320,18 +322,17 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
|
||||||
# Try to match paired parentheses; yes: sometimes there are
|
# Try to match paired parentheses; yes: sometimes there are
|
||||||
# parentheses inside comments...
|
# parentheses inside comments...
|
||||||
nidx = title.rfind('(')
|
nidx = title.rfind('(')
|
||||||
while (nidx != -1 and \
|
while nidx != -1 and title[nidx:].count('(') != title[nidx:].count(')'):
|
||||||
title[nidx:].count('(') != title[nidx:].count(')')):
|
|
||||||
nidx = title[:nidx].rfind('(')
|
nidx = title[:nidx].rfind('(')
|
||||||
# Unbalanced parentheses: stop here.
|
# Unbalanced parentheses: stop here.
|
||||||
if nidx == -1: break
|
if nidx == -1: break
|
||||||
# The last item in parentheses seems to be a year: stop here.
|
# The last item in parentheses seems to be a year: stop here.
|
||||||
first4 = title[nidx+1:nidx+5]
|
first4 = title[nidx + 1:nidx + 5]
|
||||||
if (first4.isdigit() or first4 == '????') and \
|
if (first4.isdigit() or first4 == '????') and title[nidx + 5:nidx + 6] in (')', '/'):
|
||||||
title[nidx+5:nidx+6] in (')', '/'): break
|
break
|
||||||
# The last item in parentheses is a known kind: stop here.
|
# The last item in parentheses is a known kind: stop here.
|
||||||
if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie',
|
if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', 'TV series', 'short'):
|
||||||
'TV series', 'short'): break
|
break
|
||||||
# Else, in parentheses there are some notes.
|
# Else, in parentheses there are some notes.
|
||||||
# XXX: should the notes in the role half be kept separated
|
# XXX: should the notes in the role half be kept separated
|
||||||
# from the notes in the movie title half?
|
# from the notes in the movie title half?
|
||||||
|
@ -471,8 +472,8 @@ class DOMParserBase(object):
|
||||||
if _gotError:
|
if _gotError:
|
||||||
warnings.warn('falling back to "%s"' % mod)
|
warnings.warn('falling back to "%s"' % mod)
|
||||||
break
|
break
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
if idx+1 >= nrMods:
|
if idx + 1 >= nrMods:
|
||||||
# Raise the exception, if we don't have any more
|
# Raise the exception, if we don't have any more
|
||||||
# options to try.
|
# options to try.
|
||||||
raise IMDbError('unable to use any parser in %s: %s' % \
|
raise IMDbError('unable to use any parser in %s: %s' % \
|
||||||
|
@ -786,10 +787,10 @@ class Extractor(object):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
"""String representation of an Extractor object."""
|
"""String representation of an Extractor object."""
|
||||||
r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \
|
t = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, group_key=%s' + \
|
||||||
'group_key=%s group_key_normalize=%s)>' % (id(self),
|
', group_key_normalize=%s)>'
|
||||||
self.label, self.path, repr(self.attrs), self.group,
|
r = t % (id(self), self.label, self.path, repr(self.attrs), self.group,
|
||||||
self.group_key, self.group_key_normalize)
|
self.group_key, self.group_key_normalize)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
@ -825,7 +826,7 @@ def _parse_ref(text, link, info):
|
||||||
yearK = re_yearKind_index.match(info)
|
yearK = re_yearKind_index.match(info)
|
||||||
if yearK and yearK.start() == 0:
|
if yearK and yearK.start() == 0:
|
||||||
text += ' %s' % info[:yearK.end()]
|
text += ' %s' % info[:yearK.end()]
|
||||||
return (text.replace('\n', ' '), link)
|
return text.replace('\n', ' '), link
|
||||||
|
|
||||||
|
|
||||||
class GatherRefs(DOMParserBase):
|
class GatherRefs(DOMParserBase):
|
||||||
|
|
|
@ -687,7 +687,7 @@ class IMDbSqlAccessSystem(IMDbBase):
|
||||||
elif isinstance(o, dict):
|
elif isinstance(o, dict):
|
||||||
for value in o.values():
|
for value in o.values():
|
||||||
self._findRefs(value, trefs, nrefs)
|
self._findRefs(value, trefs, nrefs)
|
||||||
return (trefs, nrefs)
|
return trefs, nrefs
|
||||||
|
|
||||||
def _extractRefs(self, o):
|
def _extractRefs(self, o):
|
||||||
"""Scan for titles or names references in strings."""
|
"""Scan for titles or names references in strings."""
|
||||||
|
@ -702,7 +702,7 @@ class IMDbSqlAccessSystem(IMDbBase):
|
||||||
"imdb.parser.sql.IMDbSqlAccessSystem; "
|
"imdb.parser.sql.IMDbSqlAccessSystem; "
|
||||||
"if it's not a recursion limit exceeded and we're not "
|
"if it's not a recursion limit exceeded and we're not "
|
||||||
"running in a Symbian environment, it's a bug:\n%s" % e)
|
"running in a Symbian environment, it's a bug:\n%s" % e)
|
||||||
return (trefs, nrefs)
|
return trefs, nrefs
|
||||||
|
|
||||||
def _changeAKAencoding(self, akanotes, akatitle):
|
def _changeAKAencoding(self, akanotes, akatitle):
|
||||||
"""Return akatitle in the correct charset, as specified in
|
"""Return akatitle in the correct charset, as specified in
|
||||||
|
|
|
@ -437,11 +437,13 @@ def ISNULL(x):
|
||||||
"""Emulate SQLObject's ISNULL."""
|
"""Emulate SQLObject's ISNULL."""
|
||||||
# XXX: Should we use null()? Can null() be a global instance?
|
# XXX: Should we use null()? Can null() be a global instance?
|
||||||
# XXX: Is it safe to test None with the == operator, in this case?
|
# XXX: Is it safe to test None with the == operator, in this case?
|
||||||
return x == None
|
return x is None
|
||||||
|
|
||||||
|
|
||||||
def ISNOTNULL(x):
|
def ISNOTNULL(x):
|
||||||
"""Emulate SQLObject's ISNOTNULL."""
|
"""Emulate SQLObject's ISNOTNULL."""
|
||||||
return x != None
|
return x is not None
|
||||||
|
|
||||||
|
|
||||||
def CONTAINSSTRING(expr, pattern):
|
def CONTAINSSTRING(expr, pattern):
|
||||||
"""Emulate SQLObject's CONTAINSSTRING."""
|
"""Emulate SQLObject's CONTAINSSTRING."""
|
||||||
|
|
|
@ -122,53 +122,80 @@ class DBTable(object):
|
||||||
|
|
||||||
|
|
||||||
# Default values to insert in some tables: {'column': (list, of, values, ...)}
|
# Default values to insert in some tables: {'column': (list, of, values, ...)}
|
||||||
kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie',
|
kindTypeDefs = {
|
||||||
'tv mini series', 'video game', 'episode')}
|
'kind': (
|
||||||
companyTypeDefs = {'kind': ('distributors', 'production companies',
|
'movie', 'tv series', 'tv movie', 'video movie',
|
||||||
'special effects companies', 'miscellaneous companies')}
|
'tv mini series', 'video game', 'episode', 'short', 'tv short'
|
||||||
infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages',
|
)
|
||||||
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
|
}
|
||||||
'keywords', 'alternate versions', 'crazy credits', 'goofs',
|
|
||||||
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
|
companyTypeDefs = {
|
||||||
'mini biography', 'birth notes', 'birth date', 'height',
|
'kind': (
|
||||||
'death date', 'spouse', 'other works', 'birth name',
|
'distributors', 'production companies',
|
||||||
'salary history', 'nick names', 'books', 'agent address',
|
'special effects companies', 'miscellaneous companies'
|
||||||
'biographical movies', 'portrayed in', 'where now', 'trade mark',
|
)
|
||||||
'interviews', 'article', 'magazine cover photo', 'pictorial',
|
}
|
||||||
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
|
|
||||||
'LD official retail price', 'LD frequency response', 'LD pressing plant',
|
infoTypeDefs = {
|
||||||
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
|
'info': (
|
||||||
'LD production country', 'LD contrast', 'LD color rendition',
|
'runtimes', 'color info', 'genres', 'languages',
|
||||||
'LD picture format', 'LD video noise', 'LD video artifacts',
|
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
|
||||||
'LD release country', 'LD sharpness', 'LD dynamic range',
|
'keywords', 'alternate versions', 'crazy credits', 'goofs',
|
||||||
'LD audio noise', 'LD color information', 'LD group genre',
|
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
|
||||||
'LD quality program', 'LD close captions-teletext-ld-g',
|
'mini biography', 'birth notes', 'birth date', 'height',
|
||||||
'LD category', 'LD analog left', 'LD certification',
|
'death date', 'spouse', 'other works', 'birth name',
|
||||||
'LD audio quality', 'LD video quality', 'LD aspect ratio',
|
'salary history', 'nick names', 'books', 'agent address',
|
||||||
'LD analog right', 'LD additional information',
|
'biographical movies', 'portrayed in', 'where now', 'trade mark',
|
||||||
'LD number of chapter stops', 'LD dialogue intellegibility',
|
'interviews', 'article', 'magazine cover photo', 'pictorial',
|
||||||
'LD disc size', 'LD master format', 'LD subtitles',
|
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
|
||||||
'LD status of availablility', 'LD quality of source',
|
'LD official retail price', 'LD frequency response', 'LD pressing plant',
|
||||||
'LD number of sides', 'LD video standard', 'LD supplement',
|
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
|
||||||
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
|
'LD production country', 'LD contrast', 'LD color rendition',
|
||||||
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
|
'LD picture format', 'LD video noise', 'LD video artifacts',
|
||||||
'novel', 'adaption', 'book', 'production process protocol',
|
'LD release country', 'LD sharpness', 'LD dynamic range',
|
||||||
'printed media reviews', 'essays', 'other literature', 'mpaa',
|
'LD audio noise', 'LD color information', 'LD group genre',
|
||||||
'plot', 'votes distribution', 'votes', 'rating',
|
'LD quality program', 'LD close captions-teletext-ld-g',
|
||||||
'production dates', 'copyright holder', 'filming dates', 'budget',
|
'LD category', 'LD analog left', 'LD certification',
|
||||||
'weekend gross', 'gross', 'opening weekend', 'rentals',
|
'LD audio quality', 'LD video quality', 'LD aspect ratio',
|
||||||
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')}
|
'LD analog right', 'LD additional information',
|
||||||
compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')}
|
'LD number of chapter stops', 'LD dialogue intellegibility',
|
||||||
linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as',
|
'LD disc size', 'LD master format', 'LD subtitles',
|
||||||
'references', 'referenced in', 'spoofs', 'spoofed in',
|
'LD status of availablility', 'LD quality of source',
|
||||||
'features', 'featured in', 'spin off from', 'spin off',
|
'LD number of sides', 'LD video standard', 'LD supplement',
|
||||||
'version of', 'similar to', 'edited into',
|
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
|
||||||
'edited from', 'alternate language version of',
|
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
|
||||||
'unknown link')}
|
'novel', 'adaption', 'book', 'production process protocol',
|
||||||
roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer',
|
'printed media reviews', 'essays', 'other literature', 'mpaa',
|
||||||
'cinematographer', 'composer', 'costume designer',
|
'plot', 'votes distribution', 'votes', 'rating',
|
||||||
'director', 'editor', 'miscellaneous crew',
|
'production dates', 'copyright holder', 'filming dates', 'budget',
|
||||||
'production designer', 'guest')}
|
'weekend gross', 'gross', 'opening weekend', 'rentals',
|
||||||
|
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
compCastTypeDefs = {
|
||||||
|
'kind': ('cast', 'crew', 'complete', 'complete+verified')
|
||||||
|
}
|
||||||
|
|
||||||
|
linkTypeDefs = {
|
||||||
|
'link': (
|
||||||
|
'follows', 'followed by', 'remake of', 'remade as',
|
||||||
|
'references', 'referenced in', 'spoofs', 'spoofed in',
|
||||||
|
'features', 'featured in', 'spin off from', 'spin off',
|
||||||
|
'version of', 'similar to', 'edited into',
|
||||||
|
'edited from', 'alternate language version of',
|
||||||
|
'unknown link'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
roleTypeDefs = {
|
||||||
|
'role': (
|
||||||
|
'actor', 'actress', 'producer', 'writer',
|
||||||
|
'cinematographer', 'composer', 'costume designer',
|
||||||
|
'director', 'editor', 'miscellaneous crew',
|
||||||
|
'production designer', 'guest'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
# Schema of tables in our database.
|
# Schema of tables in our database.
|
||||||
# XXX: Foreign keys can be used to create constrains between tables,
|
# XXX: Foreign keys can be used to create constrains between tables,
|
||||||
|
@ -186,7 +213,7 @@ DB_SCHEMA = [
|
||||||
# the alternateID attribute here will be ignored by SQLAlchemy.
|
# the alternateID attribute here will be ignored by SQLAlchemy.
|
||||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||||
DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'),
|
DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'),
|
||||||
DBCol('gender', STRINGCOL, length=1, default=None),
|
DBCol('gender', STRINGCOL, length=1, default=None),
|
||||||
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
||||||
|
@ -204,7 +231,7 @@ DB_SCHEMA = [
|
||||||
# from namePcodeNf.
|
# from namePcodeNf.
|
||||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||||
DBCol('imdbID', INTCOL, default=None),
|
DBCol('imdbID', INTCOL, default=None),
|
||||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||||
index='idx_pcodenf'),
|
index='idx_pcodenf'),
|
||||||
|
@ -218,7 +245,7 @@ DB_SCHEMA = [
|
||||||
# namePcodeSf is the soundex of the name plus the country code.
|
# namePcodeSf is the soundex of the name plus the country code.
|
||||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||||
DBCol('countryCode', UNICODECOL, length=255, default=None),
|
DBCol('countryCode', STRINGCOL, length=255, default=None),
|
||||||
DBCol('imdbID', INTCOL, default=None),
|
DBCol('imdbID', INTCOL, default=None),
|
||||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||||
index='idx_pcodenf'),
|
index='idx_pcodenf'),
|
||||||
|
@ -237,7 +264,7 @@ DB_SCHEMA = [
|
||||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||||
DBCol('title', UNICODECOL, notNone=True,
|
DBCol('title', UNICODECOL, notNone=True,
|
||||||
index='idx_title', indexLen=10),
|
index='idx_title', indexLen=10),
|
||||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||||
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
||||||
DBCol('productionYear', INTCOL, default=None),
|
DBCol('productionYear', INTCOL, default=None),
|
||||||
DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"),
|
DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"),
|
||||||
|
@ -264,7 +291,7 @@ DB_SCHEMA = [
|
||||||
DBCol('personID', INTCOL, notNone=True, index='idx_person',
|
DBCol('personID', INTCOL, notNone=True, index='idx_person',
|
||||||
foreignKey='Name'),
|
foreignKey='Name'),
|
||||||
DBCol('name', UNICODECOL, notNone=True),
|
DBCol('name', UNICODECOL, notNone=True),
|
||||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||||
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
||||||
index='idx_pcodecf'),
|
index='idx_pcodecf'),
|
||||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||||
|
@ -291,7 +318,7 @@ DB_SCHEMA = [
|
||||||
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
|
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
|
||||||
foreignKey='Title'),
|
foreignKey='Title'),
|
||||||
DBCol('title', UNICODECOL, notNone=True),
|
DBCol('title', UNICODECOL, notNone=True),
|
||||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||||
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
||||||
DBCol('productionYear', INTCOL, default=None),
|
DBCol('productionYear', INTCOL, default=None),
|
||||||
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
|
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
|
||||||
|
|
|
@ -42,8 +42,22 @@ _utils_logger = logging.getLogger('imdbpy.utils')
|
||||||
# and year of release.
|
# and year of release.
|
||||||
# XXX: probably L, C, D and M are far too much! ;-)
|
# XXX: probably L, C, D and M are far too much! ;-)
|
||||||
re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
|
re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
|
||||||
re_extended_year_index = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?((?:[0-9\?]{4})(?:-[0-9\?]{4})?)(?:/([IVXLCDM]+)?)?\)')
|
re_m_episode = re.compile(r'\(TV Episode\)\s+-\s+', re.I)
|
||||||
re_remove_kind = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?')
|
re_m_series = re.compile(r'Season\s+\d+\s+\|\s+Episode\s+\d+\s+-', re.I)
|
||||||
|
re_m_imdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
|
||||||
|
re_m_kind = re.compile(
|
||||||
|
r'\((TV episode|TV Series|TV mini-series|mini|TV|Video|Video Game|VG|Short|TV Movie|TV Short|V)\)',
|
||||||
|
re.I)
|
||||||
|
|
||||||
|
KIND_MAP = {
|
||||||
|
'tv': 'tv movie',
|
||||||
|
'tv episode': 'episode',
|
||||||
|
'v': 'video movie',
|
||||||
|
'video': 'video movie',
|
||||||
|
'vg': 'video game',
|
||||||
|
'mini': 'tv mini series',
|
||||||
|
'tv mini-series': 'tv mini series'
|
||||||
|
}
|
||||||
|
|
||||||
# Match only the imdbIndex (for name strings).
|
# Match only the imdbIndex (for name strings).
|
||||||
re_index = re.compile(r'^\(([IVXLCDM]+)\)$')
|
re_index = re.compile(r'^\(([IVXLCDM]+)\)$')
|
||||||
|
@ -283,13 +297,6 @@ def _split_series_episode(title):
|
||||||
# that means this is an episode title, as returned by
|
# that means this is an episode title, as returned by
|
||||||
# the web server.
|
# the web server.
|
||||||
series_title = title[:second_quot]
|
series_title = title[:second_quot]
|
||||||
##elif episode_or_year[-1:] == '}':
|
|
||||||
## # Title of the episode, as in the plain text data files.
|
|
||||||
## begin_eps = episode_or_year.find('{')
|
|
||||||
## if begin_eps == -1: return series_title, episode_or_year
|
|
||||||
## series_title = title[:second_quot+begin_eps].rstrip()
|
|
||||||
## # episode_or_year is returned with the {...}
|
|
||||||
## episode_or_year = episode_or_year[begin_eps:]
|
|
||||||
return series_title, episode_or_year
|
return series_title, episode_or_year
|
||||||
|
|
||||||
|
|
||||||
|
@ -383,65 +390,24 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
|
||||||
# tv mini series: 5,497
|
# tv mini series: 5,497
|
||||||
# video game: 5,490
|
# video game: 5,490
|
||||||
# More up-to-date statistics: http://us.imdb.com/database_statistics
|
# More up-to-date statistics: http://us.imdb.com/database_statistics
|
||||||
if title.endswith('(TV)'):
|
epindex = re_m_episode.search(title)
|
||||||
kind = u'tv movie'
|
if epindex:
|
||||||
title = title[:-4].rstrip()
|
# It's an episode of a series.
|
||||||
elif title.endswith('(TV Movie)'):
|
kind = 'episode'
|
||||||
kind = u'tv movie'
|
series_title = title[epindex.end():]
|
||||||
title = title[:-10].rstrip()
|
series_title = re_m_series.sub('', series_title)
|
||||||
elif title.endswith('(V)'):
|
series_info = analyze_title(series_title)
|
||||||
kind = u'video movie'
|
result['episode of'] = series_info.get('title')
|
||||||
title = title[:-3].rstrip()
|
result['series year'] = series_info.get('year')
|
||||||
elif title.lower().endswith('(video)'):
|
title = title[:epindex.start()].strip()
|
||||||
kind = u'video movie'
|
else:
|
||||||
title = title[:-7].rstrip()
|
detected_kind = re_m_kind.findall(title)
|
||||||
elif title.endswith('(TV Short)'):
|
if detected_kind:
|
||||||
kind = u'tv short'
|
kind = detected_kind[-1].lower().replace('-', '')
|
||||||
title = title[:-10].rstrip()
|
kind = KIND_MAP.get(kind, kind)
|
||||||
elif title.endswith('(TV Mini-Series)'):
|
title = re_m_kind.sub('', title).strip()
|
||||||
kind = u'tv mini series'
|
|
||||||
title = title[:-16].rstrip()
|
|
||||||
elif title.endswith('(mini)'):
|
|
||||||
kind = u'tv mini series'
|
|
||||||
title = title[:-6].rstrip()
|
|
||||||
elif title.endswith('(VG)'):
|
|
||||||
kind = u'video game'
|
|
||||||
title = title[:-4].rstrip()
|
|
||||||
elif title.endswith('(Video Game)'):
|
|
||||||
kind = u'video game'
|
|
||||||
title = title[:-12].rstrip()
|
|
||||||
elif title.endswith('(TV Series)'):
|
|
||||||
epindex = title.find('(TV Episode) - ')
|
|
||||||
if epindex >= 0:
|
|
||||||
# It's an episode of a series.
|
|
||||||
kind = u'episode'
|
|
||||||
series_info = analyze_title(title[epindex + 15:])
|
|
||||||
result['episode of'] = series_info.get('title')
|
|
||||||
result['series year'] = series_info.get('year')
|
|
||||||
title = title[:epindex]
|
|
||||||
else:
|
|
||||||
kind = u'tv series'
|
|
||||||
title = title[:-11].rstrip()
|
|
||||||
# Search for the year and the optional imdbIndex (a roman number).
|
# Search for the year and the optional imdbIndex (a roman number).
|
||||||
yi = re_year_index.findall(title)
|
yi = re_year_index.findall(title)
|
||||||
if not yi:
|
|
||||||
yi = re_extended_year_index.findall(title)
|
|
||||||
if yi:
|
|
||||||
yk, yiy, yii = yi[-1]
|
|
||||||
yi = [(yiy, yii)]
|
|
||||||
if yk == 'TV episode':
|
|
||||||
kind = u'episode'
|
|
||||||
elif yk in ('TV', 'TV Movie'):
|
|
||||||
kind = u'tv movie'
|
|
||||||
elif yk == 'TV Series':
|
|
||||||
kind = u'tv series'
|
|
||||||
elif yk == 'Video':
|
|
||||||
kind = u'video movie'
|
|
||||||
elif yk in ('TV mini-series', 'TV Mini-Series'):
|
|
||||||
kind = u'tv mini series'
|
|
||||||
elif yk == 'Video Game':
|
|
||||||
kind = u'video game'
|
|
||||||
title = re_remove_kind.sub('(', title)
|
|
||||||
if yi:
|
if yi:
|
||||||
last_yi = yi[-1]
|
last_yi = yi[-1]
|
||||||
year = last_yi[0]
|
year = last_yi[0]
|
||||||
|
@ -450,7 +416,12 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
|
||||||
year = year[:-len(imdbIndex)-1]
|
year = year[:-len(imdbIndex)-1]
|
||||||
i = title.rfind('(%s)' % last_yi[0])
|
i = title.rfind('(%s)' % last_yi[0])
|
||||||
if i != -1:
|
if i != -1:
|
||||||
title = title[:i-1].rstrip()
|
title = title[:i - 1].rstrip()
|
||||||
|
if not imdbIndex:
|
||||||
|
detect_imdbIndex = re_m_imdbIndex.findall(title)
|
||||||
|
if detect_imdbIndex:
|
||||||
|
imdbIndex = detect_imdbIndex[-1]
|
||||||
|
title = re_m_imdbIndex.sub('', title).strip()
|
||||||
# This is a tv (mini) series: strip the '"' at the begin and at the end.
|
# This is a tv (mini) series: strip the '"' at the begin and at the end.
|
||||||
# XXX: strip('"') is not used for compatibility with Python 2.0.
|
# XXX: strip('"') is not used for compatibility with Python 2.0.
|
||||||
if title and title[0] == title[-1] == '"':
|
if title and title[0] == title[-1] == '"':
|
||||||
|
@ -464,8 +435,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
|
||||||
title = canonicalTitle(title)
|
title = canonicalTitle(title)
|
||||||
else:
|
else:
|
||||||
title = normalizeTitle(title)
|
title = normalizeTitle(title)
|
||||||
# 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series',
|
|
||||||
# 'tv movie', 'video movie', 'video game')
|
|
||||||
result['title'] = title
|
result['title'] = title
|
||||||
result['kind'] = kind or u'movie'
|
result['kind'] = kind or u'movie'
|
||||||
if year and year != '????':
|
if year and year != '????':
|
||||||
|
@ -832,7 +801,7 @@ def date_and_notes(s):
|
||||||
"""Parse (birth|death) date and notes; returns a tuple in the
|
"""Parse (birth|death) date and notes; returns a tuple in the
|
||||||
form (date, notes)."""
|
form (date, notes)."""
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
if not s: return (u'', u'')
|
if not s: return u'', u''
|
||||||
notes = u''
|
notes = u''
|
||||||
if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february',
|
if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february',
|
||||||
'march', 'april', 'may', 'june',
|
'march', 'april', 'may', 'june',
|
||||||
|
@ -990,7 +959,7 @@ def _tag4TON(ton, addAccessSystem=False, _containerOnly=False):
|
||||||
beginTag += extras
|
beginTag += extras
|
||||||
if ton.notes:
|
if ton.notes:
|
||||||
beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes)
|
beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes)
|
||||||
return (beginTag, u'</%s>' % tag)
|
return beginTag, u'</%s>' % tag
|
||||||
|
|
||||||
|
|
||||||
TAGS_TO_MODIFY = {
|
TAGS_TO_MODIFY = {
|
||||||
|
@ -1264,8 +1233,8 @@ class _Container(object):
|
||||||
self.__role = role
|
self.__role = role
|
||||||
|
|
||||||
currentRole = property(_get_currentRole, _set_currentRole,
|
currentRole = property(_get_currentRole, _set_currentRole,
|
||||||
doc="The role of a Person in a Movie" + \
|
doc="The role of a Person in a Movie"
|
||||||
" or the interpreter of a Character in a Movie.")
|
" or the interpreter of a Character in a Movie.")
|
||||||
|
|
||||||
def _init(self, **kwds): pass
|
def _init(self, **kwds): pass
|
||||||
|
|
||||||
|
@ -1478,10 +1447,10 @@ class _Container(object):
|
||||||
except RuntimeError, e:
|
except RuntimeError, e:
|
||||||
# Symbian/python 2.2 has a poor regexp implementation.
|
# Symbian/python 2.2 has a poor regexp implementation.
|
||||||
import warnings
|
import warnings
|
||||||
warnings.warn('RuntimeError in '
|
warnings.warn("RuntimeError in imdb.utils._Container.__getitem__;"
|
||||||
"imdb.utils._Container.__getitem__; if it's not "
|
" if it's not a recursion limit exceeded and we're"
|
||||||
"a recursion limit exceeded and we're not running "
|
" not running in a Symbian environment, it's a"
|
||||||
"in a Symbian environment, it's a bug:\n%s" % e)
|
" bug:\n%s" % e)
|
||||||
return rawData
|
return rawData
|
||||||
|
|
||||||
def __setitem__(self, key, item):
|
def __setitem__(self, key, item):
|
||||||
|
|
Loading…
Reference in a new issue