mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-05 17:43:37 +00:00
Update IMDb 5.1 (r907) → 5.2.1dev20171113 (f640595).
Thanks to the backport by @MasterMind2k
This commit is contained in:
parent
18c400acec
commit
78026584eb
24 changed files with 1992 additions and 1184 deletions
|
@ -7,6 +7,7 @@
|
|||
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
||||
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
|
||||
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
|
||||
* Update IMDb 5.1 (r907) to 5.2.1dev20171113 (f640595)
|
||||
|
||||
[develop changelog]
|
||||
|
||||
|
|
|
@ -23,8 +23,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|||
|
||||
from copy import deepcopy
|
||||
|
||||
from imdb.utils import analyze_company_name, build_company_name, \
|
||||
flatten, _Container, cmpCompanies
|
||||
from imdb.utils import _Container
|
||||
from imdb.utils import analyze_company_name, build_company_name, cmpCompanies, flatten
|
||||
|
||||
|
||||
class Company(_Container):
|
||||
|
|
|
@ -24,8 +24,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|||
from copy import deepcopy
|
||||
|
||||
from imdb import linguistics
|
||||
from imdb.utils import analyze_title, build_title, canonicalTitle, \
|
||||
flatten, _Container, cmpMovies
|
||||
from imdb.utils import _Container
|
||||
from imdb.utils import analyze_title, build_title, canonicalTitle, cmpMovies, flatten
|
||||
|
||||
|
||||
class Movie(_Container):
|
||||
|
|
|
@ -6,7 +6,7 @@ a person from the IMDb database.
|
|||
It can fetch data through different media (e.g.: the IMDb web pages,
|
||||
a SQL database, etc.)
|
||||
|
||||
Copyright 2004-2016 Davide Alberani <da@erlug.linux.it>
|
||||
Copyright 2004-2018 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -25,12 +25,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|||
|
||||
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
|
||||
'available_access_systems']
|
||||
__version__ = VERSION = '5.1'
|
||||
__version__ = VERSION = '5.2.1dev20171113'
|
||||
|
||||
VERSION_NOTICE = """This is the imdbpy-legacy branch of IMDbPY, and requires Python 2.
|
||||
Please notice that this version is mostly unsupported.
|
||||
|
||||
For a version compatible with Python 3, see the master branch:
|
||||
https://github.com/alberanid/imdbpy/
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
if sys.hexversion >= 0x3000000:
|
||||
print(VERSION_NOTICE)
|
||||
sys.exit(1)
|
||||
|
||||
# Import compatibility module (importing it is enough).
|
||||
import _compat
|
||||
|
||||
import sys, os, ConfigParser, logging
|
||||
import os, ConfigParser, logging
|
||||
from types import MethodType
|
||||
|
||||
from imdb import Movie, Person, Character, Company
|
||||
|
@ -38,38 +51,39 @@ import imdb._logging
|
|||
from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError
|
||||
from imdb.utils import build_title, build_name, build_company_name
|
||||
|
||||
_imdb_logger = logging.getLogger('imdbpy')
|
||||
_aux_logger = logging.getLogger('imdbpy.aux')
|
||||
|
||||
|
||||
# URLs of the main pages for movies, persons, characters and queries.
|
||||
imdbURL_base = 'http://akas.imdb.com/'
|
||||
imdbURL_base = 'http://www.imdb.com/'
|
||||
|
||||
# NOTE: the urls below will be removed in a future version.
|
||||
# please use the values in the 'urls' attribute
|
||||
# of the IMDbBase subclass instance.
|
||||
# http://akas.imdb.com/title/
|
||||
# http://www.imdb.com/title/
|
||||
imdbURL_movie_base = '%stitle/' % imdbURL_base
|
||||
# http://akas.imdb.com/title/tt%s/
|
||||
# http://www.imdb.com/title/tt%s/
|
||||
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
|
||||
# http://akas.imdb.com/name/
|
||||
# http://www.imdb.com/name/
|
||||
imdbURL_person_base = '%sname/' % imdbURL_base
|
||||
# http://akas.imdb.com/name/nm%s/
|
||||
# http://www.imdb.com/name/nm%s/
|
||||
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
|
||||
# http://akas.imdb.com/character/
|
||||
# http://www.imdb.com/character/
|
||||
imdbURL_character_base = '%scharacter/' % imdbURL_base
|
||||
# http://akas.imdb.com/character/ch%s/
|
||||
# http://www.imdb.com/character/ch%s/
|
||||
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
|
||||
# http://akas.imdb.com/company/
|
||||
# http://www.imdb.com/company/
|
||||
imdbURL_company_base = '%scompany/' % imdbURL_base
|
||||
# http://akas.imdb.com/company/co%s/
|
||||
# http://www.imdb.com/company/co%s/
|
||||
imdbURL_company_main = imdbURL_company_base + 'co%s/'
|
||||
# http://akas.imdb.com/keyword/%s/
|
||||
# http://www.imdb.com/keyword/%s/
|
||||
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
|
||||
# http://akas.imdb.com/chart/top
|
||||
# http://www.imdb.com/chart/top
|
||||
imdbURL_top250 = imdbURL_base + 'chart/top'
|
||||
# http://akas.imdb.com/chart/bottom
|
||||
# http://www.imdb.com/chart/bottom
|
||||
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
|
||||
# http://akas.imdb.com/find?%s
|
||||
# http://www.imdb.com/find?%s
|
||||
imdbURL_find = imdbURL_base + 'find?%s'
|
||||
|
||||
# Name of the configuration file.
|
||||
|
@ -103,7 +117,7 @@ class ConfigParserWithCase(ConfigParser.ConfigParser):
|
|||
try:
|
||||
self.read(fname)
|
||||
except (ConfigParser.MissingSectionHeaderError,
|
||||
ConfigParser.ParsingError), e:
|
||||
ConfigParser.ParsingError) as e:
|
||||
_aux_logger.warn('Troubles reading config file: %s' % e)
|
||||
# Stop at the first valid file.
|
||||
if self.has_section('imdbpy'):
|
||||
|
@ -159,10 +173,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
|
|||
accessSystem = 'http'
|
||||
kwds.update(keywords)
|
||||
keywords = kwds
|
||||
except Exception, e:
|
||||
import logging
|
||||
logging.getLogger('imdbpy').warn('Unable to read configuration' \
|
||||
' file; complete error: %s' % e)
|
||||
except Exception as e:
|
||||
_imdb_logger.warn('Unable to read configuration file; complete error: %s' % e)
|
||||
# It just LOOKS LIKE a bad habit: we tried to read config
|
||||
# options from some files, but something is gone horribly
|
||||
# wrong: ignore everything and pretend we were called with
|
||||
|
@ -177,9 +189,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
|
|||
try:
|
||||
import logging.config
|
||||
logging.config.fileConfig(os.path.expanduser(logCfg))
|
||||
except Exception, e:
|
||||
logging.getLogger('imdbpy').warn('unable to read logger ' \
|
||||
'config: %s' % e)
|
||||
except Exception as e:
|
||||
_imdb_logger.warn('unable to read logger config: %s' % e)
|
||||
if accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
||||
logging.warn('httpThin was removed since IMDbPY 4.8')
|
||||
accessSystem = 'http'
|
||||
|
@ -244,9 +255,6 @@ class IMDbBase:
|
|||
# in the subclasses).
|
||||
accessSystem = 'UNKNOWN'
|
||||
|
||||
# Top-level logger for IMDbPY.
|
||||
_imdb_logger = logging.getLogger('imdbpy')
|
||||
|
||||
# Whether to re-raise caught exceptions or not.
|
||||
_reraise_exceptions = False
|
||||
|
||||
|
@ -285,30 +293,30 @@ class IMDbBase:
|
|||
imdbURL_base = 'http://%s' % imdbURL_base
|
||||
if not imdbURL_base.endswith('/'):
|
||||
imdbURL_base = '%s/' % imdbURL_base
|
||||
# http://akas.imdb.com/title/
|
||||
imdbURL_movie_base='%stitle/' % imdbURL_base
|
||||
# http://akas.imdb.com/title/tt%s/
|
||||
imdbURL_movie_main=imdbURL_movie_base + 'tt%s/'
|
||||
# http://akas.imdb.com/name/
|
||||
imdbURL_person_base='%sname/' % imdbURL_base
|
||||
# http://akas.imdb.com/name/nm%s/
|
||||
imdbURL_person_main=imdbURL_person_base + 'nm%s/'
|
||||
# http://akas.imdb.com/character/
|
||||
imdbURL_character_base='%scharacter/' % imdbURL_base
|
||||
# http://akas.imdb.com/character/ch%s/
|
||||
imdbURL_character_main=imdbURL_character_base + 'ch%s/'
|
||||
# http://akas.imdb.com/company/
|
||||
imdbURL_company_base='%scompany/' % imdbURL_base
|
||||
# http://akas.imdb.com/company/co%s/
|
||||
imdbURL_company_main=imdbURL_company_base + 'co%s/'
|
||||
# http://akas.imdb.com/keyword/%s/
|
||||
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/'
|
||||
# http://akas.imdb.com/chart/top
|
||||
imdbURL_top250=imdbURL_base + 'chart/top'
|
||||
# http://akas.imdb.com/chart/bottom
|
||||
imdbURL_bottom100=imdbURL_base + 'chart/bottom'
|
||||
# http://akas.imdb.com/find?%s
|
||||
imdbURL_find=imdbURL_base + 'find?%s'
|
||||
# http://www.imdb.com/title/
|
||||
imdbURL_movie_base = '%stitle/' % imdbURL_base
|
||||
# http://www.imdb.com/title/tt%s/
|
||||
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
|
||||
# http://www.imdb.com/name/
|
||||
imdbURL_person_base = '%sname/' % imdbURL_base
|
||||
# http://www.imdb.com/name/nm%s/
|
||||
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
|
||||
# http://www.imdb.com/character/
|
||||
imdbURL_character_base = '%scharacter/' % imdbURL_base
|
||||
# http://www.imdb.com/character/ch%s/
|
||||
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
|
||||
# http://www.imdb.com/company/
|
||||
imdbURL_company_base = '%scompany/' % imdbURL_base
|
||||
# http://www.imdb.com/company/co%s/
|
||||
imdbURL_company_main = imdbURL_company_base + 'co%s/'
|
||||
# http://www.imdb.com/keyword/%s/
|
||||
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
|
||||
# http://www.imdb.com/chart/top
|
||||
imdbURL_top250 = imdbURL_base + 'chart/top'
|
||||
# http://www.imdb.com/chart/bottom
|
||||
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
|
||||
# http://www.imdb.com/find?%s
|
||||
imdbURL_find = imdbURL_base + 'find?%s'
|
||||
self.urls = dict(
|
||||
movie_base=imdbURL_movie_base,
|
||||
movie_main=imdbURL_movie_main,
|
||||
|
@ -727,16 +735,15 @@ class IMDbBase:
|
|||
mopID = mop.companyID
|
||||
prefix = 'company'
|
||||
else:
|
||||
raise IMDbError('object ' + repr(mop) + \
|
||||
' is not a Movie, Person, Character or Company instance')
|
||||
raise IMDbError('object ' + repr(mop) +
|
||||
' is not a Movie, Person, Character or Company instance')
|
||||
if mopID is None:
|
||||
# XXX: enough? It's obvious that there are Characters
|
||||
# objects without characterID, so I think they should
|
||||
# just do nothing, when an i.update(character) is tried.
|
||||
if prefix == 'character':
|
||||
return
|
||||
raise IMDbDataAccessError( \
|
||||
'the supplied object has null movieID, personID or companyID')
|
||||
raise IMDbDataAccessError('supplied object has null movieID, personID or companyID')
|
||||
if mop.accessSystem == self.accessSystem:
|
||||
aSystem = self
|
||||
else:
|
||||
|
@ -760,21 +767,22 @@ class IMDbBase:
|
|||
continue
|
||||
if not i:
|
||||
continue
|
||||
self._imdb_logger.debug('retrieving "%s" info set', i)
|
||||
_imdb_logger.debug('retrieving "%s" info set', i)
|
||||
try:
|
||||
method = getattr(aSystem, 'get_%s_%s' %
|
||||
(prefix, i.replace(' ', '_')))
|
||||
except AttributeError:
|
||||
self._imdb_logger.error('unknown information set "%s"', i)
|
||||
_imdb_logger.error('unknown information set "%s"', i)
|
||||
# Keeps going.
|
||||
method = lambda *x: {}
|
||||
try:
|
||||
ret = method(mopID)
|
||||
except Exception, e:
|
||||
self._imdb_logger.critical('caught an exception retrieving ' \
|
||||
'or parsing "%s" info set for mopID ' \
|
||||
'"%s" (accessSystem: %s)',
|
||||
i, mopID, mop.accessSystem, exc_info=True)
|
||||
except Exception:
|
||||
_imdb_logger.critical(
|
||||
'caught an exception retrieving or parsing "%s" info set'
|
||||
' for mopID "%s" (accessSystem: %s)',
|
||||
i, mopID, mop.accessSystem, exc_info=True
|
||||
)
|
||||
ret = {}
|
||||
# If requested by the user, reraise the exception.
|
||||
if self._reraise_exceptions:
|
||||
|
@ -826,9 +834,7 @@ class IMDbBase:
|
|||
raise NotImplementedError('override this method')
|
||||
|
||||
def _searchIMDb(self, kind, ton, title_kind=None):
|
||||
"""Search the IMDb akas server for the given title or name."""
|
||||
# The Exact Primary search system has gone AWOL, so we resort
|
||||
# to the mobile search. :-/
|
||||
"""Search the IMDb www server for the given title or name."""
|
||||
if not ton:
|
||||
return None
|
||||
ton = ton.strip('"')
|
||||
|
@ -935,8 +941,8 @@ class IMDbBase:
|
|||
else:
|
||||
imdbID = aSystem.company2imdbID(build_company_name(mop))
|
||||
else:
|
||||
raise IMDbError('object ' + repr(mop) + \
|
||||
' is not a Movie, Person or Character instance')
|
||||
raise IMDbError('object ' + repr(mop) +
|
||||
' is not a Movie, Person or Character instance')
|
||||
return imdbID
|
||||
|
||||
def get_imdbURL(self, mop):
|
||||
|
@ -954,8 +960,8 @@ class IMDbBase:
|
|||
elif isinstance(mop, Company.Company):
|
||||
url_firstPart = imdbURL_company_main
|
||||
else:
|
||||
raise IMDbError('object ' + repr(mop) + \
|
||||
' is not a Movie, Person, Character or Company instance')
|
||||
raise IMDbError('object ' + repr(mop) +
|
||||
' is not a Movie, Person, Character or Company instance')
|
||||
return url_firstPart % imdbID
|
||||
|
||||
def get_special_methods(self):
|
||||
|
|
|
@ -32,8 +32,9 @@ LEVELS = {'debug': logging.DEBUG,
|
|||
|
||||
imdbpyLogger = logging.getLogger('imdbpy')
|
||||
imdbpyStreamHandler = logging.StreamHandler()
|
||||
imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \
|
||||
' %(pathname)s:%(lineno)d: %(message)s')
|
||||
imdbpyFormatter = logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(name)s] %(pathname)s:%(lineno)d: %(message)s'
|
||||
)
|
||||
imdbpyStreamHandler.setFormatter(imdbpyFormatter)
|
||||
imdbpyLogger.addHandler(imdbpyStreamHandler)
|
||||
|
||||
|
|
|
@ -269,8 +269,8 @@ for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items():
|
|||
everyentcharrefs[k] = v
|
||||
everyentcharrefs['#%s' % ord(v)] = v
|
||||
everyentcharrefsget = everyentcharrefs.get
|
||||
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' %
|
||||
'|'.join(map(re.escape, everyentcharrefs)))
|
||||
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % '|'.join(map(re.escape,
|
||||
everyentcharrefs)))
|
||||
re_everyentcharrefssub = re_everyentcharrefs.sub
|
||||
|
||||
def _replAllXMLRef(match):
|
||||
|
@ -408,7 +408,7 @@ def _valueWithType(tag, tagValue):
|
|||
|
||||
# Extra tags to get (if values were not already read from title/name).
|
||||
_titleTags = ('imdbindex', 'kind', 'year')
|
||||
_nameTags = ('imdbindex')
|
||||
_nameTags = ('imdbindex',)
|
||||
_companyTags = ('imdbindex', 'country')
|
||||
|
||||
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,
|
||||
|
|
|
@ -7,7 +7,7 @@ the imdb.IMDb function will return an instance of this class when
|
|||
called with the 'accessSystem' argument set to "http" or "web"
|
||||
or "html" (this is the default).
|
||||
|
||||
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
|
||||
Copyright 2004-2017 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
|
@ -26,6 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|||
"""
|
||||
|
||||
import sys
|
||||
import ssl
|
||||
import socket
|
||||
import logging
|
||||
from urllib import FancyURLopener, quote_plus
|
||||
|
@ -68,8 +69,8 @@ class _ModuleProxy:
|
|||
"""Initialize a proxy for the given module; defaultKeys, if set,
|
||||
muste be a dictionary of values to set for instanced objects."""
|
||||
if oldParsers or fallBackToNew:
|
||||
_aux_logger.warn('The old set of parsers was removed; falling ' \
|
||||
'back to the new parsers.')
|
||||
_aux_logger.warn('The old set of parsers was removed;'
|
||||
' falling back to the new parsers.')
|
||||
self.useModule = useModule
|
||||
if defaultKeys is None:
|
||||
defaultKeys = {}
|
||||
|
@ -142,6 +143,7 @@ class IMDbURLopener(FancyURLopener):
|
|||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._last_url = u''
|
||||
kwargs['context'] = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||
FancyURLopener.__init__(self, *args, **kwargs)
|
||||
# Headers to add to every request.
|
||||
# XXX: IMDb's web server doesn't like urllib-based programs,
|
||||
|
@ -211,9 +213,9 @@ class IMDbURLopener(FancyURLopener):
|
|||
if server_encode is None and content:
|
||||
begin_h = content.find('text/html; charset=')
|
||||
if begin_h != -1:
|
||||
end_h = content[19+begin_h:].find('"')
|
||||
end_h = content[19 + begin_h:].find('"')
|
||||
if end_h != -1:
|
||||
server_encode = content[19+begin_h:19+begin_h+end_h]
|
||||
server_encode = content[19 + begin_h:19 + begin_h + end_h]
|
||||
if server_encode:
|
||||
try:
|
||||
if lookup(server_encode):
|
||||
|
@ -237,9 +239,10 @@ class IMDbURLopener(FancyURLopener):
|
|||
if encode is None:
|
||||
encode = 'latin_1'
|
||||
# The detection of the encoding is error prone...
|
||||
self._logger.warn('Unable to detect the encoding of the retrieved '
|
||||
'page [%s]; falling back to default latin1.', encode)
|
||||
##print unicode(content, encode, 'replace').encode('utf8')
|
||||
self._logger.warn('Unable to detect the encoding of the retrieved page [%s];'
|
||||
' falling back to default utf8.', encode)
|
||||
if isinstance(content, unicode):
|
||||
return content
|
||||
return unicode(content, encode, 'replace')
|
||||
|
||||
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
||||
|
@ -288,8 +291,8 @@ class IMDbHTTPAccessSystem(IMDbBase):
|
|||
self._getRefs = True
|
||||
self._mdparse = False
|
||||
if isThin:
|
||||
self._http_logger.warn('"httpThin" access system no longer ' +
|
||||
'supported; "http" used automatically', exc_info=False)
|
||||
self._http_logger.warn('"httpThin" access system no longer supported;'
|
||||
' "http" used automatically', exc_info=False)
|
||||
self.isThin = 0
|
||||
if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
||||
self.accessSystem = 'http'
|
||||
|
@ -503,7 +506,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
|
|||
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
|
||||
|
||||
def get_movie_main(self, movieID):
|
||||
cont = self._retrieve(self.urls['movie_main'] % movieID + 'combined')
|
||||
cont = self._retrieve(self.urls['movie_main'] % movieID + 'reference')
|
||||
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
|
||||
|
||||
def get_movie_full_credits(self, movieID):
|
||||
|
@ -811,7 +814,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
|
|||
def _search_keyword(self, keyword, results):
|
||||
# XXX: the IMDb web server seems to have some serious problem with
|
||||
# non-ascii keyword.
|
||||
# E.g.: http://akas.imdb.com/keyword/fianc%E9/
|
||||
# E.g.: http://www.imdb.com/keyword/fianc%E9/
|
||||
# will return a 500 Internal Server Error: Redirect Recursion.
|
||||
keyword = keyword.encode('utf8', 'ignore')
|
||||
try:
|
||||
|
|
|
@ -171,7 +171,7 @@ class PageElement:
|
|||
return self
|
||||
|
||||
def _lastRecursiveChild(self):
|
||||
"Finds the last element beneath this object to be parsed."
|
||||
"""Finds the last element beneath this object to be parsed."""
|
||||
lastChild = self
|
||||
while hasattr(lastChild, 'contents') and lastChild.contents:
|
||||
lastChild = lastChild.contents[-1]
|
||||
|
@ -184,7 +184,7 @@ class PageElement:
|
|||
newChild = NavigableString(newChild)
|
||||
|
||||
position = min(position, len(self.contents))
|
||||
if hasattr(newChild, 'parent') and newChild.parent != None:
|
||||
if hasattr(newChild, 'parent') and newChild.parent is not None:
|
||||
# We're 'inserting' an element that's already one
|
||||
# of this object's children.
|
||||
if newChild.parent == self:
|
||||
|
@ -323,7 +323,7 @@ class PageElement:
|
|||
return r
|
||||
|
||||
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
|
||||
"Iterates over a generator looking for things that match."
|
||||
"""Iterates over a generator looking for things that match."""
|
||||
|
||||
if isinstance(name, SoupStrainer):
|
||||
strainer = name
|
||||
|
@ -415,7 +415,7 @@ class NavigableString(unicode, PageElement):
|
|||
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||
|
||||
def __getnewargs__(self):
|
||||
return (NavigableString.__str__(self),)
|
||||
return NavigableString.__str__(self),
|
||||
|
||||
def __getattr__(self, attr):
|
||||
"""text.string gives you text. This is for backwards
|
||||
|
@ -460,7 +460,7 @@ class Tag(PageElement):
|
|||
"""Represents a found HTML tag with its attributes and contents."""
|
||||
|
||||
def _invert(h):
|
||||
"Cheap function to invert a hash."
|
||||
"""Cheap function to invert a hash."""
|
||||
i = {}
|
||||
for k,v in h.items():
|
||||
i[v] = k
|
||||
|
@ -501,14 +501,14 @@ class Tag(PageElement):
|
|||
|
||||
def __init__(self, parser, name, attrs=None, parent=None,
|
||||
previous=None):
|
||||
"Basic constructor."
|
||||
"""Basic constructor."""
|
||||
|
||||
# We don't actually store the parser object: that lets extracted
|
||||
# chunks be garbage-collected
|
||||
self.parserClass = parser.__class__
|
||||
self.isSelfClosing = parser.isSelfClosingTag(name)
|
||||
self.name = name
|
||||
if attrs == None:
|
||||
if attrs is None:
|
||||
attrs = []
|
||||
self.attrs = attrs
|
||||
self.contents = []
|
||||
|
@ -541,18 +541,18 @@ class Tag(PageElement):
|
|||
return self._getAttrMap()[key]
|
||||
|
||||
def __iter__(self):
|
||||
"Iterating over a tag iterates over its contents."
|
||||
"""Iterating over a tag iterates over its contents."""
|
||||
return iter(self.contents)
|
||||
|
||||
def __len__(self):
|
||||
"The length of a tag is the length of its list of contents."
|
||||
"""The length of a tag is the length of its list of contents."""
|
||||
return len(self.contents)
|
||||
|
||||
def __contains__(self, x):
|
||||
return x in self.contents
|
||||
|
||||
def __nonzero__(self):
|
||||
"A tag is non-None even if it has no contents."
|
||||
"""A tag is non-None even if it has no contents."""
|
||||
return True
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
|
@ -570,7 +570,7 @@ class Tag(PageElement):
|
|||
self._getAttrMap()[key] = value
|
||||
|
||||
def __delitem__(self, key):
|
||||
"Deleting tag[key] deletes all 'key' attributes for the tag."
|
||||
"""Deleting tag[key] deletes all 'key' attributes for the tag."""
|
||||
for item in self.attrs:
|
||||
if item[0] == key:
|
||||
self.attrs.remove(item)
|
||||
|
@ -911,7 +911,7 @@ class SoupStrainer:
|
|||
#print "Matching %s against %s" % (markup, matchAgainst)
|
||||
result = False
|
||||
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
|
||||
result = markup != None
|
||||
result = markup is not None
|
||||
elif callable(matchAgainst):
|
||||
result = matchAgainst(markup)
|
||||
else:
|
||||
|
@ -1130,7 +1130,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||
# Python installations can't copy regexes. If anyone
|
||||
# was relying on the existence of markupMassage, this
|
||||
# might cause problems.
|
||||
del(self.markupMassage)
|
||||
del self.markupMassage
|
||||
self.reset()
|
||||
|
||||
SGMLParser.feed(self, markup)
|
||||
|
@ -1253,7 +1253,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||
"""
|
||||
|
||||
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
|
||||
isNestable = nestingResetTriggers != None
|
||||
isNestable = nestingResetTriggers is not None
|
||||
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
|
||||
popTo = None
|
||||
inclusive = True
|
||||
|
@ -1264,9 +1264,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||
#last occurance.
|
||||
popTo = name
|
||||
break
|
||||
if (nestingResetTriggers != None
|
||||
if (nestingResetTriggers is not None
|
||||
and p.name in nestingResetTriggers) \
|
||||
or (nestingResetTriggers == None and isResetNesting
|
||||
or (nestingResetTriggers is None and isResetNesting
|
||||
and self.RESET_NESTING_TAGS.has_key(p.name)):
|
||||
|
||||
#If we encounter one of the nesting reset triggers
|
||||
|
@ -1342,11 +1342,11 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||
self._toStringSubclass(text, ProcessingInstruction)
|
||||
|
||||
def handle_comment(self, text):
|
||||
"Handle comments as Comment objects."
|
||||
"""Handle comments as Comment objects."""
|
||||
self._toStringSubclass(text, Comment)
|
||||
|
||||
def handle_charref(self, ref):
|
||||
"Handle character references as data."
|
||||
"""Handle character references as data."""
|
||||
if self.convertEntities:
|
||||
data = unichr(int(ref))
|
||||
else:
|
||||
|
@ -1397,7 +1397,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||
self.handle_data(data)
|
||||
|
||||
def handle_decl(self, data):
|
||||
"Handle DOCTYPEs and the like as Declaration objects."
|
||||
"""Handle DOCTYPEs and the like as Declaration objects."""
|
||||
self._toStringSubclass(data, Declaration)
|
||||
|
||||
def parse_declaration(self, i):
|
||||
|
@ -1793,8 +1793,8 @@ class UnicodeDammit:
|
|||
return self.markup
|
||||
|
||||
def _toUnicode(self, data, encoding):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
"""Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases"""
|
||||
|
||||
# strip Byte Order Mark (if present)
|
||||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
||||
|
|
|
@ -67,7 +67,7 @@ def tokenize_path(path):
|
|||
if path[i] == '/':
|
||||
if i > 0:
|
||||
separators.append((last_position, i))
|
||||
if (path[i+1] == '/'):
|
||||
if path[i+1] == '/':
|
||||
last_position = i
|
||||
i = i + 1
|
||||
else:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
parser.http.characterParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse
|
||||
the IMDb pages on the akas.imdb.com server about a character.
|
||||
the IMDb pages on the www.imdb.com server about a character.
|
||||
E.g., for "Jesse James" the referred pages would be:
|
||||
main details: http://www.imdb.com/character/ch0000001/
|
||||
biography: http://www.imdb.com/character/ch0000001/bio
|
||||
|
@ -37,7 +37,7 @@ _personIDs = re.compile(r'/name/nm([0-9]{7})')
|
|||
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
||||
"""Parser for the "filmography" page of a given character.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -101,7 +101,7 @@ class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
|||
class DOMHTMLCharacterBioParser(DOMParserBase):
|
||||
"""Parser for the "biography" page of a given character.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -146,7 +146,7 @@ class DOMHTMLCharacterBioParser(DOMParserBase):
|
|||
class DOMHTMLCharacterQuotesParser(DOMParserBase):
|
||||
"""Parser for the "quotes" page of a given character.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
|
|
@ -2,12 +2,12 @@
|
|||
parser.http.companyParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse
|
||||
the IMDb pages on the akas.imdb.com server about a company.
|
||||
the IMDb pages on the www.imdb.com server about a company.
|
||||
E.g., for "Columbia Pictures [us]" the referred page would be:
|
||||
main details: http://akas.imdb.com/company/co0071509/
|
||||
main details: http://www.imdb.com/company/co0071509/
|
||||
|
||||
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
|
||||
2008-2017 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -34,7 +34,7 @@ from imdb.utils import analyze_company_name
|
|||
class DOMCompanyParser(DOMParserBase):
|
||||
"""Parser for the main page of a given company.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -44,31 +44,38 @@ class DOMCompanyParser(DOMParserBase):
|
|||
_containsObjects = True
|
||||
|
||||
extractors = [
|
||||
Extractor(label='name',
|
||||
path="//title",
|
||||
attrs=Attribute(key='name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: \
|
||||
analyze_company_name(x, stripNotes=True))),
|
||||
Extractor(
|
||||
label='name',
|
||||
path="//h1/span[@class='display-title ']", # note the extra trailing space in class
|
||||
attrs=Attribute(
|
||||
key='name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: analyze_company_name(x, stripNotes=True)
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(label='filmography',
|
||||
group="//b/a[@name]",
|
||||
group_key="./text()",
|
||||
group_key_normalize=lambda x: x.lower(),
|
||||
path="../following-sibling::ol[1]/li",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'title': "./a[1]/text()",
|
||||
'year': "./text()[1]"
|
||||
},
|
||||
postprocess=lambda x:
|
||||
build_movie(u'%s %s' % \
|
||||
(x.get('title'), x.get('year').strip()),
|
||||
movieID=analyze_imdbid(x.get('link') or u''),
|
||||
_parsingCompany=True))),
|
||||
]
|
||||
Extractor(
|
||||
label='filmography',
|
||||
group="//b/a[@name]",
|
||||
group_key="./text()",
|
||||
group_key_normalize=lambda x: x.lower(),
|
||||
path="../following-sibling::ol[1]/li",
|
||||
attrs=Attribute(
|
||||
key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'title': "./a[1]/text()",
|
||||
'year': "./text()[1]"
|
||||
},
|
||||
postprocess=lambda x: build_movie(
|
||||
'%s %s' % (x.get('title'), x.get('year').strip()),
|
||||
movieID=analyze_imdbid(x.get('link') or u''),
|
||||
_parsingCompany=True
|
||||
)
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<b><a name=)', re.I), r'</p>\1')
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2,10 +2,10 @@
|
|||
parser.http.personParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse
|
||||
the IMDb pages on the akas.imdb.com server about a person.
|
||||
the IMDb pages on the www.imdb.com server about a person.
|
||||
E.g., for "Mel Gibson" the referred pages would be:
|
||||
categorized: http://akas.imdb.com/name/nm0000154/maindetails
|
||||
biography: http://akas.imdb.com/name/nm0000154/bio
|
||||
categorized: http://www.imdb.com/name/nm0000154/maindetails
|
||||
biography: http://www.imdb.com/name/nm0000154/bio
|
||||
...and so on...
|
||||
|
||||
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
||||
|
@ -52,7 +52,7 @@ def build_date(date):
|
|||
class DOMHTMLMaindetailsParser(DOMParserBase):
|
||||
"""Parser for the "categorized" (maindetails) page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -192,7 +192,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
|
|||
class DOMHTMLBioParser(DOMParserBase):
|
||||
"""Parser for the "biography" page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -225,92 +225,157 @@ class DOMHTMLBioParser(DOMParserBase):
|
|||
# TODO: check if this slicing is always correct
|
||||
postprocess=lambda x: u''.join(x).strip()[2:])]
|
||||
extractors = [
|
||||
Extractor(label='headshot',
|
||||
path="//a[@name='headshot']",
|
||||
attrs=Attribute(key='headshot',
|
||||
path="./img/@src")),
|
||||
Extractor(label='birth info',
|
||||
path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
|
||||
attrs=_birth_attrs),
|
||||
Extractor(label='death info',
|
||||
path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
|
||||
attrs=_death_attrs),
|
||||
Extractor(label='nick names',
|
||||
path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
|
||||
attrs=Attribute(key='nick names',
|
||||
path="./text()",
|
||||
joiner='|',
|
||||
postprocess=lambda x: [n.strip().replace(' (',
|
||||
'::(', 1) for n in x.split('|')
|
||||
if n.strip()])),
|
||||
Extractor(label='birth name',
|
||||
path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
|
||||
attrs=Attribute(key='birth name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: canonicalName(x.strip()))),
|
||||
Extractor(label='height',
|
||||
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
|
||||
attrs=Attribute(key='height',
|
||||
path="./text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='mini biography',
|
||||
path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
|
||||
attrs=Attribute(key='mini biography',
|
||||
multi=True,
|
||||
path={
|
||||
'bio': ".//text()",
|
||||
'by': ".//a[@name='ba']//text()"
|
||||
},
|
||||
postprocess=lambda x: "%s::%s" % \
|
||||
((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
|
||||
(x.get('by') or u'').strip() or u'Anonymous'))),
|
||||
Extractor(label='spouse',
|
||||
path="//div[h5='Spouse']/table/tr",
|
||||
attrs=Attribute(key='spouse',
|
||||
multi=True,
|
||||
path={
|
||||
'name': "./td[1]//text()",
|
||||
'info': "./td[2]//text()"
|
||||
},
|
||||
postprocess=lambda x: ("%s::%s" % \
|
||||
(x.get('name').strip(),
|
||||
(x.get('info') or u'').strip())).strip(':'))),
|
||||
Extractor(label='trade mark',
|
||||
path="//div[h5='Trade Mark']/p",
|
||||
attrs=Attribute(key='trade mark',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='trivia',
|
||||
path="//div[h5='Trivia']/p",
|
||||
attrs=Attribute(key='trivia',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='quotes',
|
||||
path="//div[h5='Personal Quotes']/p",
|
||||
attrs=Attribute(key='quotes',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='salary',
|
||||
path="//div[h5='Salary']/table/tr",
|
||||
attrs=Attribute(key='salary history',
|
||||
multi=True,
|
||||
path={
|
||||
'title': "./td[1]//text()",
|
||||
'info': "./td[2]/text()",
|
||||
},
|
||||
postprocess=lambda x: "%s::%s" % \
|
||||
(x.get('title').strip(),
|
||||
x.get('info').strip()))),
|
||||
Extractor(label='where now',
|
||||
path="//div[h5='Where Are They Now']/p",
|
||||
attrs=Attribute(key='where now',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
]
|
||||
Extractor(
|
||||
label='headshot',
|
||||
path="//a[@name='headshot']",
|
||||
attrs=Attribute(
|
||||
key='headshot',
|
||||
path="./img/@src"
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='birth info',
|
||||
path="//table[@id='overviewTable']"
|
||||
"//td[text()='Date of Birth']/following-sibling::td[1]",
|
||||
attrs=_birth_attrs
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='death info',
|
||||
path="//table[@id='overviewTable']"
|
||||
"//td[text()='Date of Death']/following-sibling::td[1]",
|
||||
attrs=_death_attrs
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='nick names',
|
||||
path="//table[@id='overviewTable']"
|
||||
"//td[text()='Nickenames']/following-sibling::td[1]",
|
||||
attrs=Attribute(
|
||||
key='nick names',
|
||||
path="./text()",
|
||||
joiner='|',
|
||||
postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|')
|
||||
if n.strip()]
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='birth name',
|
||||
path="//table[@id='overviewTable']"
|
||||
"//td[text()='Birth Name']/following-sibling::td[1]",
|
||||
attrs=Attribute(
|
||||
key='birth name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: canonicalName(x.strip())
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='height',
|
||||
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
|
||||
attrs=Attribute(
|
||||
key='height',
|
||||
path="./text()",
|
||||
postprocess=lambda x: x.strip()
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='mini biography',
|
||||
path="//a[@name='mini_bio']/following-sibling::"
|
||||
"div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
|
||||
attrs=Attribute(
|
||||
key='mini biography',
|
||||
multi=True,
|
||||
path={
|
||||
'bio': ".//text()",
|
||||
'by': ".//a[@name='ba']//text()"
|
||||
},
|
||||
postprocess=lambda x: "%s::%s" % (
|
||||
(x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
|
||||
(x.get('by') or u'').strip() or u'Anonymous'
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='spouse',
|
||||
path="//div[h5='Spouse']/table/tr",
|
||||
attrs=Attribute(
|
||||
key='spouse',
|
||||
multi=True,
|
||||
path={
|
||||
'name': "./td[1]//text()",
|
||||
'info': "./td[2]//text()"
|
||||
},
|
||||
postprocess=lambda x: ("%s::%s" % (
|
||||
x.get('name').strip(),
|
||||
(x.get('info') or u'').strip())).strip(':')
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='trade mark',
|
||||
path="//div[h5='Trade Mark']/p",
|
||||
attrs=Attribute(
|
||||
key='trade mark',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip()
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='trivia',
|
||||
path="//div[h5='Trivia']/p",
|
||||
attrs=Attribute(
|
||||
key='trivia',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip()
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='quotes',
|
||||
path="//div[h5='Personal Quotes']/p",
|
||||
attrs=Attribute(
|
||||
key='quotes',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip()
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='salary',
|
||||
path="//div[h5='Salary']/table/tr",
|
||||
attrs=Attribute(
|
||||
key='salary history',
|
||||
multi=True,
|
||||
path={
|
||||
'title': "./td[1]//text()",
|
||||
'info': "./td[2]/text()",
|
||||
},
|
||||
postprocess=lambda x: "%s::%s" % (
|
||||
x.get('title').strip(),
|
||||
x.get('info').strip())
|
||||
)
|
||||
),
|
||||
|
||||
Extractor(
|
||||
label='where now',
|
||||
path="//div[h5='Where Are They Now']/p",
|
||||
attrs=Attribute(
|
||||
key='where now',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip()
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
|
||||
|
@ -329,7 +394,7 @@ class DOMHTMLBioParser(DOMParserBase):
|
|||
class DOMHTMLResumeParser(DOMParserBase):
|
||||
"""Parser for the "resume" page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -406,13 +471,13 @@ class DOMHTMLResumeParser(DOMParserBase):
|
|||
continue
|
||||
if len(data[key][0]) == 3:
|
||||
for item in data[key]:
|
||||
item[:] = [x for x in item if not x == None]
|
||||
item[:] = [x for x in item if not x is None]
|
||||
continue
|
||||
|
||||
if len(data[key][0]) == 2:
|
||||
new_key = {}
|
||||
for item in data[key]:
|
||||
if item[0] == None:
|
||||
if item[0] is None:
|
||||
continue
|
||||
if ':' in item[0]:
|
||||
if item[1].replace(item[0], '')[1:].strip() == '':
|
||||
|
@ -422,15 +487,14 @@ class DOMHTMLResumeParser(DOMParserBase):
|
|||
new_key[item[0]] = item[1]
|
||||
data[key] = new_key
|
||||
|
||||
new_data = {}
|
||||
new_data['resume'] = data
|
||||
new_data = {'resume': data}
|
||||
return new_data
|
||||
|
||||
|
||||
class DOMHTMLOtherWorksParser(DOMParserBase):
|
||||
"""Parser for the "other works" and "agent" pages of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -466,7 +530,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
|
|||
minidx = minfo.find(' -')
|
||||
# Sometimes, for some unknown reason, the role is left in minfo.
|
||||
if minidx != -1:
|
||||
slfRole = minfo[minidx+3:].lstrip()
|
||||
slfRole = minfo[minidx + 3:].lstrip()
|
||||
minfo = minfo[:minidx].rstrip()
|
||||
if slfRole.endswith(')'):
|
||||
commidx = slfRole.rfind('(')
|
||||
|
@ -504,7 +568,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
|
|||
class DOMHTMLSeriesParser(DOMParserBase):
|
||||
"""Parser for the "by TV series" page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -559,7 +623,7 @@ class DOMHTMLSeriesParser(DOMParserBase):
|
|||
class DOMHTMLPersonGenresParser(DOMParserBase):
|
||||
"""Parser for the "by genre" and "by keywords" pages of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchCharacterParser class (and the
|
|||
search_character_parser instance), used to parse the results of a search
|
||||
for a given character.
|
||||
E.g., when searching for the name "Jesse James", the parsed page would be:
|
||||
http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James
|
||||
http://www.imdb.com/find?s=ch;mx=20;q=Jesse+James
|
||||
|
||||
Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchCompanyParser class (and the
|
|||
search_company_parser instance), used to parse the results of a search
|
||||
for a given company.
|
||||
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
|
||||
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
|
||||
http://www.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
|
||||
|
||||
Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
@ -46,22 +46,29 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
|
|||
_titleBuilder = lambda self, x: build_company_name(x)
|
||||
_linkPrefix = '/company/co'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'name': "./a[1]/text()",
|
||||
'notes': "./text()[1]"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link')),
|
||||
analyze_company_name(x.get('name')+(x.get('notes')
|
||||
or u''), stripNotes=True)
|
||||
))]
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[@class='result_text']/a[starts-with(@href, " \
|
||||
"'/company/co')]/..",
|
||||
attrs=_attrs)]
|
||||
_attrs = [
|
||||
Attribute(
|
||||
key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'name': "./a[1]/text()",
|
||||
'notes': "./text()[1]"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link')),
|
||||
analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True)
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
extractors = [
|
||||
Extractor(
|
||||
label='search',
|
||||
path="//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..",
|
||||
attrs=_attrs
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchKeywordParser class (and the
|
|||
search_company_parser instance), used to parse the results of a search
|
||||
for a given keyword.
|
||||
E.g., when searching for the keyword "alabama", the parsed page would be:
|
||||
http://akas.imdb.com/find?s=kw;mx=20;q=alabama
|
||||
http://www.imdb.com/find?s=kw;mx=20;q=alabama
|
||||
|
||||
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ search_movie_parser instance), used to parse the results of a search
|
|||
for a given title.
|
||||
E.g., for when searching for the title "the passion", the parsed
|
||||
page would be:
|
||||
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
|
||||
http://www.imdb.com/find?q=the+passion&tt=on&mx=20
|
||||
|
||||
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
@ -67,7 +67,7 @@ class DOMBasicMovieParser(DOMParserBase):
|
|||
data = []
|
||||
else:
|
||||
link = data.pop('link')
|
||||
if (link and data):
|
||||
if link and data:
|
||||
data = [(link, data)]
|
||||
else:
|
||||
data = []
|
||||
|
|
|
@ -5,7 +5,7 @@ This module provides the HTMLSearchPersonParser class (and the
|
|||
search_person_parser instance), used to parse the results of a search
|
||||
for a given person.
|
||||
E.g., when searching for the name "Mel Gibson", the parsed page would be:
|
||||
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
|
||||
http://www.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
|
||||
|
||||
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
|
|
@ -4,8 +4,8 @@ parser.http.topBottomParser module (imdb package).
|
|||
This module provides the classes (and the instances), used to parse the
|
||||
lists of top 250 and bottom 100 movies.
|
||||
E.g.:
|
||||
http://akas.imdb.com/chart/top
|
||||
http://akas.imdb.com/chart/bottom
|
||||
http://www.imdb.com/chart/top
|
||||
http://www.imdb.com/chart/bottom
|
||||
|
||||
Copyright 2009-2015 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
|
@ -31,7 +31,7 @@ from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
|
|||
class DOMHTMLTop250Parser(DOMParserBase):
|
||||
"""Parser for the "top 250" page.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
@ -42,17 +42,24 @@ class DOMHTMLTop250Parser(DOMParserBase):
|
|||
ranktext = 'top 250 rank'
|
||||
|
||||
def _init(self):
|
||||
self.extractors = [Extractor(label=self.label,
|
||||
path="//div[@id='main']//div[1]//div//table//tbody//tr",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={self.ranktext: "./td[2]//text()",
|
||||
'rating': "./td[3]//strong//text()",
|
||||
'title': "./td[2]//a//text()",
|
||||
'year': "./td[2]//span//text()",
|
||||
'movieID': "./td[2]//a/@href",
|
||||
'votes': "./td[3]//strong/@title"
|
||||
}))]
|
||||
self.extractors = [
|
||||
Extractor(
|
||||
label=self.label,
|
||||
path="//div[@id='main']//div[1]//div//table//tbody//tr",
|
||||
attrs=Attribute(
|
||||
key=None,
|
||||
multi=True,
|
||||
path={
|
||||
self.ranktext: "./td[2]/text()",
|
||||
'rating': "./td[3]//strong//text()",
|
||||
'title': "./td[2]//a//text()",
|
||||
'year': "./td[2]//span//text()",
|
||||
'movieID': "./td[2]//a/@href",
|
||||
'votes': "./td[3]//strong/@title"
|
||||
}
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if not data or self.label not in data:
|
||||
|
@ -73,9 +80,11 @@ class DOMHTMLTop250Parser(DOMParserBase):
|
|||
if theID in seenIDs:
|
||||
continue
|
||||
seenIDs.append(theID)
|
||||
minfo = analyze_title(d['title']+" "+d['year'])
|
||||
try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
|
||||
except: pass
|
||||
minfo = analyze_title(d['title'] + ' ' + d['year'])
|
||||
try:
|
||||
minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
|
||||
except:
|
||||
pass
|
||||
if 'votes' in d:
|
||||
try:
|
||||
votes = d['votes'].replace(' votes','')
|
||||
|
@ -93,7 +102,7 @@ class DOMHTMLTop250Parser(DOMParserBase):
|
|||
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
|
||||
"""Parser for the "bottom 100" page.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
the www.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
|
|
|
@ -35,7 +35,9 @@ from imdb.Character import Character
|
|||
|
||||
|
||||
# Year, imdbIndex and kind.
|
||||
re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)')
|
||||
re_yearKind_index = re.compile(
|
||||
r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)'
|
||||
)
|
||||
|
||||
# Match imdb ids in href tags
|
||||
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
|
||||
|
@ -304,7 +306,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
|
|||
elif title[-14:] == 'TV mini-series':
|
||||
title = title[:-14] + ' (mini)'
|
||||
if title and title.endswith(_defSep.rstrip()):
|
||||
title = title[:-len(_defSep)+1]
|
||||
title = title[:-len(_defSep) + 1]
|
||||
# Try to understand where the movie title ends.
|
||||
while True:
|
||||
if year:
|
||||
|
@ -320,18 +322,17 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
|
|||
# Try to match paired parentheses; yes: sometimes there are
|
||||
# parentheses inside comments...
|
||||
nidx = title.rfind('(')
|
||||
while (nidx != -1 and \
|
||||
title[nidx:].count('(') != title[nidx:].count(')')):
|
||||
while nidx != -1 and title[nidx:].count('(') != title[nidx:].count(')'):
|
||||
nidx = title[:nidx].rfind('(')
|
||||
# Unbalanced parentheses: stop here.
|
||||
if nidx == -1: break
|
||||
# The last item in parentheses seems to be a year: stop here.
|
||||
first4 = title[nidx+1:nidx+5]
|
||||
if (first4.isdigit() or first4 == '????') and \
|
||||
title[nidx+5:nidx+6] in (')', '/'): break
|
||||
first4 = title[nidx + 1:nidx + 5]
|
||||
if (first4.isdigit() or first4 == '????') and title[nidx + 5:nidx + 6] in (')', '/'):
|
||||
break
|
||||
# The last item in parentheses is a known kind: stop here.
|
||||
if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie',
|
||||
'TV series', 'short'): break
|
||||
if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', 'TV series', 'short'):
|
||||
break
|
||||
# Else, in parentheses there are some notes.
|
||||
# XXX: should the notes in the role half be kept separated
|
||||
# from the notes in the movie title half?
|
||||
|
@ -471,8 +472,8 @@ class DOMParserBase(object):
|
|||
if _gotError:
|
||||
warnings.warn('falling back to "%s"' % mod)
|
||||
break
|
||||
except ImportError, e:
|
||||
if idx+1 >= nrMods:
|
||||
except ImportError as e:
|
||||
if idx + 1 >= nrMods:
|
||||
# Raise the exception, if we don't have any more
|
||||
# options to try.
|
||||
raise IMDbError('unable to use any parser in %s: %s' % \
|
||||
|
@ -786,10 +787,10 @@ class Extractor(object):
|
|||
|
||||
def __repr__(self):
|
||||
"""String representation of an Extractor object."""
|
||||
r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \
|
||||
'group_key=%s group_key_normalize=%s)>' % (id(self),
|
||||
self.label, self.path, repr(self.attrs), self.group,
|
||||
self.group_key, self.group_key_normalize)
|
||||
t = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, group_key=%s' + \
|
||||
', group_key_normalize=%s)>'
|
||||
r = t % (id(self), self.label, self.path, repr(self.attrs), self.group,
|
||||
self.group_key, self.group_key_normalize)
|
||||
return r
|
||||
|
||||
|
||||
|
@ -825,7 +826,7 @@ def _parse_ref(text, link, info):
|
|||
yearK = re_yearKind_index.match(info)
|
||||
if yearK and yearK.start() == 0:
|
||||
text += ' %s' % info[:yearK.end()]
|
||||
return (text.replace('\n', ' '), link)
|
||||
return text.replace('\n', ' '), link
|
||||
|
||||
|
||||
class GatherRefs(DOMParserBase):
|
||||
|
|
|
@ -687,7 +687,7 @@ class IMDbSqlAccessSystem(IMDbBase):
|
|||
elif isinstance(o, dict):
|
||||
for value in o.values():
|
||||
self._findRefs(value, trefs, nrefs)
|
||||
return (trefs, nrefs)
|
||||
return trefs, nrefs
|
||||
|
||||
def _extractRefs(self, o):
|
||||
"""Scan for titles or names references in strings."""
|
||||
|
@ -702,7 +702,7 @@ class IMDbSqlAccessSystem(IMDbBase):
|
|||
"imdb.parser.sql.IMDbSqlAccessSystem; "
|
||||
"if it's not a recursion limit exceeded and we're not "
|
||||
"running in a Symbian environment, it's a bug:\n%s" % e)
|
||||
return (trefs, nrefs)
|
||||
return trefs, nrefs
|
||||
|
||||
def _changeAKAencoding(self, akanotes, akatitle):
|
||||
"""Return akatitle in the correct charset, as specified in
|
||||
|
|
|
@ -437,11 +437,13 @@ def ISNULL(x):
|
|||
"""Emulate SQLObject's ISNULL."""
|
||||
# XXX: Should we use null()? Can null() be a global instance?
|
||||
# XXX: Is it safe to test None with the == operator, in this case?
|
||||
return x == None
|
||||
return x is None
|
||||
|
||||
|
||||
def ISNOTNULL(x):
|
||||
"""Emulate SQLObject's ISNOTNULL."""
|
||||
return x != None
|
||||
return x is not None
|
||||
|
||||
|
||||
def CONTAINSSTRING(expr, pattern):
|
||||
"""Emulate SQLObject's CONTAINSSTRING."""
|
||||
|
|
|
@ -122,53 +122,80 @@ class DBTable(object):
|
|||
|
||||
|
||||
# Default values to insert in some tables: {'column': (list, of, values, ...)}
|
||||
kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie',
|
||||
'tv mini series', 'video game', 'episode')}
|
||||
companyTypeDefs = {'kind': ('distributors', 'production companies',
|
||||
'special effects companies', 'miscellaneous companies')}
|
||||
infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages',
|
||||
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
|
||||
'keywords', 'alternate versions', 'crazy credits', 'goofs',
|
||||
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
|
||||
'mini biography', 'birth notes', 'birth date', 'height',
|
||||
'death date', 'spouse', 'other works', 'birth name',
|
||||
'salary history', 'nick names', 'books', 'agent address',
|
||||
'biographical movies', 'portrayed in', 'where now', 'trade mark',
|
||||
'interviews', 'article', 'magazine cover photo', 'pictorial',
|
||||
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
|
||||
'LD official retail price', 'LD frequency response', 'LD pressing plant',
|
||||
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
|
||||
'LD production country', 'LD contrast', 'LD color rendition',
|
||||
'LD picture format', 'LD video noise', 'LD video artifacts',
|
||||
'LD release country', 'LD sharpness', 'LD dynamic range',
|
||||
'LD audio noise', 'LD color information', 'LD group genre',
|
||||
'LD quality program', 'LD close captions-teletext-ld-g',
|
||||
'LD category', 'LD analog left', 'LD certification',
|
||||
'LD audio quality', 'LD video quality', 'LD aspect ratio',
|
||||
'LD analog right', 'LD additional information',
|
||||
'LD number of chapter stops', 'LD dialogue intellegibility',
|
||||
'LD disc size', 'LD master format', 'LD subtitles',
|
||||
'LD status of availablility', 'LD quality of source',
|
||||
'LD number of sides', 'LD video standard', 'LD supplement',
|
||||
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
|
||||
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
|
||||
'novel', 'adaption', 'book', 'production process protocol',
|
||||
'printed media reviews', 'essays', 'other literature', 'mpaa',
|
||||
'plot', 'votes distribution', 'votes', 'rating',
|
||||
'production dates', 'copyright holder', 'filming dates', 'budget',
|
||||
'weekend gross', 'gross', 'opening weekend', 'rentals',
|
||||
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')}
|
||||
compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')}
|
||||
linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as',
|
||||
'references', 'referenced in', 'spoofs', 'spoofed in',
|
||||
'features', 'featured in', 'spin off from', 'spin off',
|
||||
'version of', 'similar to', 'edited into',
|
||||
'edited from', 'alternate language version of',
|
||||
'unknown link')}
|
||||
roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer',
|
||||
'cinematographer', 'composer', 'costume designer',
|
||||
'director', 'editor', 'miscellaneous crew',
|
||||
'production designer', 'guest')}
|
||||
kindTypeDefs = {
|
||||
'kind': (
|
||||
'movie', 'tv series', 'tv movie', 'video movie',
|
||||
'tv mini series', 'video game', 'episode', 'short', 'tv short'
|
||||
)
|
||||
}
|
||||
|
||||
companyTypeDefs = {
|
||||
'kind': (
|
||||
'distributors', 'production companies',
|
||||
'special effects companies', 'miscellaneous companies'
|
||||
)
|
||||
}
|
||||
|
||||
infoTypeDefs = {
|
||||
'info': (
|
||||
'runtimes', 'color info', 'genres', 'languages',
|
||||
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
|
||||
'keywords', 'alternate versions', 'crazy credits', 'goofs',
|
||||
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
|
||||
'mini biography', 'birth notes', 'birth date', 'height',
|
||||
'death date', 'spouse', 'other works', 'birth name',
|
||||
'salary history', 'nick names', 'books', 'agent address',
|
||||
'biographical movies', 'portrayed in', 'where now', 'trade mark',
|
||||
'interviews', 'article', 'magazine cover photo', 'pictorial',
|
||||
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
|
||||
'LD official retail price', 'LD frequency response', 'LD pressing plant',
|
||||
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
|
||||
'LD production country', 'LD contrast', 'LD color rendition',
|
||||
'LD picture format', 'LD video noise', 'LD video artifacts',
|
||||
'LD release country', 'LD sharpness', 'LD dynamic range',
|
||||
'LD audio noise', 'LD color information', 'LD group genre',
|
||||
'LD quality program', 'LD close captions-teletext-ld-g',
|
||||
'LD category', 'LD analog left', 'LD certification',
|
||||
'LD audio quality', 'LD video quality', 'LD aspect ratio',
|
||||
'LD analog right', 'LD additional information',
|
||||
'LD number of chapter stops', 'LD dialogue intellegibility',
|
||||
'LD disc size', 'LD master format', 'LD subtitles',
|
||||
'LD status of availablility', 'LD quality of source',
|
||||
'LD number of sides', 'LD video standard', 'LD supplement',
|
||||
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
|
||||
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
|
||||
'novel', 'adaption', 'book', 'production process protocol',
|
||||
'printed media reviews', 'essays', 'other literature', 'mpaa',
|
||||
'plot', 'votes distribution', 'votes', 'rating',
|
||||
'production dates', 'copyright holder', 'filming dates', 'budget',
|
||||
'weekend gross', 'gross', 'opening weekend', 'rentals',
|
||||
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank'
|
||||
)
|
||||
}
|
||||
|
||||
compCastTypeDefs = {
|
||||
'kind': ('cast', 'crew', 'complete', 'complete+verified')
|
||||
}
|
||||
|
||||
linkTypeDefs = {
|
||||
'link': (
|
||||
'follows', 'followed by', 'remake of', 'remade as',
|
||||
'references', 'referenced in', 'spoofs', 'spoofed in',
|
||||
'features', 'featured in', 'spin off from', 'spin off',
|
||||
'version of', 'similar to', 'edited into',
|
||||
'edited from', 'alternate language version of',
|
||||
'unknown link'
|
||||
)
|
||||
}
|
||||
|
||||
roleTypeDefs = {
|
||||
'role': (
|
||||
'actor', 'actress', 'producer', 'writer',
|
||||
'cinematographer', 'composer', 'costume designer',
|
||||
'director', 'editor', 'miscellaneous crew',
|
||||
'production designer', 'guest'
|
||||
)
|
||||
}
|
||||
|
||||
# Schema of tables in our database.
|
||||
# XXX: Foreign keys can be used to create constrains between tables,
|
||||
|
@ -186,7 +213,7 @@ DB_SCHEMA = [
|
|||
# the alternateID attribute here will be ignored by SQLAlchemy.
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'),
|
||||
DBCol('gender', STRINGCOL, length=1, default=None),
|
||||
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
||||
|
@ -204,7 +231,7 @@ DB_SCHEMA = [
|
|||
# from namePcodeNf.
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodenf'),
|
||||
|
@ -218,7 +245,7 @@ DB_SCHEMA = [
|
|||
# namePcodeSf is the soundex of the name plus the country code.
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||
DBCol('countryCode', UNICODECOL, length=255, default=None),
|
||||
DBCol('countryCode', STRINGCOL, length=255, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodenf'),
|
||||
|
@ -237,7 +264,7 @@ DB_SCHEMA = [
|
|||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('title', UNICODECOL, notNone=True,
|
||||
index='idx_title', indexLen=10),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
||||
DBCol('productionYear', INTCOL, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"),
|
||||
|
@ -264,7 +291,7 @@ DB_SCHEMA = [
|
|||
DBCol('personID', INTCOL, notNone=True, index='idx_person',
|
||||
foreignKey='Name'),
|
||||
DBCol('name', UNICODECOL, notNone=True),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodecf'),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
|
@ -291,7 +318,7 @@ DB_SCHEMA = [
|
|||
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
|
||||
foreignKey='Title'),
|
||||
DBCol('title', UNICODECOL, notNone=True),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
|
||||
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
||||
DBCol('productionYear', INTCOL, default=None),
|
||||
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
|
||||
|
|
|
@ -42,8 +42,22 @@ _utils_logger = logging.getLogger('imdbpy.utils')
|
|||
# and year of release.
|
||||
# XXX: probably L, C, D and M are far too much! ;-)
|
||||
re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
|
||||
re_extended_year_index = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?((?:[0-9\?]{4})(?:-[0-9\?]{4})?)(?:/([IVXLCDM]+)?)?\)')
|
||||
re_remove_kind = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?')
|
||||
re_m_episode = re.compile(r'\(TV Episode\)\s+-\s+', re.I)
|
||||
re_m_series = re.compile(r'Season\s+\d+\s+\|\s+Episode\s+\d+\s+-', re.I)
|
||||
re_m_imdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
|
||||
re_m_kind = re.compile(
|
||||
r'\((TV episode|TV Series|TV mini-series|mini|TV|Video|Video Game|VG|Short|TV Movie|TV Short|V)\)',
|
||||
re.I)
|
||||
|
||||
KIND_MAP = {
|
||||
'tv': 'tv movie',
|
||||
'tv episode': 'episode',
|
||||
'v': 'video movie',
|
||||
'video': 'video movie',
|
||||
'vg': 'video game',
|
||||
'mini': 'tv mini series',
|
||||
'tv mini-series': 'tv mini series'
|
||||
}
|
||||
|
||||
# Match only the imdbIndex (for name strings).
|
||||
re_index = re.compile(r'^\(([IVXLCDM]+)\)$')
|
||||
|
@ -283,13 +297,6 @@ def _split_series_episode(title):
|
|||
# that means this is an episode title, as returned by
|
||||
# the web server.
|
||||
series_title = title[:second_quot]
|
||||
##elif episode_or_year[-1:] == '}':
|
||||
## # Title of the episode, as in the plain text data files.
|
||||
## begin_eps = episode_or_year.find('{')
|
||||
## if begin_eps == -1: return series_title, episode_or_year
|
||||
## series_title = title[:second_quot+begin_eps].rstrip()
|
||||
## # episode_or_year is returned with the {...}
|
||||
## episode_or_year = episode_or_year[begin_eps:]
|
||||
return series_title, episode_or_year
|
||||
|
||||
|
||||
|
@ -383,65 +390,24 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
|
|||
# tv mini series: 5,497
|
||||
# video game: 5,490
|
||||
# More up-to-date statistics: http://us.imdb.com/database_statistics
|
||||
if title.endswith('(TV)'):
|
||||
kind = u'tv movie'
|
||||
title = title[:-4].rstrip()
|
||||
elif title.endswith('(TV Movie)'):
|
||||
kind = u'tv movie'
|
||||
title = title[:-10].rstrip()
|
||||
elif title.endswith('(V)'):
|
||||
kind = u'video movie'
|
||||
title = title[:-3].rstrip()
|
||||
elif title.lower().endswith('(video)'):
|
||||
kind = u'video movie'
|
||||
title = title[:-7].rstrip()
|
||||
elif title.endswith('(TV Short)'):
|
||||
kind = u'tv short'
|
||||
title = title[:-10].rstrip()
|
||||
elif title.endswith('(TV Mini-Series)'):
|
||||
kind = u'tv mini series'
|
||||
title = title[:-16].rstrip()
|
||||
elif title.endswith('(mini)'):
|
||||
kind = u'tv mini series'
|
||||
title = title[:-6].rstrip()
|
||||
elif title.endswith('(VG)'):
|
||||
kind = u'video game'
|
||||
title = title[:-4].rstrip()
|
||||
elif title.endswith('(Video Game)'):
|
||||
kind = u'video game'
|
||||
title = title[:-12].rstrip()
|
||||
elif title.endswith('(TV Series)'):
|
||||
epindex = title.find('(TV Episode) - ')
|
||||
if epindex >= 0:
|
||||
# It's an episode of a series.
|
||||
kind = u'episode'
|
||||
series_info = analyze_title(title[epindex + 15:])
|
||||
result['episode of'] = series_info.get('title')
|
||||
result['series year'] = series_info.get('year')
|
||||
title = title[:epindex]
|
||||
else:
|
||||
kind = u'tv series'
|
||||
title = title[:-11].rstrip()
|
||||
epindex = re_m_episode.search(title)
|
||||
if epindex:
|
||||
# It's an episode of a series.
|
||||
kind = 'episode'
|
||||
series_title = title[epindex.end():]
|
||||
series_title = re_m_series.sub('', series_title)
|
||||
series_info = analyze_title(series_title)
|
||||
result['episode of'] = series_info.get('title')
|
||||
result['series year'] = series_info.get('year')
|
||||
title = title[:epindex.start()].strip()
|
||||
else:
|
||||
detected_kind = re_m_kind.findall(title)
|
||||
if detected_kind:
|
||||
kind = detected_kind[-1].lower().replace('-', '')
|
||||
kind = KIND_MAP.get(kind, kind)
|
||||
title = re_m_kind.sub('', title).strip()
|
||||
# Search for the year and the optional imdbIndex (a roman number).
|
||||
yi = re_year_index.findall(title)
|
||||
if not yi:
|
||||
yi = re_extended_year_index.findall(title)
|
||||
if yi:
|
||||
yk, yiy, yii = yi[-1]
|
||||
yi = [(yiy, yii)]
|
||||
if yk == 'TV episode':
|
||||
kind = u'episode'
|
||||
elif yk in ('TV', 'TV Movie'):
|
||||
kind = u'tv movie'
|
||||
elif yk == 'TV Series':
|
||||
kind = u'tv series'
|
||||
elif yk == 'Video':
|
||||
kind = u'video movie'
|
||||
elif yk in ('TV mini-series', 'TV Mini-Series'):
|
||||
kind = u'tv mini series'
|
||||
elif yk == 'Video Game':
|
||||
kind = u'video game'
|
||||
title = re_remove_kind.sub('(', title)
|
||||
if yi:
|
||||
last_yi = yi[-1]
|
||||
year = last_yi[0]
|
||||
|
@ -450,7 +416,12 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
|
|||
year = year[:-len(imdbIndex)-1]
|
||||
i = title.rfind('(%s)' % last_yi[0])
|
||||
if i != -1:
|
||||
title = title[:i-1].rstrip()
|
||||
title = title[:i - 1].rstrip()
|
||||
if not imdbIndex:
|
||||
detect_imdbIndex = re_m_imdbIndex.findall(title)
|
||||
if detect_imdbIndex:
|
||||
imdbIndex = detect_imdbIndex[-1]
|
||||
title = re_m_imdbIndex.sub('', title).strip()
|
||||
# This is a tv (mini) series: strip the '"' at the begin and at the end.
|
||||
# XXX: strip('"') is not used for compatibility with Python 2.0.
|
||||
if title and title[0] == title[-1] == '"':
|
||||
|
@ -464,8 +435,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
|
|||
title = canonicalTitle(title)
|
||||
else:
|
||||
title = normalizeTitle(title)
|
||||
# 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series',
|
||||
# 'tv movie', 'video movie', 'video game')
|
||||
result['title'] = title
|
||||
result['kind'] = kind or u'movie'
|
||||
if year and year != '????':
|
||||
|
@ -832,7 +801,7 @@ def date_and_notes(s):
|
|||
"""Parse (birth|death) date and notes; returns a tuple in the
|
||||
form (date, notes)."""
|
||||
s = s.strip()
|
||||
if not s: return (u'', u'')
|
||||
if not s: return u'', u''
|
||||
notes = u''
|
||||
if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february',
|
||||
'march', 'april', 'may', 'june',
|
||||
|
@ -990,7 +959,7 @@ def _tag4TON(ton, addAccessSystem=False, _containerOnly=False):
|
|||
beginTag += extras
|
||||
if ton.notes:
|
||||
beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes)
|
||||
return (beginTag, u'</%s>' % tag)
|
||||
return beginTag, u'</%s>' % tag
|
||||
|
||||
|
||||
TAGS_TO_MODIFY = {
|
||||
|
@ -1264,8 +1233,8 @@ class _Container(object):
|
|||
self.__role = role
|
||||
|
||||
currentRole = property(_get_currentRole, _set_currentRole,
|
||||
doc="The role of a Person in a Movie" + \
|
||||
" or the interpreter of a Character in a Movie.")
|
||||
doc="The role of a Person in a Movie"
|
||||
" or the interpreter of a Character in a Movie.")
|
||||
|
||||
def _init(self, **kwds): pass
|
||||
|
||||
|
@ -1478,10 +1447,10 @@ class _Container(object):
|
|||
except RuntimeError, e:
|
||||
# Symbian/python 2.2 has a poor regexp implementation.
|
||||
import warnings
|
||||
warnings.warn('RuntimeError in '
|
||||
"imdb.utils._Container.__getitem__; if it's not "
|
||||
"a recursion limit exceeded and we're not running "
|
||||
"in a Symbian environment, it's a bug:\n%s" % e)
|
||||
warnings.warn("RuntimeError in imdb.utils._Container.__getitem__;"
|
||||
" if it's not a recursion limit exceeded and we're"
|
||||
" not running in a Symbian environment, it's a"
|
||||
" bug:\n%s" % e)
|
||||
return rawData
|
||||
|
||||
def __setitem__(self, key, item):
|
||||
|
|
Loading…
Reference in a new issue