Update IMDb 5.1 (r907) → 5.2.1dev20171113 (f640595).

Thanks to the backport by @MasterMind2k
This commit is contained in:
JackDandy 2018-03-26 18:16:59 +01:00
parent 18c400acec
commit 78026584eb
24 changed files with 1992 additions and 1184 deletions

View file

@ -7,6 +7,7 @@
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
* Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo
* Update html5lib 0.99999999/1.0b9 (1a28d72) to 1.1-dev (e9ef538)
* Update IMDb 5.1 (r907) to 5.2.1dev20171113 (f640595)
[develop changelog]

View file

@ -23,8 +23,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
from copy import deepcopy
from imdb.utils import analyze_company_name, build_company_name, \
flatten, _Container, cmpCompanies
from imdb.utils import _Container
from imdb.utils import analyze_company_name, build_company_name, cmpCompanies, flatten
class Company(_Container):

View file

@ -24,8 +24,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
from copy import deepcopy
from imdb import linguistics
from imdb.utils import analyze_title, build_title, canonicalTitle, \
flatten, _Container, cmpMovies
from imdb.utils import _Container
from imdb.utils import analyze_title, build_title, canonicalTitle, cmpMovies, flatten
class Movie(_Container):

View file

@ -6,7 +6,7 @@ a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages,
a SQL database, etc.)
Copyright 2004-2016 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2018 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -25,12 +25,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems']
__version__ = VERSION = '5.1'
__version__ = VERSION = '5.2.1dev20171113'
VERSION_NOTICE = """This is the imdbpy-legacy branch of IMDbPY, and requires Python 2.
Please notice that this version is mostly unsupported.
For a version compatible with Python 3, see the master branch:
https://github.com/alberanid/imdbpy/
"""
import sys
if sys.hexversion >= 0x3000000:
print(VERSION_NOTICE)
sys.exit(1)
# Import compatibility module (importing it is enough).
import _compat
import sys, os, ConfigParser, logging
import os, ConfigParser, logging
from types import MethodType
from imdb import Movie, Person, Character, Company
@ -38,38 +51,39 @@ import imdb._logging
from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError
from imdb.utils import build_title, build_name, build_company_name
_imdb_logger = logging.getLogger('imdbpy')
_aux_logger = logging.getLogger('imdbpy.aux')
# URLs of the main pages for movies, persons, characters and queries.
imdbURL_base = 'http://akas.imdb.com/'
imdbURL_base = 'http://www.imdb.com/'
# NOTE: the urls below will be removed in a future version.
# please use the values in the 'urls' attribute
# of the IMDbBase subclass instance.
# http://akas.imdb.com/title/
# http://www.imdb.com/title/
imdbURL_movie_base = '%stitle/' % imdbURL_base
# http://akas.imdb.com/title/tt%s/
# http://www.imdb.com/title/tt%s/
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
# http://akas.imdb.com/name/
# http://www.imdb.com/name/
imdbURL_person_base = '%sname/' % imdbURL_base
# http://akas.imdb.com/name/nm%s/
# http://www.imdb.com/name/nm%s/
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
# http://akas.imdb.com/character/
# http://www.imdb.com/character/
imdbURL_character_base = '%scharacter/' % imdbURL_base
# http://akas.imdb.com/character/ch%s/
# http://www.imdb.com/character/ch%s/
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
# http://akas.imdb.com/company/
# http://www.imdb.com/company/
imdbURL_company_base = '%scompany/' % imdbURL_base
# http://akas.imdb.com/company/co%s/
# http://www.imdb.com/company/co%s/
imdbURL_company_main = imdbURL_company_base + 'co%s/'
# http://akas.imdb.com/keyword/%s/
# http://www.imdb.com/keyword/%s/
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top
# http://www.imdb.com/chart/top
imdbURL_top250 = imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom
# http://www.imdb.com/chart/bottom
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s
# http://www.imdb.com/find?%s
imdbURL_find = imdbURL_base + 'find?%s'
# Name of the configuration file.
@ -103,7 +117,7 @@ class ConfigParserWithCase(ConfigParser.ConfigParser):
try:
self.read(fname)
except (ConfigParser.MissingSectionHeaderError,
ConfigParser.ParsingError), e:
ConfigParser.ParsingError) as e:
_aux_logger.warn('Troubles reading config file: %s' % e)
# Stop at the first valid file.
if self.has_section('imdbpy'):
@ -159,10 +173,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
accessSystem = 'http'
kwds.update(keywords)
keywords = kwds
except Exception, e:
import logging
logging.getLogger('imdbpy').warn('Unable to read configuration' \
' file; complete error: %s' % e)
except Exception as e:
_imdb_logger.warn('Unable to read configuration file; complete error: %s' % e)
# It just LOOKS LIKE a bad habit: we tried to read config
# options from some files, but something is gone horribly
# wrong: ignore everything and pretend we were called with
@ -177,9 +189,8 @@ def IMDb(accessSystem=None, *arguments, **keywords):
try:
import logging.config
logging.config.fileConfig(os.path.expanduser(logCfg))
except Exception, e:
logging.getLogger('imdbpy').warn('unable to read logger ' \
'config: %s' % e)
except Exception as e:
_imdb_logger.warn('unable to read logger config: %s' % e)
if accessSystem in ('httpThin', 'webThin', 'htmlThin'):
logging.warn('httpThin was removed since IMDbPY 4.8')
accessSystem = 'http'
@ -244,9 +255,6 @@ class IMDbBase:
# in the subclasses).
accessSystem = 'UNKNOWN'
# Top-level logger for IMDbPY.
_imdb_logger = logging.getLogger('imdbpy')
# Whether to re-raise caught exceptions or not.
_reraise_exceptions = False
@ -285,30 +293,30 @@ class IMDbBase:
imdbURL_base = 'http://%s' % imdbURL_base
if not imdbURL_base.endswith('/'):
imdbURL_base = '%s/' % imdbURL_base
# http://akas.imdb.com/title/
imdbURL_movie_base='%stitle/' % imdbURL_base
# http://akas.imdb.com/title/tt%s/
imdbURL_movie_main=imdbURL_movie_base + 'tt%s/'
# http://akas.imdb.com/name/
imdbURL_person_base='%sname/' % imdbURL_base
# http://akas.imdb.com/name/nm%s/
imdbURL_person_main=imdbURL_person_base + 'nm%s/'
# http://akas.imdb.com/character/
imdbURL_character_base='%scharacter/' % imdbURL_base
# http://akas.imdb.com/character/ch%s/
imdbURL_character_main=imdbURL_character_base + 'ch%s/'
# http://akas.imdb.com/company/
imdbURL_company_base='%scompany/' % imdbURL_base
# http://akas.imdb.com/company/co%s/
imdbURL_company_main=imdbURL_company_base + 'co%s/'
# http://akas.imdb.com/keyword/%s/
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top
imdbURL_top250=imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom
imdbURL_bottom100=imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s
imdbURL_find=imdbURL_base + 'find?%s'
# http://www.imdb.com/title/
imdbURL_movie_base = '%stitle/' % imdbURL_base
# http://www.imdb.com/title/tt%s/
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
# http://www.imdb.com/name/
imdbURL_person_base = '%sname/' % imdbURL_base
# http://www.imdb.com/name/nm%s/
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
# http://www.imdb.com/character/
imdbURL_character_base = '%scharacter/' % imdbURL_base
# http://www.imdb.com/character/ch%s/
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
# http://www.imdb.com/company/
imdbURL_company_base = '%scompany/' % imdbURL_base
# http://www.imdb.com/company/co%s/
imdbURL_company_main = imdbURL_company_base + 'co%s/'
# http://www.imdb.com/keyword/%s/
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
# http://www.imdb.com/chart/top
imdbURL_top250 = imdbURL_base + 'chart/top'
# http://www.imdb.com/chart/bottom
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
# http://www.imdb.com/find?%s
imdbURL_find = imdbURL_base + 'find?%s'
self.urls = dict(
movie_base=imdbURL_movie_base,
movie_main=imdbURL_movie_main,
@ -727,16 +735,15 @@ class IMDbBase:
mopID = mop.companyID
prefix = 'company'
else:
raise IMDbError('object ' + repr(mop) + \
' is not a Movie, Person, Character or Company instance')
raise IMDbError('object ' + repr(mop) +
' is not a Movie, Person, Character or Company instance')
if mopID is None:
# XXX: enough? It's obvious that there are Characters
# objects without characterID, so I think they should
# just do nothing, when an i.update(character) is tried.
if prefix == 'character':
return
raise IMDbDataAccessError( \
'the supplied object has null movieID, personID or companyID')
raise IMDbDataAccessError('supplied object has null movieID, personID or companyID')
if mop.accessSystem == self.accessSystem:
aSystem = self
else:
@ -760,21 +767,22 @@ class IMDbBase:
continue
if not i:
continue
self._imdb_logger.debug('retrieving "%s" info set', i)
_imdb_logger.debug('retrieving "%s" info set', i)
try:
method = getattr(aSystem, 'get_%s_%s' %
(prefix, i.replace(' ', '_')))
except AttributeError:
self._imdb_logger.error('unknown information set "%s"', i)
_imdb_logger.error('unknown information set "%s"', i)
# Keeps going.
method = lambda *x: {}
try:
ret = method(mopID)
except Exception, e:
self._imdb_logger.critical('caught an exception retrieving ' \
'or parsing "%s" info set for mopID ' \
'"%s" (accessSystem: %s)',
i, mopID, mop.accessSystem, exc_info=True)
except Exception:
_imdb_logger.critical(
'caught an exception retrieving or parsing "%s" info set'
' for mopID "%s" (accessSystem: %s)',
i, mopID, mop.accessSystem, exc_info=True
)
ret = {}
# If requested by the user, reraise the exception.
if self._reraise_exceptions:
@ -826,9 +834,7 @@ class IMDbBase:
raise NotImplementedError('override this method')
def _searchIMDb(self, kind, ton, title_kind=None):
"""Search the IMDb akas server for the given title or name."""
# The Exact Primary search system has gone AWOL, so we resort
# to the mobile search. :-/
"""Search the IMDb www server for the given title or name."""
if not ton:
return None
ton = ton.strip('"')
@ -935,8 +941,8 @@ class IMDbBase:
else:
imdbID = aSystem.company2imdbID(build_company_name(mop))
else:
raise IMDbError('object ' + repr(mop) + \
' is not a Movie, Person or Character instance')
raise IMDbError('object ' + repr(mop) +
' is not a Movie, Person or Character instance')
return imdbID
def get_imdbURL(self, mop):
@ -954,8 +960,8 @@ class IMDbBase:
elif isinstance(mop, Company.Company):
url_firstPart = imdbURL_company_main
else:
raise IMDbError('object ' + repr(mop) + \
' is not a Movie, Person, Character or Company instance')
raise IMDbError('object ' + repr(mop) +
' is not a Movie, Person, Character or Company instance')
return url_firstPart % imdbID
def get_special_methods(self):

View file

@ -32,8 +32,9 @@ LEVELS = {'debug': logging.DEBUG,
imdbpyLogger = logging.getLogger('imdbpy')
imdbpyStreamHandler = logging.StreamHandler()
imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \
' %(pathname)s:%(lineno)d: %(message)s')
imdbpyFormatter = logging.Formatter(
'%(asctime)s %(levelname)s [%(name)s] %(pathname)s:%(lineno)d: %(message)s'
)
imdbpyStreamHandler.setFormatter(imdbpyFormatter)
imdbpyLogger.addHandler(imdbpyStreamHandler)

View file

@ -269,8 +269,8 @@ for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items():
everyentcharrefs[k] = v
everyentcharrefs['#%s' % ord(v)] = v
everyentcharrefsget = everyentcharrefs.get
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' %
'|'.join(map(re.escape, everyentcharrefs)))
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % '|'.join(map(re.escape,
everyentcharrefs)))
re_everyentcharrefssub = re_everyentcharrefs.sub
def _replAllXMLRef(match):
@ -408,7 +408,7 @@ def _valueWithType(tag, tagValue):
# Extra tags to get (if values were not already read from title/name).
_titleTags = ('imdbindex', 'kind', 'year')
_nameTags = ('imdbindex')
_nameTags = ('imdbindex',)
_companyTags = ('imdbindex', 'country')
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,

View file

@ -7,7 +7,7 @@ the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "http" or "web"
or "html" (this is the default).
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2017 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -26,6 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import sys
import ssl
import socket
import logging
from urllib import FancyURLopener, quote_plus
@ -68,8 +69,8 @@ class _ModuleProxy:
"""Initialize a proxy for the given module; defaultKeys, if set,
muste be a dictionary of values to set for instanced objects."""
if oldParsers or fallBackToNew:
_aux_logger.warn('The old set of parsers was removed; falling ' \
'back to the new parsers.')
_aux_logger.warn('The old set of parsers was removed;'
' falling back to the new parsers.')
self.useModule = useModule
if defaultKeys is None:
defaultKeys = {}
@ -142,6 +143,7 @@ class IMDbURLopener(FancyURLopener):
def __init__(self, *args, **kwargs):
self._last_url = u''
kwargs['context'] = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
FancyURLopener.__init__(self, *args, **kwargs)
# Headers to add to every request.
# XXX: IMDb's web server doesn't like urllib-based programs,
@ -211,9 +213,9 @@ class IMDbURLopener(FancyURLopener):
if server_encode is None and content:
begin_h = content.find('text/html; charset=')
if begin_h != -1:
end_h = content[19+begin_h:].find('"')
end_h = content[19 + begin_h:].find('"')
if end_h != -1:
server_encode = content[19+begin_h:19+begin_h+end_h]
server_encode = content[19 + begin_h:19 + begin_h + end_h]
if server_encode:
try:
if lookup(server_encode):
@ -237,9 +239,10 @@ class IMDbURLopener(FancyURLopener):
if encode is None:
encode = 'latin_1'
# The detection of the encoding is error prone...
self._logger.warn('Unable to detect the encoding of the retrieved '
'page [%s]; falling back to default latin1.', encode)
##print unicode(content, encode, 'replace').encode('utf8')
self._logger.warn('Unable to detect the encoding of the retrieved page [%s];'
' falling back to default utf8.', encode)
if isinstance(content, unicode):
return content
return unicode(content, encode, 'replace')
def http_error_default(self, url, fp, errcode, errmsg, headers):
@ -288,8 +291,8 @@ class IMDbHTTPAccessSystem(IMDbBase):
self._getRefs = True
self._mdparse = False
if isThin:
self._http_logger.warn('"httpThin" access system no longer ' +
'supported; "http" used automatically', exc_info=False)
self._http_logger.warn('"httpThin" access system no longer supported;'
' "http" used automatically', exc_info=False)
self.isThin = 0
if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'):
self.accessSystem = 'http'
@ -503,7 +506,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
def get_movie_main(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'combined')
cont = self._retrieve(self.urls['movie_main'] % movieID + 'reference')
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
def get_movie_full_credits(self, movieID):
@ -811,7 +814,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
def _search_keyword(self, keyword, results):
# XXX: the IMDb web server seems to have some serious problem with
# non-ascii keyword.
# E.g.: http://akas.imdb.com/keyword/fianc%E9/
# E.g.: http://www.imdb.com/keyword/fianc%E9/
# will return a 500 Internal Server Error: Redirect Recursion.
keyword = keyword.encode('utf8', 'ignore')
try:

View file

@ -171,7 +171,7 @@ class PageElement:
return self
def _lastRecursiveChild(self):
"Finds the last element beneath this object to be parsed."
"""Finds the last element beneath this object to be parsed."""
lastChild = self
while hasattr(lastChild, 'contents') and lastChild.contents:
lastChild = lastChild.contents[-1]
@ -184,7 +184,7 @@ class PageElement:
newChild = NavigableString(newChild)
position = min(position, len(self.contents))
if hasattr(newChild, 'parent') and newChild.parent != None:
if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
if newChild.parent == self:
@ -323,7 +323,7 @@ class PageElement:
return r
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
"""Iterates over a generator looking for things that match."""
if isinstance(name, SoupStrainer):
strainer = name
@ -415,7 +415,7 @@ class NavigableString(unicode, PageElement):
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
return (NavigableString.__str__(self),)
return NavigableString.__str__(self),
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@ -460,7 +460,7 @@ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
def _invert(h):
"Cheap function to invert a hash."
"""Cheap function to invert a hash."""
i = {}
for k,v in h.items():
i[v] = k
@ -501,14 +501,14 @@ class Tag(PageElement):
def __init__(self, parser, name, attrs=None, parent=None,
previous=None):
"Basic constructor."
"""Basic constructor."""
# We don't actually store the parser object: that lets extracted
# chunks be garbage-collected
self.parserClass = parser.__class__
self.isSelfClosing = parser.isSelfClosingTag(name)
self.name = name
if attrs == None:
if attrs is None:
attrs = []
self.attrs = attrs
self.contents = []
@ -541,18 +541,18 @@ class Tag(PageElement):
return self._getAttrMap()[key]
def __iter__(self):
"Iterating over a tag iterates over its contents."
"""Iterating over a tag iterates over its contents."""
return iter(self.contents)
def __len__(self):
"The length of a tag is the length of its list of contents."
"""The length of a tag is the length of its list of contents."""
return len(self.contents)
def __contains__(self, x):
return x in self.contents
def __nonzero__(self):
"A tag is non-None even if it has no contents."
"""A tag is non-None even if it has no contents."""
return True
def __setitem__(self, key, value):
@ -570,7 +570,7 @@ class Tag(PageElement):
self._getAttrMap()[key] = value
def __delitem__(self, key):
"Deleting tag[key] deletes all 'key' attributes for the tag."
"""Deleting tag[key] deletes all 'key' attributes for the tag."""
for item in self.attrs:
if item[0] == key:
self.attrs.remove(item)
@ -911,7 +911,7 @@ class SoupStrainer:
#print "Matching %s against %s" % (markup, matchAgainst)
result = False
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
result = markup != None
result = markup is not None
elif callable(matchAgainst):
result = matchAgainst(markup)
else:
@ -1130,7 +1130,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
# Python installations can't copy regexes. If anyone
# was relying on the existence of markupMassage, this
# might cause problems.
del(self.markupMassage)
del self.markupMassage
self.reset()
SGMLParser.feed(self, markup)
@ -1253,7 +1253,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
"""
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None
isNestable = nestingResetTriggers is not None
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
popTo = None
inclusive = True
@ -1264,9 +1264,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
#last occurance.
popTo = name
break
if (nestingResetTriggers != None
if (nestingResetTriggers is not None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
or (nestingResetTriggers is None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)):
#If we encounter one of the nesting reset triggers
@ -1342,11 +1342,11 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
"Handle comments as Comment objects."
"""Handle comments as Comment objects."""
self._toStringSubclass(text, Comment)
def handle_charref(self, ref):
"Handle character references as data."
"""Handle character references as data."""
if self.convertEntities:
data = unichr(int(ref))
else:
@ -1397,7 +1397,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self.handle_data(data)
def handle_decl(self, data):
"Handle DOCTYPEs and the like as Declaration objects."
"""Handle DOCTYPEs and the like as Declaration objects."""
self._toStringSubclass(data, Declaration)
def parse_declaration(self, i):
@ -1793,8 +1793,8 @@ class UnicodeDammit:
return self.markup
def _toUnicode(self, data, encoding):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
"""Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases"""
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \

View file

@ -67,7 +67,7 @@ def tokenize_path(path):
if path[i] == '/':
if i > 0:
separators.append((last_position, i))
if (path[i+1] == '/'):
if path[i+1] == '/':
last_position = i
i = i + 1
else:

View file

@ -2,7 +2,7 @@
parser.http.characterParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a character.
the IMDb pages on the www.imdb.com server about a character.
E.g., for "Jesse James" the referred pages would be:
main details: http://www.imdb.com/character/ch0000001/
biography: http://www.imdb.com/character/ch0000001/bio
@ -37,7 +37,7 @@ _personIDs = re.compile(r'/name/nm([0-9]{7})')
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
"""Parser for the "filmography" page of a given character.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -101,7 +101,7 @@ class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
class DOMHTMLCharacterBioParser(DOMParserBase):
"""Parser for the "biography" page of a given character.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -146,7 +146,7 @@ class DOMHTMLCharacterBioParser(DOMParserBase):
class DOMHTMLCharacterQuotesParser(DOMParserBase):
"""Parser for the "quotes" page of a given character.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:

View file

@ -2,12 +2,12 @@
parser.http.companyParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a company.
the IMDb pages on the www.imdb.com server about a company.
E.g., for "Columbia Pictures [us]" the referred page would be:
main details: http://akas.imdb.com/company/co0071509/
main details: http://www.imdb.com/company/co0071509/
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
Copyright 2008-2017 Davide Alberani <da@erlug.linux.it>
2008-2017 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -34,7 +34,7 @@ from imdb.utils import analyze_company_name
class DOMCompanyParser(DOMParserBase):
"""Parser for the main page of a given company.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -44,31 +44,38 @@ class DOMCompanyParser(DOMParserBase):
_containsObjects = True
extractors = [
Extractor(label='name',
path="//title",
attrs=Attribute(key='name',
path="./text()",
postprocess=lambda x: \
analyze_company_name(x, stripNotes=True))),
Extractor(
label='name',
path="//h1/span[@class='display-title ']", # note the extra trailing space in class
attrs=Attribute(
key='name',
path="./text()",
postprocess=lambda x: analyze_company_name(x, stripNotes=True)
)
),
Extractor(label='filmography',
group="//b/a[@name]",
group_key="./text()",
group_key_normalize=lambda x: x.lower(),
path="../following-sibling::ol[1]/li",
attrs=Attribute(key=None,
multi=True,
path={
'link': "./a[1]/@href",
'title': "./a[1]/text()",
'year': "./text()[1]"
},
postprocess=lambda x:
build_movie(u'%s %s' % \
(x.get('title'), x.get('year').strip()),
movieID=analyze_imdbid(x.get('link') or u''),
_parsingCompany=True))),
]
Extractor(
label='filmography',
group="//b/a[@name]",
group_key="./text()",
group_key_normalize=lambda x: x.lower(),
path="../following-sibling::ol[1]/li",
attrs=Attribute(
key=None,
multi=True,
path={
'link': "./a[1]/@href",
'title': "./a[1]/text()",
'year': "./text()[1]"
},
postprocess=lambda x: build_movie(
'%s %s' % (x.get('title'), x.get('year').strip()),
movieID=analyze_imdbid(x.get('link') or u''),
_parsingCompany=True
)
)
)
]
preprocessors = [
(re.compile('(<b><a name=)', re.I), r'</p>\1')

File diff suppressed because it is too large Load diff

View file

@ -2,10 +2,10 @@
parser.http.personParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a person.
the IMDb pages on the www.imdb.com server about a person.
E.g., for "Mel Gibson" the referred pages would be:
categorized: http://akas.imdb.com/name/nm0000154/maindetails
biography: http://akas.imdb.com/name/nm0000154/bio
categorized: http://www.imdb.com/name/nm0000154/maindetails
biography: http://www.imdb.com/name/nm0000154/bio
...and so on...
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
@ -52,7 +52,7 @@ def build_date(date):
class DOMHTMLMaindetailsParser(DOMParserBase):
"""Parser for the "categorized" (maindetails) page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -192,7 +192,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
class DOMHTMLBioParser(DOMParserBase):
"""Parser for the "biography" page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -225,92 +225,157 @@ class DOMHTMLBioParser(DOMParserBase):
# TODO: check if this slicing is always correct
postprocess=lambda x: u''.join(x).strip()[2:])]
extractors = [
Extractor(label='headshot',
path="//a[@name='headshot']",
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='birth info',
path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
attrs=_birth_attrs),
Extractor(label='death info',
path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
attrs=_death_attrs),
Extractor(label='nick names',
path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
attrs=Attribute(key='nick names',
path="./text()",
joiner='|',
postprocess=lambda x: [n.strip().replace(' (',
'::(', 1) for n in x.split('|')
if n.strip()])),
Extractor(label='birth name',
path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
attrs=Attribute(key='birth name',
path="./text()",
postprocess=lambda x: canonicalName(x.strip()))),
Extractor(label='height',
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
attrs=Attribute(key='height',
path="./text()",
postprocess=lambda x: x.strip())),
Extractor(label='mini biography',
path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
attrs=Attribute(key='mini biography',
multi=True,
path={
'bio': ".//text()",
'by': ".//a[@name='ba']//text()"
},
postprocess=lambda x: "%s::%s" % \
((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
(x.get('by') or u'').strip() or u'Anonymous'))),
Extractor(label='spouse',
path="//div[h5='Spouse']/table/tr",
attrs=Attribute(key='spouse',
multi=True,
path={
'name': "./td[1]//text()",
'info': "./td[2]//text()"
},
postprocess=lambda x: ("%s::%s" % \
(x.get('name').strip(),
(x.get('info') or u'').strip())).strip(':'))),
Extractor(label='trade mark',
path="//div[h5='Trade Mark']/p",
attrs=Attribute(key='trade mark',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='trivia',
path="//div[h5='Trivia']/p",
attrs=Attribute(key='trivia',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='quotes',
path="//div[h5='Personal Quotes']/p",
attrs=Attribute(key='quotes',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='salary',
path="//div[h5='Salary']/table/tr",
attrs=Attribute(key='salary history',
multi=True,
path={
'title': "./td[1]//text()",
'info': "./td[2]/text()",
},
postprocess=lambda x: "%s::%s" % \
(x.get('title').strip(),
x.get('info').strip()))),
Extractor(label='where now',
path="//div[h5='Where Are They Now']/p",
attrs=Attribute(key='where now',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
]
Extractor(
label='headshot',
path="//a[@name='headshot']",
attrs=Attribute(
key='headshot',
path="./img/@src"
)
),
Extractor(
label='birth info',
path="//table[@id='overviewTable']"
"//td[text()='Date of Birth']/following-sibling::td[1]",
attrs=_birth_attrs
),
Extractor(
label='death info',
path="//table[@id='overviewTable']"
"//td[text()='Date of Death']/following-sibling::td[1]",
attrs=_death_attrs
),
Extractor(
label='nick names',
path="//table[@id='overviewTable']"
"//td[text()='Nickenames']/following-sibling::td[1]",
attrs=Attribute(
key='nick names',
path="./text()",
joiner='|',
postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|')
if n.strip()]
)
),
Extractor(
label='birth name',
path="//table[@id='overviewTable']"
"//td[text()='Birth Name']/following-sibling::td[1]",
attrs=Attribute(
key='birth name',
path="./text()",
postprocess=lambda x: canonicalName(x.strip())
)
),
Extractor(
label='height',
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
attrs=Attribute(
key='height',
path="./text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='mini biography',
path="//a[@name='mini_bio']/following-sibling::"
"div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
attrs=Attribute(
key='mini biography',
multi=True,
path={
'bio': ".//text()",
'by': ".//a[@name='ba']//text()"
},
postprocess=lambda x: "%s::%s" % (
(x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
(x.get('by') or u'').strip() or u'Anonymous'
)
)
),
Extractor(
label='spouse',
path="//div[h5='Spouse']/table/tr",
attrs=Attribute(
key='spouse',
multi=True,
path={
'name': "./td[1]//text()",
'info': "./td[2]//text()"
},
postprocess=lambda x: ("%s::%s" % (
x.get('name').strip(),
(x.get('info') or u'').strip())).strip(':')
)
),
Extractor(
label='trade mark',
path="//div[h5='Trade Mark']/p",
attrs=Attribute(
key='trade mark',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='trivia',
path="//div[h5='Trivia']/p",
attrs=Attribute(
key='trivia',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='quotes',
path="//div[h5='Personal Quotes']/p",
attrs=Attribute(
key='quotes',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
),
Extractor(
label='salary',
path="//div[h5='Salary']/table/tr",
attrs=Attribute(
key='salary history',
multi=True,
path={
'title': "./td[1]//text()",
'info': "./td[2]/text()",
},
postprocess=lambda x: "%s::%s" % (
x.get('title').strip(),
x.get('info').strip())
)
),
Extractor(
label='where now',
path="//div[h5='Where Are They Now']/p",
attrs=Attribute(
key='where now',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip()
)
)
]
preprocessors = [
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
@ -329,7 +394,7 @@ class DOMHTMLBioParser(DOMParserBase):
class DOMHTMLResumeParser(DOMParserBase):
"""Parser for the "resume" page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -406,13 +471,13 @@ class DOMHTMLResumeParser(DOMParserBase):
continue
if len(data[key][0]) == 3:
for item in data[key]:
item[:] = [x for x in item if not x == None]
item[:] = [x for x in item if not x is None]
continue
if len(data[key][0]) == 2:
new_key = {}
for item in data[key]:
if item[0] == None:
if item[0] is None:
continue
if ':' in item[0]:
if item[1].replace(item[0], '')[1:].strip() == '':
@ -422,15 +487,14 @@ class DOMHTMLResumeParser(DOMParserBase):
new_key[item[0]] = item[1]
data[key] = new_key
new_data = {}
new_data['resume'] = data
new_data = {'resume': data}
return new_data
class DOMHTMLOtherWorksParser(DOMParserBase):
"""Parser for the "other works" and "agent" pages of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -466,7 +530,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
minidx = minfo.find(' -')
# Sometimes, for some unknown reason, the role is left in minfo.
if minidx != -1:
slfRole = minfo[minidx+3:].lstrip()
slfRole = minfo[minidx + 3:].lstrip()
minfo = minfo[:minidx].rstrip()
if slfRole.endswith(')'):
commidx = slfRole.rfind('(')
@ -504,7 +568,7 @@ def _build_episode(link, title, minfo, role, roleA, roleAID):
class DOMHTMLSeriesParser(DOMParserBase):
"""Parser for the "by TV series" page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -559,7 +623,7 @@ class DOMHTMLSeriesParser(DOMParserBase):
class DOMHTMLPersonGenresParser(DOMParserBase):
"""Parser for the "by genre" and "by keywords" pages of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchCharacterParser class (and the
search_character_parser instance), used to parse the results of a search
for a given character.
E.g., when searching for the name "Jesse James", the parsed page would be:
http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James
http://www.imdb.com/find?s=ch;mx=20;q=Jesse+James
Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchCompanyParser class (and the
search_company_parser instance), used to parse the results of a search
for a given company.
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
http://www.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
@ -46,22 +46,29 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
_titleBuilder = lambda self, x: build_company_name(x)
_linkPrefix = '/company/co'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'name': "./a[1]/text()",
'notes': "./text()[1]"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link')),
analyze_company_name(x.get('name')+(x.get('notes')
or u''), stripNotes=True)
))]
extractors = [Extractor(label='search',
path="//td[@class='result_text']/a[starts-with(@href, " \
"'/company/co')]/..",
attrs=_attrs)]
_attrs = [
Attribute(
key='data',
multi=True,
path={
'link': "./a[1]/@href",
'name': "./a[1]/text()",
'notes': "./text()[1]"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link')),
analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True)
)
)
]
extractors = [
Extractor(
label='search',
path="//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..",
attrs=_attrs
)
]
_OBJECTS = {

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchKeywordParser class (and the
search_company_parser instance), used to parse the results of a search
for a given keyword.
E.g., when searching for the keyword "alabama", the parsed page would be:
http://akas.imdb.com/find?s=kw;mx=20;q=alabama
http://www.imdb.com/find?s=kw;mx=20;q=alabama
Copyright 2009 Davide Alberani <da@erlug.linux.it>

View file

@ -6,7 +6,7 @@ search_movie_parser instance), used to parse the results of a search
for a given title.
E.g., for when searching for the title "the passion", the parsed
page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
http://www.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
@ -67,7 +67,7 @@ class DOMBasicMovieParser(DOMParserBase):
data = []
else:
link = data.pop('link')
if (link and data):
if link and data:
data = [(link, data)]
else:
data = []

View file

@ -5,7 +5,7 @@ This module provides the HTMLSearchPersonParser class (and the
search_person_parser instance), used to parse the results of a search
for a given person.
E.g., when searching for the name "Mel Gibson", the parsed page would be:
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
http://www.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>

View file

@ -4,8 +4,8 @@ parser.http.topBottomParser module (imdb package).
This module provides the classes (and the instances), used to parse the
lists of top 250 and bottom 100 movies.
E.g.:
http://akas.imdb.com/chart/top
http://akas.imdb.com/chart/bottom
http://www.imdb.com/chart/top
http://www.imdb.com/chart/bottom
Copyright 2009-2015 Davide Alberani <da@erlug.linux.it>
@ -31,7 +31,7 @@ from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
class DOMHTMLTop250Parser(DOMParserBase):
"""Parser for the "top 250" page.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
@ -42,17 +42,24 @@ class DOMHTMLTop250Parser(DOMParserBase):
ranktext = 'top 250 rank'
def _init(self):
self.extractors = [Extractor(label=self.label,
path="//div[@id='main']//div[1]//div//table//tbody//tr",
attrs=Attribute(key=None,
multi=True,
path={self.ranktext: "./td[2]//text()",
'rating': "./td[3]//strong//text()",
'title': "./td[2]//a//text()",
'year': "./td[2]//span//text()",
'movieID': "./td[2]//a/@href",
'votes': "./td[3]//strong/@title"
}))]
self.extractors = [
Extractor(
label=self.label,
path="//div[@id='main']//div[1]//div//table//tbody//tr",
attrs=Attribute(
key=None,
multi=True,
path={
self.ranktext: "./td[2]/text()",
'rating': "./td[3]//strong//text()",
'title': "./td[2]//a//text()",
'year': "./td[2]//span//text()",
'movieID': "./td[2]//a/@href",
'votes': "./td[3]//strong/@title"
}
)
)
]
def postprocess_data(self, data):
if not data or self.label not in data:
@ -73,9 +80,11 @@ class DOMHTMLTop250Parser(DOMParserBase):
if theID in seenIDs:
continue
seenIDs.append(theID)
minfo = analyze_title(d['title']+" "+d['year'])
try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
except: pass
minfo = analyze_title(d['title'] + ' ' + d['year'])
try:
minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
except:
pass
if 'votes' in d:
try:
votes = d['votes'].replace(' votes','')
@ -93,7 +102,7 @@ class DOMHTMLTop250Parser(DOMParserBase):
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
"""Parser for the "bottom 100" page.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
the www.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:

View file

@ -35,7 +35,9 @@ from imdb.Character import Character
# Year, imdbIndex and kind.
re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)')
re_yearKind_index = re.compile(
r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)'
)
# Match imdb ids in href tags
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
@ -304,7 +306,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
elif title[-14:] == 'TV mini-series':
title = title[:-14] + ' (mini)'
if title and title.endswith(_defSep.rstrip()):
title = title[:-len(_defSep)+1]
title = title[:-len(_defSep) + 1]
# Try to understand where the movie title ends.
while True:
if year:
@ -320,18 +322,17 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
# Try to match paired parentheses; yes: sometimes there are
# parentheses inside comments...
nidx = title.rfind('(')
while (nidx != -1 and \
title[nidx:].count('(') != title[nidx:].count(')')):
while nidx != -1 and title[nidx:].count('(') != title[nidx:].count(')'):
nidx = title[:nidx].rfind('(')
# Unbalanced parentheses: stop here.
if nidx == -1: break
# The last item in parentheses seems to be a year: stop here.
first4 = title[nidx+1:nidx+5]
if (first4.isdigit() or first4 == '????') and \
title[nidx+5:nidx+6] in (')', '/'): break
first4 = title[nidx + 1:nidx + 5]
if (first4.isdigit() or first4 == '????') and title[nidx + 5:nidx + 6] in (')', '/'):
break
# The last item in parentheses is a known kind: stop here.
if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie',
'TV series', 'short'): break
if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', 'TV series', 'short'):
break
# Else, in parentheses there are some notes.
# XXX: should the notes in the role half be kept separated
# from the notes in the movie title half?
@ -471,8 +472,8 @@ class DOMParserBase(object):
if _gotError:
warnings.warn('falling back to "%s"' % mod)
break
except ImportError, e:
if idx+1 >= nrMods:
except ImportError as e:
if idx + 1 >= nrMods:
# Raise the exception, if we don't have any more
# options to try.
raise IMDbError('unable to use any parser in %s: %s' % \
@ -786,10 +787,10 @@ class Extractor(object):
def __repr__(self):
"""String representation of an Extractor object."""
r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \
'group_key=%s group_key_normalize=%s)>' % (id(self),
self.label, self.path, repr(self.attrs), self.group,
self.group_key, self.group_key_normalize)
t = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, group_key=%s' + \
', group_key_normalize=%s)>'
r = t % (id(self), self.label, self.path, repr(self.attrs), self.group,
self.group_key, self.group_key_normalize)
return r
@ -825,7 +826,7 @@ def _parse_ref(text, link, info):
yearK = re_yearKind_index.match(info)
if yearK and yearK.start() == 0:
text += ' %s' % info[:yearK.end()]
return (text.replace('\n', ' '), link)
return text.replace('\n', ' '), link
class GatherRefs(DOMParserBase):

View file

@ -687,7 +687,7 @@ class IMDbSqlAccessSystem(IMDbBase):
elif isinstance(o, dict):
for value in o.values():
self._findRefs(value, trefs, nrefs)
return (trefs, nrefs)
return trefs, nrefs
def _extractRefs(self, o):
"""Scan for titles or names references in strings."""
@ -702,7 +702,7 @@ class IMDbSqlAccessSystem(IMDbBase):
"imdb.parser.sql.IMDbSqlAccessSystem; "
"if it's not a recursion limit exceeded and we're not "
"running in a Symbian environment, it's a bug:\n%s" % e)
return (trefs, nrefs)
return trefs, nrefs
def _changeAKAencoding(self, akanotes, akatitle):
"""Return akatitle in the correct charset, as specified in

View file

@ -437,11 +437,13 @@ def ISNULL(x):
"""Emulate SQLObject's ISNULL."""
# XXX: Should we use null()? Can null() be a global instance?
# XXX: Is it safe to test None with the == operator, in this case?
return x == None
return x is None
def ISNOTNULL(x):
"""Emulate SQLObject's ISNOTNULL."""
return x != None
return x is not None
def CONTAINSSTRING(expr, pattern):
"""Emulate SQLObject's CONTAINSSTRING."""

View file

@ -122,53 +122,80 @@ class DBTable(object):
# Default values to insert in some tables: {'column': (list, of, values, ...)}
kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie',
'tv mini series', 'video game', 'episode')}
companyTypeDefs = {'kind': ('distributors', 'production companies',
'special effects companies', 'miscellaneous companies')}
infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages',
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
'keywords', 'alternate versions', 'crazy credits', 'goofs',
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
'mini biography', 'birth notes', 'birth date', 'height',
'death date', 'spouse', 'other works', 'birth name',
'salary history', 'nick names', 'books', 'agent address',
'biographical movies', 'portrayed in', 'where now', 'trade mark',
'interviews', 'article', 'magazine cover photo', 'pictorial',
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
'LD official retail price', 'LD frequency response', 'LD pressing plant',
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
'LD production country', 'LD contrast', 'LD color rendition',
'LD picture format', 'LD video noise', 'LD video artifacts',
'LD release country', 'LD sharpness', 'LD dynamic range',
'LD audio noise', 'LD color information', 'LD group genre',
'LD quality program', 'LD close captions-teletext-ld-g',
'LD category', 'LD analog left', 'LD certification',
'LD audio quality', 'LD video quality', 'LD aspect ratio',
'LD analog right', 'LD additional information',
'LD number of chapter stops', 'LD dialogue intellegibility',
'LD disc size', 'LD master format', 'LD subtitles',
'LD status of availablility', 'LD quality of source',
'LD number of sides', 'LD video standard', 'LD supplement',
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
'novel', 'adaption', 'book', 'production process protocol',
'printed media reviews', 'essays', 'other literature', 'mpaa',
'plot', 'votes distribution', 'votes', 'rating',
'production dates', 'copyright holder', 'filming dates', 'budget',
'weekend gross', 'gross', 'opening weekend', 'rentals',
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')}
compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')}
linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as',
'references', 'referenced in', 'spoofs', 'spoofed in',
'features', 'featured in', 'spin off from', 'spin off',
'version of', 'similar to', 'edited into',
'edited from', 'alternate language version of',
'unknown link')}
roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer',
'cinematographer', 'composer', 'costume designer',
'director', 'editor', 'miscellaneous crew',
'production designer', 'guest')}
kindTypeDefs = {
'kind': (
'movie', 'tv series', 'tv movie', 'video movie',
'tv mini series', 'video game', 'episode', 'short', 'tv short'
)
}
companyTypeDefs = {
'kind': (
'distributors', 'production companies',
'special effects companies', 'miscellaneous companies'
)
}
infoTypeDefs = {
'info': (
'runtimes', 'color info', 'genres', 'languages',
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
'keywords', 'alternate versions', 'crazy credits', 'goofs',
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
'mini biography', 'birth notes', 'birth date', 'height',
'death date', 'spouse', 'other works', 'birth name',
'salary history', 'nick names', 'books', 'agent address',
'biographical movies', 'portrayed in', 'where now', 'trade mark',
'interviews', 'article', 'magazine cover photo', 'pictorial',
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
'LD official retail price', 'LD frequency response', 'LD pressing plant',
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
'LD production country', 'LD contrast', 'LD color rendition',
'LD picture format', 'LD video noise', 'LD video artifacts',
'LD release country', 'LD sharpness', 'LD dynamic range',
'LD audio noise', 'LD color information', 'LD group genre',
'LD quality program', 'LD close captions-teletext-ld-g',
'LD category', 'LD analog left', 'LD certification',
'LD audio quality', 'LD video quality', 'LD aspect ratio',
'LD analog right', 'LD additional information',
'LD number of chapter stops', 'LD dialogue intellegibility',
'LD disc size', 'LD master format', 'LD subtitles',
'LD status of availablility', 'LD quality of source',
'LD number of sides', 'LD video standard', 'LD supplement',
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
'novel', 'adaption', 'book', 'production process protocol',
'printed media reviews', 'essays', 'other literature', 'mpaa',
'plot', 'votes distribution', 'votes', 'rating',
'production dates', 'copyright holder', 'filming dates', 'budget',
'weekend gross', 'gross', 'opening weekend', 'rentals',
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank'
)
}
compCastTypeDefs = {
'kind': ('cast', 'crew', 'complete', 'complete+verified')
}
linkTypeDefs = {
'link': (
'follows', 'followed by', 'remake of', 'remade as',
'references', 'referenced in', 'spoofs', 'spoofed in',
'features', 'featured in', 'spin off from', 'spin off',
'version of', 'similar to', 'edited into',
'edited from', 'alternate language version of',
'unknown link'
)
}
roleTypeDefs = {
'role': (
'actor', 'actress', 'producer', 'writer',
'cinematographer', 'composer', 'costume designer',
'director', 'editor', 'miscellaneous crew',
'production designer', 'guest'
)
}
# Schema of tables in our database.
# XXX: Foreign keys can be used to create constrains between tables,
@ -186,7 +213,7 @@ DB_SCHEMA = [
# the alternateID attribute here will be ignored by SQLAlchemy.
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'),
DBCol('gender', STRINGCOL, length=1, default=None),
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
@ -204,7 +231,7 @@ DB_SCHEMA = [
# from namePcodeNf.
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'),
@ -218,7 +245,7 @@ DB_SCHEMA = [
# namePcodeSf is the soundex of the name plus the country code.
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('countryCode', UNICODECOL, length=255, default=None),
DBCol('countryCode', STRINGCOL, length=255, default=None),
DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'),
@ -237,7 +264,7 @@ DB_SCHEMA = [
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('title', UNICODECOL, notNone=True,
index='idx_title', indexLen=10),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
DBCol('productionYear', INTCOL, default=None),
DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"),
@ -264,7 +291,7 @@ DB_SCHEMA = [
DBCol('personID', INTCOL, notNone=True, index='idx_person',
foreignKey='Name'),
DBCol('name', UNICODECOL, notNone=True),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
index='idx_pcodecf'),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
@ -291,7 +318,7 @@ DB_SCHEMA = [
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
foreignKey='Title'),
DBCol('title', UNICODECOL, notNone=True),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbIndex', STRINGCOL, length=12, default=None),
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
DBCol('productionYear', INTCOL, default=None),
DBCol('phoneticCode', STRINGCOL, length=5, default=None,

View file

@ -42,8 +42,22 @@ _utils_logger = logging.getLogger('imdbpy.utils')
# and year of release.
# XXX: probably L, C, D and M are far too much! ;-)
re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
re_extended_year_index = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?((?:[0-9\?]{4})(?:-[0-9\?]{4})?)(?:/([IVXLCDM]+)?)?\)')
re_remove_kind = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?')
re_m_episode = re.compile(r'\(TV Episode\)\s+-\s+', re.I)
re_m_series = re.compile(r'Season\s+\d+\s+\|\s+Episode\s+\d+\s+-', re.I)
re_m_imdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
re_m_kind = re.compile(
r'\((TV episode|TV Series|TV mini-series|mini|TV|Video|Video Game|VG|Short|TV Movie|TV Short|V)\)',
re.I)
KIND_MAP = {
'tv': 'tv movie',
'tv episode': 'episode',
'v': 'video movie',
'video': 'video movie',
'vg': 'video game',
'mini': 'tv mini series',
'tv mini-series': 'tv mini series'
}
# Match only the imdbIndex (for name strings).
re_index = re.compile(r'^\(([IVXLCDM]+)\)$')
@ -283,13 +297,6 @@ def _split_series_episode(title):
# that means this is an episode title, as returned by
# the web server.
series_title = title[:second_quot]
##elif episode_or_year[-1:] == '}':
## # Title of the episode, as in the plain text data files.
## begin_eps = episode_or_year.find('{')
## if begin_eps == -1: return series_title, episode_or_year
## series_title = title[:second_quot+begin_eps].rstrip()
## # episode_or_year is returned with the {...}
## episode_or_year = episode_or_year[begin_eps:]
return series_title, episode_or_year
@ -383,65 +390,24 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
# tv mini series: 5,497
# video game: 5,490
# More up-to-date statistics: http://us.imdb.com/database_statistics
if title.endswith('(TV)'):
kind = u'tv movie'
title = title[:-4].rstrip()
elif title.endswith('(TV Movie)'):
kind = u'tv movie'
title = title[:-10].rstrip()
elif title.endswith('(V)'):
kind = u'video movie'
title = title[:-3].rstrip()
elif title.lower().endswith('(video)'):
kind = u'video movie'
title = title[:-7].rstrip()
elif title.endswith('(TV Short)'):
kind = u'tv short'
title = title[:-10].rstrip()
elif title.endswith('(TV Mini-Series)'):
kind = u'tv mini series'
title = title[:-16].rstrip()
elif title.endswith('(mini)'):
kind = u'tv mini series'
title = title[:-6].rstrip()
elif title.endswith('(VG)'):
kind = u'video game'
title = title[:-4].rstrip()
elif title.endswith('(Video Game)'):
kind = u'video game'
title = title[:-12].rstrip()
elif title.endswith('(TV Series)'):
epindex = title.find('(TV Episode) - ')
if epindex >= 0:
# It's an episode of a series.
kind = u'episode'
series_info = analyze_title(title[epindex + 15:])
result['episode of'] = series_info.get('title')
result['series year'] = series_info.get('year')
title = title[:epindex]
else:
kind = u'tv series'
title = title[:-11].rstrip()
epindex = re_m_episode.search(title)
if epindex:
# It's an episode of a series.
kind = 'episode'
series_title = title[epindex.end():]
series_title = re_m_series.sub('', series_title)
series_info = analyze_title(series_title)
result['episode of'] = series_info.get('title')
result['series year'] = series_info.get('year')
title = title[:epindex.start()].strip()
else:
detected_kind = re_m_kind.findall(title)
if detected_kind:
kind = detected_kind[-1].lower().replace('-', '')
kind = KIND_MAP.get(kind, kind)
title = re_m_kind.sub('', title).strip()
# Search for the year and the optional imdbIndex (a roman number).
yi = re_year_index.findall(title)
if not yi:
yi = re_extended_year_index.findall(title)
if yi:
yk, yiy, yii = yi[-1]
yi = [(yiy, yii)]
if yk == 'TV episode':
kind = u'episode'
elif yk in ('TV', 'TV Movie'):
kind = u'tv movie'
elif yk == 'TV Series':
kind = u'tv series'
elif yk == 'Video':
kind = u'video movie'
elif yk in ('TV mini-series', 'TV Mini-Series'):
kind = u'tv mini series'
elif yk == 'Video Game':
kind = u'video game'
title = re_remove_kind.sub('(', title)
if yi:
last_yi = yi[-1]
year = last_yi[0]
@ -450,7 +416,12 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
year = year[:-len(imdbIndex)-1]
i = title.rfind('(%s)' % last_yi[0])
if i != -1:
title = title[:i-1].rstrip()
title = title[:i - 1].rstrip()
if not imdbIndex:
detect_imdbIndex = re_m_imdbIndex.findall(title)
if detect_imdbIndex:
imdbIndex = detect_imdbIndex[-1]
title = re_m_imdbIndex.sub('', title).strip()
# This is a tv (mini) series: strip the '"' at the begin and at the end.
# XXX: strip('"') is not used for compatibility with Python 2.0.
if title and title[0] == title[-1] == '"':
@ -464,8 +435,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
title = canonicalTitle(title)
else:
title = normalizeTitle(title)
# 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series',
# 'tv movie', 'video movie', 'video game')
result['title'] = title
result['kind'] = kind or u'movie'
if year and year != '????':
@ -832,7 +801,7 @@ def date_and_notes(s):
"""Parse (birth|death) date and notes; returns a tuple in the
form (date, notes)."""
s = s.strip()
if not s: return (u'', u'')
if not s: return u'', u''
notes = u''
if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february',
'march', 'april', 'may', 'june',
@ -990,7 +959,7 @@ def _tag4TON(ton, addAccessSystem=False, _containerOnly=False):
beginTag += extras
if ton.notes:
beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes)
return (beginTag, u'</%s>' % tag)
return beginTag, u'</%s>' % tag
TAGS_TO_MODIFY = {
@ -1264,8 +1233,8 @@ class _Container(object):
self.__role = role
currentRole = property(_get_currentRole, _set_currentRole,
doc="The role of a Person in a Movie" + \
" or the interpreter of a Character in a Movie.")
doc="The role of a Person in a Movie"
" or the interpreter of a Character in a Movie.")
def _init(self, **kwds): pass
@ -1478,10 +1447,10 @@ class _Container(object):
except RuntimeError, e:
# Symbian/python 2.2 has a poor regexp implementation.
import warnings
warnings.warn('RuntimeError in '
"imdb.utils._Container.__getitem__; if it's not "
"a recursion limit exceeded and we're not running "
"in a Symbian environment, it's a bug:\n%s" % e)
warnings.warn("RuntimeError in imdb.utils._Container.__getitem__;"
" if it's not a recursion limit exceeded and we're"
" not running in a Symbian environment, it's a"
" bug:\n%s" % e)
return rawData
def __setitem__(self, key, item):