Update imdbpy libs to v5.0

Fixed invalid indexer id issues for TVRage shows.

Fixed issues for getting posters and backdrops for TVRage shows.

We now convert XML straight to a dict object for Indexer APIs, improved overall performance api's

Fixed issues with TVRage shows and displaying genre's properly.
This commit is contained in:
echel0n 2014-05-28 22:40:12 -07:00
parent 764cf6e62e
commit 2dcd26e69c
30 changed files with 7446 additions and 453 deletions

View file

@ -6,7 +6,7 @@ a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages, It can fetch data through different media (e.g.: the IMDb web pages,
a SQL database, etc.) a SQL database, etc.)
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2014 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company', __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems'] 'available_access_systems']
__version__ = VERSION = '4.9' __version__ = VERSION = '5.0'
# Import compatibility module (importing it is enough). # Import compatibility module (importing it is enough).
import _compat import _compat
@ -160,6 +160,7 @@ def IMDb(accessSystem=None, *arguments, **keywords):
kwds.update(keywords) kwds.update(keywords)
keywords = kwds keywords = kwds
except Exception, e: except Exception, e:
import logging
logging.getLogger('imdbpy').warn('Unable to read configuration' \ logging.getLogger('imdbpy').warn('Unable to read configuration' \
' file; complete error: %s' % e) ' file; complete error: %s' % e)
# It just LOOKS LIKE a bad habit: we tried to read config # It just LOOKS LIKE a bad habit: we tried to read config
@ -303,7 +304,7 @@ class IMDbBase:
# http://akas.imdb.com/keyword/%s/ # http://akas.imdb.com/keyword/%s/
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/' imdbURL_keyword_main=imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top # http://akas.imdb.com/chart/top
imdbURL_top250=imdbURL_base + 'chart/top', imdbURL_top250=imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom # http://akas.imdb.com/chart/bottom
imdbURL_bottom100=imdbURL_base + 'chart/bottom' imdbURL_bottom100=imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s # http://akas.imdb.com/find?%s
@ -824,22 +825,23 @@ class IMDbBase:
# subclass, somewhere under the imdb.parser package. # subclass, somewhere under the imdb.parser package.
raise NotImplementedError('override this method') raise NotImplementedError('override this method')
def _searchIMDb(self, kind, ton): def _searchIMDb(self, kind, ton, title_kind=None):
"""Search the IMDb akas server for the given title or name.""" """Search the IMDb akas server for the given title or name."""
# The Exact Primary search system has gone AWOL, so we resort # The Exact Primary search system has gone AWOL, so we resort
# to the mobile search. :-/ # to the mobile search. :-/
if not ton: if not ton:
return None return None
ton = ton.strip('"')
aSystem = IMDb('mobile') aSystem = IMDb('mobile')
if kind == 'tt': if kind == 'tt':
searchFunct = aSystem.search_movie searchFunct = aSystem.search_movie
check = 'long imdb canonical title' check = 'long imdb title'
elif kind == 'nm': elif kind == 'nm':
searchFunct = aSystem.search_person searchFunct = aSystem.search_person
check = 'long imdb canonical name' check = 'long imdb name'
elif kind == 'char': elif kind == 'char':
searchFunct = aSystem.search_character searchFunct = aSystem.search_character
check = 'long imdb canonical name' check = 'long imdb name'
elif kind == 'co': elif kind == 'co':
# XXX: are [COUNTRY] codes included in the results? # XXX: are [COUNTRY] codes included in the results?
searchFunct = aSystem.search_company searchFunct = aSystem.search_company
@ -852,24 +854,42 @@ class IMDbBase:
# exact match. # exact match.
if len(searchRes) == 1: if len(searchRes) == 1:
return searchRes[0].getID() return searchRes[0].getID()
title_only_matches = []
for item in searchRes: for item in searchRes:
# Return the first perfect match. # Return the first perfect match.
if item[check] == ton: if item[check].strip('"') == ton:
return item.getID() # For titles do additional check for kind
if kind != 'tt' or title_kind == item['kind']:
return item.getID()
elif kind == 'tt':
title_only_matches.append(item.getID())
# imdbpy2sql.py could detected wrong type, so if no title and kind
# matches found - collect all results with title only match
# Return list of IDs if multiple matches (can happen when searching
# titles with no title_kind specified)
# Example: DB: Band of Brothers "tv series" vs "tv mini-series"
if title_only_matches:
if len(title_only_matches) == 1:
return title_only_matches[0]
else:
return title_only_matches
return None return None
def title2imdbID(self, title): def title2imdbID(self, title, kind=None):
"""Translate a movie title (in the plain text data files format) """Translate a movie title (in the plain text data files format)
to an imdbID. to an imdbID.
Try an Exact Primary Title search on IMDb; Try an Exact Primary Title search on IMDb;
return None if it's unable to get the imdbID.""" return None if it's unable to get the imdbID;
return self._searchIMDb('tt', title) Always specify kind: movie, tv series, video game etc. or search can
return list of IDs if multiple matches found
"""
return self._searchIMDb('tt', title, kind)
def name2imdbID(self, name): def name2imdbID(self, name):
"""Translate a person name in an imdbID. """Translate a person name in an imdbID.
Try an Exact Primary Name search on IMDb; Try an Exact Primary Name search on IMDb;
return None if it's unable to get the imdbID.""" return None if it's unable to get the imdbID."""
return self._searchIMDb('tt', name) return self._searchIMDb('nm', name)
def character2imdbID(self, name): def character2imdbID(self, name):
"""Translate a character name in an imdbID. """Translate a character name in an imdbID.
@ -896,7 +916,8 @@ class IMDbBase:
imdbID = aSystem.get_imdbMovieID(mop.movieID) imdbID = aSystem.get_imdbMovieID(mop.movieID)
else: else:
imdbID = aSystem.title2imdbID(build_title(mop, canonical=0, imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
ptdf=1)) ptdf=0, appendKind=False),
mop['kind'])
elif isinstance(mop, Person.Person): elif isinstance(mop, Person.Person):
if mop.personID is not None: if mop.personID is not None:
imdbID = aSystem.get_imdbPersonID(mop.personID) imdbID = aSystem.get_imdbPersonID(mop.personID)

View file

@ -29,7 +29,7 @@
[imdbpy] [imdbpy]
## Default. ## Default.
accessSystem = mobile accessSystem = http
## Optional (options common to every data access system): ## Optional (options common to every data access system):
# Activate adult searches (on, by default). # Activate adult searches (on, by default).
@ -37,7 +37,7 @@ accessSystem = mobile
# Number of results for searches (20 by default). # Number of results for searches (20 by default).
#results = 20 #results = 20
# Re-raise all caught exceptions (off, by default). # Re-raise all caught exceptions (off, by default).
reraiseExceptions = on #reraiseExceptions = off
## Optional (options common to http and mobile data access systems): ## Optional (options common to http and mobile data access systems):
# Proxy used to access the network. If it requires authentication, # Proxy used to access the network. If it requires authentication,
@ -69,7 +69,7 @@ reraiseExceptions = on
## Set the threshold for logging messages. ## Set the threshold for logging messages.
# Can be one of "debug", "info", "warning", "error", "critical" (default: # Can be one of "debug", "info", "warning", "error", "critical" (default:
# "warning"). # "warning").
loggingLevel = info #loggingLevel = debug
## Path to a configuration file for the logging facility; ## Path to a configuration file for the logging facility;
# see: http://docs.python.org/library/logging.html#configuring-logging # see: http://docs.python.org/library/logging.html#configuring-logging

View file

@ -64,8 +64,10 @@ LANG_ARTICLES = {
'English': ('the', 'a', 'an'), 'English': ('the', 'a', 'an'),
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'", 'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
'uno'), 'uno'),
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos', 'Spanish': ('la', 'lo', 'el', 'las', 'un', 'los', 'una', 'al', 'del',
'unas'), 'unos', 'unas', 'uno'),
'French': ('le', "l'", 'la', 'les', 'un', 'une', 'des', 'au', 'du', '\xc3\xa0 la',
'de la', 'aux'),
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'), 'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
'Turkish': (), # Some languages doesn't have articles. 'Turkish': (), # Some languages doesn't have articles.
} }

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
""" """
generatepot.py script. generatepot.py script.

1303
lib/imdb/locale/imdbpy-ar.po Normal file

File diff suppressed because it is too large Load diff

1303
lib/imdb/locale/imdbpy-bg.po Normal file

File diff suppressed because it is too large Load diff

1303
lib/imdb/locale/imdbpy-de.po Normal file

File diff suppressed because it is too large Load diff

1304
lib/imdb/locale/imdbpy-es.po Normal file

File diff suppressed because it is too large Load diff

1304
lib/imdb/locale/imdbpy-fr.po Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
"""Generate binary message catalog from textual translation description. """Generate binary message catalog from textual translation description.

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
""" """
rebuildmo.py script. rebuildmo.py script.

View file

@ -104,15 +104,24 @@ PY_VERSION = sys.version_info[:2]
# The cookies for the "adult" search. # The cookies for the "adult" search.
# Please don't mess with these account. # Please don't mess with these account.
# Old 'IMDbPY' account. # Old 'IMDbPY' account.
_old_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1' _IMDbPY_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1'
_old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q==' _IMDbPY_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q=='
# New 'IMDbPYweb' account. # 'imdbpy2010' account.
_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1' _imdbpy2010_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk=' _imdbpy2010_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
# old 'IMDbPYweb' account.
_old_IMDbPYweb_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
_old_IMDbPYweb_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
# old 'IMDbPYweb' account values (as of 2012-12-30)
_IMDbPYweb_cookie_id = 'BCYjtpb46Go0cMHAMewWZEauhwqPL7ASCPpPVNutu6BuayHZd0U6Dk3UAqVlEM8DHLDsSr02RGQn5ff3245-R4A130NAWJ_5yqXx7X-zJey8vQM8JKdv3rTUSEJznJQlojUW1Bije-Q0FXAixs4I0sePWhd_tA41i-9AF2q3lPmaksram6ilMhN9i3IPESW1PMbk'
_IMDbPYweb_cookie_uu = 'BCYttQjEMc-NyUdFUGxThidAnBo7wwalEzj4un9uzf2XoEjtqDhNfrH7bOSuwlRkMEQ11SNyTajl-b9Q-21m4HwYu0e3jXZrjYLXLYzFkrEroCDyUREqaTwPJPSjGtFmvlaVBZEZmsWpaxe18DT5KiygKyGPZKH78Xu4im6ba-Sd31WvbXHzP8KGXPpGjhhVuv7Dcv314HCWkE832Srf9ya-Uv0FdGAmYyLbIAXuxnvpYQd6oZ8-CYkSGLIqcKWdrf5S'
# 'IMDbPY2013' account
_IMDbPY2013_cookie_id = 'BCYmoyqSm2WglmOzG-SrFWSvVpxsTZOB0qEOOqmAwCBxCbaNgKOxd0DTKzUvt7t04Pya5gV2tUrpDmYxrc1Dr54DQj2UXI7QI35__M5-HI2KrbOI3PjDz6M-_U3HG8topMfN64R24tmBixoZhMYXVaEc556lf0Z4gQNJVYRANXvwytP5v1lpfeToRlu9aVJwN4kT'
_IMDbPY2013_cookie_uu = 'BCYquDS8Y2i8R1pJxS4nB77YrhjHHXeOea2Xl9KtZvE6RZKVfMvzTGU4Vl5-yxfPbgRSiFJasyf-hhPuVvXyaHlfeBjNlbFT8hz2HzFFkQ_SxKxq05J51gi7Fv4SaAws1M-i7zmQ1TRunfJqCVIYqPwIs2NO7s4_YDH2ZoISVGLgca8OY2K58HychOZB1oRWHVeAJNhLJMrCWJBuGRLCNnQK5X9tA0dPPntr2Ussy0ouul-N1GQz-8y5vda3JJ_C6xkwmHcA6JrOdOFO_HqMWjVSXuxGEdrXC919JM9H0vooVvKeVgAEJnTh2GiVlUJUoH3c'
# imdbpy2010 account. # Currently used account.
#_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI=' _cookie_id = _IMDbPY2013_cookie_id
#_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A==' _cookie_uu = _IMDbPY2013_cookie_uu
class _FakeURLOpener(object): class _FakeURLOpener(object):
@ -141,9 +150,10 @@ class IMDbURLopener(FancyURLopener):
for header in ('User-Agent', 'User-agent', 'user-agent'): for header in ('User-Agent', 'User-agent', 'user-agent'):
self.del_header(header) self.del_header(header)
self.set_header('User-Agent', 'Mozilla/5.0') self.set_header('User-Agent', 'Mozilla/5.0')
self.set_header('Accept-Language', 'en-us,en;q=0.5')
# XXX: This class is used also to perform "Exact Primary # XXX: This class is used also to perform "Exact Primary
# [Title|Name]" searches, and so by default the cookie is set. # [Title|Name]" searches, and so by default the cookie is set.
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu) c_header = 'uu=%s; id=%s' % (_cookie_uu, _cookie_id)
self.set_header('Cookie', c_header) self.set_header('Cookie', c_header)
def get_proxy(self): def get_proxy(self):
@ -199,12 +209,11 @@ class IMDbURLopener(FancyURLopener):
server_encode = uopener.info().getparam('charset') server_encode = uopener.info().getparam('charset')
# Otherwise, look at the content-type HTML meta tag. # Otherwise, look at the content-type HTML meta tag.
if server_encode is None and content: if server_encode is None and content:
first_bytes = content[:512] begin_h = content.find('text/html; charset=')
begin_h = first_bytes.find('text/html; charset=')
if begin_h != -1: if begin_h != -1:
end_h = first_bytes[19+begin_h:].find('"') end_h = content[19+begin_h:].find('"')
if end_h != -1: if end_h != -1:
server_encode = first_bytes[19+begin_h:19+begin_h+end_h] server_encode = content[19+begin_h:19+begin_h+end_h]
if server_encode: if server_encode:
try: try:
if lookup(server_encode): if lookup(server_encode):
@ -455,16 +464,16 @@ class IMDbHTTPAccessSystem(IMDbBase):
results is the maximum number of results to be retrieved.""" results is the maximum number of results to be retrieved."""
if isinstance(ton, unicode): if isinstance(ton, unicode):
try: try:
ton = ton.encode('iso8859-1') ton = ton.encode('utf-8')
except Exception, e: except Exception, e:
try: try:
ton = ton.encode('utf-8') ton = ton.encode('iso8859-1')
except Exception, e: except Exception, e:
pass pass
##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results)) ##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results))
params = 'q=%s;s=%s;mx=%s' % (quote_plus(ton), kind, str(results)) params = 'q=%s&s=%s&mx=%s' % (quote_plus(ton), kind, str(results))
if kind == 'ep': if kind == 'ep':
params = params.replace('s=ep;', 's=tt;ttype=ep;', 1) params = params.replace('s=ep&', 's=tt&ttype=ep&', 1)
cont = self._retrieve(self.urls['find'] % params) cont = self._retrieve(self.urls['find'] % params)
#print 'URL:', imdbURL_find % params #print 'URL:', imdbURL_find % params
if cont.find('Your search returned more than') == -1 or \ if cont.find('Your search returned more than') == -1 or \
@ -472,7 +481,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return cont return cont
# The retrieved page contains no results, because too many # The retrieved page contains no results, because too many
# titles or names contain the string we're looking for. # titles or names contain the string we're looking for.
params = 'q=%s;ls=%s;lm=0' % (quote_plus(ton), kind) params = 'q=%s&ls=%s&lm=0' % (quote_plus(ton), kind)
size = 131072 + results * 512 size = 131072 + results * 512
return self._retrieve(self.urls['find'] % params, size=size) return self._retrieve(self.urls['find'] % params, size=size)
@ -587,6 +596,10 @@ class IMDbHTTPAccessSystem(IMDbBase):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'recommendations') cont = self._retrieve(self.urls['movie_main'] % movieID + 'recommendations')
return self.mProxy.rec_parser.parse(cont) return self.mProxy.rec_parser.parse(cont)
def get_movie_critic_reviews(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'criticreviews')
return self.mProxy.criticrev_parser.parse(cont)
def get_movie_external_reviews(self, movieID): def get_movie_external_reviews(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'externalreviews') cont = self._retrieve(self.urls['movie_main'] % movieID + 'externalreviews')
return self.mProxy.externalrev_parser.parse(cont) return self.mProxy.externalrev_parser.parse(cont)
@ -754,7 +767,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return self.pProxy.person_keywords_parser.parse(cont) return self.pProxy.person_keywords_parser.parse(cont)
def _search_character(self, name, results): def _search_character(self, name, results):
cont = self._get_search_content('char', name, results) cont = self._get_search_content('ch', name, results)
return self.scProxy.search_character_parser.parse(cont, results=results)['data'] return self.scProxy.search_character_parser.parse(cont, results=results)['data']
def get_character_main(self, characterID): def get_character_main(self, characterID):

View file

@ -9,7 +9,7 @@ pages would be:
plot summary: http://akas.imdb.com/title/tt0094226/plotsummary plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
...and so on... ...and so on...
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -531,9 +531,6 @@ class DOMHTMLMovieParser(DOMParserBase):
def _process_plotsummary(x): def _process_plotsummary(x):
"""Process a plot (contributed by Rdian06).""" """Process a plot (contributed by Rdian06)."""
xauthor = x.get('author') xauthor = x.get('author')
if xauthor:
xauthor = xauthor.replace('{', '<').replace('}', '>').replace('(',
'<').replace(')', '>').strip()
xplot = x.get('plot', u'').strip() xplot = x.get('plot', u'').strip()
if xauthor: if xauthor:
xplot += u'::%s' % xauthor xplot += u'::%s' % xauthor
@ -555,17 +552,20 @@ class DOMHTMLPlotParser(DOMParserBase):
# Notice that recently IMDb started to put the email of the # Notice that recently IMDb started to put the email of the
# author only in the link, that we're not collecting, here. # author only in the link, that we're not collecting, here.
extractors = [Extractor(label='plot', extractors = [Extractor(label='plot',
path="//p[@class='plotpar']", path="//ul[@class='zebraList']//p",
attrs=Attribute(key='plot', attrs=Attribute(key='plot',
multi=True, multi=True,
path={'plot': './text()', path={'plot': './text()[1]',
'author': './i/a/text()'}, 'author': './span/em/a/text()'},
postprocess=_process_plotsummary))] postprocess=_process_plotsummary))]
def _process_award(x): def _process_award(x):
award = {} award = {}
award['award'] = x.get('award').strip() _award = x.get('award')
if _award is not None:
_award = _award.strip()
award['award'] = _award
if not award['award']: if not award['award']:
return {} return {}
award['year'] = x.get('year').strip() award['year'] = x.get('year').strip()
@ -709,10 +709,16 @@ class DOMHTMLTaglinesParser(DOMParserBase):
result = tparser.parse(taglines_html_string) result = tparser.parse(taglines_html_string)
""" """
extractors = [Extractor(label='taglines', extractors = [Extractor(label='taglines',
path="//div[@id='tn15content']/p", path='//*[contains(concat(" ", normalize-space(@class), " "), " soda ")]',
attrs=Attribute(key='taglines', multi=True, attrs=Attribute(key='taglines',
multi=True,
path="./text()"))] path="./text()"))]
def postprocess_data(self, data):
if 'taglines' in data:
data['taglines'] = [tagline.strip() for tagline in data['taglines']]
return data
class DOMHTMLKeywordsParser(DOMParserBase): class DOMHTMLKeywordsParser(DOMParserBase):
"""Parser for the "keywords" page of a given movie. """Parser for the "keywords" page of a given movie.
@ -785,9 +791,9 @@ class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
] ]
def postprocess_data(self, data): def postprocess_data(self, data):
if 'soundtrack' in data: if 'alternate versions' in data:
nd = [] nd = []
for x in data['soundtrack']: for x in data['alternate versions']:
ds = x.split('\n') ds = x.split('\n')
title = ds[0] title = ds[0]
if title[0] == '"' and title[-1] == '"': if title[0] == '"' and title[-1] == '"':
@ -846,6 +852,13 @@ class DOMHTMLCrazyCreditsParser(DOMParserBase):
x.replace('\n', ' ').replace(' ', ' ')))] x.replace('\n', ' ').replace(' ', ' ')))]
def _process_goof(x):
if x['spoiler_category']:
return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip()
else:
return x['category'].strip() + ': ' + x['text'].strip()
class DOMHTMLGoofsParser(DOMParserBase): class DOMHTMLGoofsParser(DOMParserBase):
"""Parser for the "goofs" page of a given movie. """Parser for the "goofs" page of a given movie.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
@ -858,9 +871,14 @@ class DOMHTMLGoofsParser(DOMParserBase):
""" """
_defGetRefs = True _defGetRefs = True
extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li", extractors = [Extractor(label='goofs', path="//div[@class='soda odd']",
attrs=Attribute(key='goofs', multi=True, path=".//text()", attrs=Attribute(key='goofs', multi=True,
postprocess=lambda x: (x or u'').strip()))] path={
'text':"./text()",
'category':'./preceding-sibling::h4[1]/text()',
'spoiler_category': './h4/text()'
},
postprocess=_process_goof))]
class DOMHTMLQuotesParser(DOMParserBase): class DOMHTMLQuotesParser(DOMParserBase):
@ -876,9 +894,16 @@ class DOMHTMLQuotesParser(DOMParserBase):
_defGetRefs = True _defGetRefs = True
extractors = [ extractors = [
Extractor(label='quotes', Extractor(label='quotes_odd',
path="//div[@class='_imdbpy']", path="//div[@class='quote soda odd']",
attrs=Attribute(key='quotes', attrs=Attribute(key='quotes_odd',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip().replace(' \n',
'::').replace('::\n', '::').replace('\n', ' '))),
Extractor(label='quotes_even',
path="//div[@class='quote soda even']",
attrs=Attribute(key='quotes_even',
multi=True, multi=True,
path=".//text()", path=".//text()",
postprocess=lambda x: x.strip().replace(' \n', postprocess=lambda x: x.strip().replace(' \n',
@ -886,27 +911,23 @@ class DOMHTMLQuotesParser(DOMParserBase):
] ]
preprocessors = [ preprocessors = [
(re.compile('(<a name="?qt[0-9]{7}"?></a>)', re.I), (re.compile('<a href="#" class="hidesoda hidden">Hide options</a><br>', re.I), '')
r'\1<div class="_imdbpy">'), ]
(re.compile('<hr width="30%">', re.I), '</div>'),
(re.compile('<hr/>', re.I), '</div>'),
(re.compile('<script.*?</script>', re.I|re.S), ''),
# For BeautifulSoup.
(re.compile('<!-- sid: t-channel : MIDDLE_CENTER -->', re.I), '</div>')
]
def preprocess_dom(self, dom): def preprocess_dom(self, dom):
# Remove "link this quote" links. # Remove "link this quote" links.
for qLink in self.xpath(dom, "//p[@class='linksoda']"): for qLink in self.xpath(dom, "//span[@class='linksoda']"):
qLink.drop_tree()
for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"):
qLink.drop_tree() qLink.drop_tree()
return dom return dom
def postprocess_data(self, data): def postprocess_data(self, data):
if 'quotes' not in data: quotes = data.get('quotes_odd', []) + data.get('quotes_even', [])
if not quotes:
return {} return {}
for idx, quote in enumerate(data['quotes']): quotes = [q.split('::') for q in quotes]
data['quotes'][idx] = quote.split('::') return {'quotes': quotes}
return data
class DOMHTMLReleaseinfoParser(DOMParserBase): class DOMHTMLReleaseinfoParser(DOMParserBase):
@ -920,13 +941,13 @@ class DOMHTMLReleaseinfoParser(DOMParserBase):
result = rdparser.parse(releaseinfo_html_string) result = rdparser.parse(releaseinfo_html_string)
""" """
extractors = [Extractor(label='release dates', extractors = [Extractor(label='release dates',
path="//th[@class='xxxx']/../../tr", path="//table[@id='release_dates']//tr",
attrs=Attribute(key='release dates', multi=True, attrs=Attribute(key='release dates', multi=True,
path={'country': ".//td[1]//text()", path={'country': ".//td[1]//text()",
'date': ".//td[2]//text()", 'date': ".//td[2]//text()",
'notes': ".//td[3]//text()"})), 'notes': ".//td[3]//text()"})),
Extractor(label='akas', Extractor(label='akas',
path="//div[@class='_imdbpy_akas']/table/tr", path="//table[@id='akas']//tr",
attrs=Attribute(key='akas', multi=True, attrs=Attribute(key='akas', multi=True,
path={'title': "./td[1]/text()", path={'title': "./td[1]/text()",
'countries': "./td[2]/text()"}))] 'countries': "./td[2]/text()"}))]
@ -961,7 +982,7 @@ class DOMHTMLReleaseinfoParser(DOMParserBase):
title = (aka.get('title') or '').strip() title = (aka.get('title') or '').strip()
if not title: if not title:
continue continue
countries = (aka.get('countries') or '').split('/') countries = (aka.get('countries') or '').split(',')
if not countries: if not countries:
nakas.append(title) nakas.append(title)
else: else:
@ -1135,7 +1156,28 @@ def _normalize_href(href):
href = '%s%s' % (imdbURL_base, href) href = '%s%s' % (imdbURL_base, href)
return href return href
class DOMHTMLCriticReviewsParser(DOMParserBase):
"""Parser for the "critic reviews" pages of a given movie.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
osparser = DOMHTMLCriticReviewsParser()
result = osparser.parse(officialsites_html_string)
"""
kind = 'critic reviews'
extractors = [
Extractor(label='metascore',
path="//div[@class='metascore_wrap']/div/span",
attrs=Attribute(key='metascore',
path=".//text()")),
Extractor(label='metacritic url',
path="//div[@class='article']/div[@class='see-more']/a",
attrs=Attribute(key='metacritic url',
path="./@href")) ]
class DOMHTMLOfficialsitesParser(DOMParserBase): class DOMHTMLOfficialsitesParser(DOMParserBase):
"""Parser for the "official sites", "external reviews", "newsgroup """Parser for the "official sites", "external reviews", "newsgroup
reviews", "miscellaneous links", "sound clips", "video clips" and reviews", "miscellaneous links", "sound clips", "video clips" and
@ -1471,6 +1513,14 @@ class DOMHTMLSeasonEpisodesParser(DOMParserBase):
try: selected_season = int(selected_season) try: selected_season = int(selected_season)
except: pass except: pass
nd = {selected_season: {}} nd = {selected_season: {}}
if 'episode -1' in data:
counter = 1
for episode in data['episode -1']:
while 'episode %d' % counter in data:
counter += 1
k = 'episode %d' % counter
data[k] = [episode]
del data['episode -1']
for episode_nr, episode in data.iteritems(): for episode_nr, episode in data.iteritems():
if not (episode and episode[0] and if not (episode and episode[0] and
episode_nr.startswith('episode ')): episode_nr.startswith('episode ')):
@ -1860,6 +1910,8 @@ _OBJECTS = {
'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None), 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
'ratings_parser': ((DOMHTMLRatingsParser,), None), 'ratings_parser': ((DOMHTMLRatingsParser,), None),
'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
'criticrev_parser': ((DOMHTMLCriticReviewsParser,),
{'kind': 'critic reviews'}),
'externalrev_parser': ((DOMHTMLOfficialsitesParser,), 'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
{'kind': 'external reviews'}), {'kind': 'external reviews'}),
'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,), 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),

View file

@ -8,7 +8,7 @@ E.g., for "Mel Gibson" the referred pages would be:
biography: http://akas.imdb.com/name/nm0000154/bio biography: http://akas.imdb.com/name/nm0000154/bio
...and so on... ...and so on...
Copyright 2004-20101 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -60,6 +60,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
result = cparser.parse(categorized_html_string) result = cparser.parse(categorized_html_string)
""" """
_containsObjects = True _containsObjects = True
_name_imdb_index = re.compile(r'\([IVXLCDM]+\)')
_birth_attrs = [Attribute(key='birth date', _birth_attrs = [Attribute(key='birth date',
path='.//time[@itemprop="birthDate"]/@datetime'), path='.//time[@itemprop="birthDate"]/@datetime'),
@ -100,6 +101,10 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
path=".//text()", path=".//text()",
postprocess=lambda x: analyze_name(x, postprocess=lambda x: analyze_name(x,
canonical=1))), canonical=1))),
Extractor(label='name_index',
path="//h1[@class='header']/span[1]",
attrs=Attribute(key='name_index',
path="./text()")),
Extractor(label='birth info', Extractor(label='birth info',
path="//div[h4='Born:']", path="//div[h4='Born:']",
@ -110,7 +115,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
attrs=_death_attrs), attrs=_death_attrs),
Extractor(label='headshot', Extractor(label='headshot',
path="//td[@id='img_primary']/a", path="//td[@id='img_primary']/div[@class='image']/a",
attrs=Attribute(key='headshot', attrs=Attribute(key='headshot',
path="./img/@src")), path="./img/@src")),
@ -152,6 +157,11 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
for what in 'birth date', 'death date': for what in 'birth date', 'death date':
if what in data and not data[what]: if what in data and not data[what]:
del data[what] del data[what]
name_index = (data.get('name_index') or '').strip()
if name_index:
if self._name_imdb_index.match(name_index):
data['imdbIndex'] = name_index[1:-1]
del data['name_index']
# XXX: the code below is for backwards compatibility # XXX: the code below is for backwards compatibility
# probably could be removed # probably could be removed
for key in data.keys(): for key in data.keys():
@ -220,13 +230,13 @@ class DOMHTMLBioParser(DOMParserBase):
attrs=Attribute(key='headshot', attrs=Attribute(key='headshot',
path="./img/@src")), path="./img/@src")),
Extractor(label='birth info', Extractor(label='birth info',
path="//div[h5='Date of Birth']", path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
attrs=_birth_attrs), attrs=_birth_attrs),
Extractor(label='death info', Extractor(label='death info',
path="//div[h5='Date of Death']", path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
attrs=_death_attrs), attrs=_death_attrs),
Extractor(label='nick names', Extractor(label='nick names',
path="//div[h5='Nickname']", path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
attrs=Attribute(key='nick names', attrs=Attribute(key='nick names',
path="./text()", path="./text()",
joiner='|', joiner='|',
@ -234,25 +244,25 @@ class DOMHTMLBioParser(DOMParserBase):
'::(', 1) for n in x.split('|') '::(', 1) for n in x.split('|')
if n.strip()])), if n.strip()])),
Extractor(label='birth name', Extractor(label='birth name',
path="//div[h5='Birth Name']", path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
attrs=Attribute(key='birth name', attrs=Attribute(key='birth name',
path="./text()", path="./text()",
postprocess=lambda x: canonicalName(x.strip()))), postprocess=lambda x: canonicalName(x.strip()))),
Extractor(label='height', Extractor(label='height',
path="//div[h5='Height']", path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
attrs=Attribute(key='height', attrs=Attribute(key='height',
path="./text()", path="./text()",
postprocess=lambda x: x.strip())), postprocess=lambda x: x.strip())),
Extractor(label='mini biography', Extractor(label='mini biography',
path="//div[h5='Mini Biography']", path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
attrs=Attribute(key='mini biography', attrs=Attribute(key='mini biography',
multi=True, multi=True,
path={ path={
'bio': "./p//text()", 'bio': ".//text()",
'by': "./b/following-sibling::a/text()" 'by': ".//a[@name='ba']//text()"
}, },
postprocess=lambda x: "%s::%s" % \ postprocess=lambda x: "%s::%s" % \
(x.get('bio').strip(), ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
(x.get('by') or u'').strip() or u'Anonymous'))), (x.get('by') or u'').strip() or u'Anonymous'))),
Extractor(label='spouse', Extractor(label='spouse',
path="//div[h5='Spouse']/table/tr", path="//div[h5='Spouse']/table/tr",

View file

@ -5,9 +5,9 @@ This module provides the HTMLSearchCharacterParser class (and the
search_character_parser instance), used to parse the results of a search search_character_parser instance), used to parse the results of a search
for a given character. for a given character.
E.g., when searching for the name "Jesse James", the parsed page would be: E.g., when searching for the name "Jesse James", the parsed page would be:
http://akas.imdb.com/find?s=Characters;mx=20;q=Jesse+James http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it> Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -42,7 +42,7 @@ class DOMBasicCharacterParser(DOMBasicMovieParser):
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser): class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCharacterParser _BaseParser = DOMBasicCharacterParser
_notDirectHitTitle = '<title>imdb search' _notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_name(x, canonical=False) _titleBuilder = lambda self, x: build_name(x, canonical=False)
_linkPrefix = '/character/ch' _linkPrefix = '/character/ch'
@ -57,7 +57,7 @@ class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
{'name': x.get('name')} {'name': x.get('name')}
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \ path="//td[@class='result_text']/a[starts-with(@href, " \
"'/character/ch')]/..", "'/character/ch')]/..",
attrs=_attrs)] attrs=_attrs)]

View file

@ -7,7 +7,7 @@ for a given company.
E.g., when searching for the name "Columbia Pictures", the parsed page would be: E.g., when searching for the name "Columbia Pictures", the parsed page would be:
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it> Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -42,7 +42,7 @@ class DOMBasicCompanyParser(DOMBasicMovieParser):
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCompanyParser _BaseParser = DOMBasicCompanyParser
_notDirectHitTitle = '<title>imdb company' _notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_company_name(x) _titleBuilder = lambda self, x: build_company_name(x)
_linkPrefix = '/company/co' _linkPrefix = '/company/co'
@ -59,7 +59,7 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
or u''), stripNotes=True) or u''), stripNotes=True)
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \ path="//td[@class='result_text']/a[starts-with(@href, " \
"'/company/co')]/..", "'/company/co')]/..",
attrs=_attrs)] attrs=_attrs)]

View file

@ -8,7 +8,7 @@ E.g., for when searching for the title "the passion", the parsed
page would be: page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20 http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -77,7 +77,7 @@ class DOMBasicMovieParser(DOMParserBase):
def custom_analyze_title(title): def custom_analyze_title(title):
"""Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)""" """Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
# XXX: very crappy. :-( # XXX: very crappy. :-(
nt = title.split(' ')[0] nt = title.split(' aka ')[0]
if nt: if nt:
title = nt title = nt
if not title: if not title:
@ -92,7 +92,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
"new search system" is used, for movies.""" "new search system" is used, for movies."""
_BaseParser = DOMBasicMovieParser _BaseParser = DOMBasicMovieParser
_notDirectHitTitle = '<title>imdb title' _notDirectHitTitle = '<title>find - imdb</title>'
_titleBuilder = lambda self, x: build_title(x) _titleBuilder = lambda self, x: build_title(x)
_linkPrefix = '/title/tt' _linkPrefix = '/title/tt'
@ -101,8 +101,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
path={ path={
'link': "./a[1]/@href", 'link': "./a[1]/@href",
'info': ".//text()", 'info': ".//text()",
#'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': "./i//text()"
'akas': ".//p[@class='find-aka']//text()"
}, },
postprocess=lambda x: ( postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''), analyze_imdbid(x.get('link') or u''),
@ -110,7 +109,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
x.get('akas') x.get('akas')
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/title/tt')]/..", path="//td[@class='result_text']",
attrs=_attrs)] attrs=_attrs)]
def _init(self): def _init(self):
self.url = u'' self.url = u''
@ -119,14 +118,11 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
self.url = u'' self.url = u''
def preprocess_string(self, html_string): def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower(): if self._notDirectHitTitle in html_string[:10240].lower():
if self._linkPrefix == '/title/tt': if self._linkPrefix == '/title/tt':
# Only for movies. # Only for movies.
# XXX (HTU): does this still apply?
html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('(TV mini-series)', '(mini)')
html_string = html_string.replace('<p class="find-aka">',
'<p class="find-aka">::')
#html_string = _reAKAStitles.sub(
# r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
return html_string return html_string
# Direct hit! # Direct hit!
dbme = self._BaseParser(useModule=self._useModule) dbme = self._BaseParser(useModule=self._useModule)
@ -141,7 +137,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
title = self._titleBuilder(res[0][1]) title = self._titleBuilder(res[0][1])
if not (link and title): return u'' if not (link and title): return u''
link = link.replace('http://pro.imdb.com', '') link = link.replace('http://pro.imdb.com', '')
new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
title) title)
return new_html return new_html
@ -161,11 +157,14 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
if not datum[0] and datum[1]: if not datum[0] and datum[1]:
continue continue
if datum[2] is not None: if datum[2] is not None:
akas = filter(None, datum[2].split('::')) #akas = filter(None, datum[2].split('::'))
if self._linkPrefix == '/title/tt': if self._linkPrefix == '/title/tt':
akas = [a.replace('" - ', '::').rstrip() for a in akas] # XXX (HTU): couldn't find a result with multiple akas
akas = [a.replace('aka "', '', 1).replace('aka "', aka = datum[2]
'', 1).lstrip() for a in akas] akas = [aka[1:-1]] # remove the quotes
#akas = [a.replace('" - ', '::').rstrip() for a in akas]
#akas = [a.replace('aka "', '', 1).replace('aka "',
#'', 1).lstrip() for a in akas]
datum[1]['akas'] = akas datum[1]['akas'] = akas
data['data'][idx] = (datum[0], datum[1]) data['data'][idx] = (datum[0], datum[1])
else: else:

View file

@ -7,7 +7,7 @@ for a given person.
E.g., when searching for the name "Mel Gibson", the parsed page would be: E.g., when searching for the name "Mel Gibson", the parsed page would be:
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20 http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -55,7 +55,7 @@ class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
"""Parse the html page that the IMDb web server shows when the """Parse the html page that the IMDb web server shows when the
"new search system" is used, for persons.""" "new search system" is used, for persons."""
_BaseParser = DOMBasicPersonParser _BaseParser = DOMBasicPersonParser
_notDirectHitTitle = '<title>imdb name' _notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_name(x, canonical=True) _titleBuilder = lambda self, x: build_name(x, canonical=True)
_linkPrefix = '/name/nm' _linkPrefix = '/name/nm'
@ -74,11 +74,11 @@ class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
canonical=1), x.get('akas') canonical=1), x.get('akas')
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/name/nm')]/..", path="//td[@class='result_text']/a[starts-with(@href, '/name/nm')]/..",
attrs=_attrs)] attrs=_attrs)]
def preprocess_string(self, html_string): def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower(): if self._notDirectHitTitle in html_string[:10240].lower():
html_string = _reAKASp.sub( html_string = _reAKASp.sub(
r'\1<div class="_imdbpyAKA">\2::</div>\3', r'\1<div class="_imdbpyAKA">\2::</div>\3',
html_string) html_string)

View file

@ -340,7 +340,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
title = title[:nidx].rstrip() title = title[:nidx].rstrip()
if year: if year:
year = year.strip() year = year.strip()
if title[-1] == ')': if title[-1:] == ')':
fpIdx = title.rfind('(') fpIdx = title.rfind('(')
if fpIdx != -1: if fpIdx != -1:
if notes: notes = '%s %s' % (title[fpIdx:], notes) if notes: notes = '%s %s' % (title[fpIdx:], notes)

View file

@ -6,7 +6,7 @@ IMDb's data for mobile systems.
the imdb.IMDb function will return an instance of this class when the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "mobile". called with the 'accessSystem' argument set to "mobile".
Copyright 2005-2011 Davide Alberani <da@erlug.linux.it> Copyright 2005-2012 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -193,7 +193,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
title) title)
return res return res
tl = title[0].lower() tl = title[0].lower()
if not tl.startswith('imdb title'): if not tl.startswith('find - imdb'):
# a direct hit! # a direct hit!
title = _unHtml(title[0]) title = _unHtml(title[0])
mid = None mid = None
@ -211,7 +211,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
# XXX: this results*3 prevents some recursion errors, but... # XXX: this results*3 prevents some recursion errors, but...
# it's not exactly understandable (i.e.: why 'results' is # it's not exactly understandable (i.e.: why 'results' is
# not enough to get all the results?) # not enough to get all the results?)
lis = _findBetween(cont, 'td valign="top">', '</td>', lis = _findBetween(cont, 'td class="result_text">', '</td>',
maxRes=results*3) maxRes=results*3)
for li in lis: for li in lis:
akas = re_makas.findall(li) akas = re_makas.findall(li)
@ -492,7 +492,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
self._mobile_logger.warn('no title tag searching for name %s', name) self._mobile_logger.warn('no title tag searching for name %s', name)
return res return res
nl = name[0].lower() nl = name[0].lower()
if not nl.startswith('imdb name'): if not nl.startswith('find - imdb'):
# a direct hit! # a direct hit!
name = _unHtml(name[0]) name = _unHtml(name[0])
name = name.replace('- Filmography by type' , '').strip() name = name.replace('- Filmography by type' , '').strip()
@ -506,7 +506,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res return res
res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
else: else:
lis = _findBetween(cont, 'td valign="top">', '</td>', lis = _findBetween(cont, 'td class="result_text">', '</td>',
maxRes=results*3) maxRes=results*3)
for li in lis: for li in lis:
akas = _findBetween(li, '<em>"', '"</em>') akas = _findBetween(li, '<em>"', '"</em>')
@ -771,7 +771,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return {'data': d} return {'data': d}
def _search_character(self, name, results): def _search_character(self, name, results):
cont = subXMLRefs(self._get_search_content('char', name, results)) cont = subXMLRefs(self._get_search_content('ch', name, results))
name = _findBetween(cont, '<title>', '</title>', maxRes=1) name = _findBetween(cont, '<title>', '</title>', maxRes=1)
res = [] res = []
if not name: if not name:
@ -779,8 +779,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
name) name)
return res return res
nl = name[0].lower() nl = name[0].lower()
if not (nl.startswith('imdb search') or nl.startswith('imdb search') \ if not nl.startswith('find - imdb'):
or nl.startswith('imdb character')):
# a direct hit! # a direct hit!
name = _unHtml(name[0]).replace('(Character)', '').strip() name = _unHtml(name[0]).replace('(Character)', '').strip()
pid = None pid = None
@ -793,23 +792,18 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res return res
res[:] = [(str(pid[0]), analyze_name(name))] res[:] = [(str(pid[0]), analyze_name(name))]
else: else:
sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>', lis = _findBetween(cont, '<td class="result_text"',
maxRes=results*3) ['<small', '</td>', '<br'])
sects += _findBetween(cont, '<b>Characters', '</table>', for li in lis:
maxRes=results*3) li = '<%s' % li
for sect in sects: pid = re_imdbID.findall(li)
lis = _findBetween(sect, '<a href="/character/', pname = _unHtml(li)
['<small', '</td>', '<br']) if not (pid and pname):
for li in lis: self._mobile_logger.debug('no name/characterID' \
li = '<%s' % li ' parsing %s searching for' \
pid = re_imdbID.findall(li) ' character %s', li, name)
pname = _unHtml(li) continue
if not (pid and pname): res.append((str(pid[0]), analyze_name(pname)))
self._mobile_logger.debug('no name/characterID' \
' parsing %s searching for' \
' character %s', li, name)
continue
res.append((str(pid[0]), analyze_name(pname)))
return res return res
def get_character_main(self, characterID): def get_character_main(self, characterID):

View file

@ -7,7 +7,7 @@ the SQLObject _AND_ SQLAlchemy Object Relational Managers is available.
the imdb.IMDb function will return an instance of this class when the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "sql", "database" or "db". called with the 'accessSystem' argument set to "sql", "database" or "db".
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it> Copyright 2005-2012 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -452,7 +452,12 @@ def get_movie_data(movieID, kindDict, fromAka=0, _table=None):
else: else:
if not fromAka: Table = Title if not fromAka: Table = Title
else: Table = AkaTitle else: Table = AkaTitle
m = Table.get(movieID) try:
m = Table.get(movieID)
except Exception, e:
_aux_logger.warn('Unable to fetch information for movieID %s: %s', movieID, e)
mdict = {}
return mdict
mdict = {'title': m.title, 'kind': kindDict[m.kindID], mdict = {'title': m.title, 'kind': kindDict[m.kindID],
'year': m.productionYear, 'imdbIndex': m.imdbIndex, 'year': m.productionYear, 'imdbIndex': m.imdbIndex,
'season': m.seasonNr, 'episode': m.episodeNr} 'season': m.seasonNr, 'episode': m.episodeNr}
@ -825,14 +830,14 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = movie.imdbID imdbID = movie.imdbID
if imdbID is not None: return '%07d' % imdbID if imdbID is not None: return '%07d' % imdbID
m_dict = get_movie_data(movie.id, self._kind) m_dict = get_movie_data(movie.id, self._kind)
titline = build_title(m_dict, ptdf=1) titline = build_title(m_dict, ptdf=0)
imdbID = self.title2imdbID(titline) imdbID = self.title2imdbID(titline, m_dict['kind'])
# If the imdbID was retrieved from the web and was not in the # If the imdbID was retrieved from the web and was not in the
# database, update the database (ignoring errors, because it's # database, update the database (ignoring errors, because it's
# possibile that the current user has not update privileges). # possibile that the current user has not update privileges).
# There're times when I think I'm a genius; this one of # There're times when I think I'm a genius; this one of
# those times... <g> # those times... <g>
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: movie.imdbID = int(imdbID) try: movie.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -847,9 +852,9 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = person.imdbID imdbID = person.imdbID
if imdbID is not None: return '%07d' % imdbID if imdbID is not None: return '%07d' % imdbID
n_dict = {'name': person.name, 'imdbIndex': person.imdbIndex} n_dict = {'name': person.name, 'imdbIndex': person.imdbIndex}
namline = build_name(n_dict, canonical=1) namline = build_name(n_dict, canonical=False)
imdbID = self.name2imdbID(namline) imdbID = self.name2imdbID(namline)
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: person.imdbID = int(imdbID) try: person.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -864,9 +869,9 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = character.imdbID imdbID = character.imdbID
if imdbID is not None: return '%07d' % imdbID if imdbID is not None: return '%07d' % imdbID
n_dict = {'name': character.name, 'imdbIndex': character.imdbIndex} n_dict = {'name': character.name, 'imdbIndex': character.imdbIndex}
namline = build_name(n_dict, canonical=1) namline = build_name(n_dict, canonical=False)
imdbID = self.character2imdbID(namline) imdbID = self.character2imdbID(namline)
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: character.imdbID = int(imdbID) try: character.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -883,7 +888,7 @@ class IMDbSqlAccessSystem(IMDbBase):
n_dict = {'name': company.name, 'country': company.countryCode} n_dict = {'name': company.name, 'country': company.countryCode}
namline = build_company_name(n_dict) namline = build_company_name(n_dict)
imdbID = self.company2imdbID(namline) imdbID = self.company2imdbID(namline)
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: company.imdbID = int(imdbID) try: company.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -1116,8 +1121,9 @@ class IMDbSqlAccessSystem(IMDbBase):
if mlinks: if mlinks:
for ml in mlinks: for ml in mlinks:
lmovieData = get_movie_data(ml[0], self._kind) lmovieData = get_movie_data(ml[0], self._kind)
m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql') if lmovieData:
ml[0] = m m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql')
ml[0] = m
res['connections'] = {} res['connections'] = {}
mlinks[:] = _groupListBy(mlinks, 1) mlinks[:] = _groupListBy(mlinks, 1)
for group in mlinks: for group in mlinks:

View file

@ -466,6 +466,7 @@ class _AlchemyConnection(object):
def setConnection(uri, tables, encoding='utf8', debug=False): def setConnection(uri, tables, encoding='utf8', debug=False):
"""Set connection for every table.""" """Set connection for every table."""
params = {'encoding': encoding}
# FIXME: why on earth MySQL requires an additional parameter, # FIXME: why on earth MySQL requires an additional parameter,
# is well beyond my understanding... # is well beyond my understanding...
if uri.startswith('mysql'): if uri.startswith('mysql'):
@ -474,7 +475,11 @@ def setConnection(uri, tables, encoding='utf8', debug=False):
else: else:
uri += '?' uri += '?'
uri += 'charset=%s' % encoding uri += 'charset=%s' % encoding
params = {'encoding': encoding}
# On some server configurations, we will need to explictly enable
# loading data from local files
params['local_infile'] = 1
if debug: if debug:
params['echo'] = True params['echo'] = True
if uri.startswith('ibm_db'): if uri.startswith('ibm_db'):

Binary file not shown.

View file

@ -182,6 +182,10 @@ def setConnection(uri, tables, encoding='utf8', debug=False):
kw['use_unicode'] = 1 kw['use_unicode'] = 1
#kw['sqlobject_encoding'] = encoding #kw['sqlobject_encoding'] = encoding
kw['charset'] = encoding kw['charset'] = encoding
# On some server configurations, we will need to explictly enable
# loading data from local files
kw['local_infile'] = 1
conn = connectionForURI(uri, **kw) conn = connectionForURI(uri, **kw)
conn.debug = debug conn.debug = debug
# XXX: doesn't work and a work-around was put in imdbpy2sql.py; # XXX: doesn't work and a work-around was put in imdbpy2sql.py;

View file

@ -3,7 +3,7 @@ utils module (imdb package).
This module provides basic utilities for the imdb package. This module provides basic utilities for the imdb package.
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2009 H. Turgut Uyar <uyar@tekir.org> 2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -189,10 +189,9 @@ _unicodeArticles = linguistics.toUnicode(_articles)
articlesDicts = linguistics.articlesDictsForLang(None) articlesDicts = linguistics.articlesDictsForLang(None)
spArticles = linguistics.spArticlesForLang(None) spArticles = linguistics.spArticlesForLang(None)
def canonicalTitle(title, lang=None): def canonicalTitle(title, lang=None, imdbIndex=None):
"""Return the title in the canonic format 'Movie Title, The'; """Return the title in the canonic format 'Movie Title, The';
beware that it doesn't handle long imdb titles, but only the beware that it doesn't handle long imdb titles.
title portion, without year[/imdbIndex] or special markup.
The 'lang' argument can be used to specify the language of the title. The 'lang' argument can be used to specify the language of the title.
""" """
isUnicode = isinstance(title, unicode) isUnicode = isinstance(title, unicode)
@ -203,15 +202,19 @@ def canonicalTitle(title, lang=None):
except IndexError: except IndexError:
pass pass
if isUnicode: if isUnicode:
_format = u'%s, %s' _format = u'%s%s, %s'
else: else:
_format = '%s, %s' _format = '%s%s, %s'
ltitle = title.lower() ltitle = title.lower()
if imdbIndex:
imdbIndex = ' (%s)' % imdbIndex
else:
imdbIndex = ''
spArticles = linguistics.spArticlesForLang(lang) spArticles = linguistics.spArticlesForLang(lang)
for article in spArticles[isUnicode]: for article in spArticles[isUnicode]:
if ltitle.startswith(article): if ltitle.startswith(article):
lart = len(article) lart = len(article)
title = _format % (title[lart:], title[:lart]) title = _format % (title[lart:], imdbIndex, title[:lart])
if article[-1] == ' ': if article[-1] == ' ':
title = title[:-1] title = title[:-1]
break break
@ -383,18 +386,42 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
if title.endswith('(TV)'): if title.endswith('(TV)'):
kind = u'tv movie' kind = u'tv movie'
title = title[:-4].rstrip() title = title[:-4].rstrip()
elif title.endswith('(TV Movie)'):
kind = u'tv movie'
title = title[:-10].rstrip()
elif title.endswith('(V)'): elif title.endswith('(V)'):
kind = u'video movie' kind = u'video movie'
title = title[:-3].rstrip() title = title[:-3].rstrip()
elif title.endswith('(video)'): elif title.lower().endswith('(video)'):
kind = u'video movie' kind = u'video movie'
title = title[:-7].rstrip() title = title[:-7].rstrip()
elif title.endswith('(TV Short)'):
kind = u'tv short'
title = title[:-10].rstrip()
elif title.endswith('(TV Mini-Series)'):
kind = u'tv mini series'
title = title[:-16].rstrip()
elif title.endswith('(mini)'): elif title.endswith('(mini)'):
kind = u'tv mini series' kind = u'tv mini series'
title = title[:-6].rstrip() title = title[:-6].rstrip()
elif title.endswith('(VG)'): elif title.endswith('(VG)'):
kind = u'video game' kind = u'video game'
title = title[:-4].rstrip() title = title[:-4].rstrip()
elif title.endswith('(Video Game)'):
kind = u'video game'
title = title[:-12].rstrip()
elif title.endswith('(TV Series)'):
epindex = title.find('(TV Episode) - ')
if epindex >= 0:
# It's an episode of a series.
kind = u'episode'
series_info = analyze_title(title[epindex + 15:])
result['episode of'] = series_info.get('title')
result['series year'] = series_info.get('year')
title = title[:epindex]
else:
kind = u'tv series'
title = title[:-11].rstrip()
# Search for the year and the optional imdbIndex (a roman number). # Search for the year and the optional imdbIndex (a roman number).
yi = re_year_index.findall(title) yi = re_year_index.findall(title)
if not yi: if not yi:
@ -430,9 +457,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
if not kind: if not kind:
kind = u'tv series' kind = u'tv series'
title = title[1:-1].strip() title = title[1:-1].strip()
elif title.endswith('(TV series)'):
kind = u'tv series'
title = title[:-11].rstrip()
if not title: if not title:
raise IMDbParserError('invalid title: "%s"' % original_t) raise IMDbParserError('invalid title: "%s"' % original_t)
if canonical is not None: if canonical is not None:
@ -489,7 +513,7 @@ def _convertTime(title, fromPTDFtoWEB=1, _emptyString=u''):
def build_title(title_dict, canonical=None, canonicalSeries=None, def build_title(title_dict, canonical=None, canonicalSeries=None,
canonicalEpisode=None, ptdf=0, lang=None, _doYear=1, canonicalEpisode=None, ptdf=0, lang=None, _doYear=1,
_emptyString=u''): _emptyString=u'', appendKind=True):
"""Given a dictionary that represents a "long" IMDb title, """Given a dictionary that represents a "long" IMDb title,
return a string. return a string.
@ -511,6 +535,11 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
doYear = 0 doYear = 0
if ptdf: if ptdf:
doYear = 1 doYear = 1
# XXX: for results coming from the new search page.
if not isinstance(episode_of, (dict, _Container)):
episode_of = {'title': episode_of, 'kind': 'tv series'}
if 'series year' in title_dict:
episode_of['year'] = title_dict['series year']
pre_title = build_title(episode_of, canonical=canonicalSeries, pre_title = build_title(episode_of, canonical=canonicalSeries,
ptdf=0, _doYear=doYear, ptdf=0, _doYear=doYear,
_emptyString=_emptyString) _emptyString=_emptyString)
@ -545,12 +574,14 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
episode_title += '.%s' % episode episode_title += '.%s' % episode
episode_title += ')' episode_title += ')'
episode_title = '{%s}' % episode_title episode_title = '{%s}' % episode_title
return '%s %s' % (pre_title, episode_title) return _emptyString + '%s %s' % (_emptyString + pre_title,
_emptyString + episode_title)
title = title_dict.get('title', '') title = title_dict.get('title', '')
imdbIndex = title_dict.get('imdbIndex', '')
if not title: return _emptyString if not title: return _emptyString
if canonical is not None: if canonical is not None:
if canonical: if canonical:
title = canonicalTitle(title, lang=lang) title = canonicalTitle(title, lang=lang, imdbIndex=imdbIndex)
else: else:
title = normalizeTitle(title, lang=lang) title = normalizeTitle(title, lang=lang)
if pre_title: if pre_title:
@ -558,15 +589,20 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
if kind in (u'tv series', u'tv mini series'): if kind in (u'tv series', u'tv mini series'):
title = '"%s"' % title title = '"%s"' % title
if _doYear: if _doYear:
imdbIndex = title_dict.get('imdbIndex') year = title_dict.get('year') or '????'
year = title_dict.get('year') or u'????'
if isinstance(_emptyString, str): if isinstance(_emptyString, str):
year = str(year) year = str(year)
title += ' (%s' % year imdbIndex = title_dict.get('imdbIndex')
if imdbIndex: if not ptdf:
title += '/%s' % imdbIndex if imdbIndex and (canonical is None or canonical):
title += ')' title += ' (%s)' % imdbIndex
if kind: title += ' (%s)' % year
else:
title += ' (%s' % year
if imdbIndex and (canonical is None or canonical):
title += '/%s' % imdbIndex
title += ')'
if appendKind and kind:
if kind == 'tv movie': if kind == 'tv movie':
title += ' (TV)' title += ' (TV)'
elif kind == 'video movie': elif kind == 'video movie':

View file

@ -11,6 +11,7 @@ __author__ = "dbr/Ben"
__version__ = "1.9" __version__ = "1.9"
import os import os
import re
import time import time
import getpass import getpass
import StringIO import StringIO
@ -18,8 +19,10 @@ import tempfile
import warnings import warnings
import logging import logging
import zipfile import zipfile
import datetime as dt
import requests import requests
import cachecontrol import cachecontrol
import xmltodict
try: try:
import xml.etree.cElementTree as ElementTree import xml.etree.cElementTree as ElementTree
@ -31,6 +34,7 @@ try:
except ImportError: except ImportError:
gzip = None gzip = None
from lib.dateutil.parser import parse
from cachecontrol import caches from cachecontrol import caches
from tvdb_ui import BaseUI, ConsoleUI from tvdb_ui import BaseUI, ConsoleUI
@ -560,44 +564,71 @@ class Tvdb:
except requests.Timeout, e: except requests.Timeout, e:
raise tvdb_error("Connection timed out " + str(e.message) + " while loading URL " + str(url)) raise tvdb_error("Connection timed out " + str(e.message) + " while loading URL " + str(url))
if 'application/zip' in resp.headers.get("Content-Type", '') and resp.ok: def process(path, key, value):
try: key = key.lower()
# TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20]
log().debug("We recived a zip file unpacking now ...")
zipdata = StringIO.StringIO()
zipdata.write(resp.content)
myzipfile = zipfile.ZipFile(zipdata)
return myzipfile.read('%s.xml' % language)
except zipfile.BadZipfile:
raise tvdb_error("Bad zip file received from thetvdb.com, could not read it")
return resp.content if resp.ok else None # clean up value and do type changes
if value:
try:
# convert to integer if needed
if value.isdigit():
value = int(value)
except:
pass
if key in ['banner', 'fanart', 'poster']:
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
try:
if key == 'firstaired' and value in "0000-00-00":
new_value = str(dt.date.fromordinal(1))
new_value = re.sub("([-]0{2}){1,}", "", new_value)
fixDate = parse(new_value, fuzzy=True).date()
value = fixDate.strftime("%Y-%m-%d")
elif key == 'firstaired':
value = parse(value, fuzzy=True).date()
value = value.strftime("%Y-%m-%d")
except:
pass
value = self._cleanData(value)
return (key, value)
if resp.ok:
if 'application/zip' in resp.headers.get("Content-Type", ''):
try:
# TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20]
log().debug("We recived a zip file unpacking now ...")
zipdata = StringIO.StringIO()
zipdata.write(resp.content)
myzipfile = zipfile.ZipFile(zipdata)
return xmltodict.parse(myzipfile.read('%s.xml' % language), postprocessor=process)
except zipfile.BadZipfile:
raise tvdb_error("Bad zip file received from thetvdb.com, could not read it")
else:
return xmltodict.parse(resp.text.strip(), postprocessor=process)
def _getetsrc(self, url, params=None, language=None): def _getetsrc(self, url, params=None, language=None):
"""Loads a URL using caching, returns an ElementTree of the source """Loads a URL using caching, returns an ElementTree of the source
""" """
src = self._loadUrl(url, params=params, language=language)
try: try:
# TVDB doesn't sanitize \r (CR) from user input in some fields,
# remove it to avoid errors. Change from SickBeard, from will14m
return ElementTree.fromstring(src.rstrip("\r")) if src else None
except SyntaxError:
src = self._loadUrl(url, params=params, language=language) src = self._loadUrl(url, params=params, language=language)
try: src = [src[item] for item in src][0]
return ElementTree.fromstring(src.rstrip("\r")) if src else None except:
except SyntaxError, exceptionmsg: errormsg = "There was an error with the XML retrieved from thetvdb.com:"
errormsg = "There was an error with the XML retrieved from thetvdb.com:\n%s" % (
exceptionmsg if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
self.config['cache_location']
) )
if self.config['cache_enabled']: errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on"
errormsg += "\nFirst try emptying the cache folder at..\n%s" % ( errormsg += "\nhttp://dbr.lighthouseapp.com/projects/13342-tvdb_api/overview\n"
self.config['cache_location'] raise tvdb_error(errormsg)
)
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on" return src
errormsg += "\nhttp://dbr.lighthouseapp.com/projects/13342-tvdb_api/overview\n"
raise tvdb_error(errormsg)
def _setItem(self, sid, seas, ep, attrib, value): def _setItem(self, sid, seas, ep, attrib, value):
"""Creates a new episode, creating Show(), Season() and """Creates a new episode, creating Show(), Season() and
@ -649,9 +680,8 @@ class Tvdb:
log().debug("Searching for show %s" % series) log().debug("Searching for show %s" % series)
self.config['params_getSeries']['seriesname'] = series self.config['params_getSeries']['seriesname'] = series
seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries']) seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries'])
allSeries = list(dict((s.tag.lower(), s.text) for s in x.getchildren()) for x in seriesEt)
return allSeries return [seriesEt[item] for item in seriesEt][0]
def _getSeries(self, series): def _getSeries(self, series):
"""This searches TheTVDB.com for the series name, """This searches TheTVDB.com for the series name,
@ -798,24 +828,13 @@ class Tvdb:
self.config['url_seriesInfo'] % (sid, getShowInLanguage) self.config['url_seriesInfo'] % (sid, getShowInLanguage)
) )
if seriesInfoEt is None: return False # check and make sure we have data to process and that it contains a series name
for curInfo in seriesInfoEt.findall("Series")[0]: if seriesInfoEt is None or 'seriesname' not in seriesInfoEt['series']:
tag = curInfo.tag.lower() return False
value = curInfo.text
if tag == 'seriesname' and value is None: for k, v in seriesInfoEt['series'].items():
return False self._setShowData(sid, k, v)
if value is not None:
if tag == 'id':
value = int(value)
if tag in ['banner', 'fanart', 'poster']:
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
if seriesSearch: if seriesSearch:
return True return True
@ -837,63 +856,40 @@ class Tvdb:
epsEt = self._getetsrc(url, language=language) epsEt = self._getetsrc(url, language=language)
for cur_ep in epsEt.findall("Episode"): for cur_ep in epsEt["episode"]:
if self.config['dvdorder']: if self.config['dvdorder']:
log().debug('Using DVD ordering.') log().debug('Using DVD ordering.')
use_dvd = cur_ep.find('DVD_season').text != None and cur_ep.find('DVD_episodenumber').text != None use_dvd = cur_ep['dvd_season'] != None and cur_ep['dvd_episodenumber'] != None
else: else:
use_dvd = False use_dvd = False
if use_dvd: if use_dvd:
elem_seasnum, elem_epno = cur_ep.find('DVD_season'), cur_ep.find('DVD_episodenumber') seasnum, epno = cur_ep['dvd_season'], cur_ep['dvd_episodenumber']
else: else:
elem_seasnum, elem_epno = cur_ep.find('SeasonNumber'), cur_ep.find('EpisodeNumber') seasnum, epno = cur_ep['seasonnumber'], cur_ep['episodenumber']
if elem_seasnum is None or elem_epno is None:
if seasnum is None or epno is None:
log().warning("An episode has incomplete season/episode number (season: %r, episode: %r)" % ( log().warning("An episode has incomplete season/episode number (season: %r, episode: %r)" % (
elem_seasnum, elem_epno)) seasnum, epno))
log().debug(
" ".join(
"%r is %r" % (child.tag, child.text) for child in cur_ep.getchildren()))
# TODO: Should this happen?
continue # Skip to next episode continue # Skip to next episode
# float() is because https://github.com/dbr/tvnamer/issues/95 - should probably be fixed in TVDB data # float() is because https://github.com/dbr/tvnamer/issues/95 - should probably be fixed in TVDB data
seas_no = int(float(elem_seasnum.text)) seas_no = int(float(seasnum))
ep_no = int(float(elem_epno.text)) ep_no = int(float(epno))
useDVD = False for k,v in cur_ep.items():
k = k.lower()
if (self.config['dvdorder']): if v is not None:
log().debug('DVD Order? Yes') if k == 'id':
useDVD = (cur_ep.find('DVD_season').text != None and cur_ep.find('DVD_episodenumber').text != None) v = int(v)
else:
log().debug('DVD Order? No')
if (useDVD): if k == 'filename':
log().debug('Use DVD Order? Yes') v = self.config['url_artworkPrefix'] % (v)
seas_no = int(cur_ep.find('DVD_season').text)
ep_no = int(float(cur_ep.find('DVD_episodenumber').text))
else:
log().debug('Use DVD Order? No')
seas_no = int(cur_ep.find('SeasonNumber').text)
ep_no = int(cur_ep.find('EpisodeNumber').text)
for cur_item in cur_ep.getchildren():
tag = cur_item.tag.lower()
value = cur_item.text
if value is not None:
if tag == 'id':
value = int(value)
if tag == 'filename':
value = self.config['url_artworkPrefix'] % (value)
else: else:
value = self._cleanData(value) v = self._cleanData(v)
self._setItem(sid, seas_no, ep_no, tag, value)
self._setItem(sid, seas_no, ep_no, k, v)
return True return True

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python2 # !/usr/bin/env python2
#encoding:utf-8 #encoding:utf-8
#author:echel0n #author:echel0n
#project:tvrage_api #project:tvrage_api
@ -24,6 +24,7 @@ import logging
import datetime as dt import datetime as dt
import requests import requests
import cachecontrol import cachecontrol
import xmltodict
try: try:
import xml.etree.cElementTree as ElementTree import xml.etree.cElementTree as ElementTree
@ -35,11 +36,13 @@ from cachecontrol import caches
from tvrage_ui import BaseUI from tvrage_ui import BaseUI
from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfound, from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfound,
tvrage_seasonnotfound, tvrage_episodenotfound, tvrage_attributenotfound) tvrage_seasonnotfound, tvrage_episodenotfound, tvrage_attributenotfound)
def log(): def log():
return logging.getLogger("tvrage_api") return logging.getLogger("tvrage_api")
def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None): def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
"""Retry calling the decorated function using an exponential backoff. """Retry calling the decorated function using an exponential backoff.
@ -83,6 +86,7 @@ def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
return deco_retry return deco_retry
class ShowContainer(dict): class ShowContainer(dict):
"""Simple dict that holds a series of Show instances """Simple dict that holds a series of Show instances
""" """
@ -105,13 +109,14 @@ class ShowContainer(dict):
_lastgc = time.time() _lastgc = time.time()
del tbd del tbd
super(ShowContainer, self).__setitem__(key, value) super(ShowContainer, self).__setitem__(key, value)
class Show(dict): class Show(dict):
"""Holds a dict of seasons, and show data. """Holds a dict of seasons, and show data.
""" """
def __init__(self): def __init__(self):
dict.__init__(self) dict.__init__(self)
self.data = {} self.data = {}
@ -157,7 +162,7 @@ class Show(dict):
raise tvrage_episodenotfound("Could not find any episodes that aired on %s" % date) raise tvrage_episodenotfound("Could not find any episodes that aired on %s" % date)
return ret return ret
def search(self, term = None, key = None): def search(self, term=None, key=None):
""" """
Search all episodes in show. Can search all data, or a specific key (for Search all episodes in show. Can search all data, or a specific key (for
example, episodename) example, episodename)
@ -173,7 +178,7 @@ class Show(dict):
""" """
results = [] results = []
for cur_season in self.values(): for cur_season in self.values():
searchresult = cur_season.search(term = term, key = key) searchresult = cur_season.search(term=term, key=key)
if len(searchresult) != 0: if len(searchresult) != 0:
results.extend(searchresult) results.extend(searchresult)
@ -181,7 +186,7 @@ class Show(dict):
class Season(dict): class Season(dict):
def __init__(self, show = None): def __init__(self, show=None):
"""The show attribute points to the parent show """The show attribute points to the parent show
""" """
self.show = show self.show = show
@ -202,13 +207,13 @@ class Season(dict):
else: else:
return dict.__getitem__(self, episode_number) return dict.__getitem__(self, episode_number)
def search(self, term = None, key = None): def search(self, term=None, key=None):
"""Search all episodes in season, returns a list of matching Episode """Search all episodes in season, returns a list of matching Episode
instances. instances.
""" """
results = [] results = []
for ep in self.values(): for ep in self.values():
searchresult = ep.search(term = term, key = key) searchresult = ep.search(term=term, key=key)
if searchresult is not None: if searchresult is not None:
results.append( results.append(
searchresult searchresult
@ -217,7 +222,7 @@ class Season(dict):
class Episode(dict): class Episode(dict):
def __init__(self, season = None): def __init__(self, season=None):
"""The season attribute points to the parent season """The season attribute points to the parent season
""" """
self.season = season self.season = season
@ -242,7 +247,7 @@ class Episode(dict):
except KeyError: except KeyError:
raise tvrage_attributenotfound("Cannot find attribute %s" % (repr(key))) raise tvrage_attributenotfound("Cannot find attribute %s" % (repr(key)))
def search(self, term = None, key = None): def search(self, term=None, key=None):
"""Search episode data for term, if it matches, return the Episode (self). """Search episode data for term, if it matches, return the Episode (self).
The key parameter can be used to limit the search to a specific element, The key parameter can be used to limit the search to a specific element,
for example, episodename. for example, episodename.
@ -258,25 +263,27 @@ class Episode(dict):
if key is not None and cur_key != key: if key is not None and cur_key != key:
# Do not search this key # Do not search this key
continue continue
if cur_value.find( unicode(term).lower() ) > -1: if cur_value.find(unicode(term).lower()) > -1:
return self return self
class TVRage: class TVRage:
"""Create easy-to-use interface to name of season/episode name""" """Create easy-to-use interface to name of season/episode name"""
def __init__(self, def __init__(self,
interactive = False, interactive=False,
select_first = False, select_first=False,
debug = False, debug=False,
cache = True, cache=True,
banners = False, banners=False,
actors = False, actors=False,
custom_ui = None, custom_ui=None,
language = None, language=None,
search_all_languages = False, search_all_languages=False,
apikey = None, apikey=None,
forceConnect=False, forceConnect=False,
useZip=False, useZip=False,
dvdorder=False): dvdorder=False):
""" """
cache (True/False/str/unicode/urllib2 opener): cache (True/False/str/unicode/urllib2 opener):
@ -294,18 +301,18 @@ class TVRage:
return an exception immediately. return an exception immediately.
""" """
self.shows = ShowContainer() # Holds all Show classes self.shows = ShowContainer() # Holds all Show classes
self.corrections = {} # Holds show-name to show_id mapping self.corrections = {} # Holds show-name to show_id mapping
self.sess = requests.session() # HTTP Session self.sess = requests.session() # HTTP Session
self.config = {} self.config = {}
if apikey is not None: if apikey is not None:
self.config['apikey'] = apikey self.config['apikey'] = apikey
else: else:
self.config['apikey'] = "Uhewg1Rr0o62fvZvUIZt" # tvdb_api's API key self.config['apikey'] = "Uhewg1Rr0o62fvZvUIZt" # tvdb_api's API key
self.config['debug_enabled'] = debug # show debugging messages self.config['debug_enabled'] = debug # show debugging messages
self.config['custom_ui'] = custom_ui self.config['custom_ui'] = custom_ui
@ -322,8 +329,8 @@ class TVRage:
if self.config['debug_enabled']: if self.config['debug_enabled']:
warnings.warn("The debug argument to tvrage_api.__init__ will be removed in the next version. " warnings.warn("The debug argument to tvrage_api.__init__ will be removed in the next version. "
"To enable debug messages, use the following code before importing: " "To enable debug messages, use the following code before importing: "
"import logging; logging.basicConfig(level=logging.DEBUG)") "import logging; logging.basicConfig(level=logging.DEBUG)")
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@ -331,8 +338,8 @@ class TVRage:
# Hard-coded here as it is realtively static, and saves another HTTP request, as # Hard-coded here as it is realtively static, and saves another HTTP request, as
# recommended on http://tvrage.com/wiki/index.php/API:languages.xml # recommended on http://tvrage.com/wiki/index.php/API:languages.xml
self.config['valid_languages'] = [ self.config['valid_languages'] = [
"da", "fi", "nl", "de", "it", "es", "fr","pl", "hu","el","tr", "da", "fi", "nl", "de", "it", "es", "fr", "pl", "hu", "el", "tr",
"ru","he","ja","pt","zh","cs","sl", "hr","ko","en","sv","no" "ru", "he", "ja", "pt", "zh", "cs", "sl", "hr", "ko", "en", "sv", "no"
] ]
# tvrage.com should be based around numeric language codes, # tvrage.com should be based around numeric language codes,
@ -340,9 +347,9 @@ class TVRage:
# requires the language ID, thus this mapping is required (mainly # requires the language ID, thus this mapping is required (mainly
# for usage in tvrage_ui - internally tvrage_api will use the language abbreviations) # for usage in tvrage_ui - internally tvrage_api will use the language abbreviations)
self.config['langabbv_to_id'] = {'el': 20, 'en': 7, 'zh': 27, self.config['langabbv_to_id'] = {'el': 20, 'en': 7, 'zh': 27,
'it': 15, 'cs': 28, 'es': 16, 'ru': 22, 'nl': 13, 'pt': 26, 'no': 9, 'it': 15, 'cs': 28, 'es': 16, 'ru': 22, 'nl': 13, 'pt': 26, 'no': 9,
'tr': 21, 'pl': 18, 'fr': 17, 'hr': 31, 'de': 14, 'da': 10, 'fi': 11, 'tr': 21, 'pl': 18, 'fr': 17, 'hr': 31, 'de': 14, 'da': 10, 'fi': 11,
'hu': 19, 'ja': 25, 'he': 24, 'ko': 32, 'sv': 8, 'sl': 30} 'hu': 19, 'ja': 25, 'he': 24, 'ko': 32, 'sv': 8, 'sl': 30}
if language is None: if language is None:
self.config['language'] = 'en' self.config['language'] = 'en'
@ -390,9 +397,9 @@ class TVRage:
# get response from TVRage # get response from TVRage
if self.config['cache_enabled']: if self.config['cache_enabled']:
resp = self.sess.get(url, cache_auto=True, params=params) resp = self.sess.get(url.strip(), cache_auto=True, params=params)
else: else:
resp = requests.get(url, params=params) resp = requests.get(url.strip(), params=params)
except requests.HTTPError, e: except requests.HTTPError, e:
raise tvrage_error("HTTP error " + str(e.errno) + " while loading URL " + str(url)) raise tvrage_error("HTTP error " + str(e.errno) + " while loading URL " + str(url))
@ -403,81 +410,84 @@ class TVRage:
except requests.Timeout, e: except requests.Timeout, e:
raise tvrage_error("Connection timed out " + str(e.message) + " while loading URL " + str(url)) raise tvrage_error("Connection timed out " + str(e.message) + " while loading URL " + str(url))
return resp.content if resp.ok else None def remap_keys(path, key, value):
name_map = {
'showid': 'id',
'showname': 'seriesname',
'name': 'seriesname',
'summary': 'overview',
'started': 'firstaired',
'genres': 'genre',
'airtime': 'airs_time',
'airday': 'airs_dayofweek',
'image': 'fanart',
'epnum': 'absolute_number',
'title': 'episodename',
'airdate': 'firstaired',
'screencap': 'filename',
'seasonnum': 'episodenumber'
}
try:
key = name_map[key.lower()]
except (ValueError, TypeError, KeyError):
key.lower()
# clean up value and do type changes
if value:
if isinstance(value, dict):
if key == 'network':
value = value['#text']
if key == 'genre':
value = value['genre']
if not isinstance(value, list):
value = [value]
value = '|' + '|'.join(value) + '|'
try:
# convert to integer if needed
if value.isdigit():
value = int(value)
except:
pass
try:
if key == 'firstaired' and value in "0000-00-00":
new_value = str(dt.date.fromordinal(1))
new_value = re.sub("([-]0{2}){1,}", "", new_value)
fixDate = parse(new_value, fuzzy=True).date()
value = fixDate.strftime("%Y-%m-%d")
elif key == 'firstaired':
value = parse(value, fuzzy=True).date()
value = value.strftime("%Y-%m-%d")
except:
pass
value = self._cleanData(value)
return (key, value)
if resp.ok:
return xmltodict.parse(resp.text.strip(), postprocessor=remap_keys)
def _getetsrc(self, url, params=None): def _getetsrc(self, url, params=None):
"""Loads a URL using caching, returns an ElementTree of the source """Loads a URL using caching, returns an ElementTree of the source
""" """
reDict = {
'showid': 'id',
'showname': 'seriesname',
'name': 'seriesname',
'summary': 'overview',
'started': 'firstaired',
'genres': 'genre',
'airtime': 'airs_time',
'airday': 'airs_dayofweek',
'image': 'fanart',
'epnum': 'absolute_number',
'title': 'episodename',
'airdate': 'firstaired',
'screencap': 'filename',
'seasonnum': 'episodenumber',
}
robj = re.compile('|'.join(reDict.keys()))
src = self._loadUrl(url, params)
try: try:
# TVRAGE doesn't sanitize \r (CR) from user input in some fields,
# remove it to avoid errors. Change from SickBeard, from will14m
xml = ElementTree.fromstring(src.rstrip("\r"))
tree = ElementTree.ElementTree(xml)
for elm in tree.findall('.//*'):
elm.tag = robj.sub(lambda m: reDict[m.group(0)], elm.tag)
if elm.tag in 'firstaired':
try:
if elm.text in "0000-00-00":
elm.text = str(dt.date.fromordinal(1))
elm.text = re.sub("([-]0{2}){1,}", "", elm.text)
fixDate = parse(elm.text, fuzzy=True).date()
elm.text = fixDate.strftime("%Y-%m-%d")
except:
pass
return ElementTree.fromstring(ElementTree.tostring(xml))
except SyntaxError:
src = self._loadUrl(url, params) src = self._loadUrl(url, params)
try: src = [src[item] for item in src][0]
xml = ElementTree.fromstring(src.rstrip("\r")) except:
tree = ElementTree.ElementTree(xml) errormsg = "There was an error with the XML retrieved from tvrage.com"
for elm in tree.findall('.//*'):
elm.tag = robj.sub(lambda m: reDict[m.group(0)], elm.tag)
if elm.tag in 'firstaired' and elm.text: if self.config['cache_enabled']:
if elm.text == "0000-00-00": errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
elm.text = str(dt.date.fromordinal(1)) self.config['cache_location']
try:
#month = strptime(match.group('air_month')[:3],'%b').tm_mon
#day = re.sub("(st|nd|rd|th)", "", match.group('air_day'))
#dtStr = '%s/%s/%s' % (year, month, day)
fixDate = parse(elm.text, fuzzy=True)
elm.text = fixDate.strftime("%Y-%m-%d")
except:
pass
return ElementTree.fromstring(ElementTree.tostring(xml))
except SyntaxError, exceptionmsg:
errormsg = "There was an error with the XML retrieved from tvrage.com:\n%s" % (
exceptionmsg
) )
if self.config['cache_enabled']: errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on\n"
errormsg += "\nFirst try emptying the cache folder at..\n%s" % ( raise tvrage_error(errormsg)
self.config['cache_location']
)
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on\n" return src
raise tvrage_error(errormsg)
def _setItem(self, sid, seas, ep, attrib, value): def _setItem(self, sid, seas, ep, attrib, value):
"""Creates a new episode, creating Show(), Season() and """Creates a new episode, creating Show(), Season() and
@ -497,9 +507,9 @@ class TVRage:
if sid not in self.shows: if sid not in self.shows:
self.shows[sid] = Show() self.shows[sid] = Show()
if seas not in self.shows[sid]: if seas not in self.shows[sid]:
self.shows[sid][seas] = Season(show = self.shows[sid]) self.shows[sid][seas] = Season(show=self.shows[sid])
if ep not in self.shows[sid][seas]: if ep not in self.shows[sid][seas]:
self.shows[sid][seas][ep] = Episode(season = self.shows[sid][seas]) self.shows[sid][seas][ep] = Episode(season=self.shows[sid][seas])
self.shows[sid][seas][ep][attrib] = value self.shows[sid][seas][ep][attrib] = value
def _setShowData(self, sid, key, value): def _setShowData(self, sid, key, value):
@ -529,9 +539,8 @@ class TVRage:
log().debug("Searching for show %s" % series) log().debug("Searching for show %s" % series)
self.config['params_getSeries']['show'] = series self.config['params_getSeries']['show'] = series
seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries']) seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries'])
allSeries = list(dict((s.tag.lower(),s.text) for s in x.getchildren()) for x in seriesEt)
return allSeries return [seriesEt[item] for item in seriesEt][0]
def _getSeries(self, series): def _getSeries(self, series):
"""This searches tvrage.com for the series name, """This searches tvrage.com for the series name,
@ -547,10 +556,10 @@ class TVRage:
if self.config['custom_ui'] is not None: if self.config['custom_ui'] is not None:
log().debug("Using custom UI %s" % (repr(self.config['custom_ui']))) log().debug("Using custom UI %s" % (repr(self.config['custom_ui'])))
ui = self.config['custom_ui'](config = self.config) ui = self.config['custom_ui'](config=self.config)
else: else:
log().debug('Auto-selecting first search result using BaseUI') log().debug('Auto-selecting first search result using BaseUI')
ui = BaseUI(config = self.config) ui = BaseUI(config=self.config)
return ui.selectSeries(allSeries) return ui.selectSeries(allSeries)
@ -568,62 +577,49 @@ class TVRage:
self.config['params_seriesInfo'] self.config['params_seriesInfo']
) )
if seriesInfoEt is None: return False # check and make sure we have data to process and that it contains a series name
for curInfo in seriesInfoEt: if seriesInfoEt is None or 'seriesname' not in seriesInfoEt:
tag = curInfo.tag.lower() return False
value = curInfo.text
if tag == 'seriesname' and value is None: for k, v in seriesInfoEt.items():
return False self._setShowData(sid, k, v)
if tag == 'id': # series search ends here
value = int(value) if seriesSearch:
return True
if value is not None:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
if seriesSearch: return True
try:
# Parse genre data
log().debug('Getting genres of %s' % (sid))
for genre in seriesInfoEt.find('genres'):
tag = genre.tag.lower()
value = genre.text
if value is not None:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
except Exception:
log().debug('No genres for %s' % (sid))
# Parse episode data # Parse episode data
log().debug('Getting all episodes of %s' % (sid)) log().debug('Getting all episodes of %s' % (sid))
self.config['params_epInfo']['sid'] = sid self.config['params_epInfo']['sid'] = sid
epsEt = self._getetsrc(self.config['url_epInfo'], self.config['params_epInfo']) epsEt = self._getetsrc(self.config['url_epInfo'], self.config['params_epInfo'])
for cur_list in epsEt.findall("Episodelist"):
for cur_seas in cur_list:
try:
seas_no = int(cur_seas.attrib['no'])
for cur_ep in cur_seas:
ep_no = int(cur_ep.find('episodenumber').text)
self._setItem(sid, seas_no, ep_no, 'seasonnumber', seas_no)
for cur_item in cur_ep:
tag = cur_item.tag.lower()
value = cur_item.text for season in epsEt['Episodelist']['Season']:
if value is not None: episodes = season['episode']
if tag == 'id': if not isinstance(episodes, list):
value = int(value) episodes = [episodes]
value = self._cleanData(value) for episode in episodes:
seas_no = int(season['@no'])
ep_no = int(episode['episodenumber'])
self._setItem(sid, seas_no, ep_no, 'seasonnumber', seas_no)
self._setItem(sid, seas_no, ep_no, tag, value) for k,v in episode.items():
except: try:
continue k = k.lower()
if v is not None:
if k == 'link':
v = v.rsplit('/', 1)[1]
k = 'id'
if k == 'id':
v = int(v)
v = self._cleanData(v)
self._setItem(sid, seas_no, ep_no, k, v)
except:
continue
return True return True
def _nameToSid(self, name): def _nameToSid(self, name):
@ -632,7 +628,7 @@ class TVRage:
the correct SID. the correct SID.
""" """
if name in self.corrections: if name in self.corrections:
log().debug('Correcting %s to %s' % (name, self.corrections[name]) ) log().debug('Correcting %s to %s' % (name, self.corrections[name]))
return self.corrections[name] return self.corrections[name]
else: else:
log().debug('Getting show %s' % (name)) log().debug('Getting show %s' % (name))
@ -673,11 +669,13 @@ def main():
grabs an episode name interactively. grabs an episode name interactively.
""" """
import logging import logging
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
tvrage_instance = TVRage(cache=False) tvrage_instance = TVRage(cache=False)
print tvrage_instance['Lost']['seriesname'] print tvrage_instance['Lost']['seriesname']
print tvrage_instance['Lost'][1][4]['episodename'] print tvrage_instance['Lost'][1][4]['episodename']
if __name__ == '__main__': if __name__ == '__main__':
main() main()

359
lib/xmltodict.py Normal file
View file

@ -0,0 +1,359 @@
#!/usr/bin/env python
"Makes working with XML feel like you are working with JSON"
from xml.parsers import expat
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
try: # pragma no cover
from cStringIO import StringIO
except ImportError: # pragma no cover
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
try: # pragma no cover
from collections import OrderedDict
except ImportError: # pragma no cover
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
try: # pragma no cover
_basestring = basestring
except NameError: # pragma no cover
_basestring = str
try: # pragma no cover
_unicode = unicode
except NameError: # pragma no cover
_unicode = str
__author__ = 'Martin Blech'
__version__ = '0.9.0'
__license__ = 'MIT'
class ParsingInterrupted(Exception):
pass
class _DictSAXHandler(object):
def __init__(self,
item_depth=0,
item_callback=lambda *args: True,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
force_cdata=False,
cdata_separator='',
postprocessor=None,
dict_constructor=OrderedDict,
strip_whitespace=True,
namespace_separator=':',
namespaces=None):
self.path = []
self.stack = []
self.data = None
self.item = None
self.item_depth = item_depth
self.xml_attribs = xml_attribs
self.item_callback = item_callback
self.attr_prefix = attr_prefix
self.cdata_key = cdata_key
self.force_cdata = force_cdata
self.cdata_separator = cdata_separator
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
self.namespace_separator = namespace_separator
self.namespaces = namespaces
def _build_name(self, full_name):
if not self.namespaces:
return full_name
i = full_name.rfind(self.namespace_separator)
if i == -1:
return full_name
namespace, name = full_name[:i], full_name[i+1:]
short_namespace = self.namespaces.get(namespace, namespace)
if not short_namespace:
return name
else:
return self.namespace_separator.join((short_namespace, name))
def _attrs_to_dict(self, attrs):
if isinstance(attrs, dict):
return attrs
return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
def startElement(self, full_name, attrs):
name = self._build_name(full_name)
attrs = self._attrs_to_dict(attrs)
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
self.stack.append((self.item, self.data))
if self.xml_attribs:
attrs = self.dict_constructor(
(self.attr_prefix+key, value)
for (key, value) in attrs.items())
else:
attrs = None
self.item = attrs or None
self.data = None
def endElement(self, full_name):
name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
item = self.data
should_continue = self.item_callback(self.path, item)
if not should_continue:
raise ParsingInterrupted()
if len(self.stack):
item, data = self.item, self.data
self.item, self.data = self.stack.pop()
if self.strip_whitespace and data is not None:
data = data.strip() or None
if data and self.force_cdata and item is None:
item = self.dict_constructor()
if item is not None:
if data:
self.push_data(item, self.cdata_key, data)
self.item = self.push_data(self.item, name, item)
else:
self.item = self.push_data(self.item, name, data)
else:
self.item = self.data = None
self.path.pop()
def characters(self, data):
if not self.data:
self.data = data
else:
self.data += self.cdata_separator + data
def push_data(self, item, key, data):
if self.postprocessor is not None:
result = self.postprocessor(self.path, key, data)
if result is None:
return item
key, data = result
if item is None:
item = self.dict_constructor()
try:
value = item[key]
if isinstance(value, list):
value.append(data)
else:
item[key] = [value, data]
except KeyError:
item[key] = data
return item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
namespace_separator=':', **kwargs):
"""Parse the given XML input and convert it into a dictionary.
`xml_input` can either be a `string` or a file-like object.
If `xml_attribs` is `True`, element attributes are put in the dictionary
among regular child elements, using `@` as a prefix to avoid collisions. If
set to `False`, they are just ignored.
Simple example::
>>> import xmltodict
>>> doc = xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>
... \"\"\")
>>> doc['a']['@prop']
u'x'
>>> doc['a']['b']
[u'1', u'2']
If `item_depth` is `0`, the function returns a dictionary for the root
element (default behavior). Otherwise, it calls `item_callback` every time
an item at the specified depth is found and returns `None` in the end
(streaming mode).
The callback function receives two parameters: the `path` from the document
root to the item (name-attribs pairs), and the `item` (dict). If the
callback's return value is false-ish, parsing will be stopped with the
:class:`ParsingInterrupted` exception.
Streaming example::
>>> def handle(path, item):
... print 'path:%s item:%s' % (path, item)
... return True
...
>>> xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>\"\"\", item_depth=2, item_callback=handle)
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
The optional argument `postprocessor` is a function that takes `path`,
`key` and `value` as positional arguments and returns a new `(key, value)`
pair where both `key` and `value` may have changed. Usage example::
>>> def postprocessor(path, key, value):
... try:
... return key + ':int', int(value)
... except (ValueError, TypeError):
... return key, value
>>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
... postprocessor=postprocessor)
OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
You can pass an alternate version of `expat` (such as `defusedexpat`) by
using the `expat` parameter. E.g:
>>> import defusedexpat
>>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
OrderedDict([(u'a', u'hello')])
"""
handler = _DictSAXHandler(namespace_separator=namespace_separator,
**kwargs)
if isinstance(xml_input, _unicode):
if not encoding:
encoding = 'utf-8'
xml_input = xml_input.encode(encoding)
if not process_namespaces:
namespace_separator = None
parser = expat.ParserCreate(
encoding,
namespace_separator
)
try:
parser.ordered_attributes = True
except AttributeError:
# Jython's expat does not support ordered_attributes
pass
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement
parser.CharacterDataHandler = handler.characters
parser.buffer_text = True
try:
parser.ParseFile(xml_input)
except (TypeError, AttributeError):
parser.Parse(xml_input, True)
return handler.item
def _emit(key, value, content_handler,
attr_prefix='@',
cdata_key='#text',
depth=0,
preprocessor=None,
pretty=False,
newl='\n',
indent='\t'):
if preprocessor is not None:
result = preprocessor(key, value)
if result is None:
return
key, value = result
if not isinstance(value, (list, tuple)):
value = [value]
if depth == 0 and len(value) > 1:
raise ValueError('document with multiple roots')
for v in value:
if v is None:
v = OrderedDict()
elif not isinstance(v, dict):
v = _unicode(v)
if isinstance(v, _basestring):
v = OrderedDict(((cdata_key, v),))
cdata = None
attrs = OrderedDict()
children = []
for ik, iv in v.items():
if ik == cdata_key:
cdata = iv
continue
if ik.startswith(attr_prefix):
attrs[ik[len(attr_prefix):]] = iv
continue
children.append((ik, iv))
if pretty:
content_handler.ignorableWhitespace(depth * indent)
content_handler.startElement(key, AttributesImpl(attrs))
if pretty and children:
content_handler.ignorableWhitespace(newl)
for child_key, child_value in children:
_emit(child_key, child_value, content_handler,
attr_prefix, cdata_key, depth+1, preprocessor,
pretty, newl, indent)
if cdata is not None:
content_handler.characters(cdata)
if pretty and children:
content_handler.ignorableWhitespace(depth * indent)
content_handler.endElement(key)
if pretty and depth:
content_handler.ignorableWhitespace(newl)
def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
**kwargs):
"""Emit an XML document for the given `input_dict` (reverse of `parse`).
The resulting XML document is returned as a string, but if `output` (a
file-like object) is specified, it is written there instead.
Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
as XML node attributes, whereas keys equal to `cdata_key`
(default=`'#text'`) are treated as character data.
The `pretty` parameter (default=`False`) enables pretty-printing. In this
mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
can be customized with the `newl` and `indent` parameters.
"""
((key, value),) = input_dict.items()
must_return = False
if output is None:
output = StringIO()
must_return = True
content_handler = XMLGenerator(output, encoding)
if full_document:
content_handler.startDocument()
_emit(key, value, content_handler, **kwargs)
if full_document:
content_handler.endDocument()
if must_return:
value = output.getvalue()
try: # pragma no cover
value = value.decode(encoding)
except AttributeError: # pragma no cover
pass
return value
if __name__ == '__main__': # pragma: no cover
import sys
import marshal
(item_depth,) = sys.argv[1:]
item_depth = int(item_depth)
def handle_item(path, item):
marshal.dump((path, item), sys.stdout)
return True
try:
root = parse(sys.stdin,
item_depth=item_depth,
item_callback=handle_item,
dict_constructor=dict)
if item_depth == 0:
handle_item([], root)
except KeyboardInterrupt:
pass

View file

@ -782,14 +782,10 @@ class GenericMetadata():
# Try and get posters and fanart from TMDB # Try and get posters and fanart from TMDB
if image_url is None: if image_url is None:
for show_name in set(allPossibleShowNames(show_obj)): if image_type in ('poster', 'poster_thumb'):
if image_type in ('poster', 'poster_thumb'): image_url = self._retrieve_show_images_from_tmdb(show_obj, poster=True)
image_url = self._retrieve_show_images_from_tmdb(show_obj, poster=True) elif image_type == 'fanart':
elif image_type == 'fanart': image_url = self._retrieve_show_images_from_tmdb(show_obj, backdrop=True)
image_url = self._retrieve_show_images_from_tmdb(show_obj, backdrop=True)
if image_url:
break
if image_url: if image_url:
image_data = metadata_helpers.getShowImage(image_url, which) image_data = metadata_helpers.getShowImage(image_url, which)
@ -965,8 +961,6 @@ class GenericMetadata():
return (indexer_id, name, indexer) return (indexer_id, name, indexer)
def _retrieve_show_images_from_tmdb(self, show, backdrop=False, poster=False): def _retrieve_show_images_from_tmdb(self, show, backdrop=False, poster=False):
tmdb_id = None
# get TMDB configuration info # get TMDB configuration info
tmdb = TMDB(sickbeard.TMDB_API_KEY) tmdb = TMDB(sickbeard.TMDB_API_KEY)
config = tmdb.Configuration() config = tmdb.Configuration()
@ -981,27 +975,14 @@ class GenericMetadata():
try: try:
search = tmdb.Search() search = tmdb.Search()
for result in search.collection({'query': show.name}) + search.tv({'query': show.name}): for show_name in set(allPossibleShowNames(show)):
tmdb_id = result['id'] for result in search.collection({'query': show_name})['results'] + search.tv({'query': show_name})['results']:
external_ids = tmdb.TV(tmdb_id).external_ids() if backdrop and result['backdrop_path']:
if show.indexerid in [external_ids['tvdb_id'], external_ids['tvrage_id']]: return "{0}{1}{2}".format(base_url, max_size, result['backdrop_path'])
break elif poster and result['poster_path']:
return "{0}{1}{2}".format(base_url, max_size, result['poster_path'])
if tmdb_id: except Exception, e:
images = tmdb.Collections(tmdb_id).images()
if len(images) > 0:
# get backdrop urls
if backdrop:
rel_path = images['backdrops'][0]['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
return url
# get poster urls
if poster:
rel_path = images['posters'][0]['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
return url
except:
pass pass
logger.log(u"Could not find any posters or background for " + show.name, logger.DEBUG) logger.log(u"Could not find any posters or background for " + show.name, logger.DEBUG)

View file

@ -829,7 +829,7 @@ class TVShow(object):
self.airs = myEp["airs_dayofweek"] + " " + myEp["airs_time"] self.airs = myEp["airs_dayofweek"] + " " + myEp["airs_time"]
if getattr(myEp, 'firstaired', None) is not None: if getattr(myEp, 'firstaired', None) is not None:
self.startyear = int(myEp["firstaired"].split('-')[0]) self.startyear = int(str(myEp["firstaired"]).split('-')[0])
self.status = getattr(myEp, 'status', '') self.status = getattr(myEp, 'status', '')
@ -855,7 +855,6 @@ class TVShow(object):
i = imdb.IMDb() i = imdb.IMDb()
imdbTv = i.get_movie(str(re.sub("[^0-9]", "", self.imdbid))) imdbTv = i.get_movie(str(re.sub("[^0-9]", "", self.imdbid)))
test = imdbTv.keys()
for key in filter(lambda x: x.replace('_', ' ') in imdbTv.keys(), imdb_info.keys()): for key in filter(lambda x: x.replace('_', ' ') in imdbTv.keys(), imdb_info.keys()):
# Store only the first value for string type # Store only the first value for string type
if type(imdb_info[key]) == type('') and type(imdbTv.get(key)) == type([]): if type(imdb_info[key]) == type('') and type(imdbTv.get(key)) == type([]):
@ -1556,7 +1555,7 @@ class TVEpisode(object):
self.deleteEpisode() self.deleteEpisode()
return False return False
if myEp["absolute_number"] == None or myEp["absolute_number"] == "": if getattr(myEp, 'absolute_number', None) is None:
logger.log(u"This episode (" + self.show.name + " - " + str(season) + "x" + str( logger.log(u"This episode (" + self.show.name + " - " + str(season) + "x" + str(
episode) + ") has no absolute number on " + sickbeard.indexerApi( episode) + ") has no absolute number on " + sickbeard.indexerApi(
self.indexer).name self.indexer).name
@ -1564,7 +1563,7 @@ class TVEpisode(object):
else: else:
logger.log( logger.log(
str(self.show.indexerid) + ": The absolute_number for " + str(season) + "x" + str(episode) + " is : " + str(self.show.indexerid) + ": The absolute_number for " + str(season) + "x" + str(episode) + " is : " +
myEp["absolute_number"], logger.DEBUG) str(myEp["absolute_number"]), logger.DEBUG)
self.absolute_number = int(myEp["absolute_number"]) self.absolute_number = int(myEp["absolute_number"])
self.name = getattr(myEp, 'episodename', "") self.name = getattr(myEp, 'episodename', "")
@ -1603,8 +1602,9 @@ class TVEpisode(object):
u"The show dir is missing, not bothering to change the episode statuses since it'd probably be invalid") u"The show dir is missing, not bothering to change the episode statuses since it'd probably be invalid")
return return
logger.log(str(self.show.indexerid) + u": Setting status for " + str(season) + "x" + str( if self.location:
episode) + " based on status " + str(self.status) + " and existence of " + self.location, logger.DEBUG) logger.log(str(self.show.indexerid) + u": Setting status for " + str(season) + "x" + str(
episode) + " based on status " + str(self.status) + " and existence of " + self.location, logger.DEBUG)
if not ek.ek(os.path.isfile, self.location): if not ek.ek(os.path.isfile, self.location):