From 5519fd7e13d1bb5e0e282942ff010ac397e5bbde Mon Sep 17 00:00:00 2001 From: JackDandy Date: Thu, 14 Jan 2016 08:54:24 +0000 Subject: [PATCH] Update IMDb 5.0 to 5.1dev20160106 --- CHANGES.md | 1 + lib/imdb/__init__.py | 4 +- lib/imdb/imdbpy.cfg | 4 +- lib/imdb/locale/imdbpy-de.po | 155 ++++++++++++------------ lib/imdb/locale/imdbpy-fr.po | 13 +- lib/imdb/parser/http/__init__.py | 4 + lib/imdb/parser/http/movieParser.py | 10 +- lib/imdb/parser/http/personParser.py | 106 +++++++++++++++- lib/imdb/parser/http/topBottomParser.py | 25 ++-- lib/imdb/parser/http/utils.py | 6 - lib/imdb/utils.py | 15 ++- 11 files changed, 228 insertions(+), 115 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 88758ca8..5809513d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,6 +14,7 @@ * Update dateutil library 2.4.2 (083f666) to 2.4.2 (d4baf97) * Update Hachoir library 1.3.4 (r1383) to 1.3.4 (r1435) * Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d) +* Update IMDb 5.0 to 5.1dev20160106 * Update PNotify library 2.0.1 to 2.1.0 * Update profilehooks 1.4 to 1.8.2.dev0 (ee3f1a8) * Update Requests library 2.7.0 (5d6d1bc) to 2.9.1 (a1c9b84) diff --git a/lib/imdb/__init__.py b/lib/imdb/__init__.py index 0cdc9650..32019cb5 100644 --- a/lib/imdb/__init__.py +++ b/lib/imdb/__init__.py @@ -6,7 +6,7 @@ a person from the IMDb database. It can fetch data through different media (e.g.: the IMDb web pages, a SQL database, etc.) -Copyright 2004-2014 Davide Alberani +Copyright 2004-2015 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company', 'available_access_systems'] -__version__ = VERSION = '5.0' +__version__ = VERSION = '5.1dev20160106' # Import compatibility module (importing it is enough). import _compat diff --git a/lib/imdb/imdbpy.cfg b/lib/imdb/imdbpy.cfg index 68b30538..b67fbbe5 100644 --- a/lib/imdb/imdbpy.cfg +++ b/lib/imdb/imdbpy.cfg @@ -29,7 +29,7 @@ [imdbpy] ## Default. -accessSystem = httpThin +#accessSystem = http ## Optional (options common to every data access system): # Activate adult searches (on, by default). @@ -69,7 +69,7 @@ accessSystem = httpThin ## Set the threshold for logging messages. # Can be one of "debug", "info", "warning", "error", "critical" (default: # "warning"). -loggingLevel = debug +#loggingLevel = debug ## Path to a configuration file for the logging facility; # see: http://docs.python.org/library/logging.html#configuring-logging diff --git a/lib/imdb/locale/imdbpy-de.po b/lib/imdb/locale/imdbpy-de.po index c0585b92..ee3112b1 100644 --- a/lib/imdb/locale/imdbpy-de.po +++ b/lib/imdb/locale/imdbpy-de.po @@ -1,12 +1,13 @@ # Gettext message file for imdbpy # Translators: -# Ioan, 2013 +# Nils Welzk, 2013 +# Raphael, 2014 msgid "" msgstr "" "Project-Id-Version: IMDbPY\n" "POT-Creation-Date: 2010-03-18 14:35+0000\n" -"PO-Revision-Date: 2013-11-20 11:07+0000\n" -"Last-Translator: Ioan\n" +"PO-Revision-Date: 2014-10-21 15:24+0000\n" +"Last-Translator: Raphael\n" "Language-Team: German (http://www.transifex.com/projects/p/imdbpy/language/de/)\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" @@ -20,11 +21,11 @@ msgstr "" # Default: Actor msgid "actor" -msgstr "" +msgstr "Schauspieler" # Default: Actress msgid "actress" -msgstr "" +msgstr "Schauspielerin" # Default: Adaption msgid "adaption" @@ -32,7 +33,7 @@ msgstr "" # Default: Additional information msgid "additional-information" -msgstr "" +msgstr "zusätzliche Information" # Default: Admissions msgid "admissions" @@ -48,7 +49,7 @@ msgstr "" # Default: Akas msgid "akas" -msgstr "" +msgstr "Pseudonüme" # Default: Akas from release info msgid "akas-from-release-info" @@ -56,7 +57,7 @@ msgstr "" # Default: All products msgid "all-products" -msgstr "" +msgstr "Alle Produkte" # Default: Alternate language version of msgid "alternate-language-version-of" @@ -68,7 +69,7 @@ msgstr "" # Default: Amazon reviews msgid "amazon-reviews" -msgstr "" +msgstr "Amazon Rezensionen" # Default: Analog left msgid "analog-left" @@ -100,7 +101,7 @@ msgstr "" # Default: Art director msgid "art-director" -msgstr "" +msgstr "Art Director" # Default: Article msgid "article" @@ -112,7 +113,7 @@ msgstr "" # Default: Aspect ratio msgid "aspect-ratio" -msgstr "" +msgstr "Seitenverhältnis" # Default: Assigner msgid "assigner" @@ -132,7 +133,7 @@ msgstr "" # Default: Audio quality msgid "audio-quality" -msgstr "" +msgstr "Audio Qualität" # Default: Award msgid "award" @@ -188,7 +189,7 @@ msgstr "Kosten" # Default: Business msgid "business" -msgstr "" +msgstr "Geschäft" # Default: By arrangement with msgid "by-arrangement-with" @@ -220,7 +221,7 @@ msgstr "" # Default: Cast msgid "cast" -msgstr "" +msgstr "Besetzung" # Default: Casting department msgid "casting-department" @@ -236,23 +237,23 @@ msgstr "" # Default: Category msgid "category" -msgstr "" +msgstr "Kategorie" # Default: Certificate msgid "certificate" -msgstr "" +msgstr "Zertifikat" # Default: Certificates msgid "certificates" -msgstr "" +msgstr "Zertifikate" # Default: Certification msgid "certification" -msgstr "" +msgstr "Bescheinigung" # Default: Channel msgid "channel" -msgstr "" +msgstr "Kanal" # Default: Character msgid "character" @@ -372,7 +373,7 @@ msgstr "" # Default: Description msgid "description" -msgstr "" +msgstr "Beschreibung" # Default: Dialogue intellegibility msgid "dialogue-intellegibility" @@ -396,7 +397,7 @@ msgstr "" # Default: Distributors msgid "distributors" -msgstr "" +msgstr "Händler" # Default: Dvd msgid "dvd" @@ -452,7 +453,7 @@ msgstr "Episoden" # Default: Episodes rating msgid "episodes-rating" -msgstr "" +msgstr "Episoden Bewertung" # Default: Essays msgid "essays" @@ -464,7 +465,7 @@ msgstr "" # Default: Faqs msgid "faqs" -msgstr "" +msgstr "FAQs" # Default: Feature msgid "feature" @@ -488,19 +489,19 @@ msgstr "" # Default: Filmography msgid "filmography" -msgstr "" +msgstr "Filmografie" # Default: Followed by msgid "followed-by" -msgstr "" +msgstr "gefolgt von" # Default: Follows msgid "follows" -msgstr "" +msgstr "folgt" # Default: For msgid "for" -msgstr "" +msgstr "für" # Default: Frequency response msgid "frequency-response" @@ -508,7 +509,7 @@ msgstr "" # Default: From msgid "from" -msgstr "" +msgstr "von" # Default: Full article link msgid "full-article-link" @@ -524,7 +525,7 @@ msgstr "" # Default: Genres msgid "genres" -msgstr "" +msgstr "Genres" # Default: Goofs msgid "goofs" @@ -540,7 +541,7 @@ msgstr "" # Default: Headshot msgid "headshot" -msgstr "" +msgstr "Portrait" # Default: Height msgid "height" @@ -556,15 +557,15 @@ msgstr "" # Default: Interview msgid "interview" -msgstr "" +msgstr "Interview" # Default: Interviews msgid "interviews" -msgstr "" +msgstr "Interviews" # Default: Introduction msgid "introduction" -msgstr "" +msgstr "Vorstellung" # Default: Item msgid "item" @@ -596,7 +597,7 @@ msgstr "Sprachen" # Default: Laserdisc msgid "laserdisc" -msgstr "" +msgstr "Laserdisc" # Default: Laserdisc title msgid "laserdisc-title" @@ -624,7 +625,7 @@ msgstr "Literatur" # Default: Locations msgid "locations" -msgstr "" +msgstr "Standorte" # Default: Long imdb canonical name msgid "long-imdb-canonical-name" @@ -708,11 +709,11 @@ msgstr "" # Default: Nick names msgid "nick-names" -msgstr "" +msgstr "Spitznamen" # Default: Notes msgid "notes" -msgstr "" +msgstr "Anmerkungen" # Default: Novel msgid "novel" @@ -720,7 +721,7 @@ msgstr "" # Default: Number msgid "number" -msgstr "" +msgstr "Zahl" # Default: Number of chapter stops msgid "number-of-chapter-stops" @@ -800,7 +801,7 @@ msgstr "" # Default: Plot msgid "plot" -msgstr "Inhalt" +msgstr "Handlung" # Default: Plot outline msgid "plot-outline" @@ -824,7 +825,7 @@ msgstr "" # Default: Producer msgid "producer" -msgstr "" +msgstr "Produzent" # Default: Production companies msgid "production-companies" @@ -864,15 +865,15 @@ msgstr "" # Default: Quote msgid "quote" -msgstr "" +msgstr "Zitat" # Default: Quotes msgid "quotes" -msgstr "" +msgstr "Zitate" # Default: Rating msgid "rating" -msgstr "" +msgstr "Bewertung" # Default: Recommendations msgid "recommendations" @@ -896,11 +897,11 @@ msgstr "" # Default: Release date msgid "release-date" -msgstr "" +msgstr "Veröffentlichungsdatum" # Default: Release dates msgid "release-dates" -msgstr "" +msgstr "Veröffentlichungstermine" # Default: Remade as msgid "remade-as" @@ -908,27 +909,27 @@ msgstr "" # Default: Remake of msgid "remake-of" -msgstr "" +msgstr "Remake von" # Default: Rentals msgid "rentals" -msgstr "" +msgstr "Leigebühr" # Default: Result msgid "result" -msgstr "" +msgstr "Ergebnis" # Default: Review msgid "review" -msgstr "" +msgstr "Kritik" # Default: Review author msgid "review-author" -msgstr "" +msgstr "Kritik Autor" # Default: Review kind msgid "review-kind" -msgstr "" +msgstr "Kritik Art" # Default: Runtime msgid "runtime" @@ -1096,7 +1097,7 @@ msgstr "" # Default: Soundtrack msgid "soundtrack" -msgstr "" +msgstr "Soundtrack" # Default: Spaciality msgid "spaciality" @@ -1116,43 +1117,43 @@ msgstr "" # Default: Spin off msgid "spin-off" -msgstr "" +msgstr "Nebenprodukt" # Default: Spin off from msgid "spin-off-from" -msgstr "" +msgstr "Nebenprodukt von" # Default: Spoofed in msgid "spoofed-in" -msgstr "" +msgstr "Parodiert in" # Default: Spoofs msgid "spoofs" -msgstr "" +msgstr "Parodie" # Default: Spouse msgid "spouse" -msgstr "" +msgstr "Gattin" # Default: Status of availablility msgid "status-of-availablility" -msgstr "" +msgstr "Verfügbarkeitsstatus" # Default: Studio msgid "studio" -msgstr "" +msgstr "Studio" # Default: Studios msgid "studios" -msgstr "" +msgstr "Studios" # Default: Stunt performer msgid "stunt-performer" -msgstr "" +msgstr "Stunt-Darsteller" # Default: Stunts msgid "stunts" -msgstr "" +msgstr "Stunts" # Default: Subtitles msgid "subtitles" @@ -1160,19 +1161,19 @@ msgstr "Untertitel" # Default: Supplement msgid "supplement" -msgstr "" +msgstr "Ergänzung" # Default: Supplements msgid "supplements" -msgstr "" +msgstr "Ergänzungen" # Default: Synopsis msgid "synopsis" -msgstr "" +msgstr "Zusammenfassung" # Default: Taglines msgid "taglines" -msgstr "" +msgstr "Slogan" # Default: Tech info msgid "tech-info" @@ -1188,7 +1189,7 @@ msgstr "Zeit" # Default: Title msgid "title" -msgstr "" +msgstr "Titel" # Default: Titles in this product msgid "titles-in-this-product" @@ -1200,11 +1201,11 @@ msgstr "" # Default: Top 250 rank msgid "top-250-rank" -msgstr "" +msgstr "Top 250 platzierung" # Default: Trade mark msgid "trade-mark" -msgstr "" +msgstr "Warenzeichen" # Default: Transportation department msgid "transportation-department" @@ -1212,7 +1213,7 @@ msgstr "" # Default: Trivia msgid "trivia" -msgstr "" +msgstr "Nichtigkeiten" # Default: Tv msgid "tv" @@ -1220,7 +1221,7 @@ msgstr "TV" # Default: Under license from msgid "under-license-from" -msgstr "" +msgstr "lizensiert von" # Default: Unknown link msgid "unknown-link" @@ -1256,19 +1257,19 @@ msgstr "" # Default: Video quality msgid "video-quality" -msgstr "" +msgstr "Video Qualität" # Default: Video standard msgid "video-standard" -msgstr "" +msgstr "Video Standart" # Default: Visual effects msgid "visual-effects" -msgstr "" +msgstr "Visuelle Effekte" # Default: Votes msgid "votes" -msgstr "" +msgstr "Stimmen" # Default: Votes distribution msgid "votes-distribution" @@ -1284,11 +1285,11 @@ msgstr "" # Default: With msgid "with" -msgstr "" +msgstr "mit" # Default: Writer msgid "writer" -msgstr "Schreiber" +msgstr "Autor" # Default: Written by msgid "written-by" diff --git a/lib/imdb/locale/imdbpy-fr.po b/lib/imdb/locale/imdbpy-fr.po index c4509c99..f40012c1 100644 --- a/lib/imdb/locale/imdbpy-fr.po +++ b/lib/imdb/locale/imdbpy-fr.po @@ -1,13 +1,14 @@ # Gettext message file for imdbpy # Translators: -# RainDropR , 2013 -# Stéphane Aulery, 2012 +# lukophron, 2014 +# Rajaa Gutknecht , 2013 +# lkppo, 2012 msgid "" msgstr "" "Project-Id-Version: IMDbPY\n" "POT-Creation-Date: 2010-03-18 14:35+0000\n" -"PO-Revision-Date: 2013-11-20 11:07+0000\n" -"Last-Translator: RainDropR \n" +"PO-Revision-Date: 2014-10-08 02:52+0000\n" +"Last-Translator: lukophron\n" "Language-Team: French (http://www.transifex.com/projects/p/imdbpy/language/fr/)\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" @@ -33,11 +34,11 @@ msgstr "adaptation" # Default: Additional information msgid "additional-information" -msgstr "" +msgstr "information-additionnelle" # Default: Admissions msgid "admissions" -msgstr "" +msgstr "admissions" # Default: Agent address msgid "agent-address" diff --git a/lib/imdb/parser/http/__init__.py b/lib/imdb/parser/http/__init__.py index 16f8518d..a3001a08 100644 --- a/lib/imdb/parser/http/__init__.py +++ b/lib/imdb/parser/http/__init__.py @@ -726,6 +726,10 @@ class IMDbHTTPAccessSystem(IMDbBase): cont = self._retrieve(self.urls['person_main'] % personID + 'bio') return self.pProxy.bio_parser.parse(cont, getRefs=self._getRefs) + def get_person_resume(self, personID): + cont = self._retrieve(self.urls['person_main'] % personID + 'resume') + return self.pProxy.resume_parser.parse(cont, getRefs=self._getRefs) + def get_person_awards(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'awards') return self.pProxy.person_awards_parser.parse(cont) diff --git a/lib/imdb/parser/http/movieParser.py b/lib/imdb/parser/http/movieParser.py index a203b0d9..174d73f8 100644 --- a/lib/imdb/parser/http/movieParser.py +++ b/lib/imdb/parser/http/movieParser.py @@ -226,7 +226,7 @@ class DOMHTMLMovieParser(DOMParserBase): Attribute(key="countries", path="./h5[starts-with(text(), " \ "'Countr')]/../div[@class='info-content']//text()", - postprocess=makeSplitter('|')), + postprocess=makeSplitter('|')), Attribute(key="language", path="./h5[starts-with(text(), " \ "'Language')]/..//text()", @@ -234,7 +234,7 @@ class DOMHTMLMovieParser(DOMParserBase): Attribute(key='color info', path="./h5[starts-with(text(), " \ "'Color')]/..//text()", - postprocess=makeSplitter('Color:')), + postprocess=makeSplitter('|')), Attribute(key='sound mix', path="./h5[starts-with(text(), " \ "'Sound Mix')]/..//text()", @@ -462,6 +462,8 @@ class DOMHTMLMovieParser(DOMParserBase): del data['other akas'] if nakas: data['akas'] = nakas + if 'color info' in data: + data['color info'] = [x.replace('Color:', '', 1) for x in data['color info']] if 'runtimes' in data: data['runtimes'] = [x.replace(' min', u'') for x in data['runtimes']] @@ -1177,7 +1179,7 @@ class DOMHTMLCriticReviewsParser(DOMParserBase): path="//div[@class='article']/div[@class='see-more']/a", attrs=Attribute(key='metacritic url', path="./@href")) ] - + class DOMHTMLOfficialsitesParser(DOMParserBase): """Parser for the "official sites", "external reviews", "newsgroup reviews", "miscellaneous links", "sound clips", "video clips" and @@ -1534,7 +1536,7 @@ class DOMHTMLSeasonEpisodesParser(DOMParserBase): '').strip() episode_title = episode.get('title', '').strip() episode_plot = episode.get('plot', '') - if not (episode_nr and episode_id and episode_title): + if not (episode_nr is not None and episode_id and episode_title): continue ep_obj = Movie(movieID=episode_id, title=episode_title, accessSystem=self._as, modFunct=self._modFunct) diff --git a/lib/imdb/parser/http/personParser.py b/lib/imdb/parser/http/personParser.py index 9261a4da..64b1916d 100644 --- a/lib/imdb/parser/http/personParser.py +++ b/lib/imdb/parser/http/personParser.py @@ -204,7 +204,7 @@ class DOMHTMLBioParser(DOMParserBase): _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ - "'/date/')]/text()", + "'/search/name?birth_monthday=')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?birth_year=')]/text()" }, @@ -215,7 +215,7 @@ class DOMHTMLBioParser(DOMParserBase): _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ - "'/date/')]/text()", + "'/search/name?death_monthday=')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?death_date=')]/text()" }, @@ -326,6 +326,107 @@ class DOMHTMLBioParser(DOMParserBase): return data +class DOMHTMLResumeParser(DOMParserBase): + """Parser for the "resume" page of a given person. + The page should be provided as a string, as taken from + the akas.imdb.com server. The final result will be a + dictionary, with a key for every relevant section. + + Example: + resumeparser = DOMHTMLResumeParser() + result = resumeparser.parse(resume_html_string) + """ + _defGetRefs = True + + extractors = [ + Extractor(label='info', + group="//div[@class='section_box']", + group_key="./h3/text()", + group_key_normalize=lambda x: x.lower().replace(' ', '_'), + path="./ul[@class='resume_section_multi_list']//li", + attrs=Attribute(key=None, + multi=True, + path={ + 'title': ".//b//text()", + 'desc': ".//text()", + }, + postprocess=lambda x: (x.get('title'), x.get('desc').strip().replace('\n', ' ')))), + Extractor(label='other_info', + group="//div[@class='section_box']", + group_key="./h3/text()", + group_key_normalize=lambda x: x.lower().replace(' ', '_'), + path="./ul[@class='_imdbpy']//li", + attrs=Attribute(key=None, + multi=True, + path=".//text()", + postprocess=lambda x: x.strip().replace('\n', ' '))), + Extractor(label='credits', + group="//div[@class='section_box']", + group_key="./h3/text()", + group_key_normalize=lambda x: x.lower().replace(' ', '_'), + path="./table[@class='credits']//tr", + attrs=Attribute(key=None, + multi=True, + path={ + '0':".//td[1]//text()", + '1':".//td[2]//text()", + '2':".//td[3]//text()", + }, + postprocess=lambda x: [x.get('0'),x.get('1'),x.get('2')])), + Extractor(label='mini_info', + path="//div[@class='center']", + attrs=Attribute(key='mini_info', + path=".//text()", + postprocess=lambda x: x.strip().replace('\n', ' '))), + Extractor(label='name', + path="//div[@class='center']/h1[@id='preview_user_name']", + attrs=Attribute(key='name', + path=".//text()", + postprocess=lambda x: x.strip().replace('\n', ' '))), + Extractor(label='resume_bio', + path="//div[@id='resume_rendered_html']//p", + attrs=Attribute(key='resume_bio', + multi=True, + path=".//text()")), + + ] + + preprocessors = [ + (re.compile('(
    )', re.I), r'
      \1'), + ] + + def postprocess_data(self, data): + + for key in data.keys(): + if data[key] == '': + del data[key] + if key in ('mini_info', 'name', 'resume_bio'): + if key == 'resume_bio': + data[key] = "".join(data[key]).strip() + continue + if len(data[key][0]) == 3: + for item in data[key]: + item[:] = [x for x in item if not x == None] + continue + + if len(data[key][0]) == 2: + new_key = {} + for item in data[key]: + if item[0] == None: + continue + if ':' in item[0]: + if item[1].replace(item[0], '')[1:].strip() == '': + continue + new_key[item[0].strip().replace(':', '')] = item[1].replace(item[0], '')[1:].strip() + else: + new_key[item[0]] = item[1] + data[key] = new_key + + new_data = {} + new_data['resume'] = data + return new_data + + class DOMHTMLOtherWorksParser(DOMParserBase): """Parser for the "other works" and "agent" pages of a given person. The page should be provided as a string, as taken from @@ -502,6 +603,7 @@ from movieParser import DOMHTMLNewsParser _OBJECTS = { 'maindetails_parser': ((DOMHTMLMaindetailsParser,), None), 'bio_parser': ((DOMHTMLBioParser,), None), + 'resume_parser': ((DOMHTMLResumeParser,), None), 'otherworks_parser': ((DOMHTMLOtherWorksParser,), None), #'agent_parser': ((DOMHTMLOtherWorksParser,), {'kind': 'agent'}), 'person_officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), diff --git a/lib/imdb/parser/http/topBottomParser.py b/lib/imdb/parser/http/topBottomParser.py index f0f29509..1b8bb9f0 100644 --- a/lib/imdb/parser/http/topBottomParser.py +++ b/lib/imdb/parser/http/topBottomParser.py @@ -7,7 +7,7 @@ E.g.: http://akas.imdb.com/chart/top http://akas.imdb.com/chart/bottom -Copyright 2009 Davide Alberani +Copyright 2009-2015 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -43,14 +43,15 @@ class DOMHTMLTop250Parser(DOMParserBase): def _init(self): self.extractors = [Extractor(label=self.label, - path="//div[@id='main']//table//tr", + path="//div[@id='main']//div[1]//div//table//tbody//tr", attrs=Attribute(key=None, multi=True, - path={self.ranktext: "./td[1]//text()", - 'rating': "./td[2]//text()", - 'title': "./td[3]//text()", - 'movieID': "./td[3]//a/@href", - 'votes': "./td[4]//text()" + path={self.ranktext: "./td[2]//text()", + 'rating': "./td[3]//strong//text()", + 'title': "./td[2]//a//text()", + 'year': "./td[2]//span//text()", + 'movieID': "./td[2]//a/@href", + 'votes': "./td[3]//strong/@title" }))] def postprocess_data(self, data): @@ -72,12 +73,16 @@ class DOMHTMLTop250Parser(DOMParserBase): if theID in seenIDs: continue seenIDs.append(theID) - minfo = analyze_title(d['title']) + minfo = analyze_title(d['title']+" "+d['year']) try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) except: pass if 'votes' in d: - try: minfo['votes'] = int(d['votes'].replace(',', '')) - except: pass + try: + votes = d['votes'].replace(' votes','') + votes = votes.split(' based on ')[1] + minfo['votes'] = int(votes.replace(',', '')) + except: + pass if 'rating' in d: try: minfo['rating'] = float(d['rating']) except: pass diff --git a/lib/imdb/parser/http/utils.py b/lib/imdb/parser/http/utils.py index 031a4d3a..8b4e17e3 100644 --- a/lib/imdb/parser/http/utils.py +++ b/lib/imdb/parser/http/utils.py @@ -441,12 +441,6 @@ class DOMParserBase(object): self._useModule = useModule nrMods = len(useModule) _gotError = False - - # Force warnings.warn() to omit the source code line in the message - formatwarning_orig = warnings.formatwarning - warnings.formatwarning = lambda message, category, filename, lineno, line=None: \ - formatwarning_orig(message, category, filename, lineno, line='') - for idx, mod in enumerate(useModule): mod = mod.strip().lower() try: diff --git a/lib/imdb/utils.py b/lib/imdb/utils.py index f468efd4..a506722d 100644 --- a/lib/imdb/utils.py +++ b/lib/imdb/utils.py @@ -639,11 +639,14 @@ def analyze_company_name(name, stripNotes=False): o_name = name name = name.strip() country = None - if name.endswith(']'): - idx = name.rfind('[') - if idx != -1: - country = name[idx:] - name = name[:idx].rstrip() + if name.startswith('['): + name = re.sub('[!@#$\(\)\[\]]', '', name) + else: + if name.endswith(']'): + idx = name.rfind('[') + if idx != -1: + country = name[idx:] + name = name[:idx].rstrip() if not name: raise IMDbParserError('invalid name: "%s"' % o_name) result = {'name': name} @@ -957,7 +960,7 @@ def _tag4TON(ton, addAccessSystem=False, _containerOnly=False): crl = [crl] for cr in crl: crTag = cr.__class__.__name__.lower() - crValue = cr['long imdb name'] + crValue = cr.get('long imdb name') or u'' crValue = _normalizeValue(crValue) crID = cr.getID() if crID is not None: