#!/usr/bin/env python2 # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames # Copyright (c) 2011 Nicolas Wack <wackou@gmail.com> # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # GuessIt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # Lesser GNU General Public License for more details. # # You should have received a copy of the Lesser GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # from __future__ import unicode_literals __version__ = '0.7.dev0' __all__ = ['Guess', 'Language', 'guess_file_info', 'guess_video_info', 'guess_movie_info', 'guess_episode_info'] # Do python3 detection before importing any other module, to be sure that # it will then always be available # with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/ import sys if sys.version_info[0] >= 3: PY3 = True unicode_text_type = str native_text_type = str base_text_type = str def u(x): return str(x) def s(x): return x class UnicodeMixin(object): __str__ = lambda x: x.__unicode__() import binascii def to_hex(x): return binascii.hexlify(x).decode('utf-8') else: PY3 = False __all__ = [ str(s) for s in __all__ ] # fix imports for python2 unicode_text_type = unicode native_text_type = str base_text_type = basestring def u(x): if isinstance(x, str): return x.decode('utf-8') return unicode(x) def s(x): if isinstance(x, unicode): return x.encode('utf-8') if isinstance(x, list): return [ s(y) for y in x ] if isinstance(x, tuple): return tuple(s(y) for y in x) if isinstance(x, dict): return dict((s(key), s(value)) for key, value in x.items()) return x class UnicodeMixin(object): __str__ = lambda x: unicode(x).encode('utf-8') def to_hex(x): return x.encode('hex') from guessit.guess import Guess, merge_all from guessit.language import Language from guessit.matcher import IterativeMatcher from guessit.textutils import clean_string import logging log = logging.getLogger(__name__) class NullHandler(logging.Handler): def emit(self, record): pass # let's be a nicely behaving library h = NullHandler() log.addHandler(h) def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: mtree = IterativeMatcher(filename, filetype=filetype, opts=['skip_first_year']) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') is None: return m if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage'])) if not langs: return warning('A weird error happened with language detection') # find the language that is likely more relevant for lng in langs: if lng.value in title2.value: # if the language was detected as part of a potential title, # look at this one in particular lang = lng break else: # pick the first one if we don't have a better choice lang = langs[0] # language code are rarely part of a title, and those # should be handled by the Language exceptions anyway if len(lang.value) <= 3: return m # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(title.root.value).split() try: if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and parts.index(lang.value) == len(parts) - 2): return m except ValueError: pass # if the language was in the middle of the other potential title, # keep the other title (eg: The Italian Job), except if it is at the # very beginning, in which case we consider it an error if m2['title'].startswith(lang.value): return m elif lang.value in title2.value: return m2 # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return warning('Not sure of the title because of the language position') return m def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] # Force unicode as soon as possible filename = u(filename) if info is None: info = ['filename'] if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': result.append(_guess_filename(filename, filetype)) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append(Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append(Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) elif infotype.startswith('hash_'): import hashlib hashname = infotype[5:] try: hasher = getattr(hashlib, hashname)() hashers.append((infotype, hasher)) except AttributeError: log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) else: log.warning('Invalid infotype: %s' % infotype) # do all the hashes now, but on a single pass if hashers: try: blocksize = 8192 hasherobjs = dict(hashers).values() with open(filename, 'rb') as f: chunk = f.read(blocksize) while chunk: for hasher in hasherobjs: hasher.update(chunk) chunk = f.read(blocksize) for infotype, hasher in hashers: result.append(Guess({infotype: hasher.hexdigest()}, confidence=1.0)) except Exception as e: log.warning('Could not compute hash because: %s' % e) result = merge_all(result) # last minute adjustments # if country is in the guessed properties, make it part of the filename if 'series' in result and 'country' in result: result['series'] += ' (%s)' % result['country'].alpha2.upper() return result def guess_video_info(filename, info=None): return guess_file_info(filename, 'autodetect', info) def guess_movie_info(filename, info=None): return guess_file_info(filename, 'movie', info) def guess_episode_info(filename, info=None): return guess_file_info(filename, 'episode', info)