# Author: Nic Wolfe # URL: http://code.google.com/p/sickbeard/ # # This file is part of SickRage. # # SickRage is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # SickRage is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with SickRage. If not, see . from __future__ import with_statement import time import re import datetime import os.path import regexes import shelve import sickbeard from sickbeard import logger, helpers, scene_numbering, common, exceptions, scene_exceptions, encodingKludge as ek from contextlib import closing from dateutil import parser from sickbeard.exceptions import ex class NameParser(object): NORMAL_REGEX = 0 SPORTS_REGEX = 1 ANIME_REGEX = 2 def __init__(self, file_name=True, showObj=None, useIndexers=False, convert=False, naming_pattern=False): self.file_name = file_name self.showObj = showObj self.useIndexers = useIndexers self.convert = convert self.naming_pattern = naming_pattern self.regexModes = [self.NORMAL_REGEX, self.SPORTS_REGEX, self.ANIME_REGEX] if self.showObj and not (self.showObj.is_anime and self.showObj.is_sports): self.regexModes = [self.NORMAL_REGEX] elif self.showObj and self.showObj.is_anime: self.regexModes = [self.ANIME_REGEX] elif self.showObj and self.showObj.is_sports: self.regexModes = [self.SPORTS_REGEX] def clean_series_name(self, series_name): """Cleans up series name by removing any . and _ characters, along with any trailing hyphens. Is basically equivalent to replacing all _ and . with a space, but handles decimal numbers in string, for example: >>> cleanRegexedSeriesName("an.example.1.0.test") 'an example 1.0 test' >>> cleanRegexedSeriesName("an_example_1.0_test") 'an example 1.0 test' Stolen from dbr's tvnamer """ series_name = re.sub("(\D)\.(?!\s)(\D)", "\\1 \\2", series_name) series_name = re.sub("(\d)\.(\d{4})", "\\1 \\2", series_name) # if it ends in a year then don't keep the dot series_name = re.sub("(\D)\.(?!\s)", "\\1 ", series_name) series_name = re.sub("\.(?!\s)(\D)", " \\1", series_name) series_name = series_name.replace("_", " ") series_name = re.sub("-$", "", series_name) series_name = re.sub("^\[.*\]", "", series_name) return series_name.strip() def _compile_regexes(self, regexMode): if regexMode == self.SPORTS_REGEX: logger.log(u"Using SPORTS regexs", logger.DEBUG) uncompiled_regex = [regexes.sports_regexs] elif regexMode == self.ANIME_REGEX: logger.log(u"Using ANIME regexs", logger.DEBUG) uncompiled_regex = [regexes.anime_regexes] else: logger.log(u"Using NORMAL reqgexs", logger.DEBUG) uncompiled_regex = [regexes.normal_regexes] self.compiled_regexes = [] for regexItem in uncompiled_regex: for i, (cur_pattern_name, cur_pattern) in enumerate(regexItem): try: cur_regex = re.compile(cur_pattern, re.VERBOSE | re.IGNORECASE) except re.error, errormsg: logger.log(u"WARNING: Invalid episode_pattern, %s. %s" % (errormsg, cur_pattern)) else: cur_pattern_name = str(i) + "_" + cur_pattern_name self.compiled_regexes.append((regexMode, cur_pattern_name, cur_regex)) def _parse_string(self, name): if not name: return matches = [] doneSearch = False bestResult = None for regexMode in self.regexModes: if doneSearch: break self._compile_regexes(regexMode) for (cur_regexMode, cur_regex_name, cur_regex) in self.compiled_regexes: if doneSearch: break match = cur_regex.match(name) if not match: continue regex_num = int(re.match('^\d{1,2}', cur_regex_name).group(0)) result = ParseResult(name) result.which_regex = [cur_regex_name] result.score = 0 - regex_num named_groups = match.groupdict().keys() if 'series_name' in named_groups: result.series_name = match.group('series_name') if result.series_name: result.series_name = self.clean_series_name(result.series_name) if not self.naming_pattern: if self.showObj and self.showObj.name.lower() == result.series_name.lower(): result.show = self.showObj else: result.show = helpers.get_show_by_name(result.series_name, useIndexer=self.useIndexers) if not result.show: break result.score += 1 if 'season_num' in named_groups: tmp_season = int(match.group('season_num')) if not (cur_regex_name == 'bare' and tmp_season in (19, 20)): result.season_number = tmp_season result.score += 1 if 'ep_num' in named_groups: ep_num = self._convert_number(match.group('ep_num')) if 'extra_ep_num' in named_groups and match.group('extra_ep_num'): result.episode_numbers = range(ep_num, self._convert_number(match.group('extra_ep_num')) + 1) result.score += 1 else: result.episode_numbers = [ep_num] result.score += 1 if 'ep_ab_num' in named_groups: ep_ab_num = self._convert_number(match.group('ep_ab_num')) if 'extra_ab_ep_num' in named_groups and match.group('extra_ab_ep_num'): result.ab_episode_numbers = range(ep_ab_num, self._convert_number(match.group('extra_ab_ep_num')) + 1) result.score += 1 else: result.ab_episode_numbers = [ep_ab_num] result.score += 1 if 'sports_event_id' in named_groups: sports_event_id = match.group('sports_event_id') if sports_event_id: result.sports_event_id = int(match.group('sports_event_id')) result.score += 1 if 'sports_event_name' in named_groups: result.sports_event_name = match.group('sports_event_name') if result.sports_event_name: result.sports_event_name = self.clean_series_name(result.sports_event_name) result.score += 1 if 'sports_air_date' in named_groups: sports_air_date = match.group('sports_air_date') if sports_air_date: try: result.sports_air_date = parser.parse(sports_air_date, fuzzy=True).date() result.score += 1 except: pass if 'air_year' in named_groups and 'air_month' in named_groups and 'air_day' in named_groups: year = int(match.group('air_year')) month = int(match.group('air_month')) day = int(match.group('air_day')) try: dtStr = '%s-%s-%s' % (year, month, day) result.air_date = datetime.datetime.strptime(dtStr, "%Y-%m-%d").date() result.score += 1 except: pass if 'extra_info' in named_groups: tmp_extra_info = match.group('extra_info') # Show.S04.Special or Show.S05.Part.2.Extras is almost certainly not every episode in the season if not (tmp_extra_info and 'season_only' in cur_regex_name and re.search( r'([. _-]|^)(special|extra)s?\w*([. _-]|$)', tmp_extra_info, re.I)): result.extra_info = tmp_extra_info result.score += 1 if 'release_group' in named_groups: result.release_group = match.group('release_group') result.score += 1 doneSearch = True if result.show else False matches.append(result) time.sleep(0.05) if len(matches): # pick best match with highest score based on placement bestResult = max(sorted(matches, reverse=True, key=lambda x: x.which_regex), key=lambda x: x.score) # get quality bestResult.quality = common.Quality.nameQuality(name, bestResult.show.is_anime if bestResult.show else False) # scene convert result bestResult = bestResult.convert() if self.convert and not self.naming_pattern else bestResult return bestResult def _combine_results(self, first, second, attr): # if the first doesn't exist then return the second or nothing if not first: if not second: return None else: return getattr(second, attr) # if the second doesn't exist then return the first if not second: return getattr(first, attr) a = getattr(first, attr) b = getattr(second, attr) # if a is good use it if a != None or (type(a) == list and len(a)): return a # if not use b (if b isn't set it'll just be default) else: return b def _unicodify(self, obj, encoding="utf-8"): if isinstance(obj, basestring): if not isinstance(obj, unicode): obj = unicode(obj, encoding, 'replace') return obj def _convert_number(self, org_number): """ Convert org_number into an integer org_number: integer or representation of a number: string or unicode Try force converting to int first, on error try converting from Roman numerals returns integer or 0 """ try: # try forcing to int if org_number: number = int(org_number) else: number = 0 except: # on error try converting from Roman numerals roman_to_int_map = (('M', 1000), ('CM', 900), ('D', 500), ('CD', 400), ('C', 100), ('XC', 90), ('L', 50), ('XL', 40), ('X', 10), ('IX', 9), ('V', 5), ('IV', 4), ('I', 1) ) roman_numeral = str(org_number).upper() number = 0 index = 0 for numeral, integer in roman_to_int_map: while roman_numeral[index:index + len(numeral)] == numeral: number += integer index += len(numeral) return number def parse(self, name, cache_result=True): name = self._unicodify(name) if self.naming_pattern: cache_result = False cached = NameParserCache().get(name) if cached: return cached # break it into parts if there are any (dirname, file name, extension) dir_name, file_name = os.path.split(name) ext_match = re.match('(.*)\.\w{3,4}$', file_name) if ext_match and self.file_name: base_file_name = ext_match.group(1) else: base_file_name = file_name # set up a result to use final_result = ParseResult(name) # try parsing the file name file_name_result = self._parse_string(base_file_name) # use only the direct parent dir dir_name = os.path.basename(dir_name) # parse the dirname for extra info if needed dir_name_result = self._parse_string(dir_name) # build the ParseResult object final_result.air_date = self._combine_results(file_name_result, dir_name_result, 'air_date') # anime absolute numbers final_result.ab_episode_numbers = self._combine_results(file_name_result, dir_name_result, 'ab_episode_numbers') # sports final_result.sports_event_id = self._combine_results(file_name_result, dir_name_result, 'sports_event_id') final_result.sports_event_name = self._combine_results(file_name_result, dir_name_result, 'sports_event_name') final_result.sports_air_date = self._combine_results(file_name_result, dir_name_result, 'sports_air_date') if not final_result.air_date and not final_result.sports_air_date: final_result.season_number = self._combine_results(file_name_result, dir_name_result, 'season_number') final_result.episode_numbers = self._combine_results(file_name_result, dir_name_result, 'episode_numbers') # if the dirname has a release group/show name I believe it over the filename final_result.series_name = self._combine_results(dir_name_result, file_name_result, 'series_name') final_result.extra_info = self._combine_results(dir_name_result, file_name_result, 'extra_info') final_result.release_group = self._combine_results(dir_name_result, file_name_result, 'release_group') final_result.which_regex = [] if final_result == file_name_result: final_result.which_regex = file_name_result.which_regex elif final_result == dir_name_result: final_result.which_regex = dir_name_result.which_regex else: if file_name_result: final_result.which_regex += file_name_result.which_regex if dir_name_result: final_result.which_regex += dir_name_result.which_regex final_result.show = self._combine_results(file_name_result, dir_name_result, 'show') final_result.quality = self._combine_results(file_name_result, dir_name_result, 'quality') if not final_result.show: raise InvalidShowException( "Unable to parse " + name.encode(sickbeard.SYS_ENCODING, 'xmlcharrefreplace')) # if there's no useful info in it then raise an exception if final_result.season_number == None and not final_result.episode_numbers and final_result.air_date == None and final_result.sports_air_date == None and not final_result.ab_episode_numbers and not final_result.series_name: raise InvalidNameException("Unable to parse " + name.encode(sickbeard.SYS_ENCODING, 'xmlcharrefreplace')) if cache_result: NameParserCache().add(name, final_result) logger.log(u"Parsed " + name + " into " + str(final_result).decode('utf-8', 'xmlcharrefreplace'), logger.DEBUG) return final_result class ParseResult(object): def __init__(self, original_name, series_name=None, sports_event_id=None, sports_event_name=None, sports_air_date=None, season_number=None, episode_numbers=None, extra_info=None, release_group=None, air_date=None, ab_episode_numbers=None, show=None, score=None, quality=None ): self.original_name = original_name self.series_name = series_name self.season_number = season_number if not episode_numbers: self.episode_numbers = [] else: self.episode_numbers = episode_numbers if not ab_episode_numbers: self.ab_episode_numbers = [] else: self.ab_episode_numbers = ab_episode_numbers if not quality: self.quality = common.Quality.UNKNOWN else: self.quality = quality self.extra_info = extra_info self.release_group = release_group self.air_date = air_date self.sports_event_id = sports_event_id self.sports_event_name = sports_event_name self.sports_air_date = sports_air_date self.which_regex = [] self.show = show self.score = score def __eq__(self, other): if not other: return False if self.series_name != other.series_name: return False if self.season_number != other.season_number: return False if self.episode_numbers != other.episode_numbers: return False if self.extra_info != other.extra_info: return False if self.release_group != other.release_group: return False if self.air_date != other.air_date: return False if self.sports_event_id != other.sports_event_id: return False if self.sports_event_name != other.sports_event_name: return False if self.sports_air_date != other.sports_air_date: return False if self.ab_episode_numbers != other.ab_episode_numbers: return False if self.show != other.show: return False if self.score != other.score: return False if self.quality != other.quality: return False return True def __str__(self): if self.series_name != None: to_return = self.series_name + u' - ' else: to_return = u'' if self.season_number != None: to_return += 'S' + str(self.season_number) if self.episode_numbers and len(self.episode_numbers): for e in self.episode_numbers: to_return += 'E' + str(e) if self.is_air_by_date: to_return += str(self.air_date) if self.is_sports: to_return += str(self.sports_event_name) to_return += str(self.sports_event_id) to_return += str(self.sports_air_date) if self.ab_episode_numbers: to_return += ' absolute_numbers: ' + str(self.ab_episode_numbers) if self.extra_info: to_return += ' - ' + self.extra_info if self.release_group: to_return += ' (' + self.release_group + ')' to_return += ' [ABD: ' + str(self.is_air_by_date) + ']' to_return += ' [SPORTS: ' + str(self.is_sports) + ']' to_return += ' [ANIME: ' + str(self.is_anime) + ']' to_return += ' [whichReg: ' + str(self.which_regex) + ']' return to_return.encode('utf-8') def convert(self): if not self.show: return self # can't convert with out a show object if self.is_air_by_date or self.is_sports: # scene numbering does not apply to air-by-date or sports shows return self new_episode_numbers = [] new_season_numbers = [] new_absolute_numbers = [] if self.show.is_anime and len(self.ab_episode_numbers): scene_season = scene_exceptions.get_scene_exception_by_name(self.series_name)[1] for epAbsNo in self.ab_episode_numbers: ab = scene_numbering.get_indexer_absolute_numbering(self.show.indexerid, self.show.indexer, epAbsNo, True, scene_season) if ab: try: (s, e) = helpers.get_all_episodes_from_absolute_number(self.show, None, [ab]) except exceptions.EpisodeNotFoundByAbsoluteNumberException: logger.log(str(self.show.indexerid) + ": Indexer object absolute number " + str( ab) + " is incomplete, skipping this episode") return self else: new_absolute_numbers.append(ab) new_episode_numbers.extend(e) new_season_numbers.append(s) elif self.season_number and len(self.episode_numbers): for epNo in self.episode_numbers: (s, e) = scene_numbering.get_indexer_numbering(self.show.indexerid, self.show.indexer, self.season_number, epNo) if self.show.is_anime: a = helpers.get_absolute_number_from_season_and_episode(self.show, s, e) if a: new_absolute_numbers.append(a) new_episode_numbers.append(e) new_season_numbers.append(s) # need to do a quick sanity check heregex. It's possible that we now have episodes # from more than one season (by tvdb numbering), and this is just too much # for sickbeard, so we'd need to flag it. new_season_numbers = list(set(new_season_numbers)) # remove duplicates if len(new_season_numbers) > 1: raise InvalidNameException("Scene numbering results episodes from " "seasons %s, (i.e. more than one) and " "sickrage does not support this. " "Sorry." % (str(new_season_numbers))) # I guess it's possible that we'd have duplicate episodes too, so lets # eliminate them new_episode_numbers = list(set(new_episode_numbers)) new_episode_numbers.sort() # maybe even duplicate absolute numbers so why not do them as well new_absolute_numbers = list(set(new_absolute_numbers)) new_absolute_numbers.sort() if len(new_absolute_numbers): self.ab_episode_numbers = new_absolute_numbers if len(new_season_numbers) and len(new_episode_numbers): self.episode_numbers = new_episode_numbers self.season_number = new_season_numbers[0] logger.log(u"Converted parsed result " + self.original_name + " into " + str(self).decode('utf-8', 'xmlcharrefreplace'), logger.DEBUG) return self @property def is_air_by_date(self): if self.season_number == None and len(self.episode_numbers) == 0 and self.air_date: return True return False @property def is_sports(self): if self.season_number == None and len(self.episode_numbers) == 0 and self.sports_air_date: return True return False @property def is_anime(self): if len(self.ab_episode_numbers): return True return False class NameParserCache: def __init__(self): self.npc_cache_size = 200 self.db_name = ek.ek(os.path.join, sickbeard.CACHE_DIR, 'name_parser_cache') def add(self, name, parse_result): name = name.encode('utf-8', 'ignore') try : with closing(shelve.open(self.db_name, writeback=True)) as npc: npc[str(name)] = parse_result while len(npc.items()) > self.npc_cache_size: del npc.keys()[0] except Exception as e: logger.log(u"NameParser cache error: " + ex(e), logger.ERROR) logger.log(u"NameParser cache corrupted, please delete " + self.db_name, logger.ERROR) def get(self, name): name = name.encode('utf-8', 'ignore') try: with closing(shelve.open(self.db_name, writeback=True)) as npc: parse_result = npc.get(str(name), None) except Exception as e: logger.log(u"NameParser cache error: " + ex(e), logger.ERROR) logger.log(u"NameParser cache corrupted, please delete " + self.db_name, logger.ERROR) parse_result = None if parse_result: logger.log("Using cached parse result for: " + name, logger.DEBUG) return parse_result class InvalidNameException(Exception): "The given release name is not valid" class InvalidShowException(Exception): "The given show name is not valid"