diff --git a/CHANGES.md b/CHANGES.md index 2890a364..0f1d30df 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ ### 0.x.x (2015-xx-xx xx:xx:xx UTC) * Add requirements file for pip (port from midgetspy/sick-beard) +* Remove unused libraries fuzzywuzzy and pysrt [develop changelog] diff --git a/lib/fuzzywuzzy/StringMatcher.py b/lib/fuzzywuzzy/StringMatcher.py deleted file mode 100644 index 9dccfe7e..00000000 --- a/lib/fuzzywuzzy/StringMatcher.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -""" -StringMatcher.py - -ported from python-Levenshtein -[https://github.com/miohtama/python-Levenshtein] -""" - -from Levenshtein import * -from warnings import warn - -class StringMatcher: - """A SequenceMatcher-like class built on the top of Levenshtein""" - - def _reset_cache(self): - self._ratio = self._distance = None - self._opcodes = self._editops = self._matching_blocks = None - - def __init__(self, isjunk=None, seq1='', seq2=''): - if isjunk: - warn("isjunk not NOT implemented, it will be ignored") - self._str1, self._str2 = seq1, seq2 - self._reset_cache() - - def set_seqs(self, seq1, seq2): - self._str1, self._str2 = seq1, seq2 - self._reset_cache() - - def set_seq1(self, seq1): - self._str1 = seq1 - self._reset_cache() - - def set_seq2(self, seq2): - self._str2 = seq2 - self._reset_cache() - - def get_opcodes(self): - if not self._opcodes: - if self._editops: - self._opcodes = opcodes(self._editops, self._str1, self._str2) - else: - self._opcodes = opcodes(self._str1, self._str2) - return self._opcodes - - def get_editops(self): - if not self._editops: - if self._opcodes: - self._editops = editops(self._opcodes, self._str1, self._str2) - else: - self._editops = editops(self._str1, self._str2) - return self._editops - - def get_matching_blocks(self): - if not self._matching_blocks: - self._matching_blocks = matching_blocks(self.get_opcodes(), - self._str1, self._str2) - return self._matching_blocks - - def ratio(self): - if not self._ratio: - self._ratio = ratio(self._str1, self._str2) - return self._ratio - - def quick_ratio(self): - # This is usually quick enough :o) - if not self._ratio: - self._ratio = ratio(self._str1, self._str2) - return self._ratio - - def real_quick_ratio(self): - len1, len2 = len(self._str1), len(self._str2) - return 2.0 * min(len1, len2) / (len1 + len2) - - def distance(self): - if not self._distance: - self._distance = distance(self._str1, self._str2) - return self._distance \ No newline at end of file diff --git a/lib/fuzzywuzzy/__init__.py b/lib/fuzzywuzzy/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/fuzzywuzzy/fuzz.py b/lib/fuzzywuzzy/fuzz.py deleted file mode 100644 index 26274b9a..00000000 --- a/lib/fuzzywuzzy/fuzz.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -""" -fuzz.py - -Copyright (c) 2011 Adam Cohen - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" -from __future__ import unicode_literals - -try: - from StringMatcher import StringMatcher as SequenceMatcher -except: - from difflib import SequenceMatcher - -from . import utils - - -########################### -# Basic Scoring Functions # -########################### - - -def ratio(s1, s2): - - if s1 is None: - raise TypeError("s1 is None") - if s2 is None: - raise TypeError("s2 is None") - s1, s2 = utils.make_type_consistent(s1, s2) - if len(s1) == 0 or len(s2) == 0: - return 0 - - m = SequenceMatcher(None, s1, s2) - return utils.intr(100 * m.ratio()) - - -# todo: skip duplicate indexes for a little more speed -def partial_ratio(s1, s2): - - if s1 is None: - raise TypeError("s1 is None") - if s2 is None: - raise TypeError("s2 is None") - s1, s2 = utils.make_type_consistent(s1, s2) - if len(s1) == 0 or len(s2) == 0: - return 0 - - if len(s1) <= len(s2): - shorter = s1 - longer = s2 - else: - shorter = s2 - longer = s1 - - m = SequenceMatcher(None, shorter, longer) - blocks = m.get_matching_blocks() - - # each block represents a sequence of matching characters in a string - # of the form (idx_1, idx_2, len) - # the best partial match will block align with at least one of those blocks - # e.g. shorter = "abcd", longer = XXXbcdeEEE - # block = (1,3,3) - # best score === ratio("abcd", "Xbcd") - scores = [] - for block in blocks: - long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 - long_end = long_start + len(shorter) - long_substr = longer[long_start:long_end] - - m2 = SequenceMatcher(None, shorter, long_substr) - r = m2.ratio() - if r > .995: - return 100 - else: - scores.append(r) - - return int(100 * max(scores)) - - -############################## -# Advanced Scoring Functions # -############################## - -# Sorted Token -# find all alphanumeric tokens in the string -# sort those tokens and take ratio of resulting joined strings -# controls for unordered string elements -def _token_sort(s1, s2, partial=True, force_ascii=True): - - if s1 is None: - raise TypeError("s1 is None") - if s2 is None: - raise TypeError("s2 is None") - - # pull tokens - tokens1 = utils.full_process(s1, force_ascii=force_ascii).split() - tokens2 = utils.full_process(s2, force_ascii=force_ascii).split() - - # sort tokens and join - sorted1 = " ".join(sorted(tokens1)) - sorted2 = " ".join(sorted(tokens2)) - - sorted1 = sorted1.strip() - sorted2 = sorted2.strip() - - if partial: - return partial_ratio(sorted1, sorted2) - else: - return ratio(sorted1, sorted2) - - -def token_sort_ratio(s1, s2, force_ascii=True): - return _token_sort(s1, s2, partial=False, force_ascii=force_ascii) - - -def partial_token_sort_ratio(s1, s2, force_ascii=True): - return _token_sort(s1, s2, partial=True, force_ascii=force_ascii) - - -# Token Set -# find all alphanumeric tokens in each string...treat them as a set -# construct two strings of the form -# -# take ratios of those two strings -# controls for unordered partial matches -def _token_set(s1, s2, partial=True, force_ascii=True): - - if s1 is None: - raise TypeError("s1 is None") - if s2 is None: - raise TypeError("s2 is None") - - p1 = utils.full_process(s1, force_ascii=force_ascii) - p2 = utils.full_process(s2, force_ascii=force_ascii) - - if not utils.validate_string(p1): - return 0 - if not utils.validate_string(p2): - return 0 - - # pull tokens - tokens1 = set(utils.full_process(p1).split()) - tokens2 = set(utils.full_process(p2).split()) - - intersection = tokens1.intersection(tokens2) - diff1to2 = tokens1.difference(tokens2) - diff2to1 = tokens2.difference(tokens1) - - sorted_sect = " ".join(sorted(intersection)) - sorted_1to2 = " ".join(sorted(diff1to2)) - sorted_2to1 = " ".join(sorted(diff2to1)) - - combined_1to2 = sorted_sect + " " + sorted_1to2 - combined_2to1 = sorted_sect + " " + sorted_2to1 - - # strip - sorted_sect = sorted_sect.strip() - combined_1to2 = combined_1to2.strip() - combined_2to1 = combined_2to1.strip() - - pairwise = [ - ratio(sorted_sect, combined_1to2), - ratio(sorted_sect, combined_2to1), - ratio(combined_1to2, combined_2to1) - ] - return max(pairwise) - - -def token_set_ratio(s1, s2, force_ascii=True): - return _token_set(s1, s2, partial=False, force_ascii=force_ascii) - - -def partial_token_set_ratio(s1, s2, force_ascii=True): - return _token_set(s1, s2, partial=True, force_ascii=force_ascii) - - -# TODO: numerics - -################### -# Combination API # -################### - -# q is for quick -def QRatio(s1, s2, force_ascii=True): - - p1 = utils.full_process(s1, force_ascii=force_ascii) - p2 = utils.full_process(s2, force_ascii=force_ascii) - - if not utils.validate_string(p1): - return 0 - if not utils.validate_string(p2): - return 0 - - return ratio(p1, p2) - - -def UQRatio(s1, s2): - return QRatio(s1, s2, force_ascii=False) - - -# w is for weighted -def WRatio(s1, s2, force_ascii=True): - - p1 = utils.full_process(s1, force_ascii=force_ascii) - p2 = utils.full_process(s2, force_ascii=force_ascii) - - if not utils.validate_string(p1): - return 0 - if not utils.validate_string(p2): - return 0 - - # should we look at partials? - try_partial = True - unbase_scale = .95 - partial_scale = .90 - - base = ratio(p1, p2) - len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) - - # if strings are similar length, don't use partials - if len_ratio < 1.5: - try_partial = False - - # if one string is much much shorter than the other - if len_ratio > 8: - partial_scale = .6 - - if try_partial: - partial = partial_ratio(p1, p2) * partial_scale - ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ - * unbase_scale * partial_scale - ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ - * unbase_scale * partial_scale - - return int(max(base, partial, ptsor, ptser)) - else: - tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale - tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale - - return int(max(base, tsor, tser)) - - -def UWRatio(s1, s2): - return WRatio(s1, s2, force_ascii=False) diff --git a/lib/fuzzywuzzy/process.py b/lib/fuzzywuzzy/process.py deleted file mode 100644 index 7571664e..00000000 --- a/lib/fuzzywuzzy/process.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -""" -process.py - -Copyright (c) 2011 Adam Cohen - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" -import itertools - -from . import fuzz -from . import utils - - -def extract(query, choices, processor=None, scorer=None, limit=5): - """Find best matches in a list of choices, return a list of tuples - containing the match and it's score. - - Arguments: - query -- an object representing the thing we want to find - choices -- a list of objects we are attempting to extract - values from - scorer -- f(OBJ, QUERY) --> INT. We will return the objects - with the highest score by default, we use - score.WRatio() and both OBJ and QUERY should be - strings - processor -- f(OBJ_A) --> OBJ_B, where the output is an input - to scorer for example, "processor = lambda x: - x[0]" would return the first element in a - collection x (of, say, strings) this would then - be used in the scoring collection by default, we - use utils.full_process() - - """ - if choices is None or len(choices) == 0: - return [] - - # default, turn whatever the choice is into a workable string - if processor is None: - processor = lambda x: utils.full_process(x) - - # default: wratio - if scorer is None: - scorer = fuzz.WRatio - - sl = list() - - for choice in choices: - processed = processor(choice) - score = scorer(query, processed) - tuple = (choice, score) - sl.append(tuple) - - sl.sort(key=lambda i: i[1], reverse=True) - return sl[:limit] - - -def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5): - """Find best matches above a score in a list of choices, return a - list of tuples containing the match and it's score. - - Convenience method which returns the choices with best scores, see - extract() for full arguments list - - Optional parameter: score_cutoff. - If the choice has a score of less than or equal to score_cutoff - it will not be included on result list - - """ - - best_list = extract(query, choices, processor, scorer, limit) - if len(best_list) > 0: - return list(itertools.takewhile(lambda x: x[1] > score_cutoff, best_list)) - else: - return [] - - -def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0): - """Find the best match above a score in a list of choices, return a - tuple containing the match and it's score if it's above the treshold - or None. - - Convenience method which returns the single best choice, see - extract() for full arguments list - - Optional parameter: score_cutoff. - If the best choice has a score of less than or equal to - score_cutoff we will return none (intuition: not a good enough - match) - - """ - - best_list = extract(query, choices, processor, scorer, limit=1) - if len(best_list) > 0: - best = best_list[0] - if best[1] > score_cutoff: - return best - else: - return None - else: - return None diff --git a/lib/fuzzywuzzy/string_processing.py b/lib/fuzzywuzzy/string_processing.py deleted file mode 100644 index 7c706d98..00000000 --- a/lib/fuzzywuzzy/string_processing.py +++ /dev/null @@ -1,41 +0,0 @@ -from __future__ import unicode_literals -import re - - -class StringProcessor(object): - """ - This class defines method to process strings in the most - efficient way. Ideally all the methods below use unicode strings - for both input and output. - """ - - @classmethod - def replace_non_letters_non_numbers_with_whitespace(cls, a_string): - """ - This function replaces any sequence of non letters and non - numbers with a single white space. - """ - regex = re.compile(r"(?ui)\W") - return regex.sub(" ", a_string) - - @classmethod - def strip(cls, a_string): - """ - This function strips leading and trailing white space. - """ - - return a_string.strip() - - @classmethod - def to_lower_case(cls, a_string): - """ - This function returns the lower-cased version of the string given. - """ - return a_string.lower() - - @classmethod - def to_upper_case(cls, a_string): - """ - This function returns the upper-cased version of the string given. - """ - return a_string.upper() diff --git a/lib/fuzzywuzzy/utils.py b/lib/fuzzywuzzy/utils.py deleted file mode 100644 index 2d3ae3e4..00000000 --- a/lib/fuzzywuzzy/utils.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import unicode_literals -import sys - -from fuzzywuzzy.string_processing import StringProcessor - - -PY3 = sys.version_info[0] == 3 - - -def validate_string(s): - try: - if len(s) > 0: - return True - else: - return False - except: - return False - -bad_chars = str('') # ascii dammit! -for i in range(128, 256): - bad_chars += chr(i) -if PY3: - translation_table = dict((ord(c), None) for c in bad_chars) - - -def asciionly(s): - if PY3: - return s.translate(translation_table) - else: - return s.translate(None, bad_chars) - - -def asciidammit(s): - if type(s) is str: - return asciionly(s) - elif type(s) is unicode: - return asciionly(s.encode('ascii', 'ignore')) - else: - return asciidammit(unicode(s)) - - -def make_type_consistent(s1, s2): - if isinstance(s1, str) and isinstance(s2, str): - return s1, s2 - - elif isinstance(s1, unicode) and isinstance(s2, unicode): - return s1, s2 - - else: - return unicode(s1), unicode(s2) - - -def full_process(s, force_ascii=False): - """Process string by - -- removing all but letters and numbers - -- trim whitespace - -- force to lower case - if force_ascii == True, force convert to ascii""" - - if s is None: - return "" - - if force_ascii: - s = asciidammit(s) - # Keep only Letters and Numbres (see Unicode docs). - string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s) - # Force into lowercase. - string_out = StringProcessor.to_lower_case(string_out) - # Remove leading and trailing whitespaces. - string_out = StringProcessor.strip(string_out) - return string_out - - -def intr(n): - '''Returns a correctly rounded integer''' - return int(round(n)) diff --git a/lib/pysrt/__init__.py b/lib/pysrt/__init__.py deleted file mode 100644 index 34e96717..00000000 --- a/lib/pysrt/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -from pysrt.srttime import SubRipTime -from pysrt.srtitem import SubRipItem -from pysrt.srtfile import SubRipFile -from pysrt.srtexc import Error, InvalidItem, InvalidTimeString -from pysrt.version import VERSION, VERSION_STRING - -__all__ = [ - 'SubRipFile', 'SubRipItem', 'SubRipFile', 'SUPPORT_UTF_32_LE', - 'SUPPORT_UTF_32_BE', 'InvalidItem', 'InvalidTimeString' -] - -ERROR_PASS = SubRipFile.ERROR_PASS -ERROR_LOG = SubRipFile.ERROR_LOG -ERROR_RAISE = SubRipFile.ERROR_RAISE - -open = SubRipFile.open -stream = SubRipFile.stream -from_string = SubRipFile.from_string diff --git a/lib/pysrt/commands.py b/lib/pysrt/commands.py deleted file mode 100644 index 557c663d..00000000 --- a/lib/pysrt/commands.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# pylint: disable-all - -import os -import re -import sys -import codecs -import shutil -import argparse -from textwrap import dedent - -from chardet import detect -from pysrt import SubRipFile, SubRipTime, VERSION_STRING - -def underline(string): - return "\033[4m%s\033[0m" % string - - -class TimeAwareArgumentParser(argparse.ArgumentParser): - - RE_TIME_REPRESENTATION = re.compile(r'^\-?(\d+[hms]{0,2}){1,4}$') - - def parse_args(self, args=None, namespace=None): - time_index = -1 - for index, arg in enumerate(args): - match = self.RE_TIME_REPRESENTATION.match(arg) - if match: - time_index = index - break - - if time_index >= 0: - args.insert(time_index, '--') - - return super(TimeAwareArgumentParser, self).parse_args(args, namespace) - - -class SubRipShifter(object): - - BACKUP_EXTENSION = '.bak' - RE_TIME_STRING = re.compile(r'(\d+)([hms]{0,2})') - UNIT_RATIOS = { - 'ms': 1, - '': SubRipTime.SECONDS_RATIO, - 's': SubRipTime.SECONDS_RATIO, - 'm': SubRipTime.MINUTES_RATIO, - 'h': SubRipTime.HOURS_RATIO, - } - DESCRIPTION = dedent("""\ - Srt subtitle editor - - It can either shift, split or change the frame rate. - """) - TIMESTAMP_HELP = "A timestamp in the form: [-][Hh][Mm]S[s][MSms]" - SHIFT_EPILOG = dedent("""\ - - Examples: - 1 minute and 12 seconds foreward (in place): - $ srt -i shift 1m12s movie.srt - - half a second foreward: - $ srt shift 500ms movie.srt > othername.srt - - 1 second and half backward: - $ srt -i shift -1s500ms movie.srt - - 3 seconds backward: - $ srt -i shift -3 movie.srt - """) - RATE_EPILOG = dedent("""\ - - Examples: - Convert 23.9fps subtitles to 25fps: - $ srt -i rate 23.9 25 movie.srt - """) - LIMITS_HELP = "Each parts duration in the form: [Hh][Mm]S[s][MSms]" - SPLIT_EPILOG = dedent("""\ - - Examples: - For a movie in 2 parts with the first part 48 minutes and 18 seconds long: - $ srt split 48m18s movie.srt - => creates movie.1.srt and movie.2.srt - - For a movie in 3 parts of 20 minutes each: - $ srt split 20m 20m movie.srt - => creates movie.1.srt, movie.2.srt and movie.3.srt - """) - FRAME_RATE_HELP = "A frame rate in fps (commonly 23.9 or 25)" - ENCODING_HELP = dedent("""\ - Change file encoding. Useful for players accepting only latin1 subtitles. - List of supported encodings: http://docs.python.org/library/codecs.html#standard-encodings - """) - BREAK_EPILOG = dedent("""\ - Break lines longer than defined length - """) - LENGTH_HELP = "Maximum number of characters per line" - - def __init__(self): - self.output_file_path = None - - def build_parser(self): - parser = TimeAwareArgumentParser(description=self.DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('-i', '--in-place', action='store_true', dest='in_place', - help="Edit file in-place, saving a backup as file.bak (do not works for the split command)") - parser.add_argument('-e', '--output-encoding', metavar=underline('encoding'), action='store', dest='output_encoding', - type=self.parse_encoding, help=self.ENCODING_HELP) - parser.add_argument('-v', '--version', action='version', version='%%(prog)s %s' % VERSION_STRING) - subparsers = parser.add_subparsers(title='commands') - - shift_parser = subparsers.add_parser('shift', help="Shift subtitles by specified time offset", epilog=self.SHIFT_EPILOG, formatter_class=argparse.RawTextHelpFormatter) - shift_parser.add_argument('time_offset', action='store', metavar=underline('offset'), - type=self.parse_time, help=self.TIMESTAMP_HELP) - shift_parser.set_defaults(action=self.shift) - - rate_parser = subparsers.add_parser('rate', help="Convert subtitles from a frame rate to another", epilog=self.RATE_EPILOG, formatter_class=argparse.RawTextHelpFormatter) - rate_parser.add_argument('initial', action='store', type=float, help=self.FRAME_RATE_HELP) - rate_parser.add_argument('final', action='store', type=float, help=self.FRAME_RATE_HELP) - rate_parser.set_defaults(action=self.rate) - - split_parser = subparsers.add_parser('split', help="Split a file in multiple parts", epilog=self.SPLIT_EPILOG, formatter_class=argparse.RawTextHelpFormatter) - split_parser.add_argument('limits', action='store', nargs='+', type=self.parse_time, help=self.LIMITS_HELP) - split_parser.set_defaults(action=self.split) - - break_parser = subparsers.add_parser('break', help="Break long lines", epilog=self.BREAK_EPILOG, formatter_class=argparse.RawTextHelpFormatter) - break_parser.add_argument('length', action='store', type=int, help=self.LENGTH_HELP) - break_parser.set_defaults(action=self.break_lines) - - parser.add_argument('file', action='store') - - return parser - - def run(self, args): - self.arguments = self.build_parser().parse_args(args) - if self.arguments.in_place: - self.create_backup() - self.arguments.action() - - def parse_time(self, time_string): - negative = time_string.startswith('-') - if negative: - time_string = time_string[1:] - ordinal = sum(int(value) * self.UNIT_RATIOS[unit] for value, unit - in self.RE_TIME_STRING.findall(time_string)) - return -ordinal if negative else ordinal - - def parse_encoding(self, encoding_name): - try: - codecs.lookup(encoding_name) - except LookupError as error: - raise argparse.ArgumentTypeError(error.message) - return encoding_name - - def shift(self): - self.input_file.shift(milliseconds=self.arguments.time_offset) - self.input_file.write_into(self.output_file) - - def rate(self): - ratio = self.arguments.final / self.arguments.initial - self.input_file.shift(ratio=ratio) - self.input_file.write_into(self.output_file) - - def split(self): - limits = [0] + self.arguments.limits + [self.input_file[-1].end.ordinal + 1] - base_name, extension = os.path.splitext(self.arguments.file) - for index, (start, end) in enumerate(zip(limits[:-1], limits[1:])): - file_name = '%s.%s%s' % (base_name, index + 1, extension) - part_file = self.input_file.slice(ends_after=start, starts_before=end) - part_file.shift(milliseconds=-start) - part_file.clean_indexes() - part_file.save(path=file_name, encoding=self.output_encoding) - - def create_backup(self): - backup_file = self.arguments.file + self.BACKUP_EXTENSION - if not os.path.exists(backup_file): - shutil.copy2(self.arguments.file, backup_file) - self.output_file_path = self.arguments.file - self.arguments.file = backup_file - - def break_lines(self): - split_re = re.compile(r'(.{,%i})(?:\s+|$)' % self.arguments.length) - for item in self.input_file: - item.text = '\n'.join(split_re.split(item.text)[1::2]) - self.input_file.write_into(self.output_file) - - @property - def output_encoding(self): - return self.arguments.output_encoding or self.input_file.encoding - - @property - def input_file(self): - if not hasattr(self, '_source_file'): - with open(self.arguments.file, 'rb') as f: - content = f.read() - encoding = detect(content).get('encoding') - encoding = self.normalize_encoding(encoding) - - self._source_file = SubRipFile.open(self.arguments.file, - encoding=encoding, error_handling=SubRipFile.ERROR_LOG) - return self._source_file - - @property - def output_file(self): - if not hasattr(self, '_output_file'): - if self.output_file_path: - self._output_file = codecs.open(self.output_file_path, 'w+', encoding=self.output_encoding) - else: - self._output_file = sys.stdout - return self._output_file - - def normalize_encoding(self, encoding): - return encoding.lower().replace('-', '_') - - -def main(): - SubRipShifter().run(sys.argv[1:]) - -if __name__ == '__main__': - main() diff --git a/lib/pysrt/comparablemixin.py b/lib/pysrt/comparablemixin.py deleted file mode 100644 index 3ae70b07..00000000 --- a/lib/pysrt/comparablemixin.py +++ /dev/null @@ -1,26 +0,0 @@ -class ComparableMixin(object): - def _compare(self, other, method): - try: - return method(self._cmpkey(), other._cmpkey()) - except (AttributeError, TypeError): - # _cmpkey not implemented, or return different type, - # so I can't compare with "other". - return NotImplemented - - def __lt__(self, other): - return self._compare(other, lambda s, o: s < o) - - def __le__(self, other): - return self._compare(other, lambda s, o: s <= o) - - def __eq__(self, other): - return self._compare(other, lambda s, o: s == o) - - def __ge__(self, other): - return self._compare(other, lambda s, o: s >= o) - - def __gt__(self, other): - return self._compare(other, lambda s, o: s > o) - - def __ne__(self, other): - return self._compare(other, lambda s, o: s != o) diff --git a/lib/pysrt/compat.py b/lib/pysrt/compat.py deleted file mode 100644 index 653cf320..00000000 --- a/lib/pysrt/compat.py +++ /dev/null @@ -1,24 +0,0 @@ - -import sys - -# Syntax sugar. -_ver = sys.version_info - -#: Python 2.x? -is_py2 = (_ver[0] == 2) - -#: Python 3.x? -is_py3 = (_ver[0] == 3) - -from io import open as io_open - -if is_py2: - builtin_str = str - basestring = basestring - str = unicode - open = io_open -elif is_py3: - builtin_str = str - basestring = (str, bytes) - str = str - open = open diff --git a/lib/pysrt/srtexc.py b/lib/pysrt/srtexc.py deleted file mode 100644 index 971b4709..00000000 --- a/lib/pysrt/srtexc.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Exception classes -""" - - -class Error(Exception): - """ - Pysrt's base exception - """ - pass - - -class InvalidTimeString(Error): - """ - Raised when parser fail on bad formated time strings - """ - pass - - -class InvalidItem(Error): - """ - Raised when parser fail to parse a sub title item - """ - pass - - -class InvalidIndex(InvalidItem): - """ - Raised when parser fail to parse a sub title index - """ - pass diff --git a/lib/pysrt/srtfile.py b/lib/pysrt/srtfile.py deleted file mode 100644 index 350e4b4d..00000000 --- a/lib/pysrt/srtfile.py +++ /dev/null @@ -1,312 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import sys -import codecs - -try: - from collections import UserList -except ImportError: - from UserList import UserList - -from itertools import chain -from copy import copy - -from pysrt.srtexc import Error -from pysrt.srtitem import SubRipItem -from pysrt.compat import str - -BOMS = ((codecs.BOM_UTF32_LE, 'utf_32_le'), - (codecs.BOM_UTF32_BE, 'utf_32_be'), - (codecs.BOM_UTF16_LE, 'utf_16_le'), - (codecs.BOM_UTF16_BE, 'utf_16_be'), - (codecs.BOM_UTF8, 'utf_8')) -CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS) -BIGGER_BOM = max(len(bom) for bom, encoding in BOMS) - - -class SubRipFile(UserList, object): - """ - SubRip file descriptor. - - Provide a pure Python mapping on all metadata. - - SubRipFile(items, eol, path, encoding) - - items -> list of SubRipItem. Default to []. - eol -> str: end of line character. Default to linesep used in opened file - if any else to os.linesep. - path -> str: path where file will be saved. To open an existant file see - SubRipFile.open. - encoding -> str: encoding used at file save. Default to utf-8. - """ - ERROR_PASS = 0 - ERROR_LOG = 1 - ERROR_RAISE = 2 - - DEFAULT_ENCODING = 'utf_8' - - def __init__(self, items=None, eol=None, path=None, encoding='utf-8'): - UserList.__init__(self, items or []) - self._eol = eol - self.path = path - self.encoding = encoding - - def _get_eol(self): - return self._eol or os.linesep - - def _set_eol(self, eol): - self._eol = self._eol or eol - - eol = property(_get_eol, _set_eol) - - def slice(self, starts_before=None, starts_after=None, ends_before=None, - ends_after=None): - """ - slice([starts_before][, starts_after][, ends_before][, ends_after]) \ --> SubRipFile clone - - All arguments are optional, and should be coercible to SubRipTime - object. - - It reduce the set of subtitles to those that match match given time - constraints. - - The returned set is a clone, but still contains references to original - subtitles. So if you shift this returned set, subs contained in the - original SubRipFile instance will be altered too. - - Example: - >>> subs.slice(ends_after={'seconds': 20}).shift(seconds=2) - """ - clone = copy(self) - - if starts_before: - clone.data = (i for i in clone.data if i.start < starts_before) - if starts_after: - clone.data = (i for i in clone.data if i.start > starts_after) - if ends_before: - clone.data = (i for i in clone.data if i.end < ends_before) - if ends_after: - clone.data = (i for i in clone.data if i.end > ends_after) - - clone.data = list(clone.data) - return clone - - def at(self, timestamp=None, **kwargs): - """ - at(timestamp) -> SubRipFile clone - - timestamp argument should be coercible to SubRipFile object. - - A specialization of slice. Return all subtiles visible at the - timestamp mark. - - Example: - >>> subs.at((0, 0, 20, 0)).shift(seconds=2) - >>> subs.at(seconds=20).shift(seconds=2) - """ - time = timestamp or kwargs - return self.slice(starts_before=time, ends_after=time) - - def shift(self, *args, **kwargs): - """shift(hours, minutes, seconds, milliseconds, ratio) - - Shift `start` and `end` attributes of each items of file either by - applying a ratio or by adding an offset. - - `ratio` should be either an int or a float. - Example to convert subtitles from 23.9 fps to 25 fps: - >>> subs.shift(ratio=25/23.9) - - All "time" arguments are optional and have a default value of 0. - Example to delay all subs from 2 seconds and half - >>> subs.shift(seconds=2, milliseconds=500) - """ - for item in self: - item.shift(*args, **kwargs) - - def clean_indexes(self): - """ - clean_indexes() - - Sort subs and reset their index attribute. Should be called after - destructive operations like split or such. - """ - self.sort() - for index, item in enumerate(self): - item.index = index + 1 - - @property - def text(self): - return '\n'.join(i.text for i in self) - - @classmethod - def open(cls, path='', encoding=None, error_handling=ERROR_PASS): - """ - open([path, [encoding]]) - - If you do not provide any encoding, it can be detected if the file - contain a bit order mark, unless it is set to utf-8 as default. - """ - new_file = cls(path=path, encoding=encoding) - source_file = cls._open_unicode_file(path, claimed_encoding=encoding) - new_file.read(source_file, error_handling=error_handling) - source_file.close() - return new_file - - @classmethod - def from_string(cls, source, **kwargs): - """ - from_string(source, **kwargs) -> SubRipFile - - `source` -> a unicode instance or at least a str instance encoded with - `sys.getdefaultencoding()` - """ - error_handling = kwargs.pop('error_handling', None) - new_file = cls(**kwargs) - new_file.read(source.splitlines(True), error_handling=error_handling) - return new_file - - def read(self, source_file, error_handling=ERROR_PASS): - """ - read(source_file, [error_handling]) - - This method parse subtitles contained in `source_file` and append them - to the current instance. - - `source_file` -> Any iterable that yield unicode strings, like a file - opened with `codecs.open()` or an array of unicode. - """ - self.eol = self._guess_eol(source_file) - self.extend(self.stream(source_file, error_handling=error_handling)) - return self - - @classmethod - def stream(cls, source_file, error_handling=ERROR_PASS): - """ - stream(source_file, [error_handling]) - - This method yield SubRipItem instances a soon as they have been parsed - without storing them. It is a kind of SAX parser for .srt files. - - `source_file` -> Any iterable that yield unicode strings, like a file - opened with `codecs.open()` or an array of unicode. - - Example: - >>> import pysrt - >>> import codecs - >>> file = codecs.open('movie.srt', encoding='utf-8') - >>> for sub in pysrt.stream(file): - ... sub.text += "\nHello !" - ... print unicode(sub) - """ - string_buffer = [] - for index, line in enumerate(chain(source_file, '\n')): - if line.strip(): - string_buffer.append(line) - else: - source = string_buffer - string_buffer = [] - if source and all(source): - try: - yield SubRipItem.from_lines(source) - except Error as error: - error.args += (''.join(source), ) - cls._handle_error(error, error_handling, index) - - def save(self, path=None, encoding=None, eol=None): - """ - save([path][, encoding][, eol]) - - Use initial path if no other provided. - Use initial encoding if no other provided. - Use initial eol if no other provided. - """ - path = path or self.path - encoding = encoding or self.encoding - - save_file = codecs.open(path, 'w+', encoding=encoding) - self.write_into(save_file, eol=eol) - save_file.close() - - def write_into(self, output_file, eol=None): - """ - write_into(output_file [, eol]) - - Serialize current state into `output_file`. - - `output_file` -> Any instance that respond to `write()`, typically a - file object - """ - output_eol = eol or self.eol - - for item in self: - string_repr = str(item) - if output_eol != '\n': - string_repr = string_repr.replace('\n', output_eol) - output_file.write(string_repr) - # Only add trailing eol if it's not already present. - # It was kept in the SubRipItem's text before but it really - # belongs here. Existing applications might give us subtitles - # which already contain a trailing eol though. - if not string_repr.endswith(2 * output_eol): - output_file.write(output_eol) - - @classmethod - def _guess_eol(cls, string_iterable): - first_line = cls._get_first_line(string_iterable) - for eol in ('\r\n', '\r', '\n'): - if first_line.endswith(eol): - return eol - return os.linesep - - @classmethod - def _get_first_line(cls, string_iterable): - if hasattr(string_iterable, 'tell'): - previous_position = string_iterable.tell() - - try: - first_line = next(iter(string_iterable)) - except StopIteration: - return '' - if hasattr(string_iterable, 'seek'): - string_iterable.seek(previous_position) - - return first_line - - @classmethod - def _detect_encoding(cls, path): - file_descriptor = open(path, 'rb') - first_chars = file_descriptor.read(BIGGER_BOM) - file_descriptor.close() - - for bom, encoding in BOMS: - if first_chars.startswith(bom): - return encoding - - # TODO: maybe a chardet integration - return cls.DEFAULT_ENCODING - - @classmethod - def _open_unicode_file(cls, path, claimed_encoding=None): - encoding = claimed_encoding or cls._detect_encoding(path) - source_file = codecs.open(path, 'rU', encoding=encoding) - - # get rid of BOM if any - possible_bom = CODECS_BOMS.get(encoding, None) - if possible_bom: - file_bom = source_file.read(len(possible_bom)) - if not file_bom == possible_bom: - source_file.seek(0) # if not rewind - return source_file - - @classmethod - def _handle_error(cls, error, error_handling, index): - if error_handling == cls.ERROR_RAISE: - error.args = (index, ) + error.args - raise error - if error_handling == cls.ERROR_LOG: - name = type(error).__name__ - sys.stderr.write('PySRT-%s(line %s): \n' % (name, index)) - sys.stderr.write(error.args[0].encode('ascii', 'replace')) - sys.stderr.write('\n') diff --git a/lib/pysrt/srtitem.py b/lib/pysrt/srtitem.py deleted file mode 100644 index 4101716b..00000000 --- a/lib/pysrt/srtitem.py +++ /dev/null @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- -""" -SubRip's subtitle parser -""" -from pysrt.srtexc import InvalidItem, InvalidIndex -from pysrt.srttime import SubRipTime -from pysrt.comparablemixin import ComparableMixin -from pysrt.compat import str - -class SubRipItem(ComparableMixin): - """ - SubRipItem(index, start, end, text, position) - - index -> int: index of item in file. 0 by default. - start, end -> SubRipTime or coercible. - text -> unicode: text content for item. - position -> unicode: raw srt/vtt "display coordinates" string - """ - ITEM_PATTERN = '%s\n%s --> %s%s\n%s\n' - TIMESTAMP_SEPARATOR = '-->' - - def __init__(self, index=0, start=None, end=None, text='', position=''): - try: - self.index = int(index) - except (TypeError, ValueError): # try to cast as int, but it's not mandatory - self.index = index - - self.start = SubRipTime.coerce(start or 0) - self.end = SubRipTime.coerce(end or 0) - self.position = str(position) - self.text = str(text) - - def __str__(self): - position = ' %s' % self.position if self.position.strip() else '' - return self.ITEM_PATTERN % (self.index, self.start, self.end, - position, self.text) - - def _cmpkey(self): - return (self.start, self.end) - - def shift(self, *args, **kwargs): - """ - shift(hours, minutes, seconds, milliseconds, ratio) - - Add given values to start and end attributes. - All arguments are optional and have a default value of 0. - """ - self.start.shift(*args, **kwargs) - self.end.shift(*args, **kwargs) - - @classmethod - def from_string(cls, source): - return cls.from_lines(source.splitlines(True)) - - @classmethod - def from_lines(cls, lines): - if len(lines) < 2: - raise InvalidItem() - lines = [l.rstrip() for l in lines] - index = None - if cls.TIMESTAMP_SEPARATOR not in lines[0]: - index = lines.pop(0) - start, end, position = cls.split_timestamps(lines[0]) - body = '\n'.join(lines[1:]) - return cls(index, start, end, body, position) - - @classmethod - def split_timestamps(cls, line): - timestamps = line.split(cls.TIMESTAMP_SEPARATOR) - if len(timestamps) != 2: - raise InvalidItem() - start, end_and_position = timestamps - end_and_position = end_and_position.lstrip().split(' ', 1) - end = end_and_position[0] - position = end_and_position[1] if len(end_and_position) > 1 else '' - return (s.strip() for s in (start, end, position)) diff --git a/lib/pysrt/srttime.py b/lib/pysrt/srttime.py deleted file mode 100644 index 95c578f8..00000000 --- a/lib/pysrt/srttime.py +++ /dev/null @@ -1,176 +0,0 @@ -# -*- coding: utf-8 -*- -""" -SubRip's time format parser: HH:MM:SS,mmm -""" -import re -from datetime import time - -from pysrt.srtexc import InvalidTimeString -from pysrt.comparablemixin import ComparableMixin -from pysrt.compat import str, basestring - -class TimeItemDescriptor(object): - # pylint: disable-msg=R0903 - def __init__(self, ratio, super_ratio=0): - self.ratio = int(ratio) - self.super_ratio = int(super_ratio) - - def _get_ordinal(self, instance): - if self.super_ratio: - return instance.ordinal % self.super_ratio - return instance.ordinal - - def __get__(self, instance, klass): - if instance is None: - raise AttributeError - return self._get_ordinal(instance) // self.ratio - - def __set__(self, instance, value): - part = self._get_ordinal(instance) - instance.ordinal % self.ratio - instance.ordinal += value * self.ratio - part - - -class SubRipTime(ComparableMixin): - TIME_PATTERN = '%02d:%02d:%02d,%03d' - TIME_REPR = 'SubRipTime(%d, %d, %d, %d)' - RE_TIME_SEP = re.compile(r'\:|\.|\,') - RE_INTEGER = re.compile(r'^(\d+)') - SECONDS_RATIO = 1000 - MINUTES_RATIO = SECONDS_RATIO * 60 - HOURS_RATIO = MINUTES_RATIO * 60 - - hours = TimeItemDescriptor(HOURS_RATIO) - minutes = TimeItemDescriptor(MINUTES_RATIO, HOURS_RATIO) - seconds = TimeItemDescriptor(SECONDS_RATIO, MINUTES_RATIO) - milliseconds = TimeItemDescriptor(1, SECONDS_RATIO) - - def __init__(self, hours=0, minutes=0, seconds=0, milliseconds=0): - """ - SubRipTime(hours, minutes, seconds, milliseconds) - - All arguments are optional and have a default value of 0. - """ - super(SubRipTime, self).__init__() - self.ordinal = hours * self.HOURS_RATIO \ - + minutes * self.MINUTES_RATIO \ - + seconds * self.SECONDS_RATIO \ - + milliseconds - - def __repr__(self): - return self.TIME_REPR % tuple(self) - - def __str__(self): - if self.ordinal < 0: - # Represent negative times as zero - return str(SubRipTime.from_ordinal(0)) - return self.TIME_PATTERN % tuple(self) - - def _compare(self, other, method): - return super(SubRipTime, self)._compare(self.coerce(other), method) - - def _cmpkey(self): - return self.ordinal - - def __add__(self, other): - return self.from_ordinal(self.ordinal + self.coerce(other).ordinal) - - def __iadd__(self, other): - self.ordinal += self.coerce(other).ordinal - return self - - def __sub__(self, other): - return self.from_ordinal(self.ordinal - self.coerce(other).ordinal) - - def __isub__(self, other): - self.ordinal -= self.coerce(other).ordinal - return self - - def __mul__(self, ratio): - return self.from_ordinal(int(round(self.ordinal * ratio))) - - def __imul__(self, ratio): - self.ordinal = int(round(self.ordinal * ratio)) - return self - - @classmethod - def coerce(cls, other): - """ - Coerce many types to SubRipTime instance. - Supported types: - - str/unicode - - int/long - - datetime.time - - any iterable - - dict - """ - if isinstance(other, SubRipTime): - return other - if isinstance(other, basestring): - return cls.from_string(other) - if isinstance(other, int): - return cls.from_ordinal(other) - if isinstance(other, time): - return cls.from_time(other) - try: - return cls(**other) - except TypeError: - return cls(*other) - - def __iter__(self): - yield self.hours - yield self.minutes - yield self.seconds - yield self.milliseconds - - def shift(self, *args, **kwargs): - """ - shift(hours, minutes, seconds, milliseconds) - - All arguments are optional and have a default value of 0. - """ - if 'ratio' in kwargs: - self *= kwargs.pop('ratio') - self += self.__class__(*args, **kwargs) - - @classmethod - def from_ordinal(cls, ordinal): - """ - int -> SubRipTime corresponding to a total count of milliseconds - """ - return cls(milliseconds=int(ordinal)) - - @classmethod - def from_string(cls, source): - """ - str/unicode(HH:MM:SS,mmm) -> SubRipTime corresponding to serial - raise InvalidTimeString - """ - items = cls.RE_TIME_SEP.split(source) - if len(items) != 4: - raise InvalidTimeString - return cls(*(cls.parse_int(i) for i in items)) - - @classmethod - def parse_int(cls, digits): - try: - return int(digits) - except ValueError: - match = cls.RE_INTEGER.match(digits) - if match: - return int(match.group()) - return 0 - - @classmethod - def from_time(cls, source): - """ - datetime.time -> SubRipTime corresponding to time object - """ - return cls(hours=source.hour, minutes=source.minute, - seconds=source.second, milliseconds=source.microsecond // 1000) - - def to_time(self): - """ - Convert SubRipTime instance into a pure datetime.time object - """ - return time(self.hours, self.minutes, self.seconds, - self.milliseconds * 1000) diff --git a/lib/pysrt/version.py b/lib/pysrt/version.py deleted file mode 100644 index f04e34e8..00000000 --- a/lib/pysrt/version.py +++ /dev/null @@ -1,2 +0,0 @@ -VERSION = (1, 0, 1) -VERSION_STRING = '.'.join(str(i) for i in VERSION)