Remove unused libraries fuzzywuzzy and pysrt

This commit is contained in:
adam 2015-02-20 21:30:23 +08:00
parent 89990d1085
commit c4908144e1
16 changed files with 1 additions and 1460 deletions

View file

@ -1,6 +1,7 @@
### 0.x.x (2015-xx-xx xx:xx:xx UTC)
* Add requirements file for pip (port from midgetspy/sick-beard)
* Remove unused libraries fuzzywuzzy and pysrt
[develop changelog]

View file

@ -1,78 +0,0 @@
#!/usr/bin/env python
# encoding: utf-8
"""
StringMatcher.py
ported from python-Levenshtein
[https://github.com/miohtama/python-Levenshtein]
"""
from Levenshtein import *
from warnings import warn
class StringMatcher:
"""A SequenceMatcher-like class built on the top of Levenshtein"""
def _reset_cache(self):
self._ratio = self._distance = None
self._opcodes = self._editops = self._matching_blocks = None
def __init__(self, isjunk=None, seq1='', seq2=''):
if isjunk:
warn("isjunk not NOT implemented, it will be ignored")
self._str1, self._str2 = seq1, seq2
self._reset_cache()
def set_seqs(self, seq1, seq2):
self._str1, self._str2 = seq1, seq2
self._reset_cache()
def set_seq1(self, seq1):
self._str1 = seq1
self._reset_cache()
def set_seq2(self, seq2):
self._str2 = seq2
self._reset_cache()
def get_opcodes(self):
if not self._opcodes:
if self._editops:
self._opcodes = opcodes(self._editops, self._str1, self._str2)
else:
self._opcodes = opcodes(self._str1, self._str2)
return self._opcodes
def get_editops(self):
if not self._editops:
if self._opcodes:
self._editops = editops(self._opcodes, self._str1, self._str2)
else:
self._editops = editops(self._str1, self._str2)
return self._editops
def get_matching_blocks(self):
if not self._matching_blocks:
self._matching_blocks = matching_blocks(self.get_opcodes(),
self._str1, self._str2)
return self._matching_blocks
def ratio(self):
if not self._ratio:
self._ratio = ratio(self._str1, self._str2)
return self._ratio
def quick_ratio(self):
# This is usually quick enough :o)
if not self._ratio:
self._ratio = ratio(self._str1, self._str2)
return self._ratio
def real_quick_ratio(self):
len1, len2 = len(self._str1), len(self._str2)
return 2.0 * min(len1, len2) / (len1 + len2)
def distance(self):
if not self._distance:
self._distance = distance(self._str1, self._str2)
return self._distance

View file

@ -1,263 +0,0 @@
#!/usr/bin/env python
# encoding: utf-8
"""
fuzz.py
Copyright (c) 2011 Adam Cohen
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
from __future__ import unicode_literals
try:
from StringMatcher import StringMatcher as SequenceMatcher
except:
from difflib import SequenceMatcher
from . import utils
###########################
# Basic Scoring Functions #
###########################
def ratio(s1, s2):
if s1 is None:
raise TypeError("s1 is None")
if s2 is None:
raise TypeError("s2 is None")
s1, s2 = utils.make_type_consistent(s1, s2)
if len(s1) == 0 or len(s2) == 0:
return 0
m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.ratio())
# todo: skip duplicate indexes for a little more speed
def partial_ratio(s1, s2):
if s1 is None:
raise TypeError("s1 is None")
if s2 is None:
raise TypeError("s2 is None")
s1, s2 = utils.make_type_consistent(s1, s2)
if len(s1) == 0 or len(s2) == 0:
return 0
if len(s1) <= len(s2):
shorter = s1
longer = s2
else:
shorter = s2
longer = s1
m = SequenceMatcher(None, shorter, longer)
blocks = m.get_matching_blocks()
# each block represents a sequence of matching characters in a string
# of the form (idx_1, idx_2, len)
# the best partial match will block align with at least one of those blocks
# e.g. shorter = "abcd", longer = XXXbcdeEEE
# block = (1,3,3)
# best score === ratio("abcd", "Xbcd")
scores = []
for block in blocks:
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
long_end = long_start + len(shorter)
long_substr = longer[long_start:long_end]
m2 = SequenceMatcher(None, shorter, long_substr)
r = m2.ratio()
if r > .995:
return 100
else:
scores.append(r)
return int(100 * max(scores))
##############################
# Advanced Scoring Functions #
##############################
# Sorted Token
# find all alphanumeric tokens in the string
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
def _token_sort(s1, s2, partial=True, force_ascii=True):
if s1 is None:
raise TypeError("s1 is None")
if s2 is None:
raise TypeError("s2 is None")
# pull tokens
tokens1 = utils.full_process(s1, force_ascii=force_ascii).split()
tokens2 = utils.full_process(s2, force_ascii=force_ascii).split()
# sort tokens and join
sorted1 = " ".join(sorted(tokens1))
sorted2 = " ".join(sorted(tokens2))
sorted1 = sorted1.strip()
sorted2 = sorted2.strip()
if partial:
return partial_ratio(sorted1, sorted2)
else:
return ratio(sorted1, sorted2)
def token_sort_ratio(s1, s2, force_ascii=True):
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)
def partial_token_sort_ratio(s1, s2, force_ascii=True):
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii)
# Token Set
# find all alphanumeric tokens in each string...treat them as a set
# construct two strings of the form
# <sorted_intersection><sorted_remainder>
# take ratios of those two strings
# controls for unordered partial matches
def _token_set(s1, s2, partial=True, force_ascii=True):
if s1 is None:
raise TypeError("s1 is None")
if s2 is None:
raise TypeError("s2 is None")
p1 = utils.full_process(s1, force_ascii=force_ascii)
p2 = utils.full_process(s2, force_ascii=force_ascii)
if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0
# pull tokens
tokens1 = set(utils.full_process(p1).split())
tokens2 = set(utils.full_process(p2).split())
intersection = tokens1.intersection(tokens2)
diff1to2 = tokens1.difference(tokens2)
diff2to1 = tokens2.difference(tokens1)
sorted_sect = " ".join(sorted(intersection))
sorted_1to2 = " ".join(sorted(diff1to2))
sorted_2to1 = " ".join(sorted(diff2to1))
combined_1to2 = sorted_sect + " " + sorted_1to2
combined_2to1 = sorted_sect + " " + sorted_2to1
# strip
sorted_sect = sorted_sect.strip()
combined_1to2 = combined_1to2.strip()
combined_2to1 = combined_2to1.strip()
pairwise = [
ratio(sorted_sect, combined_1to2),
ratio(sorted_sect, combined_2to1),
ratio(combined_1to2, combined_2to1)
]
return max(pairwise)
def token_set_ratio(s1, s2, force_ascii=True):
return _token_set(s1, s2, partial=False, force_ascii=force_ascii)
def partial_token_set_ratio(s1, s2, force_ascii=True):
return _token_set(s1, s2, partial=True, force_ascii=force_ascii)
# TODO: numerics
###################
# Combination API #
###################
# q is for quick
def QRatio(s1, s2, force_ascii=True):
p1 = utils.full_process(s1, force_ascii=force_ascii)
p2 = utils.full_process(s2, force_ascii=force_ascii)
if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0
return ratio(p1, p2)
def UQRatio(s1, s2):
return QRatio(s1, s2, force_ascii=False)
# w is for weighted
def WRatio(s1, s2, force_ascii=True):
p1 = utils.full_process(s1, force_ascii=force_ascii)
p2 = utils.full_process(s2, force_ascii=force_ascii)
if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0
# should we look at partials?
try_partial = True
unbase_scale = .95
partial_scale = .90
base = ratio(p1, p2)
len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))
# if strings are similar length, don't use partials
if len_ratio < 1.5:
try_partial = False
# if one string is much much shorter than the other
if len_ratio > 8:
partial_scale = .6
if try_partial:
partial = partial_ratio(p1, p2) * partial_scale
ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
* unbase_scale * partial_scale
ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
* unbase_scale * partial_scale
return int(max(base, partial, ptsor, ptser))
else:
tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
return int(max(base, tsor, tser))
def UWRatio(s1, s2):
return WRatio(s1, s2, force_ascii=False)

View file

@ -1,119 +0,0 @@
#!/usr/bin/env python
# encoding: utf-8
"""
process.py
Copyright (c) 2011 Adam Cohen
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
import itertools
from . import fuzz
from . import utils
def extract(query, choices, processor=None, scorer=None, limit=5):
"""Find best matches in a list of choices, return a list of tuples
containing the match and it's score.
Arguments:
query -- an object representing the thing we want to find
choices -- a list of objects we are attempting to extract
values from
scorer -- f(OBJ, QUERY) --> INT. We will return the objects
with the highest score by default, we use
score.WRatio() and both OBJ and QUERY should be
strings
processor -- f(OBJ_A) --> OBJ_B, where the output is an input
to scorer for example, "processor = lambda x:
x[0]" would return the first element in a
collection x (of, say, strings) this would then
be used in the scoring collection by default, we
use utils.full_process()
"""
if choices is None or len(choices) == 0:
return []
# default, turn whatever the choice is into a workable string
if processor is None:
processor = lambda x: utils.full_process(x)
# default: wratio
if scorer is None:
scorer = fuzz.WRatio
sl = list()
for choice in choices:
processed = processor(choice)
score = scorer(query, processed)
tuple = (choice, score)
sl.append(tuple)
sl.sort(key=lambda i: i[1], reverse=True)
return sl[:limit]
def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
"""Find best matches above a score in a list of choices, return a
list of tuples containing the match and it's score.
Convenience method which returns the choices with best scores, see
extract() for full arguments list
Optional parameter: score_cutoff.
If the choice has a score of less than or equal to score_cutoff
it will not be included on result list
"""
best_list = extract(query, choices, processor, scorer, limit)
if len(best_list) > 0:
return list(itertools.takewhile(lambda x: x[1] > score_cutoff, best_list))
else:
return []
def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
"""Find the best match above a score in a list of choices, return a
tuple containing the match and it's score if it's above the treshold
or None.
Convenience method which returns the single best choice, see
extract() for full arguments list
Optional parameter: score_cutoff.
If the best choice has a score of less than or equal to
score_cutoff we will return none (intuition: not a good enough
match)
"""
best_list = extract(query, choices, processor, scorer, limit=1)
if len(best_list) > 0:
best = best_list[0]
if best[1] > score_cutoff:
return best
else:
return None
else:
return None

View file

@ -1,41 +0,0 @@
from __future__ import unicode_literals
import re
class StringProcessor(object):
"""
This class defines method to process strings in the most
efficient way. Ideally all the methods below use unicode strings
for both input and output.
"""
@classmethod
def replace_non_letters_non_numbers_with_whitespace(cls, a_string):
"""
This function replaces any sequence of non letters and non
numbers with a single white space.
"""
regex = re.compile(r"(?ui)\W")
return regex.sub(" ", a_string)
@classmethod
def strip(cls, a_string):
"""
This function strips leading and trailing white space.
"""
return a_string.strip()
@classmethod
def to_lower_case(cls, a_string):
"""
This function returns the lower-cased version of the string given.
"""
return a_string.lower()
@classmethod
def to_upper_case(cls, a_string):
"""
This function returns the upper-cased version of the string given.
"""
return a_string.upper()

View file

@ -1,76 +0,0 @@
from __future__ import unicode_literals
import sys
from fuzzywuzzy.string_processing import StringProcessor
PY3 = sys.version_info[0] == 3
def validate_string(s):
try:
if len(s) > 0:
return True
else:
return False
except:
return False
bad_chars = str('') # ascii dammit!
for i in range(128, 256):
bad_chars += chr(i)
if PY3:
translation_table = dict((ord(c), None) for c in bad_chars)
def asciionly(s):
if PY3:
return s.translate(translation_table)
else:
return s.translate(None, bad_chars)
def asciidammit(s):
if type(s) is str:
return asciionly(s)
elif type(s) is unicode:
return asciionly(s.encode('ascii', 'ignore'))
else:
return asciidammit(unicode(s))
def make_type_consistent(s1, s2):
if isinstance(s1, str) and isinstance(s2, str):
return s1, s2
elif isinstance(s1, unicode) and isinstance(s2, unicode):
return s1, s2
else:
return unicode(s1), unicode(s2)
def full_process(s, force_ascii=False):
"""Process string by
-- removing all but letters and numbers
-- trim whitespace
-- force to lower case
if force_ascii == True, force convert to ascii"""
if s is None:
return ""
if force_ascii:
s = asciidammit(s)
# Keep only Letters and Numbres (see Unicode docs).
string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
# Force into lowercase.
string_out = StringProcessor.to_lower_case(string_out)
# Remove leading and trailing whitespaces.
string_out = StringProcessor.strip(string_out)
return string_out
def intr(n):
'''Returns a correctly rounded integer'''
return int(round(n))

View file

@ -1,18 +0,0 @@
from pysrt.srttime import SubRipTime
from pysrt.srtitem import SubRipItem
from pysrt.srtfile import SubRipFile
from pysrt.srtexc import Error, InvalidItem, InvalidTimeString
from pysrt.version import VERSION, VERSION_STRING
__all__ = [
'SubRipFile', 'SubRipItem', 'SubRipFile', 'SUPPORT_UTF_32_LE',
'SUPPORT_UTF_32_BE', 'InvalidItem', 'InvalidTimeString'
]
ERROR_PASS = SubRipFile.ERROR_PASS
ERROR_LOG = SubRipFile.ERROR_LOG
ERROR_RAISE = SubRipFile.ERROR_RAISE
open = SubRipFile.open
stream = SubRipFile.stream
from_string = SubRipFile.from_string

View file

@ -1,218 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable-all
import os
import re
import sys
import codecs
import shutil
import argparse
from textwrap import dedent
from chardet import detect
from pysrt import SubRipFile, SubRipTime, VERSION_STRING
def underline(string):
return "\033[4m%s\033[0m" % string
class TimeAwareArgumentParser(argparse.ArgumentParser):
RE_TIME_REPRESENTATION = re.compile(r'^\-?(\d+[hms]{0,2}){1,4}$')
def parse_args(self, args=None, namespace=None):
time_index = -1
for index, arg in enumerate(args):
match = self.RE_TIME_REPRESENTATION.match(arg)
if match:
time_index = index
break
if time_index >= 0:
args.insert(time_index, '--')
return super(TimeAwareArgumentParser, self).parse_args(args, namespace)
class SubRipShifter(object):
BACKUP_EXTENSION = '.bak'
RE_TIME_STRING = re.compile(r'(\d+)([hms]{0,2})')
UNIT_RATIOS = {
'ms': 1,
'': SubRipTime.SECONDS_RATIO,
's': SubRipTime.SECONDS_RATIO,
'm': SubRipTime.MINUTES_RATIO,
'h': SubRipTime.HOURS_RATIO,
}
DESCRIPTION = dedent("""\
Srt subtitle editor
It can either shift, split or change the frame rate.
""")
TIMESTAMP_HELP = "A timestamp in the form: [-][Hh][Mm]S[s][MSms]"
SHIFT_EPILOG = dedent("""\
Examples:
1 minute and 12 seconds foreward (in place):
$ srt -i shift 1m12s movie.srt
half a second foreward:
$ srt shift 500ms movie.srt > othername.srt
1 second and half backward:
$ srt -i shift -1s500ms movie.srt
3 seconds backward:
$ srt -i shift -3 movie.srt
""")
RATE_EPILOG = dedent("""\
Examples:
Convert 23.9fps subtitles to 25fps:
$ srt -i rate 23.9 25 movie.srt
""")
LIMITS_HELP = "Each parts duration in the form: [Hh][Mm]S[s][MSms]"
SPLIT_EPILOG = dedent("""\
Examples:
For a movie in 2 parts with the first part 48 minutes and 18 seconds long:
$ srt split 48m18s movie.srt
=> creates movie.1.srt and movie.2.srt
For a movie in 3 parts of 20 minutes each:
$ srt split 20m 20m movie.srt
=> creates movie.1.srt, movie.2.srt and movie.3.srt
""")
FRAME_RATE_HELP = "A frame rate in fps (commonly 23.9 or 25)"
ENCODING_HELP = dedent("""\
Change file encoding. Useful for players accepting only latin1 subtitles.
List of supported encodings: http://docs.python.org/library/codecs.html#standard-encodings
""")
BREAK_EPILOG = dedent("""\
Break lines longer than defined length
""")
LENGTH_HELP = "Maximum number of characters per line"
def __init__(self):
self.output_file_path = None
def build_parser(self):
parser = TimeAwareArgumentParser(description=self.DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-i', '--in-place', action='store_true', dest='in_place',
help="Edit file in-place, saving a backup as file.bak (do not works for the split command)")
parser.add_argument('-e', '--output-encoding', metavar=underline('encoding'), action='store', dest='output_encoding',
type=self.parse_encoding, help=self.ENCODING_HELP)
parser.add_argument('-v', '--version', action='version', version='%%(prog)s %s' % VERSION_STRING)
subparsers = parser.add_subparsers(title='commands')
shift_parser = subparsers.add_parser('shift', help="Shift subtitles by specified time offset", epilog=self.SHIFT_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
shift_parser.add_argument('time_offset', action='store', metavar=underline('offset'),
type=self.parse_time, help=self.TIMESTAMP_HELP)
shift_parser.set_defaults(action=self.shift)
rate_parser = subparsers.add_parser('rate', help="Convert subtitles from a frame rate to another", epilog=self.RATE_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
rate_parser.add_argument('initial', action='store', type=float, help=self.FRAME_RATE_HELP)
rate_parser.add_argument('final', action='store', type=float, help=self.FRAME_RATE_HELP)
rate_parser.set_defaults(action=self.rate)
split_parser = subparsers.add_parser('split', help="Split a file in multiple parts", epilog=self.SPLIT_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
split_parser.add_argument('limits', action='store', nargs='+', type=self.parse_time, help=self.LIMITS_HELP)
split_parser.set_defaults(action=self.split)
break_parser = subparsers.add_parser('break', help="Break long lines", epilog=self.BREAK_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
break_parser.add_argument('length', action='store', type=int, help=self.LENGTH_HELP)
break_parser.set_defaults(action=self.break_lines)
parser.add_argument('file', action='store')
return parser
def run(self, args):
self.arguments = self.build_parser().parse_args(args)
if self.arguments.in_place:
self.create_backup()
self.arguments.action()
def parse_time(self, time_string):
negative = time_string.startswith('-')
if negative:
time_string = time_string[1:]
ordinal = sum(int(value) * self.UNIT_RATIOS[unit] for value, unit
in self.RE_TIME_STRING.findall(time_string))
return -ordinal if negative else ordinal
def parse_encoding(self, encoding_name):
try:
codecs.lookup(encoding_name)
except LookupError as error:
raise argparse.ArgumentTypeError(error.message)
return encoding_name
def shift(self):
self.input_file.shift(milliseconds=self.arguments.time_offset)
self.input_file.write_into(self.output_file)
def rate(self):
ratio = self.arguments.final / self.arguments.initial
self.input_file.shift(ratio=ratio)
self.input_file.write_into(self.output_file)
def split(self):
limits = [0] + self.arguments.limits + [self.input_file[-1].end.ordinal + 1]
base_name, extension = os.path.splitext(self.arguments.file)
for index, (start, end) in enumerate(zip(limits[:-1], limits[1:])):
file_name = '%s.%s%s' % (base_name, index + 1, extension)
part_file = self.input_file.slice(ends_after=start, starts_before=end)
part_file.shift(milliseconds=-start)
part_file.clean_indexes()
part_file.save(path=file_name, encoding=self.output_encoding)
def create_backup(self):
backup_file = self.arguments.file + self.BACKUP_EXTENSION
if not os.path.exists(backup_file):
shutil.copy2(self.arguments.file, backup_file)
self.output_file_path = self.arguments.file
self.arguments.file = backup_file
def break_lines(self):
split_re = re.compile(r'(.{,%i})(?:\s+|$)' % self.arguments.length)
for item in self.input_file:
item.text = '\n'.join(split_re.split(item.text)[1::2])
self.input_file.write_into(self.output_file)
@property
def output_encoding(self):
return self.arguments.output_encoding or self.input_file.encoding
@property
def input_file(self):
if not hasattr(self, '_source_file'):
with open(self.arguments.file, 'rb') as f:
content = f.read()
encoding = detect(content).get('encoding')
encoding = self.normalize_encoding(encoding)
self._source_file = SubRipFile.open(self.arguments.file,
encoding=encoding, error_handling=SubRipFile.ERROR_LOG)
return self._source_file
@property
def output_file(self):
if not hasattr(self, '_output_file'):
if self.output_file_path:
self._output_file = codecs.open(self.output_file_path, 'w+', encoding=self.output_encoding)
else:
self._output_file = sys.stdout
return self._output_file
def normalize_encoding(self, encoding):
return encoding.lower().replace('-', '_')
def main():
SubRipShifter().run(sys.argv[1:])
if __name__ == '__main__':
main()

View file

@ -1,26 +0,0 @@
class ComparableMixin(object):
def _compare(self, other, method):
try:
return method(self._cmpkey(), other._cmpkey())
except (AttributeError, TypeError):
# _cmpkey not implemented, or return different type,
# so I can't compare with "other".
return NotImplemented
def __lt__(self, other):
return self._compare(other, lambda s, o: s < o)
def __le__(self, other):
return self._compare(other, lambda s, o: s <= o)
def __eq__(self, other):
return self._compare(other, lambda s, o: s == o)
def __ge__(self, other):
return self._compare(other, lambda s, o: s >= o)
def __gt__(self, other):
return self._compare(other, lambda s, o: s > o)
def __ne__(self, other):
return self._compare(other, lambda s, o: s != o)

View file

@ -1,24 +0,0 @@
import sys
# Syntax sugar.
_ver = sys.version_info
#: Python 2.x?
is_py2 = (_ver[0] == 2)
#: Python 3.x?
is_py3 = (_ver[0] == 3)
from io import open as io_open
if is_py2:
builtin_str = str
basestring = basestring
str = unicode
open = io_open
elif is_py3:
builtin_str = str
basestring = (str, bytes)
str = str
open = open

View file

@ -1,31 +0,0 @@
"""
Exception classes
"""
class Error(Exception):
"""
Pysrt's base exception
"""
pass
class InvalidTimeString(Error):
"""
Raised when parser fail on bad formated time strings
"""
pass
class InvalidItem(Error):
"""
Raised when parser fail to parse a sub title item
"""
pass
class InvalidIndex(InvalidItem):
"""
Raised when parser fail to parse a sub title index
"""
pass

View file

@ -1,312 +0,0 @@
# -*- coding: utf-8 -*-
import os
import sys
import codecs
try:
from collections import UserList
except ImportError:
from UserList import UserList
from itertools import chain
from copy import copy
from pysrt.srtexc import Error
from pysrt.srtitem import SubRipItem
from pysrt.compat import str
BOMS = ((codecs.BOM_UTF32_LE, 'utf_32_le'),
(codecs.BOM_UTF32_BE, 'utf_32_be'),
(codecs.BOM_UTF16_LE, 'utf_16_le'),
(codecs.BOM_UTF16_BE, 'utf_16_be'),
(codecs.BOM_UTF8, 'utf_8'))
CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS)
BIGGER_BOM = max(len(bom) for bom, encoding in BOMS)
class SubRipFile(UserList, object):
"""
SubRip file descriptor.
Provide a pure Python mapping on all metadata.
SubRipFile(items, eol, path, encoding)
items -> list of SubRipItem. Default to [].
eol -> str: end of line character. Default to linesep used in opened file
if any else to os.linesep.
path -> str: path where file will be saved. To open an existant file see
SubRipFile.open.
encoding -> str: encoding used at file save. Default to utf-8.
"""
ERROR_PASS = 0
ERROR_LOG = 1
ERROR_RAISE = 2
DEFAULT_ENCODING = 'utf_8'
def __init__(self, items=None, eol=None, path=None, encoding='utf-8'):
UserList.__init__(self, items or [])
self._eol = eol
self.path = path
self.encoding = encoding
def _get_eol(self):
return self._eol or os.linesep
def _set_eol(self, eol):
self._eol = self._eol or eol
eol = property(_get_eol, _set_eol)
def slice(self, starts_before=None, starts_after=None, ends_before=None,
ends_after=None):
"""
slice([starts_before][, starts_after][, ends_before][, ends_after]) \
-> SubRipFile clone
All arguments are optional, and should be coercible to SubRipTime
object.
It reduce the set of subtitles to those that match match given time
constraints.
The returned set is a clone, but still contains references to original
subtitles. So if you shift this returned set, subs contained in the
original SubRipFile instance will be altered too.
Example:
>>> subs.slice(ends_after={'seconds': 20}).shift(seconds=2)
"""
clone = copy(self)
if starts_before:
clone.data = (i for i in clone.data if i.start < starts_before)
if starts_after:
clone.data = (i for i in clone.data if i.start > starts_after)
if ends_before:
clone.data = (i for i in clone.data if i.end < ends_before)
if ends_after:
clone.data = (i for i in clone.data if i.end > ends_after)
clone.data = list(clone.data)
return clone
def at(self, timestamp=None, **kwargs):
"""
at(timestamp) -> SubRipFile clone
timestamp argument should be coercible to SubRipFile object.
A specialization of slice. Return all subtiles visible at the
timestamp mark.
Example:
>>> subs.at((0, 0, 20, 0)).shift(seconds=2)
>>> subs.at(seconds=20).shift(seconds=2)
"""
time = timestamp or kwargs
return self.slice(starts_before=time, ends_after=time)
def shift(self, *args, **kwargs):
"""shift(hours, minutes, seconds, milliseconds, ratio)
Shift `start` and `end` attributes of each items of file either by
applying a ratio or by adding an offset.
`ratio` should be either an int or a float.
Example to convert subtitles from 23.9 fps to 25 fps:
>>> subs.shift(ratio=25/23.9)
All "time" arguments are optional and have a default value of 0.
Example to delay all subs from 2 seconds and half
>>> subs.shift(seconds=2, milliseconds=500)
"""
for item in self:
item.shift(*args, **kwargs)
def clean_indexes(self):
"""
clean_indexes()
Sort subs and reset their index attribute. Should be called after
destructive operations like split or such.
"""
self.sort()
for index, item in enumerate(self):
item.index = index + 1
@property
def text(self):
return '\n'.join(i.text for i in self)
@classmethod
def open(cls, path='', encoding=None, error_handling=ERROR_PASS):
"""
open([path, [encoding]])
If you do not provide any encoding, it can be detected if the file
contain a bit order mark, unless it is set to utf-8 as default.
"""
new_file = cls(path=path, encoding=encoding)
source_file = cls._open_unicode_file(path, claimed_encoding=encoding)
new_file.read(source_file, error_handling=error_handling)
source_file.close()
return new_file
@classmethod
def from_string(cls, source, **kwargs):
"""
from_string(source, **kwargs) -> SubRipFile
`source` -> a unicode instance or at least a str instance encoded with
`sys.getdefaultencoding()`
"""
error_handling = kwargs.pop('error_handling', None)
new_file = cls(**kwargs)
new_file.read(source.splitlines(True), error_handling=error_handling)
return new_file
def read(self, source_file, error_handling=ERROR_PASS):
"""
read(source_file, [error_handling])
This method parse subtitles contained in `source_file` and append them
to the current instance.
`source_file` -> Any iterable that yield unicode strings, like a file
opened with `codecs.open()` or an array of unicode.
"""
self.eol = self._guess_eol(source_file)
self.extend(self.stream(source_file, error_handling=error_handling))
return self
@classmethod
def stream(cls, source_file, error_handling=ERROR_PASS):
"""
stream(source_file, [error_handling])
This method yield SubRipItem instances a soon as they have been parsed
without storing them. It is a kind of SAX parser for .srt files.
`source_file` -> Any iterable that yield unicode strings, like a file
opened with `codecs.open()` or an array of unicode.
Example:
>>> import pysrt
>>> import codecs
>>> file = codecs.open('movie.srt', encoding='utf-8')
>>> for sub in pysrt.stream(file):
... sub.text += "\nHello !"
... print unicode(sub)
"""
string_buffer = []
for index, line in enumerate(chain(source_file, '\n')):
if line.strip():
string_buffer.append(line)
else:
source = string_buffer
string_buffer = []
if source and all(source):
try:
yield SubRipItem.from_lines(source)
except Error as error:
error.args += (''.join(source), )
cls._handle_error(error, error_handling, index)
def save(self, path=None, encoding=None, eol=None):
"""
save([path][, encoding][, eol])
Use initial path if no other provided.
Use initial encoding if no other provided.
Use initial eol if no other provided.
"""
path = path or self.path
encoding = encoding or self.encoding
save_file = codecs.open(path, 'w+', encoding=encoding)
self.write_into(save_file, eol=eol)
save_file.close()
def write_into(self, output_file, eol=None):
"""
write_into(output_file [, eol])
Serialize current state into `output_file`.
`output_file` -> Any instance that respond to `write()`, typically a
file object
"""
output_eol = eol or self.eol
for item in self:
string_repr = str(item)
if output_eol != '\n':
string_repr = string_repr.replace('\n', output_eol)
output_file.write(string_repr)
# Only add trailing eol if it's not already present.
# It was kept in the SubRipItem's text before but it really
# belongs here. Existing applications might give us subtitles
# which already contain a trailing eol though.
if not string_repr.endswith(2 * output_eol):
output_file.write(output_eol)
@classmethod
def _guess_eol(cls, string_iterable):
first_line = cls._get_first_line(string_iterable)
for eol in ('\r\n', '\r', '\n'):
if first_line.endswith(eol):
return eol
return os.linesep
@classmethod
def _get_first_line(cls, string_iterable):
if hasattr(string_iterable, 'tell'):
previous_position = string_iterable.tell()
try:
first_line = next(iter(string_iterable))
except StopIteration:
return ''
if hasattr(string_iterable, 'seek'):
string_iterable.seek(previous_position)
return first_line
@classmethod
def _detect_encoding(cls, path):
file_descriptor = open(path, 'rb')
first_chars = file_descriptor.read(BIGGER_BOM)
file_descriptor.close()
for bom, encoding in BOMS:
if first_chars.startswith(bom):
return encoding
# TODO: maybe a chardet integration
return cls.DEFAULT_ENCODING
@classmethod
def _open_unicode_file(cls, path, claimed_encoding=None):
encoding = claimed_encoding or cls._detect_encoding(path)
source_file = codecs.open(path, 'rU', encoding=encoding)
# get rid of BOM if any
possible_bom = CODECS_BOMS.get(encoding, None)
if possible_bom:
file_bom = source_file.read(len(possible_bom))
if not file_bom == possible_bom:
source_file.seek(0) # if not rewind
return source_file
@classmethod
def _handle_error(cls, error, error_handling, index):
if error_handling == cls.ERROR_RAISE:
error.args = (index, ) + error.args
raise error
if error_handling == cls.ERROR_LOG:
name = type(error).__name__
sys.stderr.write('PySRT-%s(line %s): \n' % (name, index))
sys.stderr.write(error.args[0].encode('ascii', 'replace'))
sys.stderr.write('\n')

View file

@ -1,76 +0,0 @@
# -*- coding: utf-8 -*-
"""
SubRip's subtitle parser
"""
from pysrt.srtexc import InvalidItem, InvalidIndex
from pysrt.srttime import SubRipTime
from pysrt.comparablemixin import ComparableMixin
from pysrt.compat import str
class SubRipItem(ComparableMixin):
"""
SubRipItem(index, start, end, text, position)
index -> int: index of item in file. 0 by default.
start, end -> SubRipTime or coercible.
text -> unicode: text content for item.
position -> unicode: raw srt/vtt "display coordinates" string
"""
ITEM_PATTERN = '%s\n%s --> %s%s\n%s\n'
TIMESTAMP_SEPARATOR = '-->'
def __init__(self, index=0, start=None, end=None, text='', position=''):
try:
self.index = int(index)
except (TypeError, ValueError): # try to cast as int, but it's not mandatory
self.index = index
self.start = SubRipTime.coerce(start or 0)
self.end = SubRipTime.coerce(end or 0)
self.position = str(position)
self.text = str(text)
def __str__(self):
position = ' %s' % self.position if self.position.strip() else ''
return self.ITEM_PATTERN % (self.index, self.start, self.end,
position, self.text)
def _cmpkey(self):
return (self.start, self.end)
def shift(self, *args, **kwargs):
"""
shift(hours, minutes, seconds, milliseconds, ratio)
Add given values to start and end attributes.
All arguments are optional and have a default value of 0.
"""
self.start.shift(*args, **kwargs)
self.end.shift(*args, **kwargs)
@classmethod
def from_string(cls, source):
return cls.from_lines(source.splitlines(True))
@classmethod
def from_lines(cls, lines):
if len(lines) < 2:
raise InvalidItem()
lines = [l.rstrip() for l in lines]
index = None
if cls.TIMESTAMP_SEPARATOR not in lines[0]:
index = lines.pop(0)
start, end, position = cls.split_timestamps(lines[0])
body = '\n'.join(lines[1:])
return cls(index, start, end, body, position)
@classmethod
def split_timestamps(cls, line):
timestamps = line.split(cls.TIMESTAMP_SEPARATOR)
if len(timestamps) != 2:
raise InvalidItem()
start, end_and_position = timestamps
end_and_position = end_and_position.lstrip().split(' ', 1)
end = end_and_position[0]
position = end_and_position[1] if len(end_and_position) > 1 else ''
return (s.strip() for s in (start, end, position))

View file

@ -1,176 +0,0 @@
# -*- coding: utf-8 -*-
"""
SubRip's time format parser: HH:MM:SS,mmm
"""
import re
from datetime import time
from pysrt.srtexc import InvalidTimeString
from pysrt.comparablemixin import ComparableMixin
from pysrt.compat import str, basestring
class TimeItemDescriptor(object):
# pylint: disable-msg=R0903
def __init__(self, ratio, super_ratio=0):
self.ratio = int(ratio)
self.super_ratio = int(super_ratio)
def _get_ordinal(self, instance):
if self.super_ratio:
return instance.ordinal % self.super_ratio
return instance.ordinal
def __get__(self, instance, klass):
if instance is None:
raise AttributeError
return self._get_ordinal(instance) // self.ratio
def __set__(self, instance, value):
part = self._get_ordinal(instance) - instance.ordinal % self.ratio
instance.ordinal += value * self.ratio - part
class SubRipTime(ComparableMixin):
TIME_PATTERN = '%02d:%02d:%02d,%03d'
TIME_REPR = 'SubRipTime(%d, %d, %d, %d)'
RE_TIME_SEP = re.compile(r'\:|\.|\,')
RE_INTEGER = re.compile(r'^(\d+)')
SECONDS_RATIO = 1000
MINUTES_RATIO = SECONDS_RATIO * 60
HOURS_RATIO = MINUTES_RATIO * 60
hours = TimeItemDescriptor(HOURS_RATIO)
minutes = TimeItemDescriptor(MINUTES_RATIO, HOURS_RATIO)
seconds = TimeItemDescriptor(SECONDS_RATIO, MINUTES_RATIO)
milliseconds = TimeItemDescriptor(1, SECONDS_RATIO)
def __init__(self, hours=0, minutes=0, seconds=0, milliseconds=0):
"""
SubRipTime(hours, minutes, seconds, milliseconds)
All arguments are optional and have a default value of 0.
"""
super(SubRipTime, self).__init__()
self.ordinal = hours * self.HOURS_RATIO \
+ minutes * self.MINUTES_RATIO \
+ seconds * self.SECONDS_RATIO \
+ milliseconds
def __repr__(self):
return self.TIME_REPR % tuple(self)
def __str__(self):
if self.ordinal < 0:
# Represent negative times as zero
return str(SubRipTime.from_ordinal(0))
return self.TIME_PATTERN % tuple(self)
def _compare(self, other, method):
return super(SubRipTime, self)._compare(self.coerce(other), method)
def _cmpkey(self):
return self.ordinal
def __add__(self, other):
return self.from_ordinal(self.ordinal + self.coerce(other).ordinal)
def __iadd__(self, other):
self.ordinal += self.coerce(other).ordinal
return self
def __sub__(self, other):
return self.from_ordinal(self.ordinal - self.coerce(other).ordinal)
def __isub__(self, other):
self.ordinal -= self.coerce(other).ordinal
return self
def __mul__(self, ratio):
return self.from_ordinal(int(round(self.ordinal * ratio)))
def __imul__(self, ratio):
self.ordinal = int(round(self.ordinal * ratio))
return self
@classmethod
def coerce(cls, other):
"""
Coerce many types to SubRipTime instance.
Supported types:
- str/unicode
- int/long
- datetime.time
- any iterable
- dict
"""
if isinstance(other, SubRipTime):
return other
if isinstance(other, basestring):
return cls.from_string(other)
if isinstance(other, int):
return cls.from_ordinal(other)
if isinstance(other, time):
return cls.from_time(other)
try:
return cls(**other)
except TypeError:
return cls(*other)
def __iter__(self):
yield self.hours
yield self.minutes
yield self.seconds
yield self.milliseconds
def shift(self, *args, **kwargs):
"""
shift(hours, minutes, seconds, milliseconds)
All arguments are optional and have a default value of 0.
"""
if 'ratio' in kwargs:
self *= kwargs.pop('ratio')
self += self.__class__(*args, **kwargs)
@classmethod
def from_ordinal(cls, ordinal):
"""
int -> SubRipTime corresponding to a total count of milliseconds
"""
return cls(milliseconds=int(ordinal))
@classmethod
def from_string(cls, source):
"""
str/unicode(HH:MM:SS,mmm) -> SubRipTime corresponding to serial
raise InvalidTimeString
"""
items = cls.RE_TIME_SEP.split(source)
if len(items) != 4:
raise InvalidTimeString
return cls(*(cls.parse_int(i) for i in items))
@classmethod
def parse_int(cls, digits):
try:
return int(digits)
except ValueError:
match = cls.RE_INTEGER.match(digits)
if match:
return int(match.group())
return 0
@classmethod
def from_time(cls, source):
"""
datetime.time -> SubRipTime corresponding to time object
"""
return cls(hours=source.hour, minutes=source.minute,
seconds=source.second, milliseconds=source.microsecond // 1000)
def to_time(self):
"""
Convert SubRipTime instance into a pure datetime.time object
"""
return time(self.hours, self.minutes, self.seconds,
self.milliseconds * 1000)

View file

@ -1,2 +0,0 @@
VERSION = (1, 0, 1)
VERSION_STRING = '.'.join(str(i) for i in VERSION)