mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-22 09:33:37 +00:00
Merge pull request #211 from adam111316/feature/RemoveUnusedLibraries
Remove unused libraries fuzzywuzzy and pysrt
This commit is contained in:
commit
d6bcc32fb8
16 changed files with 1 additions and 1460 deletions
|
@ -1,6 +1,7 @@
|
||||||
### 0.x.x (2015-xx-xx xx:xx:xx UTC)
|
### 0.x.x (2015-xx-xx xx:xx:xx UTC)
|
||||||
|
|
||||||
* Add requirements file for pip (port from midgetspy/sick-beard)
|
* Add requirements file for pip (port from midgetspy/sick-beard)
|
||||||
|
* Remove unused libraries fuzzywuzzy and pysrt
|
||||||
|
|
||||||
[develop changelog]
|
[develop changelog]
|
||||||
|
|
||||||
|
|
|
@ -1,78 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# encoding: utf-8
|
|
||||||
"""
|
|
||||||
StringMatcher.py
|
|
||||||
|
|
||||||
ported from python-Levenshtein
|
|
||||||
[https://github.com/miohtama/python-Levenshtein]
|
|
||||||
"""
|
|
||||||
|
|
||||||
from Levenshtein import *
|
|
||||||
from warnings import warn
|
|
||||||
|
|
||||||
class StringMatcher:
|
|
||||||
"""A SequenceMatcher-like class built on the top of Levenshtein"""
|
|
||||||
|
|
||||||
def _reset_cache(self):
|
|
||||||
self._ratio = self._distance = None
|
|
||||||
self._opcodes = self._editops = self._matching_blocks = None
|
|
||||||
|
|
||||||
def __init__(self, isjunk=None, seq1='', seq2=''):
|
|
||||||
if isjunk:
|
|
||||||
warn("isjunk not NOT implemented, it will be ignored")
|
|
||||||
self._str1, self._str2 = seq1, seq2
|
|
||||||
self._reset_cache()
|
|
||||||
|
|
||||||
def set_seqs(self, seq1, seq2):
|
|
||||||
self._str1, self._str2 = seq1, seq2
|
|
||||||
self._reset_cache()
|
|
||||||
|
|
||||||
def set_seq1(self, seq1):
|
|
||||||
self._str1 = seq1
|
|
||||||
self._reset_cache()
|
|
||||||
|
|
||||||
def set_seq2(self, seq2):
|
|
||||||
self._str2 = seq2
|
|
||||||
self._reset_cache()
|
|
||||||
|
|
||||||
def get_opcodes(self):
|
|
||||||
if not self._opcodes:
|
|
||||||
if self._editops:
|
|
||||||
self._opcodes = opcodes(self._editops, self._str1, self._str2)
|
|
||||||
else:
|
|
||||||
self._opcodes = opcodes(self._str1, self._str2)
|
|
||||||
return self._opcodes
|
|
||||||
|
|
||||||
def get_editops(self):
|
|
||||||
if not self._editops:
|
|
||||||
if self._opcodes:
|
|
||||||
self._editops = editops(self._opcodes, self._str1, self._str2)
|
|
||||||
else:
|
|
||||||
self._editops = editops(self._str1, self._str2)
|
|
||||||
return self._editops
|
|
||||||
|
|
||||||
def get_matching_blocks(self):
|
|
||||||
if not self._matching_blocks:
|
|
||||||
self._matching_blocks = matching_blocks(self.get_opcodes(),
|
|
||||||
self._str1, self._str2)
|
|
||||||
return self._matching_blocks
|
|
||||||
|
|
||||||
def ratio(self):
|
|
||||||
if not self._ratio:
|
|
||||||
self._ratio = ratio(self._str1, self._str2)
|
|
||||||
return self._ratio
|
|
||||||
|
|
||||||
def quick_ratio(self):
|
|
||||||
# This is usually quick enough :o)
|
|
||||||
if not self._ratio:
|
|
||||||
self._ratio = ratio(self._str1, self._str2)
|
|
||||||
return self._ratio
|
|
||||||
|
|
||||||
def real_quick_ratio(self):
|
|
||||||
len1, len2 = len(self._str1), len(self._str2)
|
|
||||||
return 2.0 * min(len1, len2) / (len1 + len2)
|
|
||||||
|
|
||||||
def distance(self):
|
|
||||||
if not self._distance:
|
|
||||||
self._distance = distance(self._str1, self._str2)
|
|
||||||
return self._distance
|
|
|
@ -1,263 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# encoding: utf-8
|
|
||||||
"""
|
|
||||||
fuzz.py
|
|
||||||
|
|
||||||
Copyright (c) 2011 Adam Cohen
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
a copy of this software and associated documentation files (the
|
|
||||||
"Software"), to deal in the Software without restriction, including
|
|
||||||
without limitation the rights to use, copy, modify, merge, publish,
|
|
||||||
distribute, sublicense, and/or sell copies of the Software, and to
|
|
||||||
permit persons to whom the Software is furnished to do so, subject to
|
|
||||||
the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be
|
|
||||||
included in all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
||||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
||||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
||||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
try:
|
|
||||||
from StringMatcher import StringMatcher as SequenceMatcher
|
|
||||||
except:
|
|
||||||
from difflib import SequenceMatcher
|
|
||||||
|
|
||||||
from . import utils
|
|
||||||
|
|
||||||
|
|
||||||
###########################
|
|
||||||
# Basic Scoring Functions #
|
|
||||||
###########################
|
|
||||||
|
|
||||||
|
|
||||||
def ratio(s1, s2):
|
|
||||||
|
|
||||||
if s1 is None:
|
|
||||||
raise TypeError("s1 is None")
|
|
||||||
if s2 is None:
|
|
||||||
raise TypeError("s2 is None")
|
|
||||||
s1, s2 = utils.make_type_consistent(s1, s2)
|
|
||||||
if len(s1) == 0 or len(s2) == 0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
m = SequenceMatcher(None, s1, s2)
|
|
||||||
return utils.intr(100 * m.ratio())
|
|
||||||
|
|
||||||
|
|
||||||
# todo: skip duplicate indexes for a little more speed
|
|
||||||
def partial_ratio(s1, s2):
|
|
||||||
|
|
||||||
if s1 is None:
|
|
||||||
raise TypeError("s1 is None")
|
|
||||||
if s2 is None:
|
|
||||||
raise TypeError("s2 is None")
|
|
||||||
s1, s2 = utils.make_type_consistent(s1, s2)
|
|
||||||
if len(s1) == 0 or len(s2) == 0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if len(s1) <= len(s2):
|
|
||||||
shorter = s1
|
|
||||||
longer = s2
|
|
||||||
else:
|
|
||||||
shorter = s2
|
|
||||||
longer = s1
|
|
||||||
|
|
||||||
m = SequenceMatcher(None, shorter, longer)
|
|
||||||
blocks = m.get_matching_blocks()
|
|
||||||
|
|
||||||
# each block represents a sequence of matching characters in a string
|
|
||||||
# of the form (idx_1, idx_2, len)
|
|
||||||
# the best partial match will block align with at least one of those blocks
|
|
||||||
# e.g. shorter = "abcd", longer = XXXbcdeEEE
|
|
||||||
# block = (1,3,3)
|
|
||||||
# best score === ratio("abcd", "Xbcd")
|
|
||||||
scores = []
|
|
||||||
for block in blocks:
|
|
||||||
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
|
|
||||||
long_end = long_start + len(shorter)
|
|
||||||
long_substr = longer[long_start:long_end]
|
|
||||||
|
|
||||||
m2 = SequenceMatcher(None, shorter, long_substr)
|
|
||||||
r = m2.ratio()
|
|
||||||
if r > .995:
|
|
||||||
return 100
|
|
||||||
else:
|
|
||||||
scores.append(r)
|
|
||||||
|
|
||||||
return int(100 * max(scores))
|
|
||||||
|
|
||||||
|
|
||||||
##############################
|
|
||||||
# Advanced Scoring Functions #
|
|
||||||
##############################
|
|
||||||
|
|
||||||
# Sorted Token
|
|
||||||
# find all alphanumeric tokens in the string
|
|
||||||
# sort those tokens and take ratio of resulting joined strings
|
|
||||||
# controls for unordered string elements
|
|
||||||
def _token_sort(s1, s2, partial=True, force_ascii=True):
|
|
||||||
|
|
||||||
if s1 is None:
|
|
||||||
raise TypeError("s1 is None")
|
|
||||||
if s2 is None:
|
|
||||||
raise TypeError("s2 is None")
|
|
||||||
|
|
||||||
# pull tokens
|
|
||||||
tokens1 = utils.full_process(s1, force_ascii=force_ascii).split()
|
|
||||||
tokens2 = utils.full_process(s2, force_ascii=force_ascii).split()
|
|
||||||
|
|
||||||
# sort tokens and join
|
|
||||||
sorted1 = " ".join(sorted(tokens1))
|
|
||||||
sorted2 = " ".join(sorted(tokens2))
|
|
||||||
|
|
||||||
sorted1 = sorted1.strip()
|
|
||||||
sorted2 = sorted2.strip()
|
|
||||||
|
|
||||||
if partial:
|
|
||||||
return partial_ratio(sorted1, sorted2)
|
|
||||||
else:
|
|
||||||
return ratio(sorted1, sorted2)
|
|
||||||
|
|
||||||
|
|
||||||
def token_sort_ratio(s1, s2, force_ascii=True):
|
|
||||||
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
|
|
||||||
def partial_token_sort_ratio(s1, s2, force_ascii=True):
|
|
||||||
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
|
|
||||||
# Token Set
|
|
||||||
# find all alphanumeric tokens in each string...treat them as a set
|
|
||||||
# construct two strings of the form
|
|
||||||
# <sorted_intersection><sorted_remainder>
|
|
||||||
# take ratios of those two strings
|
|
||||||
# controls for unordered partial matches
|
|
||||||
def _token_set(s1, s2, partial=True, force_ascii=True):
|
|
||||||
|
|
||||||
if s1 is None:
|
|
||||||
raise TypeError("s1 is None")
|
|
||||||
if s2 is None:
|
|
||||||
raise TypeError("s2 is None")
|
|
||||||
|
|
||||||
p1 = utils.full_process(s1, force_ascii=force_ascii)
|
|
||||||
p2 = utils.full_process(s2, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
if not utils.validate_string(p1):
|
|
||||||
return 0
|
|
||||||
if not utils.validate_string(p2):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# pull tokens
|
|
||||||
tokens1 = set(utils.full_process(p1).split())
|
|
||||||
tokens2 = set(utils.full_process(p2).split())
|
|
||||||
|
|
||||||
intersection = tokens1.intersection(tokens2)
|
|
||||||
diff1to2 = tokens1.difference(tokens2)
|
|
||||||
diff2to1 = tokens2.difference(tokens1)
|
|
||||||
|
|
||||||
sorted_sect = " ".join(sorted(intersection))
|
|
||||||
sorted_1to2 = " ".join(sorted(diff1to2))
|
|
||||||
sorted_2to1 = " ".join(sorted(diff2to1))
|
|
||||||
|
|
||||||
combined_1to2 = sorted_sect + " " + sorted_1to2
|
|
||||||
combined_2to1 = sorted_sect + " " + sorted_2to1
|
|
||||||
|
|
||||||
# strip
|
|
||||||
sorted_sect = sorted_sect.strip()
|
|
||||||
combined_1to2 = combined_1to2.strip()
|
|
||||||
combined_2to1 = combined_2to1.strip()
|
|
||||||
|
|
||||||
pairwise = [
|
|
||||||
ratio(sorted_sect, combined_1to2),
|
|
||||||
ratio(sorted_sect, combined_2to1),
|
|
||||||
ratio(combined_1to2, combined_2to1)
|
|
||||||
]
|
|
||||||
return max(pairwise)
|
|
||||||
|
|
||||||
|
|
||||||
def token_set_ratio(s1, s2, force_ascii=True):
|
|
||||||
return _token_set(s1, s2, partial=False, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
|
|
||||||
def partial_token_set_ratio(s1, s2, force_ascii=True):
|
|
||||||
return _token_set(s1, s2, partial=True, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: numerics
|
|
||||||
|
|
||||||
###################
|
|
||||||
# Combination API #
|
|
||||||
###################
|
|
||||||
|
|
||||||
# q is for quick
|
|
||||||
def QRatio(s1, s2, force_ascii=True):
|
|
||||||
|
|
||||||
p1 = utils.full_process(s1, force_ascii=force_ascii)
|
|
||||||
p2 = utils.full_process(s2, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
if not utils.validate_string(p1):
|
|
||||||
return 0
|
|
||||||
if not utils.validate_string(p2):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
return ratio(p1, p2)
|
|
||||||
|
|
||||||
|
|
||||||
def UQRatio(s1, s2):
|
|
||||||
return QRatio(s1, s2, force_ascii=False)
|
|
||||||
|
|
||||||
|
|
||||||
# w is for weighted
|
|
||||||
def WRatio(s1, s2, force_ascii=True):
|
|
||||||
|
|
||||||
p1 = utils.full_process(s1, force_ascii=force_ascii)
|
|
||||||
p2 = utils.full_process(s2, force_ascii=force_ascii)
|
|
||||||
|
|
||||||
if not utils.validate_string(p1):
|
|
||||||
return 0
|
|
||||||
if not utils.validate_string(p2):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# should we look at partials?
|
|
||||||
try_partial = True
|
|
||||||
unbase_scale = .95
|
|
||||||
partial_scale = .90
|
|
||||||
|
|
||||||
base = ratio(p1, p2)
|
|
||||||
len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))
|
|
||||||
|
|
||||||
# if strings are similar length, don't use partials
|
|
||||||
if len_ratio < 1.5:
|
|
||||||
try_partial = False
|
|
||||||
|
|
||||||
# if one string is much much shorter than the other
|
|
||||||
if len_ratio > 8:
|
|
||||||
partial_scale = .6
|
|
||||||
|
|
||||||
if try_partial:
|
|
||||||
partial = partial_ratio(p1, p2) * partial_scale
|
|
||||||
ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
|
|
||||||
* unbase_scale * partial_scale
|
|
||||||
ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
|
|
||||||
* unbase_scale * partial_scale
|
|
||||||
|
|
||||||
return int(max(base, partial, ptsor, ptser))
|
|
||||||
else:
|
|
||||||
tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
|
|
||||||
tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
|
|
||||||
|
|
||||||
return int(max(base, tsor, tser))
|
|
||||||
|
|
||||||
|
|
||||||
def UWRatio(s1, s2):
|
|
||||||
return WRatio(s1, s2, force_ascii=False)
|
|
|
@ -1,119 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# encoding: utf-8
|
|
||||||
"""
|
|
||||||
process.py
|
|
||||||
|
|
||||||
Copyright (c) 2011 Adam Cohen
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
a copy of this software and associated documentation files (the
|
|
||||||
"Software"), to deal in the Software without restriction, including
|
|
||||||
without limitation the rights to use, copy, modify, merge, publish,
|
|
||||||
distribute, sublicense, and/or sell copies of the Software, and to
|
|
||||||
permit persons to whom the Software is furnished to do so, subject to
|
|
||||||
the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be
|
|
||||||
included in all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
||||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
||||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
||||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
"""
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
from . import fuzz
|
|
||||||
from . import utils
|
|
||||||
|
|
||||||
|
|
||||||
def extract(query, choices, processor=None, scorer=None, limit=5):
|
|
||||||
"""Find best matches in a list of choices, return a list of tuples
|
|
||||||
containing the match and it's score.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
query -- an object representing the thing we want to find
|
|
||||||
choices -- a list of objects we are attempting to extract
|
|
||||||
values from
|
|
||||||
scorer -- f(OBJ, QUERY) --> INT. We will return the objects
|
|
||||||
with the highest score by default, we use
|
|
||||||
score.WRatio() and both OBJ and QUERY should be
|
|
||||||
strings
|
|
||||||
processor -- f(OBJ_A) --> OBJ_B, where the output is an input
|
|
||||||
to scorer for example, "processor = lambda x:
|
|
||||||
x[0]" would return the first element in a
|
|
||||||
collection x (of, say, strings) this would then
|
|
||||||
be used in the scoring collection by default, we
|
|
||||||
use utils.full_process()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if choices is None or len(choices) == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# default, turn whatever the choice is into a workable string
|
|
||||||
if processor is None:
|
|
||||||
processor = lambda x: utils.full_process(x)
|
|
||||||
|
|
||||||
# default: wratio
|
|
||||||
if scorer is None:
|
|
||||||
scorer = fuzz.WRatio
|
|
||||||
|
|
||||||
sl = list()
|
|
||||||
|
|
||||||
for choice in choices:
|
|
||||||
processed = processor(choice)
|
|
||||||
score = scorer(query, processed)
|
|
||||||
tuple = (choice, score)
|
|
||||||
sl.append(tuple)
|
|
||||||
|
|
||||||
sl.sort(key=lambda i: i[1], reverse=True)
|
|
||||||
return sl[:limit]
|
|
||||||
|
|
||||||
|
|
||||||
def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
|
|
||||||
"""Find best matches above a score in a list of choices, return a
|
|
||||||
list of tuples containing the match and it's score.
|
|
||||||
|
|
||||||
Convenience method which returns the choices with best scores, see
|
|
||||||
extract() for full arguments list
|
|
||||||
|
|
||||||
Optional parameter: score_cutoff.
|
|
||||||
If the choice has a score of less than or equal to score_cutoff
|
|
||||||
it will not be included on result list
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
best_list = extract(query, choices, processor, scorer, limit)
|
|
||||||
if len(best_list) > 0:
|
|
||||||
return list(itertools.takewhile(lambda x: x[1] > score_cutoff, best_list))
|
|
||||||
else:
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
|
|
||||||
"""Find the best match above a score in a list of choices, return a
|
|
||||||
tuple containing the match and it's score if it's above the treshold
|
|
||||||
or None.
|
|
||||||
|
|
||||||
Convenience method which returns the single best choice, see
|
|
||||||
extract() for full arguments list
|
|
||||||
|
|
||||||
Optional parameter: score_cutoff.
|
|
||||||
If the best choice has a score of less than or equal to
|
|
||||||
score_cutoff we will return none (intuition: not a good enough
|
|
||||||
match)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
best_list = extract(query, choices, processor, scorer, limit=1)
|
|
||||||
if len(best_list) > 0:
|
|
||||||
best = best_list[0]
|
|
||||||
if best[1] > score_cutoff:
|
|
||||||
return best
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return None
|
|
|
@ -1,41 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class StringProcessor(object):
|
|
||||||
"""
|
|
||||||
This class defines method to process strings in the most
|
|
||||||
efficient way. Ideally all the methods below use unicode strings
|
|
||||||
for both input and output.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def replace_non_letters_non_numbers_with_whitespace(cls, a_string):
|
|
||||||
"""
|
|
||||||
This function replaces any sequence of non letters and non
|
|
||||||
numbers with a single white space.
|
|
||||||
"""
|
|
||||||
regex = re.compile(r"(?ui)\W")
|
|
||||||
return regex.sub(" ", a_string)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def strip(cls, a_string):
|
|
||||||
"""
|
|
||||||
This function strips leading and trailing white space.
|
|
||||||
"""
|
|
||||||
|
|
||||||
return a_string.strip()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def to_lower_case(cls, a_string):
|
|
||||||
"""
|
|
||||||
This function returns the lower-cased version of the string given.
|
|
||||||
"""
|
|
||||||
return a_string.lower()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def to_upper_case(cls, a_string):
|
|
||||||
"""
|
|
||||||
This function returns the upper-cased version of the string given.
|
|
||||||
"""
|
|
||||||
return a_string.upper()
|
|
|
@ -1,76 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from fuzzywuzzy.string_processing import StringProcessor
|
|
||||||
|
|
||||||
|
|
||||||
PY3 = sys.version_info[0] == 3
|
|
||||||
|
|
||||||
|
|
||||||
def validate_string(s):
|
|
||||||
try:
|
|
||||||
if len(s) > 0:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
bad_chars = str('') # ascii dammit!
|
|
||||||
for i in range(128, 256):
|
|
||||||
bad_chars += chr(i)
|
|
||||||
if PY3:
|
|
||||||
translation_table = dict((ord(c), None) for c in bad_chars)
|
|
||||||
|
|
||||||
|
|
||||||
def asciionly(s):
|
|
||||||
if PY3:
|
|
||||||
return s.translate(translation_table)
|
|
||||||
else:
|
|
||||||
return s.translate(None, bad_chars)
|
|
||||||
|
|
||||||
|
|
||||||
def asciidammit(s):
|
|
||||||
if type(s) is str:
|
|
||||||
return asciionly(s)
|
|
||||||
elif type(s) is unicode:
|
|
||||||
return asciionly(s.encode('ascii', 'ignore'))
|
|
||||||
else:
|
|
||||||
return asciidammit(unicode(s))
|
|
||||||
|
|
||||||
|
|
||||||
def make_type_consistent(s1, s2):
|
|
||||||
if isinstance(s1, str) and isinstance(s2, str):
|
|
||||||
return s1, s2
|
|
||||||
|
|
||||||
elif isinstance(s1, unicode) and isinstance(s2, unicode):
|
|
||||||
return s1, s2
|
|
||||||
|
|
||||||
else:
|
|
||||||
return unicode(s1), unicode(s2)
|
|
||||||
|
|
||||||
|
|
||||||
def full_process(s, force_ascii=False):
|
|
||||||
"""Process string by
|
|
||||||
-- removing all but letters and numbers
|
|
||||||
-- trim whitespace
|
|
||||||
-- force to lower case
|
|
||||||
if force_ascii == True, force convert to ascii"""
|
|
||||||
|
|
||||||
if s is None:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
if force_ascii:
|
|
||||||
s = asciidammit(s)
|
|
||||||
# Keep only Letters and Numbres (see Unicode docs).
|
|
||||||
string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
|
|
||||||
# Force into lowercase.
|
|
||||||
string_out = StringProcessor.to_lower_case(string_out)
|
|
||||||
# Remove leading and trailing whitespaces.
|
|
||||||
string_out = StringProcessor.strip(string_out)
|
|
||||||
return string_out
|
|
||||||
|
|
||||||
|
|
||||||
def intr(n):
|
|
||||||
'''Returns a correctly rounded integer'''
|
|
||||||
return int(round(n))
|
|
|
@ -1,18 +0,0 @@
|
||||||
from pysrt.srttime import SubRipTime
|
|
||||||
from pysrt.srtitem import SubRipItem
|
|
||||||
from pysrt.srtfile import SubRipFile
|
|
||||||
from pysrt.srtexc import Error, InvalidItem, InvalidTimeString
|
|
||||||
from pysrt.version import VERSION, VERSION_STRING
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'SubRipFile', 'SubRipItem', 'SubRipFile', 'SUPPORT_UTF_32_LE',
|
|
||||||
'SUPPORT_UTF_32_BE', 'InvalidItem', 'InvalidTimeString'
|
|
||||||
]
|
|
||||||
|
|
||||||
ERROR_PASS = SubRipFile.ERROR_PASS
|
|
||||||
ERROR_LOG = SubRipFile.ERROR_LOG
|
|
||||||
ERROR_RAISE = SubRipFile.ERROR_RAISE
|
|
||||||
|
|
||||||
open = SubRipFile.open
|
|
||||||
stream = SubRipFile.stream
|
|
||||||
from_string = SubRipFile.from_string
|
|
|
@ -1,218 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# pylint: disable-all
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import codecs
|
|
||||||
import shutil
|
|
||||||
import argparse
|
|
||||||
from textwrap import dedent
|
|
||||||
|
|
||||||
from chardet import detect
|
|
||||||
from pysrt import SubRipFile, SubRipTime, VERSION_STRING
|
|
||||||
|
|
||||||
def underline(string):
|
|
||||||
return "\033[4m%s\033[0m" % string
|
|
||||||
|
|
||||||
|
|
||||||
class TimeAwareArgumentParser(argparse.ArgumentParser):
|
|
||||||
|
|
||||||
RE_TIME_REPRESENTATION = re.compile(r'^\-?(\d+[hms]{0,2}){1,4}$')
|
|
||||||
|
|
||||||
def parse_args(self, args=None, namespace=None):
|
|
||||||
time_index = -1
|
|
||||||
for index, arg in enumerate(args):
|
|
||||||
match = self.RE_TIME_REPRESENTATION.match(arg)
|
|
||||||
if match:
|
|
||||||
time_index = index
|
|
||||||
break
|
|
||||||
|
|
||||||
if time_index >= 0:
|
|
||||||
args.insert(time_index, '--')
|
|
||||||
|
|
||||||
return super(TimeAwareArgumentParser, self).parse_args(args, namespace)
|
|
||||||
|
|
||||||
|
|
||||||
class SubRipShifter(object):
|
|
||||||
|
|
||||||
BACKUP_EXTENSION = '.bak'
|
|
||||||
RE_TIME_STRING = re.compile(r'(\d+)([hms]{0,2})')
|
|
||||||
UNIT_RATIOS = {
|
|
||||||
'ms': 1,
|
|
||||||
'': SubRipTime.SECONDS_RATIO,
|
|
||||||
's': SubRipTime.SECONDS_RATIO,
|
|
||||||
'm': SubRipTime.MINUTES_RATIO,
|
|
||||||
'h': SubRipTime.HOURS_RATIO,
|
|
||||||
}
|
|
||||||
DESCRIPTION = dedent("""\
|
|
||||||
Srt subtitle editor
|
|
||||||
|
|
||||||
It can either shift, split or change the frame rate.
|
|
||||||
""")
|
|
||||||
TIMESTAMP_HELP = "A timestamp in the form: [-][Hh][Mm]S[s][MSms]"
|
|
||||||
SHIFT_EPILOG = dedent("""\
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
1 minute and 12 seconds foreward (in place):
|
|
||||||
$ srt -i shift 1m12s movie.srt
|
|
||||||
|
|
||||||
half a second foreward:
|
|
||||||
$ srt shift 500ms movie.srt > othername.srt
|
|
||||||
|
|
||||||
1 second and half backward:
|
|
||||||
$ srt -i shift -1s500ms movie.srt
|
|
||||||
|
|
||||||
3 seconds backward:
|
|
||||||
$ srt -i shift -3 movie.srt
|
|
||||||
""")
|
|
||||||
RATE_EPILOG = dedent("""\
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
Convert 23.9fps subtitles to 25fps:
|
|
||||||
$ srt -i rate 23.9 25 movie.srt
|
|
||||||
""")
|
|
||||||
LIMITS_HELP = "Each parts duration in the form: [Hh][Mm]S[s][MSms]"
|
|
||||||
SPLIT_EPILOG = dedent("""\
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
For a movie in 2 parts with the first part 48 minutes and 18 seconds long:
|
|
||||||
$ srt split 48m18s movie.srt
|
|
||||||
=> creates movie.1.srt and movie.2.srt
|
|
||||||
|
|
||||||
For a movie in 3 parts of 20 minutes each:
|
|
||||||
$ srt split 20m 20m movie.srt
|
|
||||||
=> creates movie.1.srt, movie.2.srt and movie.3.srt
|
|
||||||
""")
|
|
||||||
FRAME_RATE_HELP = "A frame rate in fps (commonly 23.9 or 25)"
|
|
||||||
ENCODING_HELP = dedent("""\
|
|
||||||
Change file encoding. Useful for players accepting only latin1 subtitles.
|
|
||||||
List of supported encodings: http://docs.python.org/library/codecs.html#standard-encodings
|
|
||||||
""")
|
|
||||||
BREAK_EPILOG = dedent("""\
|
|
||||||
Break lines longer than defined length
|
|
||||||
""")
|
|
||||||
LENGTH_HELP = "Maximum number of characters per line"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.output_file_path = None
|
|
||||||
|
|
||||||
def build_parser(self):
|
|
||||||
parser = TimeAwareArgumentParser(description=self.DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter)
|
|
||||||
parser.add_argument('-i', '--in-place', action='store_true', dest='in_place',
|
|
||||||
help="Edit file in-place, saving a backup as file.bak (do not works for the split command)")
|
|
||||||
parser.add_argument('-e', '--output-encoding', metavar=underline('encoding'), action='store', dest='output_encoding',
|
|
||||||
type=self.parse_encoding, help=self.ENCODING_HELP)
|
|
||||||
parser.add_argument('-v', '--version', action='version', version='%%(prog)s %s' % VERSION_STRING)
|
|
||||||
subparsers = parser.add_subparsers(title='commands')
|
|
||||||
|
|
||||||
shift_parser = subparsers.add_parser('shift', help="Shift subtitles by specified time offset", epilog=self.SHIFT_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
|
|
||||||
shift_parser.add_argument('time_offset', action='store', metavar=underline('offset'),
|
|
||||||
type=self.parse_time, help=self.TIMESTAMP_HELP)
|
|
||||||
shift_parser.set_defaults(action=self.shift)
|
|
||||||
|
|
||||||
rate_parser = subparsers.add_parser('rate', help="Convert subtitles from a frame rate to another", epilog=self.RATE_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
|
|
||||||
rate_parser.add_argument('initial', action='store', type=float, help=self.FRAME_RATE_HELP)
|
|
||||||
rate_parser.add_argument('final', action='store', type=float, help=self.FRAME_RATE_HELP)
|
|
||||||
rate_parser.set_defaults(action=self.rate)
|
|
||||||
|
|
||||||
split_parser = subparsers.add_parser('split', help="Split a file in multiple parts", epilog=self.SPLIT_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
|
|
||||||
split_parser.add_argument('limits', action='store', nargs='+', type=self.parse_time, help=self.LIMITS_HELP)
|
|
||||||
split_parser.set_defaults(action=self.split)
|
|
||||||
|
|
||||||
break_parser = subparsers.add_parser('break', help="Break long lines", epilog=self.BREAK_EPILOG, formatter_class=argparse.RawTextHelpFormatter)
|
|
||||||
break_parser.add_argument('length', action='store', type=int, help=self.LENGTH_HELP)
|
|
||||||
break_parser.set_defaults(action=self.break_lines)
|
|
||||||
|
|
||||||
parser.add_argument('file', action='store')
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def run(self, args):
|
|
||||||
self.arguments = self.build_parser().parse_args(args)
|
|
||||||
if self.arguments.in_place:
|
|
||||||
self.create_backup()
|
|
||||||
self.arguments.action()
|
|
||||||
|
|
||||||
def parse_time(self, time_string):
|
|
||||||
negative = time_string.startswith('-')
|
|
||||||
if negative:
|
|
||||||
time_string = time_string[1:]
|
|
||||||
ordinal = sum(int(value) * self.UNIT_RATIOS[unit] for value, unit
|
|
||||||
in self.RE_TIME_STRING.findall(time_string))
|
|
||||||
return -ordinal if negative else ordinal
|
|
||||||
|
|
||||||
def parse_encoding(self, encoding_name):
|
|
||||||
try:
|
|
||||||
codecs.lookup(encoding_name)
|
|
||||||
except LookupError as error:
|
|
||||||
raise argparse.ArgumentTypeError(error.message)
|
|
||||||
return encoding_name
|
|
||||||
|
|
||||||
def shift(self):
|
|
||||||
self.input_file.shift(milliseconds=self.arguments.time_offset)
|
|
||||||
self.input_file.write_into(self.output_file)
|
|
||||||
|
|
||||||
def rate(self):
|
|
||||||
ratio = self.arguments.final / self.arguments.initial
|
|
||||||
self.input_file.shift(ratio=ratio)
|
|
||||||
self.input_file.write_into(self.output_file)
|
|
||||||
|
|
||||||
def split(self):
|
|
||||||
limits = [0] + self.arguments.limits + [self.input_file[-1].end.ordinal + 1]
|
|
||||||
base_name, extension = os.path.splitext(self.arguments.file)
|
|
||||||
for index, (start, end) in enumerate(zip(limits[:-1], limits[1:])):
|
|
||||||
file_name = '%s.%s%s' % (base_name, index + 1, extension)
|
|
||||||
part_file = self.input_file.slice(ends_after=start, starts_before=end)
|
|
||||||
part_file.shift(milliseconds=-start)
|
|
||||||
part_file.clean_indexes()
|
|
||||||
part_file.save(path=file_name, encoding=self.output_encoding)
|
|
||||||
|
|
||||||
def create_backup(self):
|
|
||||||
backup_file = self.arguments.file + self.BACKUP_EXTENSION
|
|
||||||
if not os.path.exists(backup_file):
|
|
||||||
shutil.copy2(self.arguments.file, backup_file)
|
|
||||||
self.output_file_path = self.arguments.file
|
|
||||||
self.arguments.file = backup_file
|
|
||||||
|
|
||||||
def break_lines(self):
|
|
||||||
split_re = re.compile(r'(.{,%i})(?:\s+|$)' % self.arguments.length)
|
|
||||||
for item in self.input_file:
|
|
||||||
item.text = '\n'.join(split_re.split(item.text)[1::2])
|
|
||||||
self.input_file.write_into(self.output_file)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def output_encoding(self):
|
|
||||||
return self.arguments.output_encoding or self.input_file.encoding
|
|
||||||
|
|
||||||
@property
|
|
||||||
def input_file(self):
|
|
||||||
if not hasattr(self, '_source_file'):
|
|
||||||
with open(self.arguments.file, 'rb') as f:
|
|
||||||
content = f.read()
|
|
||||||
encoding = detect(content).get('encoding')
|
|
||||||
encoding = self.normalize_encoding(encoding)
|
|
||||||
|
|
||||||
self._source_file = SubRipFile.open(self.arguments.file,
|
|
||||||
encoding=encoding, error_handling=SubRipFile.ERROR_LOG)
|
|
||||||
return self._source_file
|
|
||||||
|
|
||||||
@property
|
|
||||||
def output_file(self):
|
|
||||||
if not hasattr(self, '_output_file'):
|
|
||||||
if self.output_file_path:
|
|
||||||
self._output_file = codecs.open(self.output_file_path, 'w+', encoding=self.output_encoding)
|
|
||||||
else:
|
|
||||||
self._output_file = sys.stdout
|
|
||||||
return self._output_file
|
|
||||||
|
|
||||||
def normalize_encoding(self, encoding):
|
|
||||||
return encoding.lower().replace('-', '_')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
SubRipShifter().run(sys.argv[1:])
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,26 +0,0 @@
|
||||||
class ComparableMixin(object):
|
|
||||||
def _compare(self, other, method):
|
|
||||||
try:
|
|
||||||
return method(self._cmpkey(), other._cmpkey())
|
|
||||||
except (AttributeError, TypeError):
|
|
||||||
# _cmpkey not implemented, or return different type,
|
|
||||||
# so I can't compare with "other".
|
|
||||||
return NotImplemented
|
|
||||||
|
|
||||||
def __lt__(self, other):
|
|
||||||
return self._compare(other, lambda s, o: s < o)
|
|
||||||
|
|
||||||
def __le__(self, other):
|
|
||||||
return self._compare(other, lambda s, o: s <= o)
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return self._compare(other, lambda s, o: s == o)
|
|
||||||
|
|
||||||
def __ge__(self, other):
|
|
||||||
return self._compare(other, lambda s, o: s >= o)
|
|
||||||
|
|
||||||
def __gt__(self, other):
|
|
||||||
return self._compare(other, lambda s, o: s > o)
|
|
||||||
|
|
||||||
def __ne__(self, other):
|
|
||||||
return self._compare(other, lambda s, o: s != o)
|
|
|
@ -1,24 +0,0 @@
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
# Syntax sugar.
|
|
||||||
_ver = sys.version_info
|
|
||||||
|
|
||||||
#: Python 2.x?
|
|
||||||
is_py2 = (_ver[0] == 2)
|
|
||||||
|
|
||||||
#: Python 3.x?
|
|
||||||
is_py3 = (_ver[0] == 3)
|
|
||||||
|
|
||||||
from io import open as io_open
|
|
||||||
|
|
||||||
if is_py2:
|
|
||||||
builtin_str = str
|
|
||||||
basestring = basestring
|
|
||||||
str = unicode
|
|
||||||
open = io_open
|
|
||||||
elif is_py3:
|
|
||||||
builtin_str = str
|
|
||||||
basestring = (str, bytes)
|
|
||||||
str = str
|
|
||||||
open = open
|
|
|
@ -1,31 +0,0 @@
|
||||||
"""
|
|
||||||
Exception classes
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class Error(Exception):
|
|
||||||
"""
|
|
||||||
Pysrt's base exception
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidTimeString(Error):
|
|
||||||
"""
|
|
||||||
Raised when parser fail on bad formated time strings
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidItem(Error):
|
|
||||||
"""
|
|
||||||
Raised when parser fail to parse a sub title item
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidIndex(InvalidItem):
|
|
||||||
"""
|
|
||||||
Raised when parser fail to parse a sub title index
|
|
||||||
"""
|
|
||||||
pass
|
|
|
@ -1,312 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
try:
|
|
||||||
from collections import UserList
|
|
||||||
except ImportError:
|
|
||||||
from UserList import UserList
|
|
||||||
|
|
||||||
from itertools import chain
|
|
||||||
from copy import copy
|
|
||||||
|
|
||||||
from pysrt.srtexc import Error
|
|
||||||
from pysrt.srtitem import SubRipItem
|
|
||||||
from pysrt.compat import str
|
|
||||||
|
|
||||||
BOMS = ((codecs.BOM_UTF32_LE, 'utf_32_le'),
|
|
||||||
(codecs.BOM_UTF32_BE, 'utf_32_be'),
|
|
||||||
(codecs.BOM_UTF16_LE, 'utf_16_le'),
|
|
||||||
(codecs.BOM_UTF16_BE, 'utf_16_be'),
|
|
||||||
(codecs.BOM_UTF8, 'utf_8'))
|
|
||||||
CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS)
|
|
||||||
BIGGER_BOM = max(len(bom) for bom, encoding in BOMS)
|
|
||||||
|
|
||||||
|
|
||||||
class SubRipFile(UserList, object):
|
|
||||||
"""
|
|
||||||
SubRip file descriptor.
|
|
||||||
|
|
||||||
Provide a pure Python mapping on all metadata.
|
|
||||||
|
|
||||||
SubRipFile(items, eol, path, encoding)
|
|
||||||
|
|
||||||
items -> list of SubRipItem. Default to [].
|
|
||||||
eol -> str: end of line character. Default to linesep used in opened file
|
|
||||||
if any else to os.linesep.
|
|
||||||
path -> str: path where file will be saved. To open an existant file see
|
|
||||||
SubRipFile.open.
|
|
||||||
encoding -> str: encoding used at file save. Default to utf-8.
|
|
||||||
"""
|
|
||||||
ERROR_PASS = 0
|
|
||||||
ERROR_LOG = 1
|
|
||||||
ERROR_RAISE = 2
|
|
||||||
|
|
||||||
DEFAULT_ENCODING = 'utf_8'
|
|
||||||
|
|
||||||
def __init__(self, items=None, eol=None, path=None, encoding='utf-8'):
|
|
||||||
UserList.__init__(self, items or [])
|
|
||||||
self._eol = eol
|
|
||||||
self.path = path
|
|
||||||
self.encoding = encoding
|
|
||||||
|
|
||||||
def _get_eol(self):
|
|
||||||
return self._eol or os.linesep
|
|
||||||
|
|
||||||
def _set_eol(self, eol):
|
|
||||||
self._eol = self._eol or eol
|
|
||||||
|
|
||||||
eol = property(_get_eol, _set_eol)
|
|
||||||
|
|
||||||
def slice(self, starts_before=None, starts_after=None, ends_before=None,
|
|
||||||
ends_after=None):
|
|
||||||
"""
|
|
||||||
slice([starts_before][, starts_after][, ends_before][, ends_after]) \
|
|
||||||
-> SubRipFile clone
|
|
||||||
|
|
||||||
All arguments are optional, and should be coercible to SubRipTime
|
|
||||||
object.
|
|
||||||
|
|
||||||
It reduce the set of subtitles to those that match match given time
|
|
||||||
constraints.
|
|
||||||
|
|
||||||
The returned set is a clone, but still contains references to original
|
|
||||||
subtitles. So if you shift this returned set, subs contained in the
|
|
||||||
original SubRipFile instance will be altered too.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> subs.slice(ends_after={'seconds': 20}).shift(seconds=2)
|
|
||||||
"""
|
|
||||||
clone = copy(self)
|
|
||||||
|
|
||||||
if starts_before:
|
|
||||||
clone.data = (i for i in clone.data if i.start < starts_before)
|
|
||||||
if starts_after:
|
|
||||||
clone.data = (i for i in clone.data if i.start > starts_after)
|
|
||||||
if ends_before:
|
|
||||||
clone.data = (i for i in clone.data if i.end < ends_before)
|
|
||||||
if ends_after:
|
|
||||||
clone.data = (i for i in clone.data if i.end > ends_after)
|
|
||||||
|
|
||||||
clone.data = list(clone.data)
|
|
||||||
return clone
|
|
||||||
|
|
||||||
def at(self, timestamp=None, **kwargs):
|
|
||||||
"""
|
|
||||||
at(timestamp) -> SubRipFile clone
|
|
||||||
|
|
||||||
timestamp argument should be coercible to SubRipFile object.
|
|
||||||
|
|
||||||
A specialization of slice. Return all subtiles visible at the
|
|
||||||
timestamp mark.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> subs.at((0, 0, 20, 0)).shift(seconds=2)
|
|
||||||
>>> subs.at(seconds=20).shift(seconds=2)
|
|
||||||
"""
|
|
||||||
time = timestamp or kwargs
|
|
||||||
return self.slice(starts_before=time, ends_after=time)
|
|
||||||
|
|
||||||
def shift(self, *args, **kwargs):
|
|
||||||
"""shift(hours, minutes, seconds, milliseconds, ratio)
|
|
||||||
|
|
||||||
Shift `start` and `end` attributes of each items of file either by
|
|
||||||
applying a ratio or by adding an offset.
|
|
||||||
|
|
||||||
`ratio` should be either an int or a float.
|
|
||||||
Example to convert subtitles from 23.9 fps to 25 fps:
|
|
||||||
>>> subs.shift(ratio=25/23.9)
|
|
||||||
|
|
||||||
All "time" arguments are optional and have a default value of 0.
|
|
||||||
Example to delay all subs from 2 seconds and half
|
|
||||||
>>> subs.shift(seconds=2, milliseconds=500)
|
|
||||||
"""
|
|
||||||
for item in self:
|
|
||||||
item.shift(*args, **kwargs)
|
|
||||||
|
|
||||||
def clean_indexes(self):
|
|
||||||
"""
|
|
||||||
clean_indexes()
|
|
||||||
|
|
||||||
Sort subs and reset their index attribute. Should be called after
|
|
||||||
destructive operations like split or such.
|
|
||||||
"""
|
|
||||||
self.sort()
|
|
||||||
for index, item in enumerate(self):
|
|
||||||
item.index = index + 1
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
return '\n'.join(i.text for i in self)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def open(cls, path='', encoding=None, error_handling=ERROR_PASS):
|
|
||||||
"""
|
|
||||||
open([path, [encoding]])
|
|
||||||
|
|
||||||
If you do not provide any encoding, it can be detected if the file
|
|
||||||
contain a bit order mark, unless it is set to utf-8 as default.
|
|
||||||
"""
|
|
||||||
new_file = cls(path=path, encoding=encoding)
|
|
||||||
source_file = cls._open_unicode_file(path, claimed_encoding=encoding)
|
|
||||||
new_file.read(source_file, error_handling=error_handling)
|
|
||||||
source_file.close()
|
|
||||||
return new_file
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_string(cls, source, **kwargs):
|
|
||||||
"""
|
|
||||||
from_string(source, **kwargs) -> SubRipFile
|
|
||||||
|
|
||||||
`source` -> a unicode instance or at least a str instance encoded with
|
|
||||||
`sys.getdefaultencoding()`
|
|
||||||
"""
|
|
||||||
error_handling = kwargs.pop('error_handling', None)
|
|
||||||
new_file = cls(**kwargs)
|
|
||||||
new_file.read(source.splitlines(True), error_handling=error_handling)
|
|
||||||
return new_file
|
|
||||||
|
|
||||||
def read(self, source_file, error_handling=ERROR_PASS):
|
|
||||||
"""
|
|
||||||
read(source_file, [error_handling])
|
|
||||||
|
|
||||||
This method parse subtitles contained in `source_file` and append them
|
|
||||||
to the current instance.
|
|
||||||
|
|
||||||
`source_file` -> Any iterable that yield unicode strings, like a file
|
|
||||||
opened with `codecs.open()` or an array of unicode.
|
|
||||||
"""
|
|
||||||
self.eol = self._guess_eol(source_file)
|
|
||||||
self.extend(self.stream(source_file, error_handling=error_handling))
|
|
||||||
return self
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def stream(cls, source_file, error_handling=ERROR_PASS):
|
|
||||||
"""
|
|
||||||
stream(source_file, [error_handling])
|
|
||||||
|
|
||||||
This method yield SubRipItem instances a soon as they have been parsed
|
|
||||||
without storing them. It is a kind of SAX parser for .srt files.
|
|
||||||
|
|
||||||
`source_file` -> Any iterable that yield unicode strings, like a file
|
|
||||||
opened with `codecs.open()` or an array of unicode.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> import pysrt
|
|
||||||
>>> import codecs
|
|
||||||
>>> file = codecs.open('movie.srt', encoding='utf-8')
|
|
||||||
>>> for sub in pysrt.stream(file):
|
|
||||||
... sub.text += "\nHello !"
|
|
||||||
... print unicode(sub)
|
|
||||||
"""
|
|
||||||
string_buffer = []
|
|
||||||
for index, line in enumerate(chain(source_file, '\n')):
|
|
||||||
if line.strip():
|
|
||||||
string_buffer.append(line)
|
|
||||||
else:
|
|
||||||
source = string_buffer
|
|
||||||
string_buffer = []
|
|
||||||
if source and all(source):
|
|
||||||
try:
|
|
||||||
yield SubRipItem.from_lines(source)
|
|
||||||
except Error as error:
|
|
||||||
error.args += (''.join(source), )
|
|
||||||
cls._handle_error(error, error_handling, index)
|
|
||||||
|
|
||||||
def save(self, path=None, encoding=None, eol=None):
|
|
||||||
"""
|
|
||||||
save([path][, encoding][, eol])
|
|
||||||
|
|
||||||
Use initial path if no other provided.
|
|
||||||
Use initial encoding if no other provided.
|
|
||||||
Use initial eol if no other provided.
|
|
||||||
"""
|
|
||||||
path = path or self.path
|
|
||||||
encoding = encoding or self.encoding
|
|
||||||
|
|
||||||
save_file = codecs.open(path, 'w+', encoding=encoding)
|
|
||||||
self.write_into(save_file, eol=eol)
|
|
||||||
save_file.close()
|
|
||||||
|
|
||||||
def write_into(self, output_file, eol=None):
|
|
||||||
"""
|
|
||||||
write_into(output_file [, eol])
|
|
||||||
|
|
||||||
Serialize current state into `output_file`.
|
|
||||||
|
|
||||||
`output_file` -> Any instance that respond to `write()`, typically a
|
|
||||||
file object
|
|
||||||
"""
|
|
||||||
output_eol = eol or self.eol
|
|
||||||
|
|
||||||
for item in self:
|
|
||||||
string_repr = str(item)
|
|
||||||
if output_eol != '\n':
|
|
||||||
string_repr = string_repr.replace('\n', output_eol)
|
|
||||||
output_file.write(string_repr)
|
|
||||||
# Only add trailing eol if it's not already present.
|
|
||||||
# It was kept in the SubRipItem's text before but it really
|
|
||||||
# belongs here. Existing applications might give us subtitles
|
|
||||||
# which already contain a trailing eol though.
|
|
||||||
if not string_repr.endswith(2 * output_eol):
|
|
||||||
output_file.write(output_eol)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _guess_eol(cls, string_iterable):
|
|
||||||
first_line = cls._get_first_line(string_iterable)
|
|
||||||
for eol in ('\r\n', '\r', '\n'):
|
|
||||||
if first_line.endswith(eol):
|
|
||||||
return eol
|
|
||||||
return os.linesep
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_first_line(cls, string_iterable):
|
|
||||||
if hasattr(string_iterable, 'tell'):
|
|
||||||
previous_position = string_iterable.tell()
|
|
||||||
|
|
||||||
try:
|
|
||||||
first_line = next(iter(string_iterable))
|
|
||||||
except StopIteration:
|
|
||||||
return ''
|
|
||||||
if hasattr(string_iterable, 'seek'):
|
|
||||||
string_iterable.seek(previous_position)
|
|
||||||
|
|
||||||
return first_line
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _detect_encoding(cls, path):
|
|
||||||
file_descriptor = open(path, 'rb')
|
|
||||||
first_chars = file_descriptor.read(BIGGER_BOM)
|
|
||||||
file_descriptor.close()
|
|
||||||
|
|
||||||
for bom, encoding in BOMS:
|
|
||||||
if first_chars.startswith(bom):
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
# TODO: maybe a chardet integration
|
|
||||||
return cls.DEFAULT_ENCODING
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _open_unicode_file(cls, path, claimed_encoding=None):
|
|
||||||
encoding = claimed_encoding or cls._detect_encoding(path)
|
|
||||||
source_file = codecs.open(path, 'rU', encoding=encoding)
|
|
||||||
|
|
||||||
# get rid of BOM if any
|
|
||||||
possible_bom = CODECS_BOMS.get(encoding, None)
|
|
||||||
if possible_bom:
|
|
||||||
file_bom = source_file.read(len(possible_bom))
|
|
||||||
if not file_bom == possible_bom:
|
|
||||||
source_file.seek(0) # if not rewind
|
|
||||||
return source_file
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _handle_error(cls, error, error_handling, index):
|
|
||||||
if error_handling == cls.ERROR_RAISE:
|
|
||||||
error.args = (index, ) + error.args
|
|
||||||
raise error
|
|
||||||
if error_handling == cls.ERROR_LOG:
|
|
||||||
name = type(error).__name__
|
|
||||||
sys.stderr.write('PySRT-%s(line %s): \n' % (name, index))
|
|
||||||
sys.stderr.write(error.args[0].encode('ascii', 'replace'))
|
|
||||||
sys.stderr.write('\n')
|
|
|
@ -1,76 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
SubRip's subtitle parser
|
|
||||||
"""
|
|
||||||
from pysrt.srtexc import InvalidItem, InvalidIndex
|
|
||||||
from pysrt.srttime import SubRipTime
|
|
||||||
from pysrt.comparablemixin import ComparableMixin
|
|
||||||
from pysrt.compat import str
|
|
||||||
|
|
||||||
class SubRipItem(ComparableMixin):
|
|
||||||
"""
|
|
||||||
SubRipItem(index, start, end, text, position)
|
|
||||||
|
|
||||||
index -> int: index of item in file. 0 by default.
|
|
||||||
start, end -> SubRipTime or coercible.
|
|
||||||
text -> unicode: text content for item.
|
|
||||||
position -> unicode: raw srt/vtt "display coordinates" string
|
|
||||||
"""
|
|
||||||
ITEM_PATTERN = '%s\n%s --> %s%s\n%s\n'
|
|
||||||
TIMESTAMP_SEPARATOR = '-->'
|
|
||||||
|
|
||||||
def __init__(self, index=0, start=None, end=None, text='', position=''):
|
|
||||||
try:
|
|
||||||
self.index = int(index)
|
|
||||||
except (TypeError, ValueError): # try to cast as int, but it's not mandatory
|
|
||||||
self.index = index
|
|
||||||
|
|
||||||
self.start = SubRipTime.coerce(start or 0)
|
|
||||||
self.end = SubRipTime.coerce(end or 0)
|
|
||||||
self.position = str(position)
|
|
||||||
self.text = str(text)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
position = ' %s' % self.position if self.position.strip() else ''
|
|
||||||
return self.ITEM_PATTERN % (self.index, self.start, self.end,
|
|
||||||
position, self.text)
|
|
||||||
|
|
||||||
def _cmpkey(self):
|
|
||||||
return (self.start, self.end)
|
|
||||||
|
|
||||||
def shift(self, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
shift(hours, minutes, seconds, milliseconds, ratio)
|
|
||||||
|
|
||||||
Add given values to start and end attributes.
|
|
||||||
All arguments are optional and have a default value of 0.
|
|
||||||
"""
|
|
||||||
self.start.shift(*args, **kwargs)
|
|
||||||
self.end.shift(*args, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_string(cls, source):
|
|
||||||
return cls.from_lines(source.splitlines(True))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_lines(cls, lines):
|
|
||||||
if len(lines) < 2:
|
|
||||||
raise InvalidItem()
|
|
||||||
lines = [l.rstrip() for l in lines]
|
|
||||||
index = None
|
|
||||||
if cls.TIMESTAMP_SEPARATOR not in lines[0]:
|
|
||||||
index = lines.pop(0)
|
|
||||||
start, end, position = cls.split_timestamps(lines[0])
|
|
||||||
body = '\n'.join(lines[1:])
|
|
||||||
return cls(index, start, end, body, position)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def split_timestamps(cls, line):
|
|
||||||
timestamps = line.split(cls.TIMESTAMP_SEPARATOR)
|
|
||||||
if len(timestamps) != 2:
|
|
||||||
raise InvalidItem()
|
|
||||||
start, end_and_position = timestamps
|
|
||||||
end_and_position = end_and_position.lstrip().split(' ', 1)
|
|
||||||
end = end_and_position[0]
|
|
||||||
position = end_and_position[1] if len(end_and_position) > 1 else ''
|
|
||||||
return (s.strip() for s in (start, end, position))
|
|
|
@ -1,176 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
SubRip's time format parser: HH:MM:SS,mmm
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
from datetime import time
|
|
||||||
|
|
||||||
from pysrt.srtexc import InvalidTimeString
|
|
||||||
from pysrt.comparablemixin import ComparableMixin
|
|
||||||
from pysrt.compat import str, basestring
|
|
||||||
|
|
||||||
class TimeItemDescriptor(object):
|
|
||||||
# pylint: disable-msg=R0903
|
|
||||||
def __init__(self, ratio, super_ratio=0):
|
|
||||||
self.ratio = int(ratio)
|
|
||||||
self.super_ratio = int(super_ratio)
|
|
||||||
|
|
||||||
def _get_ordinal(self, instance):
|
|
||||||
if self.super_ratio:
|
|
||||||
return instance.ordinal % self.super_ratio
|
|
||||||
return instance.ordinal
|
|
||||||
|
|
||||||
def __get__(self, instance, klass):
|
|
||||||
if instance is None:
|
|
||||||
raise AttributeError
|
|
||||||
return self._get_ordinal(instance) // self.ratio
|
|
||||||
|
|
||||||
def __set__(self, instance, value):
|
|
||||||
part = self._get_ordinal(instance) - instance.ordinal % self.ratio
|
|
||||||
instance.ordinal += value * self.ratio - part
|
|
||||||
|
|
||||||
|
|
||||||
class SubRipTime(ComparableMixin):
|
|
||||||
TIME_PATTERN = '%02d:%02d:%02d,%03d'
|
|
||||||
TIME_REPR = 'SubRipTime(%d, %d, %d, %d)'
|
|
||||||
RE_TIME_SEP = re.compile(r'\:|\.|\,')
|
|
||||||
RE_INTEGER = re.compile(r'^(\d+)')
|
|
||||||
SECONDS_RATIO = 1000
|
|
||||||
MINUTES_RATIO = SECONDS_RATIO * 60
|
|
||||||
HOURS_RATIO = MINUTES_RATIO * 60
|
|
||||||
|
|
||||||
hours = TimeItemDescriptor(HOURS_RATIO)
|
|
||||||
minutes = TimeItemDescriptor(MINUTES_RATIO, HOURS_RATIO)
|
|
||||||
seconds = TimeItemDescriptor(SECONDS_RATIO, MINUTES_RATIO)
|
|
||||||
milliseconds = TimeItemDescriptor(1, SECONDS_RATIO)
|
|
||||||
|
|
||||||
def __init__(self, hours=0, minutes=0, seconds=0, milliseconds=0):
|
|
||||||
"""
|
|
||||||
SubRipTime(hours, minutes, seconds, milliseconds)
|
|
||||||
|
|
||||||
All arguments are optional and have a default value of 0.
|
|
||||||
"""
|
|
||||||
super(SubRipTime, self).__init__()
|
|
||||||
self.ordinal = hours * self.HOURS_RATIO \
|
|
||||||
+ minutes * self.MINUTES_RATIO \
|
|
||||||
+ seconds * self.SECONDS_RATIO \
|
|
||||||
+ milliseconds
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.TIME_REPR % tuple(self)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
if self.ordinal < 0:
|
|
||||||
# Represent negative times as zero
|
|
||||||
return str(SubRipTime.from_ordinal(0))
|
|
||||||
return self.TIME_PATTERN % tuple(self)
|
|
||||||
|
|
||||||
def _compare(self, other, method):
|
|
||||||
return super(SubRipTime, self)._compare(self.coerce(other), method)
|
|
||||||
|
|
||||||
def _cmpkey(self):
|
|
||||||
return self.ordinal
|
|
||||||
|
|
||||||
def __add__(self, other):
|
|
||||||
return self.from_ordinal(self.ordinal + self.coerce(other).ordinal)
|
|
||||||
|
|
||||||
def __iadd__(self, other):
|
|
||||||
self.ordinal += self.coerce(other).ordinal
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __sub__(self, other):
|
|
||||||
return self.from_ordinal(self.ordinal - self.coerce(other).ordinal)
|
|
||||||
|
|
||||||
def __isub__(self, other):
|
|
||||||
self.ordinal -= self.coerce(other).ordinal
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __mul__(self, ratio):
|
|
||||||
return self.from_ordinal(int(round(self.ordinal * ratio)))
|
|
||||||
|
|
||||||
def __imul__(self, ratio):
|
|
||||||
self.ordinal = int(round(self.ordinal * ratio))
|
|
||||||
return self
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def coerce(cls, other):
|
|
||||||
"""
|
|
||||||
Coerce many types to SubRipTime instance.
|
|
||||||
Supported types:
|
|
||||||
- str/unicode
|
|
||||||
- int/long
|
|
||||||
- datetime.time
|
|
||||||
- any iterable
|
|
||||||
- dict
|
|
||||||
"""
|
|
||||||
if isinstance(other, SubRipTime):
|
|
||||||
return other
|
|
||||||
if isinstance(other, basestring):
|
|
||||||
return cls.from_string(other)
|
|
||||||
if isinstance(other, int):
|
|
||||||
return cls.from_ordinal(other)
|
|
||||||
if isinstance(other, time):
|
|
||||||
return cls.from_time(other)
|
|
||||||
try:
|
|
||||||
return cls(**other)
|
|
||||||
except TypeError:
|
|
||||||
return cls(*other)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
yield self.hours
|
|
||||||
yield self.minutes
|
|
||||||
yield self.seconds
|
|
||||||
yield self.milliseconds
|
|
||||||
|
|
||||||
def shift(self, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
shift(hours, minutes, seconds, milliseconds)
|
|
||||||
|
|
||||||
All arguments are optional and have a default value of 0.
|
|
||||||
"""
|
|
||||||
if 'ratio' in kwargs:
|
|
||||||
self *= kwargs.pop('ratio')
|
|
||||||
self += self.__class__(*args, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_ordinal(cls, ordinal):
|
|
||||||
"""
|
|
||||||
int -> SubRipTime corresponding to a total count of milliseconds
|
|
||||||
"""
|
|
||||||
return cls(milliseconds=int(ordinal))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_string(cls, source):
|
|
||||||
"""
|
|
||||||
str/unicode(HH:MM:SS,mmm) -> SubRipTime corresponding to serial
|
|
||||||
raise InvalidTimeString
|
|
||||||
"""
|
|
||||||
items = cls.RE_TIME_SEP.split(source)
|
|
||||||
if len(items) != 4:
|
|
||||||
raise InvalidTimeString
|
|
||||||
return cls(*(cls.parse_int(i) for i in items))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def parse_int(cls, digits):
|
|
||||||
try:
|
|
||||||
return int(digits)
|
|
||||||
except ValueError:
|
|
||||||
match = cls.RE_INTEGER.match(digits)
|
|
||||||
if match:
|
|
||||||
return int(match.group())
|
|
||||||
return 0
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_time(cls, source):
|
|
||||||
"""
|
|
||||||
datetime.time -> SubRipTime corresponding to time object
|
|
||||||
"""
|
|
||||||
return cls(hours=source.hour, minutes=source.minute,
|
|
||||||
seconds=source.second, milliseconds=source.microsecond // 1000)
|
|
||||||
|
|
||||||
def to_time(self):
|
|
||||||
"""
|
|
||||||
Convert SubRipTime instance into a pure datetime.time object
|
|
||||||
"""
|
|
||||||
return time(self.hours, self.minutes, self.seconds,
|
|
||||||
self.milliseconds * 1000)
|
|
|
@ -1,2 +0,0 @@
|
||||||
VERSION = (1, 0, 1)
|
|
||||||
VERSION_STRING = '.'.join(str(i) for i in VERSION)
|
|
Loading…
Reference in a new issue