mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-21 17:13:42 +00:00
Update chardet packages to 2.3.0 (9e419e9).
This commit is contained in:
parent
eecb03ef59
commit
44e08b857c
5 changed files with 26 additions and 35 deletions
|
@ -16,6 +16,7 @@
|
|||
* Remove "Manage Torrents"
|
||||
* Update Beautiful Soup 4.3.2 to 4.4.0 (r390)
|
||||
* Update dateutil library to 2.4.2 (083f666)
|
||||
* Update chardet packages to 2.3.0 (26982c5)
|
||||
* Update Hachoir library 1.3.3 to 1.3.4 (r1383)
|
||||
* Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt)
|
||||
* Add parse media content to determine quality before making final assumptions during re-scan, update, pp
|
||||
|
|
|
@ -28,7 +28,6 @@
|
|||
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
from .enums import ProbingState
|
||||
|
||||
|
@ -79,16 +78,16 @@ class CharSetProber(object):
|
|||
|
||||
This filter applies to all scripts which do not use English characters.
|
||||
"""
|
||||
filtered = BytesIO()
|
||||
filtered = bytearray()
|
||||
|
||||
# This regex expression filters out only words that have at-least one
|
||||
# international character. The word may include one marker character at
|
||||
# the end.
|
||||
words = re.findall(
|
||||
b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', buf)
|
||||
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
|
||||
buf)
|
||||
|
||||
for word in words:
|
||||
filtered.write(word[:-1])
|
||||
filtered.extend(word[:-1])
|
||||
|
||||
# If the last character in the word is a marker, replace it with a
|
||||
# space as markers shouldn't affect our analysis (they are used
|
||||
|
@ -97,9 +96,9 @@ class CharSetProber(object):
|
|||
last_char = word[-1:]
|
||||
if not last_char.isalpha() and last_char < b'\x80':
|
||||
last_char = b' '
|
||||
filtered.write(last_char)
|
||||
filtered.extend(last_char)
|
||||
|
||||
return filtered.getvalue()
|
||||
return filtered
|
||||
|
||||
@staticmethod
|
||||
def filter_with_english_letters(buf):
|
||||
|
@ -113,7 +112,7 @@ class CharSetProber(object):
|
|||
characters and extended ASCII characters, but is currently only used by
|
||||
``Latin1Prober``.
|
||||
"""
|
||||
filtered = BytesIO()
|
||||
filtered = bytearray()
|
||||
in_tag = False
|
||||
prev = 0
|
||||
|
||||
|
@ -132,15 +131,15 @@ class CharSetProber(object):
|
|||
if curr > prev and not in_tag:
|
||||
# Keep everything after last non-extended-ASCII,
|
||||
# non-alphabetic character
|
||||
filtered.write(buf[prev:curr])
|
||||
filtered.extend(buf[prev:curr])
|
||||
# Output a space to delimit stretch we kept
|
||||
filtered.write(b' ')
|
||||
filtered.extend(b' ')
|
||||
prev = curr + 1
|
||||
|
||||
# If we're not in a tag...
|
||||
if not in_tag:
|
||||
# Keep everything after last non-extended-ASCII, non-alphabetic
|
||||
# character
|
||||
filtered.write(buf[prev:])
|
||||
filtered.extend(buf[prev:])
|
||||
|
||||
return filtered.getvalue()
|
||||
return filtered
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
"""
|
||||
All of the Enums that are used throughout the chardet package.
|
||||
|
||||
:author: Dan Blanchard (dblanchard@ets.org)
|
||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
try:
|
||||
from enum import IntEnum
|
||||
except ImportError:
|
||||
from enum34 import IntEnum
|
||||
|
||||
|
||||
class InputState(IntEnum):
|
||||
class InputState(object):
|
||||
"""
|
||||
This enum represents the different states a universal detector can be in.
|
||||
"""
|
||||
|
@ -19,7 +14,7 @@ class InputState(IntEnum):
|
|||
high_byte = 2
|
||||
|
||||
|
||||
class LanguageFilter(IntEnum):
|
||||
class LanguageFilter(object):
|
||||
"""
|
||||
This enum represents the different language filters we can apply to a
|
||||
``UniversalDetector``.
|
||||
|
@ -34,7 +29,7 @@ class LanguageFilter(IntEnum):
|
|||
cjk = chinese | japanese | korean
|
||||
|
||||
|
||||
class ProbingState(IntEnum):
|
||||
class ProbingState(object):
|
||||
"""
|
||||
This enum represents the different states a prober can be in.
|
||||
"""
|
||||
|
@ -43,7 +38,7 @@ class ProbingState(IntEnum):
|
|||
not_me = 2
|
||||
|
||||
|
||||
class MachineState(IntEnum):
|
||||
class MachineState(object):
|
||||
"""
|
||||
This enum represents the different states a state machine can be in.
|
||||
"""
|
||||
|
|
|
@ -33,7 +33,7 @@ from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
|
|||
Ibm866Model, Ibm855Model)
|
||||
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||
# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||
from .langthaimodel import TIS620ThaiModel
|
||||
from .langhebrewmodel import Win1255HebrewModel
|
||||
from .hebrewprober import HebrewProber
|
||||
|
|
|
@ -122,12 +122,10 @@ class UniversalDetector(object):
|
|||
if byte_str.startswith(codecs.BOM_UTF8):
|
||||
# EF BB BF UTF-8 with BOM
|
||||
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
||||
elif byte_str.startswith(codecs.BOM_UTF32_LE):
|
||||
elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
|
||||
# FF FE 00 00 UTF-32, little-endian BOM
|
||||
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
||||
elif byte_str.startswith(codecs.BOM_UTF32_BE):
|
||||
# 00 00 FE FF UTF-32, big-endian BOM
|
||||
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
|
||||
self.result = {'encoding': "UTF-32", 'confidence': 1.0}
|
||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
||||
|
@ -136,12 +134,10 @@ class UniversalDetector(object):
|
|||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
||||
'confidence': 1.0}
|
||||
elif byte_str.startswith(codecs.BOM_LE):
|
||||
elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
|
||||
# FF FE UTF-16, little endian BOM
|
||||
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
|
||||
elif byte_str.startswith(codecs.BOM_BE):
|
||||
# FE FF UTF-16, big endian BOM
|
||||
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
|
||||
self.result = {'encoding': "UTF-16", 'confidence': 1.0}
|
||||
|
||||
self._got_data = True
|
||||
if self.result['encoding'] is not None:
|
||||
|
@ -207,7 +203,7 @@ class UniversalDetector(object):
|
|||
return
|
||||
self.done = True
|
||||
|
||||
if self._input_state == InputState.pure_ascii:
|
||||
if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
|
||||
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
||||
return self.result
|
||||
|
||||
|
@ -229,7 +225,7 @@ class UniversalDetector(object):
|
|||
|
||||
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
||||
self.logger.debug('no probers hit minimum threshhold')
|
||||
for prober in self._charset_probers[0].mProbers:
|
||||
for prober in self._charset_probers[0].probers:
|
||||
if not prober:
|
||||
continue
|
||||
self.logger.debug('%s confidence = %s', prober.charset_name,
|
||||
|
|
Loading…
Reference in a new issue