mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-21 17:13:42 +00:00
Merge pull request #514 from JackDandy/feature/UpdateChardet
Update chardet packages to 2.3.0
This commit is contained in:
commit
6752b47593
5 changed files with 26 additions and 35 deletions
|
@ -16,6 +16,7 @@
|
||||||
* Remove "Manage Torrents"
|
* Remove "Manage Torrents"
|
||||||
* Update Beautiful Soup 4.3.2 to 4.4.0 (r390)
|
* Update Beautiful Soup 4.3.2 to 4.4.0 (r390)
|
||||||
* Update dateutil library to 2.4.2 (083f666)
|
* Update dateutil library to 2.4.2 (083f666)
|
||||||
|
* Update chardet packages to 2.3.0 (26982c5)
|
||||||
* Update Hachoir library 1.3.3 to 1.3.4 (r1383)
|
* Update Hachoir library 1.3.3 to 1.3.4 (r1383)
|
||||||
* Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt)
|
* Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt)
|
||||||
* Add parse media content to determine quality before making final assumptions during re-scan, update, pp
|
* Add parse media content to determine quality before making final assumptions during re-scan, update, pp
|
||||||
|
|
|
@ -28,7 +28,6 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
@ -79,16 +78,16 @@ class CharSetProber(object):
|
||||||
|
|
||||||
This filter applies to all scripts which do not use English characters.
|
This filter applies to all scripts which do not use English characters.
|
||||||
"""
|
"""
|
||||||
filtered = BytesIO()
|
filtered = bytearray()
|
||||||
|
|
||||||
# This regex expression filters out only words that have at-least one
|
# This regex expression filters out only words that have at-least one
|
||||||
# international character. The word may include one marker character at
|
# international character. The word may include one marker character at
|
||||||
# the end.
|
# the end.
|
||||||
words = re.findall(
|
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
|
||||||
b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', buf)
|
buf)
|
||||||
|
|
||||||
for word in words:
|
for word in words:
|
||||||
filtered.write(word[:-1])
|
filtered.extend(word[:-1])
|
||||||
|
|
||||||
# If the last character in the word is a marker, replace it with a
|
# If the last character in the word is a marker, replace it with a
|
||||||
# space as markers shouldn't affect our analysis (they are used
|
# space as markers shouldn't affect our analysis (they are used
|
||||||
|
@ -97,9 +96,9 @@ class CharSetProber(object):
|
||||||
last_char = word[-1:]
|
last_char = word[-1:]
|
||||||
if not last_char.isalpha() and last_char < b'\x80':
|
if not last_char.isalpha() and last_char < b'\x80':
|
||||||
last_char = b' '
|
last_char = b' '
|
||||||
filtered.write(last_char)
|
filtered.extend(last_char)
|
||||||
|
|
||||||
return filtered.getvalue()
|
return filtered
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_with_english_letters(buf):
|
def filter_with_english_letters(buf):
|
||||||
|
@ -113,7 +112,7 @@ class CharSetProber(object):
|
||||||
characters and extended ASCII characters, but is currently only used by
|
characters and extended ASCII characters, but is currently only used by
|
||||||
``Latin1Prober``.
|
``Latin1Prober``.
|
||||||
"""
|
"""
|
||||||
filtered = BytesIO()
|
filtered = bytearray()
|
||||||
in_tag = False
|
in_tag = False
|
||||||
prev = 0
|
prev = 0
|
||||||
|
|
||||||
|
@ -132,15 +131,15 @@ class CharSetProber(object):
|
||||||
if curr > prev and not in_tag:
|
if curr > prev and not in_tag:
|
||||||
# Keep everything after last non-extended-ASCII,
|
# Keep everything after last non-extended-ASCII,
|
||||||
# non-alphabetic character
|
# non-alphabetic character
|
||||||
filtered.write(buf[prev:curr])
|
filtered.extend(buf[prev:curr])
|
||||||
# Output a space to delimit stretch we kept
|
# Output a space to delimit stretch we kept
|
||||||
filtered.write(b' ')
|
filtered.extend(b' ')
|
||||||
prev = curr + 1
|
prev = curr + 1
|
||||||
|
|
||||||
# If we're not in a tag...
|
# If we're not in a tag...
|
||||||
if not in_tag:
|
if not in_tag:
|
||||||
# Keep everything after last non-extended-ASCII, non-alphabetic
|
# Keep everything after last non-extended-ASCII, non-alphabetic
|
||||||
# character
|
# character
|
||||||
filtered.write(buf[prev:])
|
filtered.extend(buf[prev:])
|
||||||
|
|
||||||
return filtered.getvalue()
|
return filtered
|
||||||
|
|
|
@ -1,16 +1,11 @@
|
||||||
"""
|
"""
|
||||||
All of the Enums that are used throughout the chardet package.
|
All of the Enums that are used throughout the chardet package.
|
||||||
|
|
||||||
:author: Dan Blanchard (dblanchard@ets.org)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
|
||||||
from enum import IntEnum
|
|
||||||
except ImportError:
|
|
||||||
from enum34 import IntEnum
|
|
||||||
|
|
||||||
|
class InputState(object):
|
||||||
class InputState(IntEnum):
|
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a universal detector can be in.
|
This enum represents the different states a universal detector can be in.
|
||||||
"""
|
"""
|
||||||
|
@ -19,7 +14,7 @@ class InputState(IntEnum):
|
||||||
high_byte = 2
|
high_byte = 2
|
||||||
|
|
||||||
|
|
||||||
class LanguageFilter(IntEnum):
|
class LanguageFilter(object):
|
||||||
"""
|
"""
|
||||||
This enum represents the different language filters we can apply to a
|
This enum represents the different language filters we can apply to a
|
||||||
``UniversalDetector``.
|
``UniversalDetector``.
|
||||||
|
@ -34,7 +29,7 @@ class LanguageFilter(IntEnum):
|
||||||
cjk = chinese | japanese | korean
|
cjk = chinese | japanese | korean
|
||||||
|
|
||||||
|
|
||||||
class ProbingState(IntEnum):
|
class ProbingState(object):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a prober can be in.
|
This enum represents the different states a prober can be in.
|
||||||
"""
|
"""
|
||||||
|
@ -43,7 +38,7 @@ class ProbingState(IntEnum):
|
||||||
not_me = 2
|
not_me = 2
|
||||||
|
|
||||||
|
|
||||||
class MachineState(IntEnum):
|
class MachineState(object):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a state machine can be in.
|
This enum represents the different states a state machine can be in.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -33,7 +33,7 @@ from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
|
||||||
Ibm866Model, Ibm855Model)
|
Ibm866Model, Ibm855Model)
|
||||||
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||||
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||||
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||||
from .langthaimodel import TIS620ThaiModel
|
from .langthaimodel import TIS620ThaiModel
|
||||||
from .langhebrewmodel import Win1255HebrewModel
|
from .langhebrewmodel import Win1255HebrewModel
|
||||||
from .hebrewprober import HebrewProber
|
from .hebrewprober import HebrewProber
|
||||||
|
@ -63,9 +63,9 @@ class SBCSGroupProber(CharSetGroupProber):
|
||||||
]
|
]
|
||||||
hebrew_prober = HebrewProber()
|
hebrew_prober = HebrewProber()
|
||||||
logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,
|
logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,
|
||||||
False, hebrew_prober)
|
False, hebrew_prober)
|
||||||
visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,
|
visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,
|
||||||
hebrew_prober)
|
hebrew_prober)
|
||||||
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
|
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
|
||||||
self.probers.extend([hebrew_prober, logical_hebrew_prober,
|
self.probers.extend([hebrew_prober, logical_hebrew_prober,
|
||||||
visual_hebrew_prober])
|
visual_hebrew_prober])
|
||||||
|
|
|
@ -122,12 +122,10 @@ class UniversalDetector(object):
|
||||||
if byte_str.startswith(codecs.BOM_UTF8):
|
if byte_str.startswith(codecs.BOM_UTF8):
|
||||||
# EF BB BF UTF-8 with BOM
|
# EF BB BF UTF-8 with BOM
|
||||||
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
||||||
elif byte_str.startswith(codecs.BOM_UTF32_LE):
|
elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
|
||||||
# FF FE 00 00 UTF-32, little-endian BOM
|
# FF FE 00 00 UTF-32, little-endian BOM
|
||||||
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
|
||||||
elif byte_str.startswith(codecs.BOM_UTF32_BE):
|
|
||||||
# 00 00 FE FF UTF-32, big-endian BOM
|
# 00 00 FE FF UTF-32, big-endian BOM
|
||||||
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-32", 'confidence': 1.0}
|
||||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
||||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
||||||
|
@ -136,12 +134,10 @@ class UniversalDetector(object):
|
||||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
||||||
'confidence': 1.0}
|
'confidence': 1.0}
|
||||||
elif byte_str.startswith(codecs.BOM_LE):
|
elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
|
||||||
# FF FE UTF-16, little endian BOM
|
# FF FE UTF-16, little endian BOM
|
||||||
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
|
|
||||||
elif byte_str.startswith(codecs.BOM_BE):
|
|
||||||
# FE FF UTF-16, big endian BOM
|
# FE FF UTF-16, big endian BOM
|
||||||
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-16", 'confidence': 1.0}
|
||||||
|
|
||||||
self._got_data = True
|
self._got_data = True
|
||||||
if self.result['encoding'] is not None:
|
if self.result['encoding'] is not None:
|
||||||
|
@ -207,7 +203,7 @@ class UniversalDetector(object):
|
||||||
return
|
return
|
||||||
self.done = True
|
self.done = True
|
||||||
|
|
||||||
if self._input_state == InputState.pure_ascii:
|
if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
|
||||||
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
|
@ -229,7 +225,7 @@ class UniversalDetector(object):
|
||||||
|
|
||||||
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
||||||
self.logger.debug('no probers hit minimum threshhold')
|
self.logger.debug('no probers hit minimum threshhold')
|
||||||
for prober in self._charset_probers[0].mProbers:
|
for prober in self._charset_probers[0].probers:
|
||||||
if not prober:
|
if not prober:
|
||||||
continue
|
continue
|
||||||
self.logger.debug('%s confidence = %s', prober.charset_name,
|
self.logger.debug('%s confidence = %s', prober.charset_name,
|
||||||
|
|
Loading…
Reference in a new issue