Merge pull request #514 from JackDandy/feature/UpdateChardet

Update chardet packages to 2.3.0
This commit is contained in:
JackDandy 2015-10-20 03:25:52 +01:00
commit 6752b47593
5 changed files with 26 additions and 35 deletions

View file

@ -16,6 +16,7 @@
* Remove "Manage Torrents" * Remove "Manage Torrents"
* Update Beautiful Soup 4.3.2 to 4.4.0 (r390) * Update Beautiful Soup 4.3.2 to 4.4.0 (r390)
* Update dateutil library to 2.4.2 (083f666) * Update dateutil library to 2.4.2 (083f666)
* Update chardet packages to 2.3.0 (26982c5)
* Update Hachoir library 1.3.3 to 1.3.4 (r1383) * Update Hachoir library 1.3.3 to 1.3.4 (r1383)
* Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt) * Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt)
* Add parse media content to determine quality before making final assumptions during re-scan, update, pp * Add parse media content to determine quality before making final assumptions during re-scan, update, pp

View file

@ -28,7 +28,6 @@
import logging import logging
import re import re
from io import BytesIO
from .enums import ProbingState from .enums import ProbingState
@ -79,16 +78,16 @@ class CharSetProber(object):
This filter applies to all scripts which do not use English characters. This filter applies to all scripts which do not use English characters.
""" """
filtered = BytesIO() filtered = bytearray()
# This regex expression filters out only words that have at-least one # This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at # international character. The word may include one marker character at
# the end. # the end.
words = re.findall( words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', buf) buf)
for word in words: for word in words:
filtered.write(word[:-1]) filtered.extend(word[:-1])
# If the last character in the word is a marker, replace it with a # If the last character in the word is a marker, replace it with a
# space as markers shouldn't affect our analysis (they are used # space as markers shouldn't affect our analysis (they are used
@ -97,9 +96,9 @@ class CharSetProber(object):
last_char = word[-1:] last_char = word[-1:]
if not last_char.isalpha() and last_char < b'\x80': if not last_char.isalpha() and last_char < b'\x80':
last_char = b' ' last_char = b' '
filtered.write(last_char) filtered.extend(last_char)
return filtered.getvalue() return filtered
@staticmethod @staticmethod
def filter_with_english_letters(buf): def filter_with_english_letters(buf):
@ -113,7 +112,7 @@ class CharSetProber(object):
characters and extended ASCII characters, but is currently only used by characters and extended ASCII characters, but is currently only used by
``Latin1Prober``. ``Latin1Prober``.
""" """
filtered = BytesIO() filtered = bytearray()
in_tag = False in_tag = False
prev = 0 prev = 0
@ -132,15 +131,15 @@ class CharSetProber(object):
if curr > prev and not in_tag: if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII, # Keep everything after last non-extended-ASCII,
# non-alphabetic character # non-alphabetic character
filtered.write(buf[prev:curr]) filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept # Output a space to delimit stretch we kept
filtered.write(b' ') filtered.extend(b' ')
prev = curr + 1 prev = curr + 1
# If we're not in a tag... # If we're not in a tag...
if not in_tag: if not in_tag:
# Keep everything after last non-extended-ASCII, non-alphabetic # Keep everything after last non-extended-ASCII, non-alphabetic
# character # character
filtered.write(buf[prev:]) filtered.extend(buf[prev:])
return filtered.getvalue() return filtered

View file

@ -1,16 +1,11 @@
""" """
All of the Enums that are used throughout the chardet package. All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dblanchard@ets.org) :author: Dan Blanchard (dan.blanchard@gmail.com)
""" """
try:
from enum import IntEnum
except ImportError:
from enum34 import IntEnum
class InputState(object):
class InputState(IntEnum):
""" """
This enum represents the different states a universal detector can be in. This enum represents the different states a universal detector can be in.
""" """
@ -19,7 +14,7 @@ class InputState(IntEnum):
high_byte = 2 high_byte = 2
class LanguageFilter(IntEnum): class LanguageFilter(object):
""" """
This enum represents the different language filters we can apply to a This enum represents the different language filters we can apply to a
``UniversalDetector``. ``UniversalDetector``.
@ -34,7 +29,7 @@ class LanguageFilter(IntEnum):
cjk = chinese | japanese | korean cjk = chinese | japanese | korean
class ProbingState(IntEnum): class ProbingState(object):
""" """
This enum represents the different states a prober can be in. This enum represents the different states a prober can be in.
""" """
@ -43,7 +38,7 @@ class ProbingState(IntEnum):
not_me = 2 not_me = 2
class MachineState(IntEnum): class MachineState(object):
""" """
This enum represents the different states a state machine can be in. This enum represents the different states a state machine can be in.
""" """

View file

@ -33,7 +33,7 @@ from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
Ibm866Model, Ibm855Model) Ibm866Model, Ibm855Model)
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel # from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from .langthaimodel import TIS620ThaiModel from .langthaimodel import TIS620ThaiModel
from .langhebrewmodel import Win1255HebrewModel from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber from .hebrewprober import HebrewProber
@ -63,9 +63,9 @@ class SBCSGroupProber(CharSetGroupProber):
] ]
hebrew_prober = HebrewProber() hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,
False, hebrew_prober) False, hebrew_prober)
visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True, visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,
hebrew_prober) hebrew_prober)
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober) hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
self.probers.extend([hebrew_prober, logical_hebrew_prober, self.probers.extend([hebrew_prober, logical_hebrew_prober,
visual_hebrew_prober]) visual_hebrew_prober])

View file

@ -122,12 +122,10 @@ class UniversalDetector(object):
if byte_str.startswith(codecs.BOM_UTF8): if byte_str.startswith(codecs.BOM_UTF8):
# EF BB BF UTF-8 with BOM # EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_UTF32_LE): elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
# FF FE 00 00 UTF-32, little-endian BOM # FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_UTF32_BE):
# 00 00 FE FF UTF-32, big-endian BOM # 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} self.result = {'encoding': "UTF-32", 'confidence': 1.0}
elif byte_str.startswith(b'\xFE\xFF\x00\x00'): elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412) # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
@ -136,12 +134,10 @@ class UniversalDetector(object):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0} 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_LE): elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
# FF FE UTF-16, little endian BOM # FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_BE):
# FE FF UTF-16, big endian BOM # FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} self.result = {'encoding': "UTF-16", 'confidence': 1.0}
self._got_data = True self._got_data = True
if self.result['encoding'] is not None: if self.result['encoding'] is not None:
@ -207,7 +203,7 @@ class UniversalDetector(object):
return return
self.done = True self.done = True
if self._input_state == InputState.pure_ascii: if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
self.result = {'encoding': 'ascii', 'confidence': 1.0} self.result = {'encoding': 'ascii', 'confidence': 1.0}
return self.result return self.result
@ -229,7 +225,7 @@ class UniversalDetector(object):
if self.logger.getEffectiveLevel() == logging.DEBUG: if self.logger.getEffectiveLevel() == logging.DEBUG:
self.logger.debug('no probers hit minimum threshhold') self.logger.debug('no probers hit minimum threshhold')
for prober in self._charset_probers[0].mProbers: for prober in self._charset_probers[0].probers:
if not prober: if not prober:
continue continue
self.logger.debug('%s confidence = %s', prober.charset_name, self.logger.debug('%s confidence = %s', prober.charset_name,