diff --git a/CHANGES.md b/CHANGES.md index ff3f362c..b62ec6c9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -16,6 +16,7 @@ * Remove "Manage Torrents" * Update Beautiful Soup 4.3.2 to 4.4.0 (r390) * Update dateutil library to 2.4.2 (083f666) +* Update chardet packages to 2.3.0 (26982c5) * Update Hachoir library 1.3.3 to 1.3.4 (r1383) * Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt) * Add parse media content to determine quality before making final assumptions during re-scan, update, pp diff --git a/lib/chardet/charsetprober.py b/lib/chardet/charsetprober.py index 3967fc16..92dc57a1 100644 --- a/lib/chardet/charsetprober.py +++ b/lib/chardet/charsetprober.py @@ -28,7 +28,6 @@ import logging import re -from io import BytesIO from .enums import ProbingState @@ -79,16 +78,16 @@ class CharSetProber(object): This filter applies to all scripts which do not use English characters. """ - filtered = BytesIO() + filtered = bytearray() # This regex expression filters out only words that have at-least one # international character. The word may include one marker character at # the end. - words = re.findall( - b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', buf) + words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', + buf) for word in words: - filtered.write(word[:-1]) + filtered.extend(word[:-1]) # If the last character in the word is a marker, replace it with a # space as markers shouldn't affect our analysis (they are used @@ -97,9 +96,9 @@ class CharSetProber(object): last_char = word[-1:] if not last_char.isalpha() and last_char < b'\x80': last_char = b' ' - filtered.write(last_char) + filtered.extend(last_char) - return filtered.getvalue() + return filtered @staticmethod def filter_with_english_letters(buf): @@ -113,7 +112,7 @@ class CharSetProber(object): characters and extended ASCII characters, but is currently only used by ``Latin1Prober``. """ - filtered = BytesIO() + filtered = bytearray() in_tag = False prev = 0 @@ -132,15 +131,15 @@ class CharSetProber(object): if curr > prev and not in_tag: # Keep everything after last non-extended-ASCII, # non-alphabetic character - filtered.write(buf[prev:curr]) + filtered.extend(buf[prev:curr]) # Output a space to delimit stretch we kept - filtered.write(b' ') + filtered.extend(b' ') prev = curr + 1 # If we're not in a tag... if not in_tag: # Keep everything after last non-extended-ASCII, non-alphabetic # character - filtered.write(buf[prev:]) + filtered.extend(buf[prev:]) - return filtered.getvalue() + return filtered diff --git a/lib/chardet/enums.py b/lib/chardet/enums.py index ef5f0f13..f1fe20e8 100644 --- a/lib/chardet/enums.py +++ b/lib/chardet/enums.py @@ -1,16 +1,11 @@ """ All of the Enums that are used throughout the chardet package. -:author: Dan Blanchard (dblanchard@ets.org) +:author: Dan Blanchard (dan.blanchard@gmail.com) """ -try: - from enum import IntEnum -except ImportError: - from enum34 import IntEnum - -class InputState(IntEnum): +class InputState(object): """ This enum represents the different states a universal detector can be in. """ @@ -19,7 +14,7 @@ class InputState(IntEnum): high_byte = 2 -class LanguageFilter(IntEnum): +class LanguageFilter(object): """ This enum represents the different language filters we can apply to a ``UniversalDetector``. @@ -34,7 +29,7 @@ class LanguageFilter(IntEnum): cjk = chinese | japanese | korean -class ProbingState(IntEnum): +class ProbingState(object): """ This enum represents the different states a prober can be in. """ @@ -43,7 +38,7 @@ class ProbingState(IntEnum): not_me = 2 -class MachineState(IntEnum): +class MachineState(object): """ This enum represents the different states a state machine can be in. """ diff --git a/lib/chardet/sbcsgroupprober.py b/lib/chardet/sbcsgroupprober.py index 448a41d0..29bcc2ad 100644 --- a/lib/chardet/sbcsgroupprober.py +++ b/lib/chardet/sbcsgroupprober.py @@ -33,7 +33,7 @@ from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, Ibm866Model, Ibm855Model) from .langgreekmodel import Latin7GreekModel, Win1253GreekModel from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel -from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel +# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel from .langthaimodel import TIS620ThaiModel from .langhebrewmodel import Win1255HebrewModel from .hebrewprober import HebrewProber @@ -63,9 +63,9 @@ class SBCSGroupProber(CharSetGroupProber): ] hebrew_prober = HebrewProber() logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, - False, hebrew_prober) + False, hebrew_prober) visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True, - hebrew_prober) + hebrew_prober) hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober) self.probers.extend([hebrew_prober, logical_hebrew_prober, visual_hebrew_prober]) diff --git a/lib/chardet/universaldetector.py b/lib/chardet/universaldetector.py index b843bc39..22cc7e7b 100644 --- a/lib/chardet/universaldetector.py +++ b/lib/chardet/universaldetector.py @@ -122,12 +122,10 @@ class UniversalDetector(object): if byte_str.startswith(codecs.BOM_UTF8): # EF BB BF UTF-8 with BOM self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} - elif byte_str.startswith(codecs.BOM_UTF32_LE): + elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE): # FF FE 00 00 UTF-32, little-endian BOM - self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} - elif byte_str.startswith(codecs.BOM_UTF32_BE): # 00 00 FE FF UTF-32, big-endian BOM - self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} + self.result = {'encoding': "UTF-32", 'confidence': 1.0} elif byte_str.startswith(b'\xFE\xFF\x00\x00'): # FE FF 00 00 UCS-4, unusual octet order BOM (3412) self.result = {'encoding': "X-ISO-10646-UCS-4-3412", @@ -136,12 +134,10 @@ class UniversalDetector(object): # 00 00 FF FE UCS-4, unusual octet order BOM (2143) self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} - elif byte_str.startswith(codecs.BOM_LE): + elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE): # FF FE UTF-16, little endian BOM - self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} - elif byte_str.startswith(codecs.BOM_BE): # FE FF UTF-16, big endian BOM - self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} + self.result = {'encoding': "UTF-16", 'confidence': 1.0} self._got_data = True if self.result['encoding'] is not None: @@ -207,7 +203,7 @@ class UniversalDetector(object): return self.done = True - if self._input_state == InputState.pure_ascii: + if self._input_state in (InputState.pure_ascii, InputState.esc_ascii): self.result = {'encoding': 'ascii', 'confidence': 1.0} return self.result @@ -229,7 +225,7 @@ class UniversalDetector(object): if self.logger.getEffectiveLevel() == logging.DEBUG: self.logger.debug('no probers hit minimum threshhold') - for prober in self._charset_probers[0].mProbers: + for prober in self._charset_probers[0].probers: if not prober: continue self.logger.debug('%s confidence = %s', prober.charset_name,