Merge pull request #514 from JackDandy/feature/UpdateChardet

Update chardet packages to 2.3.0
This commit is contained in:
JackDandy 2015-10-20 03:25:52 +01:00
commit 6752b47593
5 changed files with 26 additions and 35 deletions

View file

@ -16,6 +16,7 @@
* Remove "Manage Torrents"
* Update Beautiful Soup 4.3.2 to 4.4.0 (r390)
* Update dateutil library to 2.4.2 (083f666)
* Update chardet packages to 2.3.0 (26982c5)
* Update Hachoir library 1.3.3 to 1.3.4 (r1383)
* Change configure quiet option in Hachoir to suppress warnings (add ref:hacks.txt)
* Add parse media content to determine quality before making final assumptions during re-scan, update, pp

View file

@ -28,7 +28,6 @@
import logging
import re
from io import BytesIO
from .enums import ProbingState
@ -79,16 +78,16 @@ class CharSetProber(object):
This filter applies to all scripts which do not use English characters.
"""
filtered = BytesIO()
filtered = bytearray()
# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = re.findall(
b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', buf)
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
buf)
for word in words:
filtered.write(word[:-1])
filtered.extend(word[:-1])
# If the last character in the word is a marker, replace it with a
# space as markers shouldn't affect our analysis (they are used
@ -97,9 +96,9 @@ class CharSetProber(object):
last_char = word[-1:]
if not last_char.isalpha() and last_char < b'\x80':
last_char = b' '
filtered.write(last_char)
filtered.extend(last_char)
return filtered.getvalue()
return filtered
@staticmethod
def filter_with_english_letters(buf):
@ -113,7 +112,7 @@ class CharSetProber(object):
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
"""
filtered = BytesIO()
filtered = bytearray()
in_tag = False
prev = 0
@ -132,15 +131,15 @@ class CharSetProber(object):
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character
filtered.write(buf[prev:curr])
filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept
filtered.write(b' ')
filtered.extend(b' ')
prev = curr + 1
# If we're not in a tag...
if not in_tag:
# Keep everything after last non-extended-ASCII, non-alphabetic
# character
filtered.write(buf[prev:])
filtered.extend(buf[prev:])
return filtered.getvalue()
return filtered

View file

@ -1,16 +1,11 @@
"""
All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dblanchard@ets.org)
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
try:
from enum import IntEnum
except ImportError:
from enum34 import IntEnum
class InputState(IntEnum):
class InputState(object):
"""
This enum represents the different states a universal detector can be in.
"""
@ -19,7 +14,7 @@ class InputState(IntEnum):
high_byte = 2
class LanguageFilter(IntEnum):
class LanguageFilter(object):
"""
This enum represents the different language filters we can apply to a
``UniversalDetector``.
@ -34,7 +29,7 @@ class LanguageFilter(IntEnum):
cjk = chinese | japanese | korean
class ProbingState(IntEnum):
class ProbingState(object):
"""
This enum represents the different states a prober can be in.
"""
@ -43,7 +38,7 @@ class ProbingState(IntEnum):
not_me = 2
class MachineState(IntEnum):
class MachineState(object):
"""
This enum represents the different states a state machine can be in.
"""

View file

@ -33,7 +33,7 @@ from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
Ibm866Model, Ibm855Model)
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from .langthaimodel import TIS620ThaiModel
from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber

View file

@ -122,12 +122,10 @@ class UniversalDetector(object):
if byte_str.startswith(codecs.BOM_UTF8):
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_UTF32_LE):
elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
# FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_UTF32_BE):
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
self.result = {'encoding': "UTF-32", 'confidence': 1.0}
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
@ -136,12 +134,10 @@ class UniversalDetector(object):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_LE):
elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
# FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_BE):
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
self.result = {'encoding': "UTF-16", 'confidence': 1.0}
self._got_data = True
if self.result['encoding'] is not None:
@ -207,7 +203,7 @@ class UniversalDetector(object):
return
self.done = True
if self._input_state == InputState.pure_ascii:
if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
self.result = {'encoding': 'ascii', 'confidence': 1.0}
return self.result
@ -229,7 +225,7 @@ class UniversalDetector(object):
if self.logger.getEffectiveLevel() == logging.DEBUG:
self.logger.debug('no probers hit minimum threshhold')
for prober in self._charset_probers[0].mProbers:
for prober in self._charset_probers[0].probers:
if not prober:
continue
self.logger.debug('%s confidence = %s', prober.charset_name,