Merge pull request #977 from JackDandy/feature/UpdateChardet

Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2).
This commit is contained in:
JackDandy 2017-08-26 00:29:26 +01:00 committed by GitHub
commit cf383de226
35 changed files with 486 additions and 329 deletions

View file

@ -13,6 +13,7 @@
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
* Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
* Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
* Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2)
* Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160)
* Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)

View file

@ -16,17 +16,24 @@
######################### END LICENSE BLOCK #########################
from .compat import PY2, PY3, bin_type as _bin_type
from .compat import PY2, PY3
from .universaldetector import UniversalDetector
from .version import __version__, VERSION
def detect(byte_str):
if not isinstance(byte_str, _bin_type):
raise TypeError('Expected object of {0} type, got: {1}'
''.format(_bin_type, type(byte_str)))
"""
Detect the encoding of the given byte string.
u = UniversalDetector()
u.feed(byte_str)
u.close()
return u.result
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()

View file

@ -41,3 +41,7 @@ class Big5Prober(MultiByteCharSetProber):
@property
def charset_name(self):
return "Big5"
@property
def language(self):
return "Chinese"

View file

@ -35,7 +35,6 @@ from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
class CharDistributionAnalysis(object):
@ -123,9 +122,9 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
else:
return -1
@ -142,9 +141,9 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
else:
return -1
@ -161,7 +160,7 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
@ -180,7 +179,7 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
first_char, second_char = byte_str[0], byte_str[1]
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
@ -202,7 +201,7 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
@ -227,8 +226,8 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
char = wrap_ord(byte_str[0])
char = byte_str[0]
if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
else:
return -1

View file

@ -54,6 +54,14 @@ class CharSetGroupProber(CharSetProber):
return None
return self._best_guess_prober.charset_name
@property
def language(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.language
def feed(self, byte_str):
for prober in self.probers:
if not prober:
@ -63,22 +71,22 @@ class CharSetGroupProber(CharSetProber):
state = prober.feed(byte_str)
if not state:
continue
if state == ProbingState.found_it:
if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober
return self.state
elif state == ProbingState.not_me:
elif state == ProbingState.NOT_ME:
prober.active = False
self._active_num -= 1
if self._active_num <= 0:
self._state = ProbingState.not_me
self._state = ProbingState.NOT_ME
return self.state
return self.state
def get_confidence(self):
state = self.state
if state == ProbingState.found_it:
if state == ProbingState.FOUND_IT:
return 0.99
elif state == ProbingState.not_me:
elif state == ProbingState.NOT_ME:
return 0.01
best_conf = 0.0
self._best_guess_prober = None
@ -89,7 +97,7 @@ class CharSetGroupProber(CharSetProber):
self.logger.debug('%s not active', prober.charset_name)
continue
conf = prober.get_confidence()
self.logger.debug('%s confidence = %s', prober.charset_name, conf)
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
if best_conf < conf:
best_conf = conf
self._best_guess_prober = prober

View file

@ -42,7 +42,7 @@ class CharSetProber(object):
self.logger = logging.getLogger(__name__)
def reset(self):
self._state = ProbingState.detecting
self._state = ProbingState.DETECTING
@property
def charset_name(self):

View file

@ -17,15 +17,12 @@ from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from io import open
from chardet import __version__
from chardet.compat import PY2
from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
@ -38,7 +35,11 @@ def description_of(lines, name='stdin'):
"""
u = UniversalDetector()
for line in lines:
line = bytearray(line)
u.feed(line)
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
if u.done:
break
u.close()
result = u.result
if PY2:

View file

@ -28,7 +28,6 @@
import logging
from .enums import MachineState
from .compat import wrap_ord
class CodingStateMachine(object):
@ -62,13 +61,13 @@ class CodingStateMachine(object):
self.reset()
def reset(self):
self._curr_state = MachineState.start
self._curr_state = MachineState.START
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
byte_class = self._model['class_table'][wrap_ord(c)]
if self._curr_state == MachineState.start:
byte_class = self._model['class_table'][c]
if self._curr_state == MachineState.START:
self._curr_byte_pos = 0
self._curr_char_len = self._model['char_len_table'][byte_class]
# from byte's class and state_table, we get its next state
@ -83,3 +82,7 @@ class CodingStateMachine(object):
def get_coding_state_machine(self):
return self._model['name']
@property
def language(self):
return self._model['language']

View file

@ -27,17 +27,8 @@ if sys.version_info < (3, 0):
PY3 = False
base_str = (str, unicode)
text_type = unicode
bin_type = str
else:
PY2 = False
PY3 = True
base_str = (bytes, str)
text_type = str
bin_type = (bytes, bytearray)
def wrap_ord(a):
if PY2 and isinstance(a, base_str):
return ord(a)
else:
return a

View file

@ -43,3 +43,7 @@ class CP949Prober(MultiByteCharSetProber):
@property
def charset_name(self):
return "CP949"
@property
def language(self):
return "Korean"

View file

@ -9,9 +9,9 @@ class InputState(object):
"""
This enum represents the different states a universal detector can be in.
"""
pure_ascii = 0
esc_ascii = 1
high_byte = 2
PURE_ASCII = 0
ESC_ASCII = 1
HIGH_BYTE = 2
class LanguageFilter(object):
@ -19,29 +19,58 @@ class LanguageFilter(object):
This enum represents the different language filters we can apply to a
``UniversalDetector``.
"""
chinese_simplified = 0x01
chinese_traditional = 0x02
japanese = 0x04
korean = 0x08
non_cjk = 0x10
all = 0x1F
chinese = chinese_simplified | chinese_traditional
cjk = chinese | japanese | korean
CHINESE_SIMPLIFIED = 0x01
CHINESE_TRADITIONAL = 0x02
JAPANESE = 0x04
KOREAN = 0x08
NON_CJK = 0x10
ALL = 0x1F
CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
CJK = CHINESE | JAPANESE | KOREAN
class ProbingState(object):
"""
This enum represents the different states a prober can be in.
"""
detecting = 0
found_it = 1
not_me = 2
DETECTING = 0
FOUND_IT = 1
NOT_ME = 2
class MachineState(object):
"""
This enum represents the different states a state machine can be in.
"""
start = 0
error = 1
its_me = 2
START = 0
ERROR = 1
ITS_ME = 2
class SequenceLikelihood(object):
"""
This enum represents the likelihood of a character following the previous one.
"""
NEGATIVE = 0
UNLIKELY = 1
LIKELY = 2
POSITIVE = 3
@classmethod
def get_num_categories(cls):
""":returns: The number of likelihood categories in the enum."""
return 4
class CharacterCategory(object):
"""
This enum represents the different categories language models for
``SingleByteCharsetProber`` put characters into.
Anything less than CONTROL is considered a letter.
"""
UNDEFINED = 255
LINE_BREAK = 254
SYMBOL = 253
DIGIT = 252
CONTROL = 251

View file

@ -27,7 +27,6 @@
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
from .enums import LanguageFilter, ProbingState, MachineState
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL)
@ -43,15 +42,16 @@ class EscCharSetProber(CharSetProber):
def __init__(self, lang_filter=None):
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
self.coding_sm = []
if self.lang_filter & LanguageFilter.chinese_simplified:
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
if self.lang_filter & LanguageFilter.japanese:
if self.lang_filter & LanguageFilter.JAPANESE:
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.korean:
if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None
self._detected_charset = None
self._detected_language = None
self._state = None
self.reset()
@ -64,11 +64,16 @@ class EscCharSetProber(CharSetProber):
coding_sm.reset()
self.active_sm_count = len(self.coding_sm)
self._detected_charset = None
self._detected_language = None
@property
def charset_name(self):
return self._detected_charset
@property
def language(self):
return self._detected_language
def get_confidence(self):
if self._detected_charset:
return 0.99
@ -80,16 +85,17 @@ class EscCharSetProber(CharSetProber):
for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active:
continue
coding_state = coding_sm.next_state(wrap_ord(c))
if coding_state == MachineState.error:
coding_state = coding_sm.next_state(c)
if coding_state == MachineState.ERROR:
coding_sm.active = False
self.active_sm_count -= 1
if self.active_sm_count <= 0:
self._state = ProbingState.not_me
self._state = ProbingState.NOT_ME
return self.state
elif coding_state == MachineState.its_me:
self._state = ProbingState.found_it
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
self._detected_charset = coding_sm.get_coding_state_machine()
self._detected_language = coding_sm.language
return self.state
return self.state

View file

@ -63,12 +63,12 @@ HZ_CLS = (
)
HZ_ST = (
MachineState.start,MachineState.error, 3,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start, 4,MachineState.error,# 10-17
5,MachineState.error, 6,MachineState.error, 5, 5, 4,MachineState.error,# 18-1f
4,MachineState.error, 4, 4, 4,MachineState.error, 4,MachineState.error,# 20-27
4,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 28-2f
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
)
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
@ -77,7 +77,8 @@ HZ_SM_MODEL = {'class_table': HZ_CLS,
'class_factor': 6,
'state_table': HZ_ST,
'char_len_table': HZ_CHAR_LEN_TABLE,
'name': "HZ-GB-2312"}
'name': "HZ-GB-2312",
'language': 'Chinese'}
ISO2022CN_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
@ -115,14 +116,14 @@ ISO2022CN_CLS = (
)
ISO2022CN_ST = (
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07
MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f
MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,# 18-1f
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 20-27
5, 6,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 28-2f
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 30-37
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,# 38-3f
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
)
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -131,7 +132,8 @@ ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
'class_factor': 9,
'state_table': ISO2022CN_ST,
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
'name': "ISO-2022-CN"}
'name': "ISO-2022-CN",
'language': 'Chinese'}
ISO2022JP_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
@ -169,15 +171,15 @@ ISO2022JP_CLS = (
)
ISO2022JP_ST = (
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07
MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,# 18-1f
MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 20-27
MachineState.error,MachineState.error,MachineState.error, 6,MachineState.its_me,MachineState.error,MachineState.its_me,MachineState.error,# 28-2f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,# 30-37
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 38-3f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,MachineState.start,# 40-47
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
)
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -186,7 +188,8 @@ ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
'class_factor': 10,
'state_table': ISO2022JP_ST,
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
'name': "ISO-2022-JP"}
'name': "ISO-2022-JP",
'language': 'Japanese'}
ISO2022KR_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
@ -224,11 +227,11 @@ ISO2022KR_CLS = (
)
ISO2022KR_ST = (
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 10-17
MachineState.error,MachineState.error,MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,# 18-1f
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 20-27
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
)
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
@ -237,6 +240,7 @@ ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
'class_factor': 6,
'state_table': ISO2022KR_ST,
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
'name': "ISO-2022-KR"}
'name': "ISO-2022-KR",
'language': 'Korean'}

View file

@ -49,19 +49,23 @@ class EUCJPProber(MultiByteCharSetProber):
def charset_name(self):
return "EUC-JP"
@property
def language(self):
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.error:
self.logger.debug('%s prober hit error at byte %s',
self.charset_name, i)
self._state = ProbingState.not_me
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.its_me:
self._state = ProbingState.found_it
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.start:
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
@ -75,10 +79,10 @@ class EUCJPProber(MultiByteCharSetProber):
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.detecting:
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.found_it
self._state = ProbingState.FOUND_IT
return self.state

View file

@ -41,3 +41,7 @@ class EUCKRProber(MultiByteCharSetProber):
@property
def charset_name(self):
return "EUC-KR"
@property
def language(self):
return "Korean"

View file

@ -44,7 +44,7 @@
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102
EUCTW_TABLE_SIZE = 5376
EUCTW_CHAR_TO_FREQ_ORDER = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742

View file

@ -40,3 +40,7 @@ class EUCTWProber(MultiByteCharSetProber):
@property
def charset_name(self):
return "EUC-TW"
@property
def language(self):
return "Taiwan"

View file

@ -40,3 +40,7 @@ class GB2312Prober(MultiByteCharSetProber):
@property
def charset_name(self):
return "GB2312"
@property
def language(self):
return "Chinese"

View file

@ -27,7 +27,6 @@
from .charsetprober import CharSetProber
from .enums import ProbingState
from .compat import wrap_ord
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
@ -177,7 +176,7 @@ class HebrewProber(CharSetProber):
self._visual_prober = visualProber
def is_final(self, c):
return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]
def is_non_final(self, c):
@ -191,7 +190,7 @@ class HebrewProber(CharSetProber):
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM,
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str):
@ -221,9 +220,9 @@ class HebrewProber(CharSetProber):
# We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP]
if self.state == ProbingState.not_me:
if self.state == ProbingState.NOT_ME:
# Both model probers say it's not them. No reason to continue.
return ProbingState.not_me
return ProbingState.NOT_ME
byte_str = self.filter_high_byte_only(byte_str)
@ -250,8 +249,8 @@ class HebrewProber(CharSetProber):
self._prev = cur
# Forever detecting, till the end or until both model probers return
# ProbingState.not_me (handled above)
return ProbingState.detecting
# ProbingState.NOT_ME (handled above)
return ProbingState.DETECTING
@property
def charset_name(self):
@ -280,10 +279,14 @@ class HebrewProber(CharSetProber):
# Logical.
return self.LOGICAL_HEBREW_NAME
@property
def language(self):
return 'Hebrew'
@property
def state(self):
# Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.not_me) and \
(self._visual_prober.state == ProbingState.not_me):
return ProbingState.not_me
return ProbingState.detecting
if (self._logical_prober.state == ProbingState.NOT_ME) and \
(self._visual_prober.state == ProbingState.NOT_ME):
return ProbingState.NOT_ME
return ProbingState.DETECTING

View file

@ -25,7 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .compat import wrap_ord
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = (
@ -194,7 +193,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
char_len = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
@ -204,7 +203,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
# return its order if it is hiragana
if len(byte_str) > 1:
second_char = wrap_ord(byte_str[1])
second_char = byte_str[1]
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, char_len
@ -215,7 +214,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
char_len = 2
elif first_char == 0x8F:
@ -225,7 +224,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
# return its order if it is hiragana
if len(byte_str) > 1:
second_char = wrap_ord(byte_str[1])
second_char = byte_str[1]
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, char_len

View file

@ -214,7 +214,8 @@ Latin5BulgarianModel = {
'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392,
'keep_english_letter': False,
'charset_name': "ISO-8859-5"
'charset_name': "ISO-8859-5",
'language': 'Bulgairan',
}
Win1251BulgarianModel = {
@ -222,8 +223,6 @@ Win1251BulgarianModel = {
'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392,
'keep_english_letter': False,
'charset_name': "windows-1251"
'charset_name': "windows-1251",
'language': 'Bulgarian',
}

View file

@ -283,7 +283,8 @@ Koi8rModel = {
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "KOI8-R"
'charset_name': "KOI8-R",
'language': 'Russian',
}
Win1251CyrillicModel = {
@ -291,7 +292,8 @@ Win1251CyrillicModel = {
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "windows-1251"
'charset_name': "windows-1251",
'language': 'Russian',
}
Latin5CyrillicModel = {
@ -299,7 +301,8 @@ Latin5CyrillicModel = {
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "ISO-8859-5"
'charset_name': "ISO-8859-5",
'language': 'Russian',
}
MacCyrillicModel = {
@ -307,7 +310,8 @@ MacCyrillicModel = {
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "MacCyrillic"
'charset_name': "MacCyrillic",
'language': 'Russian',
}
Ibm866Model = {
@ -315,7 +319,8 @@ Ibm866Model = {
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "IBM866"
'charset_name': "IBM866",
'language': 'Russian',
}
Ibm855Model = {
@ -323,7 +328,6 @@ Ibm855Model = {
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "IBM855"
'charset_name': "IBM855",
'language': 'Russian',
}

View file

@ -211,7 +211,8 @@ Latin7GreekModel = {
'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851,
'keep_english_letter': False,
'charset_name': "ISO-8859-7"
'charset_name': "ISO-8859-7",
'language': 'Greek',
}
Win1253GreekModel = {
@ -219,7 +220,6 @@ Win1253GreekModel = {
'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851,
'keep_english_letter': False,
'charset_name': "windows-1253"
'charset_name': "windows-1253",
'language': 'Greek',
}

View file

@ -195,7 +195,6 @@ Win1255HebrewModel = {
'precedence_matrix': HEBREW_LANG_MODEL,
'typical_positive_ratio': 0.984004,
'keep_english_letter': False,
'charset_name': "windows-1255"
'charset_name': "windows-1255",
'language': 'Hebrew',
}

View file

@ -211,7 +211,8 @@ Latin2HungarianModel = {
'precedence_matrix': HungarianLangModel,
'typical_positive_ratio': 0.947368,
'keep_english_letter': True,
'charset_name': "ISO-8859-2"
'charset_name': "ISO-8859-2",
'language': 'Hungarian',
}
Win1250HungarianModel = {
@ -219,7 +220,6 @@ Win1250HungarianModel = {
'precedence_matrix': HungarianLangModel,
'typical_positive_ratio': 0.947368,
'keep_english_letter': True,
'charset_name': "windows-1250"
'charset_name': "windows-1250",
'language': 'Hungarian',
}

View file

@ -194,7 +194,6 @@ TIS620ThaiModel = {
'precedence_matrix': ThaiLangModel,
'typical_positive_ratio': 0.926386,
'keep_english_letter': False,
'charset_name': "TIS-620"
'charset_name': "TIS-620",
'language': 'Thai',
}

View file

@ -188,5 +188,6 @@ Latin5TurkishModel = {
'precedence_matrix': TurkishLangModel,
'typical_positive_ratio': 0.970290,
'keep_english_letter': True,
'charset_name': "ISO-8859-9"
'charset_name': "ISO-8859-9",
'language': 'Turkish',
}

View file

@ -27,7 +27,6 @@
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .compat import wrap_ord
from .enums import ProbingState
FREQ_CAT_NUM = 4
@ -108,16 +107,20 @@ class Latin1Prober(CharSetProber):
@property
def charset_name(self):
return "windows-1252"
return "ISO-8859-1"
@property
def language(self):
return ""
def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[wrap_ord(c)]
char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
+ char_class]
if freq == 0:
self._state = ProbingState.not_me
self._state = ProbingState.NOT_ME
break
self._freq_counter[freq] += 1
self._last_char_class = char_class
@ -125,7 +128,7 @@ class Latin1Prober(CharSetProber):
return self.state
def get_confidence(self):
if self.state == ProbingState.not_me:
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._freq_counter)

View file

@ -52,20 +52,24 @@ class MultiByteCharSetProber(CharSetProber):
@property
def charset_name(self):
pass
raise NotImplementedError
@property
def language(self):
raise NotImplementedError
def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.error:
self.logger.debug('%s prober hit error at byte %s',
self.charset_name, i)
self._state = ProbingState.not_me
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.its_me:
self._state = ProbingState.found_it
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.start:
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
@ -76,10 +80,10 @@ class MultiByteCharSetProber(CharSetProber):
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.detecting:
if self.state == ProbingState.DETECTING:
if (self.distribution_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.found_it
self._state = ProbingState.FOUND_IT
return self.state

View file

@ -65,9 +65,9 @@ BIG5_CLS = (
)
BIG5_ST = (
MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,#08-0f
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start#10-17
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
)
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
@ -101,13 +101,13 @@ CP949_CLS = (
CP949_ST = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
MachineState.error,MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start, 4, 5,MachineState.error, 6, # MachineState.start
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, # MachineState.error
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me, # MachineState.its_me
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 3
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 4
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 5
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 6
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START, 4, 5,MachineState.ERROR, 6, # MachineState.START
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
)
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
@ -156,11 +156,11 @@ EUCJP_CLS = (
)
EUCJP_ST = (
3, 4, 3, 5,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.start,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#10-17
MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error, 3,MachineState.error,#18-1f
3,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start#20-27
3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
)
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
@ -209,8 +209,8 @@ EUCKR_CLS = (
)
EUCKR_ST = (
MachineState.error,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start #08-0f
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
)
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
@ -259,12 +259,12 @@ EUCTW_CLS = (
)
EUCTW_ST = (
MachineState.error,MachineState.error,MachineState.start, 3, 3, 3, 4,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.start,MachineState.error,#10-17
MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
5,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.start,MachineState.start,#20-27
MachineState.start,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.START, 3, 3, 3, 4,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17
MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
)
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
@ -313,12 +313,12 @@ GB2312_CLS = (
)
GB2312_ST = (
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, 3,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,#10-17
4,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,#20-27
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, 3,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17
4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
)
# To be accurate, the length of class 6 can be either 2 or 4.
@ -374,9 +374,9 @@ SJIS_CLS = (
SJIS_ST = (
MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start #10-17
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
)
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
@ -425,13 +425,13 @@ UCS2BE_CLS = (
)
UCS2BE_ST = (
5, 7, 7,MachineState.error, 4, 3,MachineState.error,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
MachineState.its_me,MachineState.its_me, 6, 6, 6, 6,MachineState.error,MachineState.error,#10-17
6, 6, 6, 6, 6,MachineState.its_me, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,MachineState.error,#20-27
5, 8, 6, 6,MachineState.error, 6, 6, 6,#28-2f
6, 6, 6, 6,MachineState.error,MachineState.error,MachineState.start,MachineState.start #30-37
5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17
6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
)
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
@ -480,13 +480,13 @@ UCS2LE_CLS = (
)
UCS2LE_ST = (
6, 6, 7, 6, 4, 3,MachineState.error,MachineState.error,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
MachineState.its_me,MachineState.its_me, 5, 5, 5,MachineState.error,MachineState.its_me,MachineState.error,#10-17
5, 5, 5,MachineState.error, 5,MachineState.error, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,MachineState.error,#20-27
5, 5, 5,MachineState.error,MachineState.error,MachineState.error, 5, 5,#28-2f
5, 5, 5,MachineState.error, 5,MachineState.error,MachineState.start,MachineState.start #30-37
6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
)
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
@ -535,32 +535,32 @@ UTF8_CLS = (
)
UTF8_ST = (
MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12, 10,#00-07
MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#10-17
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#20-27
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#28-2f
MachineState.error,MachineState.error, 5, 5, 5, 5,MachineState.error,MachineState.error,#30-37
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#38-3f
MachineState.error,MachineState.error,MachineState.error, 5, 5, 5,MachineState.error,MachineState.error,#40-47
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#48-4f
MachineState.error,MachineState.error, 7, 7, 7, 7,MachineState.error,MachineState.error,#50-57
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#58-5f
MachineState.error,MachineState.error,MachineState.error,MachineState.error, 7, 7,MachineState.error,MachineState.error,#60-67
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#68-6f
MachineState.error,MachineState.error, 9, 9, 9, 9,MachineState.error,MachineState.error,#70-77
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#78-7f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 9,MachineState.error,MachineState.error,#80-87
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#88-8f
MachineState.error,MachineState.error, 12, 12, 12, 12,MachineState.error,MachineState.error,#90-97
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#98-9f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12,MachineState.error,MachineState.error,#a0-a7
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#a8-af
MachineState.error,MachineState.error, 12, 12, 12,MachineState.error,MachineState.error,MachineState.error,#b0-b7
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#b8-bf
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,#c0-c7
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error #c8-cf
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f
MachineState.ERROR,MachineState.ERROR, 5, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#40-47
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f
MachineState.ERROR,MachineState.ERROR, 7, 7, 7, 7,MachineState.ERROR,MachineState.ERROR,#50-57
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 7, 7,MachineState.ERROR,MachineState.ERROR,#60-67
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f
MachineState.ERROR,MachineState.ERROR, 9, 9, 9, 9,MachineState.ERROR,MachineState.ERROR,#70-77
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 9,MachineState.ERROR,MachineState.ERROR,#80-87
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f
MachineState.ERROR,MachineState.ERROR, 12, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,#90-97
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12,MachineState.ERROR,MachineState.ERROR,#a0-a7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af
MachineState.ERROR,MachineState.ERROR, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
)
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)

View file

@ -27,18 +27,14 @@
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .compat import wrap_ord
from .enums import ProbingState
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
class SingleByteCharSetProber(CharSetProber):
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
POSITIVE_SHORTCUT_THRESHOLD = 0.95
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
SYMBOL_CAT_ORDER = 250
NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
def __init__(self, model, reversed=False, name_prober=None):
super(SingleByteCharSetProber, self).__init__()
@ -58,7 +54,7 @@ class SingleByteCharSetProber(CharSetProber):
super(SingleByteCharSetProber, self).reset()
# char order of last character
self._last_order = 255
self._seq_counters = [0] * self.NUMBER_OF_SEQ_CAT
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
self._total_seqs = 0
self._total_char = 0
# characters that fall in our sampling range
@ -71,15 +67,29 @@ class SingleByteCharSetProber(CharSetProber):
else:
return self._model['charset_name']
@property
def language(self):
if self._name_prober:
return self._name_prober.language
else:
return self._model.get('language')
def feed(self, byte_str):
if not self._model['keep_english_letter']:
byte_str = self.filter_international_words(byte_str)
num_bytes = len(byte_str)
if not num_bytes:
if not byte_str:
return self.state
for c in byte_str:
order = self._model['char_to_order_map'][wrap_ord(c)]
if order < self.SYMBOL_CAT_ORDER:
char_to_order_map = self._model['char_to_order_map']
for i, c in enumerate(byte_str):
# XXX: Order is in range 1-64, so one would think we want 0-63 here,
# but that leads to 27 more test failures than before.
order = char_to_order_map[c]
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
# to make it closer to the original intent. The only difference
# is whether or not we count digits and control characters for
# _total_char purposes.
if order < CharacterCategory.CONTROL:
self._total_char += 1
if order < self.SAMPLE_SIZE:
self._freq_char += 1
@ -94,27 +104,28 @@ class SingleByteCharSetProber(CharSetProber):
self._seq_counters[model] += 1
self._last_order = order
if self.state == ProbingState.detecting:
charset_name = self._model['charset_name']
if self.state == ProbingState.DETECTING:
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
cf = self.get_confidence()
if cf > self.POSITIVE_SHORTCUT_THRESHOLD:
confidence = self.get_confidence()
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, we have a winner',
self._model['charset_name'], cf)
self._state = ProbingState.found_it
elif cf < self.NEGATIVE_SHORTCUT_THRESHOLD:
charset_name, confidence)
self._state = ProbingState.FOUND_IT
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, below negative '
'shortcut threshold %s',
self._model['charset_name'], cf,
'shortcut threshhold %s', charset_name,
confidence,
self.NEGATIVE_SHORTCUT_THRESHOLD)
self._state = ProbingState.not_me
self._state = ProbingState.NOT_ME
return self.state
def get_confidence(self):
r = 0.01
if self._total_seqs > 0:
r = ((1.0 * self._seq_counters[self.POSITIVE_CAT]) / self._total_seqs
/ self._model['typical_positive_ratio'])
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
self._total_seqs / self._model['typical_positive_ratio'])
r = r * self._freq_char / self._total_char
if r >= 1.0:
r = 0.99

View file

@ -49,18 +49,22 @@ class SJISProber(MultiByteCharSetProber):
def charset_name(self):
return self.context_analyzer.charset_name
@property
def language(self):
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.error:
self.logger.debug('%s prober hit error at byte %s',
self.charset_name, i)
self._state = ProbingState.not_me
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.its_me:
self._state = ProbingState.found_it
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.start:
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
@ -75,10 +79,10 @@ class SJISProber(MultiByteCharSetProber):
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.detecting:
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.found_it
self._state = ProbingState.FOUND_IT
return self.state

View file

@ -40,6 +40,7 @@ import codecs
import logging
import re
from .charsetgroupprober import CharSetGroupProber
from .enums import InputState, LanguageFilter, ProbingState
from .escprober import EscCharSetProber
from .latin1prober import Latin1Prober
@ -67,8 +68,17 @@ class UniversalDetector(object):
MINIMUM_THRESHOLD = 0.20
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
ESC_DETECTOR = re.compile(b'(\033|~{)')
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
'iso-8859-2': 'Windows-1250',
'iso-8859-5': 'Windows-1251',
'iso-8859-6': 'Windows-1256',
'iso-8859-7': 'Windows-1253',
'iso-8859-8': 'Windows-1255',
'iso-8859-9': 'Windows-1254',
'iso-8859-13': 'Windows-1257'}
def __init__(self, lang_filter=LanguageFilter.all):
def __init__(self, lang_filter=LanguageFilter.ALL):
self._esc_charset_prober = None
self._charset_probers = []
self.result = None
@ -78,6 +88,7 @@ class UniversalDetector(object):
self._last_char = None
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
self._has_win_bytes = None
self.reset()
def reset(self):
@ -86,10 +97,11 @@ class UniversalDetector(object):
initial states. This is called by ``__init__``, so you only need to
call this directly in between analyses of different documents.
"""
self.result = {'encoding': None, 'confidence': 0.0}
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
self.done = False
self._got_data = False
self._input_state = InputState.pure_ascii
self._has_win_bytes = False
self._input_state = InputState.PURE_ASCII
self._last_char = b''
if self._esc_charset_prober:
self._esc_charset_prober.reset()
@ -116,28 +128,40 @@ class UniversalDetector(object):
if not len(byte_str):
return
if not isinstance(byte_str, bytearray):
byte_str = bytearray(byte_str)
# First check for known BOMs, since these are guaranteed to be correct
if not self._got_data:
# If the data starts with BOM, we know it is UTF
if byte_str.startswith(codecs.BOM_UTF8):
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
self.result = {'encoding': "UTF-8-SIG",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith((codecs.BOM_UTF32_LE,
codecs.BOM_UTF32_BE)):
# FF FE 00 00 UTF-32, little-endian BOM
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32", 'confidence': 1.0}
self.result = {'encoding': "UTF-32",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0}
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0}
elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
'confidence': 1.0,
'language': ''}
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
# FF FE UTF-16, little endian BOM
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16", 'confidence': 1.0}
self.result = {'encoding': "UTF-16",
'confidence': 1.0,
'language': ''}
self._got_data = True
if self.result['encoding'] is not None:
@ -146,12 +170,12 @@ class UniversalDetector(object):
# If none of those matched and we've only see ASCII so far, check
# for high bytes and escape sequences
if self._input_state == InputState.pure_ascii:
if self._input_state == InputState.PURE_ASCII:
if self.HIGH_BYTE_DETECTOR.search(byte_str):
self._input_state = InputState.high_byte
elif self._input_state == InputState.pure_ascii and \
self._input_state = InputState.HIGH_BYTE
elif self._input_state == InputState.PURE_ASCII and \
self.ESC_DETECTOR.search(self._last_char + byte_str):
self._input_state = InputState.esc_ascii
self._input_state = InputState.ESC_ASCII
self._last_char = byte_str[-1:]
@ -159,14 +183,16 @@ class UniversalDetector(object):
# uses a simple state machine to check for known escape sequences in
# HZ and ISO-2022 encodings, since those are the only encodings that
# use such sequences.
if self._input_state == InputState.esc_ascii:
if self._input_state == InputState.ESC_ASCII:
if not self._esc_charset_prober:
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
if self._esc_charset_prober.feed(byte_str) == ProbingState.found_it:
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding':
self._esc_charset_prober.charset_name,
'confidence':
self._esc_charset_prober.get_confidence()}
self._esc_charset_prober.get_confidence(),
'language':
self._esc_charset_prober.language}
self.done = True
# If we've seen high bytes (i.e., those with values greater than 127),
# we need to do more complicated checks using all our multi-byte and
@ -174,59 +200,87 @@ class UniversalDetector(object):
# use character bigram distributions to determine the encoding, whereas
# the multi-byte probers use a combination of character unigram and
# bigram distributions.
elif self._input_state == InputState.high_byte:
elif self._input_state == InputState.HIGH_BYTE:
if not self._charset_probers:
self._charset_probers = [MBCSGroupProber(self.lang_filter)]
# If we're checking non-CJK encodings, use single-byte prober
if self.lang_filter & LanguageFilter.non_cjk:
if self.lang_filter & LanguageFilter.NON_CJK:
self._charset_probers.append(SBCSGroupProber())
self._charset_probers.append(Latin1Prober())
for prober in self._charset_probers:
if prober.feed(byte_str) == ProbingState.found_it:
if prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding': prober.charset_name,
'confidence': prober.get_confidence()}
'confidence': prober.get_confidence(),
'language': prober.language}
self.done = True
break
if self.WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True
def close(self):
"""
Stop analyzing the current document and come up with a final
prediction.
:returns: The ``result`` attribute if a prediction was made, otherwise
``None``.
:returns: The ``result`` attribute, a ``dict`` with the keys
`encoding`, `confidence`, and `language`.
"""
# Don't bother with checks if we're already done
if self.done:
return self.result
if not self._got_data:
self.logger.debug('no data received!')
return
self.done = True
if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
self.result = {'encoding': 'ascii', 'confidence': 1.0}
return self.result
if not self._got_data:
self.logger.debug('no data received!')
if self._input_state == InputState.high_byte:
proberConfidence = None
# Default to ASCII if it is all we've seen so far
elif self._input_state == InputState.PURE_ASCII:
self.result = {'encoding': 'ascii',
'confidence': 1.0,
'language': ''}
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
elif self._input_state == InputState.HIGH_BYTE:
prober_confidence = None
max_prober_confidence = 0.0
max_prober = None
for prober in self._charset_probers:
if not prober:
continue
proberConfidence = prober.get_confidence()
if proberConfidence > max_prober_confidence:
max_prober_confidence = proberConfidence
prober_confidence = prober.get_confidence()
if prober_confidence > max_prober_confidence:
max_prober_confidence = prober_confidence
max_prober = prober
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
self.result = {'encoding': max_prober.charset_name,
'confidence': max_prober.get_confidence()}
return self.result
charset_name = max_prober.charset_name
lower_charset_name = max_prober.charset_name.lower()
confidence = max_prober.get_confidence()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if self._has_win_bytes:
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
self.result = {'encoding': charset_name,
'confidence': confidence,
'language': max_prober.language}
# Log all prober confidences if none met MINIMUM_THRESHOLD
if self.logger.getEffectiveLevel() == logging.DEBUG:
if self.result['encoding'] is None:
self.logger.debug('no probers hit minimum threshold')
for prober in self._charset_probers[0].probers:
if not prober:
for group_prober in self._charset_probers:
if not group_prober:
continue
self.logger.debug('%s confidence = %s', prober.charset_name,
if isinstance(group_prober, CharSetGroupProber):
for prober in group_prober.probers:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
else:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
return self.result

View file

@ -50,22 +50,26 @@ class UTF8Prober(CharSetProber):
def charset_name(self):
return "utf-8"
@property
def language(self):
return ""
def feed(self, byte_str):
for c in byte_str:
coding_state = self.coding_sm.next_state(c)
if coding_state == MachineState.error:
self._state = ProbingState.not_me
if coding_state == MachineState.ERROR:
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.its_me:
self._state = ProbingState.found_it
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.start:
elif coding_state == MachineState.START:
if self.coding_sm.get_current_charlen() >= 2:
self._num_mb_chars += 1
if self.state == ProbingState.detecting:
if self.state == ProbingState.DETECTING:
if self.get_confidence() > self.SHORTCUT_THRESHOLD:
self._state = ProbingState.found_it
self._state = ProbingState.FOUND_IT
return self.state

View file

@ -2,8 +2,8 @@
This module exists only to simplify retrieving the version number of chardet
from within setup.py and from chardet subpackages.
:author: Dan Blanchard (dblanchard@ets.org)
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
__version__ = "2.3.0"
__version__ = "3.0.4"
VERSION = __version__.split('.')