Merge pull request #977 from JackDandy/feature/UpdateChardet

Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2).
This commit is contained in:
JackDandy 2017-08-26 00:29:26 +01:00 committed by GitHub
commit cf383de226
35 changed files with 486 additions and 329 deletions

View file

@ -13,6 +13,7 @@
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439) * Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
* Update cachecontrol library 0.11.5 to 0.12.3 (db54c40) * Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
* Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089) * Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
* Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2)
* Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160) * Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160)
* Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb) * Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72) * Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)

View file

@ -16,17 +16,24 @@
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from .compat import PY2, PY3, bin_type as _bin_type from .compat import PY2, PY3
from .universaldetector import UniversalDetector from .universaldetector import UniversalDetector
from .version import __version__, VERSION from .version import __version__, VERSION
def detect(byte_str): def detect(byte_str):
if not isinstance(byte_str, _bin_type): """
raise TypeError('Expected object of {0} type, got: {1}' Detect the encoding of the given byte string.
''.format(_bin_type, type(byte_str)))
u = UniversalDetector() :param byte_str: The byte sequence to examine.
u.feed(byte_str) :type byte_str: ``bytes`` or ``bytearray``
u.close() """
return u.result if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()

View file

@ -41,3 +41,7 @@ class Big5Prober(MultiByteCharSetProber):
@property @property
def charset_name(self): def charset_name(self):
return "Big5" return "Big5"
@property
def language(self):
return "Chinese"

View file

@ -35,7 +35,6 @@ from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO) BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE, from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO) JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
class CharDistributionAnalysis(object): class CharDistributionAnalysis(object):
@ -123,9 +122,9 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xc4 -- 0xfe # first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
first_char = wrap_ord(byte_str[0]) first_char = byte_str[0]
if first_char >= 0xC4: if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1 return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
else: else:
return -1 return -1
@ -142,9 +141,9 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
first_char = wrap_ord(byte_str[0]) first_char = byte_str[0]
if first_char >= 0xB0: if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1 return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
else: else:
return -1 return -1
@ -161,7 +160,7 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1]) first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0xB0) and (second_char >= 0xA1): if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1 return 94 * (first_char - 0xB0) + second_char - 0xA1
else: else:
@ -180,7 +179,7 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xa4 -- 0xfe # first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1]) first_char, second_char = byte_str[0], byte_str[1]
if first_char >= 0xA4: if first_char >= 0xA4:
if second_char >= 0xA1: if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
@ -202,7 +201,7 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1]) first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0x81) and (first_char <= 0x9F): if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81) order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF): elif (first_char >= 0xE0) and (first_char <= 0xEF):
@ -227,8 +226,8 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
# first byte range: 0xa0 -- 0xfe # first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
char = wrap_ord(byte_str[0]) char = byte_str[0]
if char >= 0xA0: if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1 return 94 * (char - 0xA1) + byte_str[1] - 0xa1
else: else:
return -1 return -1

View file

@ -54,6 +54,14 @@ class CharSetGroupProber(CharSetProber):
return None return None
return self._best_guess_prober.charset_name return self._best_guess_prober.charset_name
@property
def language(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.language
def feed(self, byte_str): def feed(self, byte_str):
for prober in self.probers: for prober in self.probers:
if not prober: if not prober:
@ -63,22 +71,22 @@ class CharSetGroupProber(CharSetProber):
state = prober.feed(byte_str) state = prober.feed(byte_str)
if not state: if not state:
continue continue
if state == ProbingState.found_it: if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober self._best_guess_prober = prober
return self.state return self.state
elif state == ProbingState.not_me: elif state == ProbingState.NOT_ME:
prober.active = False prober.active = False
self._active_num -= 1 self._active_num -= 1
if self._active_num <= 0: if self._active_num <= 0:
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
return self.state return self.state
return self.state return self.state
def get_confidence(self): def get_confidence(self):
state = self.state state = self.state
if state == ProbingState.found_it: if state == ProbingState.FOUND_IT:
return 0.99 return 0.99
elif state == ProbingState.not_me: elif state == ProbingState.NOT_ME:
return 0.01 return 0.01
best_conf = 0.0 best_conf = 0.0
self._best_guess_prober = None self._best_guess_prober = None
@ -89,7 +97,7 @@ class CharSetGroupProber(CharSetProber):
self.logger.debug('%s not active', prober.charset_name) self.logger.debug('%s not active', prober.charset_name)
continue continue
conf = prober.get_confidence() conf = prober.get_confidence()
self.logger.debug('%s confidence = %s', prober.charset_name, conf) self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
if best_conf < conf: if best_conf < conf:
best_conf = conf best_conf = conf
self._best_guess_prober = prober self._best_guess_prober = prober

View file

@ -42,7 +42,7 @@ class CharSetProber(object):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
def reset(self): def reset(self):
self._state = ProbingState.detecting self._state = ProbingState.DETECTING
@property @property
def charset_name(self): def charset_name(self):

View file

@ -17,15 +17,12 @@ from __future__ import absolute_import, print_function, unicode_literals
import argparse import argparse
import sys import sys
from io import open
from chardet import __version__ from chardet import __version__
from chardet.compat import PY2 from chardet.compat import PY2
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'): def description_of(lines, name='stdin'):
""" """
Return a string describing the probable encoding of a file or Return a string describing the probable encoding of a file or
@ -38,7 +35,11 @@ def description_of(lines, name='stdin'):
""" """
u = UniversalDetector() u = UniversalDetector()
for line in lines: for line in lines:
line = bytearray(line)
u.feed(line) u.feed(line)
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
if u.done:
break
u.close() u.close()
result = u.result result = u.result
if PY2: if PY2:

View file

@ -28,7 +28,6 @@
import logging import logging
from .enums import MachineState from .enums import MachineState
from .compat import wrap_ord
class CodingStateMachine(object): class CodingStateMachine(object):
@ -62,13 +61,13 @@ class CodingStateMachine(object):
self.reset() self.reset()
def reset(self): def reset(self):
self._curr_state = MachineState.start self._curr_state = MachineState.START
def next_state(self, c): def next_state(self, c):
# for each byte we get its class # for each byte we get its class
# if it is first byte, we also get byte length # if it is first byte, we also get byte length
byte_class = self._model['class_table'][wrap_ord(c)] byte_class = self._model['class_table'][c]
if self._curr_state == MachineState.start: if self._curr_state == MachineState.START:
self._curr_byte_pos = 0 self._curr_byte_pos = 0
self._curr_char_len = self._model['char_len_table'][byte_class] self._curr_char_len = self._model['char_len_table'][byte_class]
# from byte's class and state_table, we get its next state # from byte's class and state_table, we get its next state
@ -83,3 +82,7 @@ class CodingStateMachine(object):
def get_coding_state_machine(self): def get_coding_state_machine(self):
return self._model['name'] return self._model['name']
@property
def language(self):
return self._model['language']

View file

@ -27,17 +27,8 @@ if sys.version_info < (3, 0):
PY3 = False PY3 = False
base_str = (str, unicode) base_str = (str, unicode)
text_type = unicode text_type = unicode
bin_type = str
else: else:
PY2 = False PY2 = False
PY3 = True PY3 = True
base_str = (bytes, str) base_str = (bytes, str)
text_type = str text_type = str
bin_type = (bytes, bytearray)
def wrap_ord(a):
if PY2 and isinstance(a, base_str):
return ord(a)
else:
return a

View file

@ -43,3 +43,7 @@ class CP949Prober(MultiByteCharSetProber):
@property @property
def charset_name(self): def charset_name(self):
return "CP949" return "CP949"
@property
def language(self):
return "Korean"

View file

@ -9,9 +9,9 @@ class InputState(object):
""" """
This enum represents the different states a universal detector can be in. This enum represents the different states a universal detector can be in.
""" """
pure_ascii = 0 PURE_ASCII = 0
esc_ascii = 1 ESC_ASCII = 1
high_byte = 2 HIGH_BYTE = 2
class LanguageFilter(object): class LanguageFilter(object):
@ -19,29 +19,58 @@ class LanguageFilter(object):
This enum represents the different language filters we can apply to a This enum represents the different language filters we can apply to a
``UniversalDetector``. ``UniversalDetector``.
""" """
chinese_simplified = 0x01 CHINESE_SIMPLIFIED = 0x01
chinese_traditional = 0x02 CHINESE_TRADITIONAL = 0x02
japanese = 0x04 JAPANESE = 0x04
korean = 0x08 KOREAN = 0x08
non_cjk = 0x10 NON_CJK = 0x10
all = 0x1F ALL = 0x1F
chinese = chinese_simplified | chinese_traditional CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
cjk = chinese | japanese | korean CJK = CHINESE | JAPANESE | KOREAN
class ProbingState(object): class ProbingState(object):
""" """
This enum represents the different states a prober can be in. This enum represents the different states a prober can be in.
""" """
detecting = 0 DETECTING = 0
found_it = 1 FOUND_IT = 1
not_me = 2 NOT_ME = 2
class MachineState(object): class MachineState(object):
""" """
This enum represents the different states a state machine can be in. This enum represents the different states a state machine can be in.
""" """
start = 0 START = 0
error = 1 ERROR = 1
its_me = 2 ITS_ME = 2
class SequenceLikelihood(object):
"""
This enum represents the likelihood of a character following the previous one.
"""
NEGATIVE = 0
UNLIKELY = 1
LIKELY = 2
POSITIVE = 3
@classmethod
def get_num_categories(cls):
""":returns: The number of likelihood categories in the enum."""
return 4
class CharacterCategory(object):
"""
This enum represents the different categories language models for
``SingleByteCharsetProber`` put characters into.
Anything less than CONTROL is considered a letter.
"""
UNDEFINED = 255
LINE_BREAK = 254
SYMBOL = 253
DIGIT = 252
CONTROL = 251

View file

@ -27,7 +27,6 @@
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
from .enums import LanguageFilter, ProbingState, MachineState from .enums import LanguageFilter, ProbingState, MachineState
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL, from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL) ISO2022KR_SM_MODEL)
@ -43,15 +42,16 @@ class EscCharSetProber(CharSetProber):
def __init__(self, lang_filter=None): def __init__(self, lang_filter=None):
super(EscCharSetProber, self).__init__(lang_filter=lang_filter) super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
self.coding_sm = [] self.coding_sm = []
if self.lang_filter & LanguageFilter.chinese_simplified: if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL)) self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
if self.lang_filter & LanguageFilter.japanese: if self.lang_filter & LanguageFilter.JAPANESE:
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.korean: if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None self.active_sm_count = None
self._detected_charset = None self._detected_charset = None
self._detected_language = None
self._state = None self._state = None
self.reset() self.reset()
@ -64,11 +64,16 @@ class EscCharSetProber(CharSetProber):
coding_sm.reset() coding_sm.reset()
self.active_sm_count = len(self.coding_sm) self.active_sm_count = len(self.coding_sm)
self._detected_charset = None self._detected_charset = None
self._detected_language = None
@property @property
def charset_name(self): def charset_name(self):
return self._detected_charset return self._detected_charset
@property
def language(self):
return self._detected_language
def get_confidence(self): def get_confidence(self):
if self._detected_charset: if self._detected_charset:
return 0.99 return 0.99
@ -80,16 +85,17 @@ class EscCharSetProber(CharSetProber):
for coding_sm in self.coding_sm: for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active: if not coding_sm or not coding_sm.active:
continue continue
coding_state = coding_sm.next_state(wrap_ord(c)) coding_state = coding_sm.next_state(c)
if coding_state == MachineState.error: if coding_state == MachineState.ERROR:
coding_sm.active = False coding_sm.active = False
self.active_sm_count -= 1 self.active_sm_count -= 1
if self.active_sm_count <= 0: if self.active_sm_count <= 0:
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
return self.state return self.state
elif coding_state == MachineState.its_me: elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
self._detected_charset = coding_sm.get_coding_state_machine() self._detected_charset = coding_sm.get_coding_state_machine()
self._detected_language = coding_sm.language
return self.state return self.state
return self.state return self.state

View file

@ -63,12 +63,12 @@ HZ_CLS = (
) )
HZ_ST = ( HZ_ST = (
MachineState.start,MachineState.error, 3,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07 MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start, 4,MachineState.error,# 10-17 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
5,MachineState.error, 6,MachineState.error, 5, 5, 4,MachineState.error,# 18-1f 5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
4,MachineState.error, 4, 4, 4,MachineState.error, 4,MachineState.error,# 20-27 4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
4,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 28-2f 4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
) )
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
@ -77,7 +77,8 @@ HZ_SM_MODEL = {'class_table': HZ_CLS,
'class_factor': 6, 'class_factor': 6,
'state_table': HZ_ST, 'state_table': HZ_ST,
'char_len_table': HZ_CHAR_LEN_TABLE, 'char_len_table': HZ_CHAR_LEN_TABLE,
'name': "HZ-GB-2312"} 'name': "HZ-GB-2312",
'language': 'Chinese'}
ISO2022CN_CLS = ( ISO2022CN_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
@ -115,14 +116,14 @@ ISO2022CN_CLS = (
) )
ISO2022CN_ST = ( ISO2022CN_ST = (
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07 MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17 MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,# 18-1f MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 20-27 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
5, 6,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 28-2f 5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 30-37 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,# 38-3f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
) )
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -131,7 +132,8 @@ ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
'class_factor': 9, 'class_factor': 9,
'state_table': ISO2022CN_ST, 'state_table': ISO2022CN_ST,
'char_len_table': ISO2022CN_CHAR_LEN_TABLE, 'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
'name': "ISO-2022-CN"} 'name': "ISO-2022-CN",
'language': 'Chinese'}
ISO2022JP_CLS = ( ISO2022JP_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
@ -169,15 +171,15 @@ ISO2022JP_CLS = (
) )
ISO2022JP_ST = ( ISO2022JP_ST = (
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07 MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,# 18-1f MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 20-27 MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
MachineState.error,MachineState.error,MachineState.error, 6,MachineState.its_me,MachineState.error,MachineState.its_me,MachineState.error,# 28-2f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,# 30-37 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 38-3f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,MachineState.start,# 40-47 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
) )
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -186,7 +188,8 @@ ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
'class_factor': 10, 'class_factor': 10,
'state_table': ISO2022JP_ST, 'state_table': ISO2022JP_ST,
'char_len_table': ISO2022JP_CHAR_LEN_TABLE, 'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
'name': "ISO-2022-JP"} 'name': "ISO-2022-JP",
'language': 'Japanese'}
ISO2022KR_CLS = ( ISO2022KR_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
@ -224,11 +227,11 @@ ISO2022KR_CLS = (
) )
ISO2022KR_ST = ( ISO2022KR_ST = (
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07 MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 10-17 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
MachineState.error,MachineState.error,MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,# 18-1f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 20-27 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
) )
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
@ -237,6 +240,7 @@ ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
'class_factor': 6, 'class_factor': 6,
'state_table': ISO2022KR_ST, 'state_table': ISO2022KR_ST,
'char_len_table': ISO2022KR_CHAR_LEN_TABLE, 'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
'name': "ISO-2022-KR"} 'name': "ISO-2022-KR",
'language': 'Korean'}

View file

@ -49,19 +49,23 @@ class EUCJPProber(MultiByteCharSetProber):
def charset_name(self): def charset_name(self):
return "EUC-JP" return "EUC-JP"
@property
def language(self):
return "Japanese"
def feed(self, byte_str): def feed(self, byte_str):
for i in range(len(byte_str)): for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte # PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i]) coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.error: if coding_state == MachineState.ERROR:
self.logger.debug('%s prober hit error at byte %s', self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, i) self.charset_name, self.language, i)
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
break break
elif coding_state == MachineState.its_me: elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
break break
elif coding_state == MachineState.start: elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen() char_len = self.coding_sm.get_current_charlen()
if i == 0: if i == 0:
self._last_char[1] = byte_str[0] self._last_char[1] = byte_str[0]
@ -75,10 +79,10 @@ class EUCJPProber(MultiByteCharSetProber):
self._last_char[0] = byte_str[-1] self._last_char[0] = byte_str[-1]
if self.state == ProbingState.detecting: if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)): (self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
return self.state return self.state

View file

@ -41,3 +41,7 @@ class EUCKRProber(MultiByteCharSetProber):
@property @property
def charset_name(self): def charset_name(self):
return "EUC-KR" return "EUC-KR"
@property
def language(self):
return "Korean"

View file

@ -44,7 +44,7 @@
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75 EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table , # Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102 EUCTW_TABLE_SIZE = 5376
EUCTW_CHAR_TO_FREQ_ORDER = ( EUCTW_CHAR_TO_FREQ_ORDER = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742

View file

@ -40,3 +40,7 @@ class EUCTWProber(MultiByteCharSetProber):
@property @property
def charset_name(self): def charset_name(self):
return "EUC-TW" return "EUC-TW"
@property
def language(self):
return "Taiwan"

View file

@ -40,3 +40,7 @@ class GB2312Prober(MultiByteCharSetProber):
@property @property
def charset_name(self): def charset_name(self):
return "GB2312" return "GB2312"
@property
def language(self):
return "Chinese"

View file

@ -27,7 +27,6 @@
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import ProbingState from .enums import ProbingState
from .compat import wrap_ord
# This prober doesn't actually recognize a language or a charset. # This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers # It is a helper prober for the use of the Hebrew model probers
@ -177,8 +176,8 @@ class HebrewProber(CharSetProber):
self._visual_prober = visualProber self._visual_prober = visualProber
def is_final(self, c): def is_final(self, c):
return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN, return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI] self.FINAL_PE, self.FINAL_TSADI]
def is_non_final(self, c): def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like # The normal Tsadi is not a good Non-Final letter due to words like
@ -191,8 +190,8 @@ class HebrewProber(CharSetProber):
# for example legally end with a Non-Final Pe or Kaf. However, the # for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage # benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare. # since these words are quite rare.
return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM, return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE] self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str): def feed(self, byte_str):
# Final letter analysis for logical-visual decision. # Final letter analysis for logical-visual decision.
@ -221,9 +220,9 @@ class HebrewProber(CharSetProber):
# We automatically filter out all 7-bit characters (replace them with # We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP] # spaces) so the word boundary detection works properly. [MAP]
if self.state == ProbingState.not_me: if self.state == ProbingState.NOT_ME:
# Both model probers say it's not them. No reason to continue. # Both model probers say it's not them. No reason to continue.
return ProbingState.not_me return ProbingState.NOT_ME
byte_str = self.filter_high_byte_only(byte_str) byte_str = self.filter_high_byte_only(byte_str)
@ -250,8 +249,8 @@ class HebrewProber(CharSetProber):
self._prev = cur self._prev = cur
# Forever detecting, till the end or until both model probers return # Forever detecting, till the end or until both model probers return
# ProbingState.not_me (handled above) # ProbingState.NOT_ME (handled above)
return ProbingState.detecting return ProbingState.DETECTING
@property @property
def charset_name(self): def charset_name(self):
@ -280,10 +279,14 @@ class HebrewProber(CharSetProber):
# Logical. # Logical.
return self.LOGICAL_HEBREW_NAME return self.LOGICAL_HEBREW_NAME
@property
def language(self):
return 'Hebrew'
@property @property
def state(self): def state(self):
# Remain active as long as any of the model probers are active. # Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.not_me) and \ if (self._logical_prober.state == ProbingState.NOT_ME) and \
(self._visual_prober.state == ProbingState.not_me): (self._visual_prober.state == ProbingState.NOT_ME):
return ProbingState.not_me return ProbingState.NOT_ME
return ProbingState.detecting return ProbingState.DETECTING

View file

@ -25,7 +25,6 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from .compat import wrap_ord
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = ( jp2CharContext = (
@ -194,7 +193,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
if not byte_str: if not byte_str:
return -1, 1 return -1, 1
# find out current char's byte length # find out current char's byte length
first_char = wrap_ord(byte_str[0]) first_char = byte_str[0]
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC): if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
char_len = 2 char_len = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC): if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
@ -204,7 +203,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
# return its order if it is hiragana # return its order if it is hiragana
if len(byte_str) > 1: if len(byte_str) > 1:
second_char = wrap_ord(byte_str[1]) second_char = byte_str[1]
if (first_char == 202) and (0x9F <= second_char <= 0xF1): if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, char_len return second_char - 0x9F, char_len
@ -215,7 +214,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
if not byte_str: if not byte_str:
return -1, 1 return -1, 1
# find out current char's byte length # find out current char's byte length
first_char = wrap_ord(byte_str[0]) first_char = byte_str[0]
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE): if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
char_len = 2 char_len = 2
elif first_char == 0x8F: elif first_char == 0x8F:
@ -225,7 +224,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
# return its order if it is hiragana # return its order if it is hiragana
if len(byte_str) > 1: if len(byte_str) > 1:
second_char = wrap_ord(byte_str[1]) second_char = byte_str[1]
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3): if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, char_len return second_char - 0xA1, char_len

View file

@ -214,7 +214,8 @@ Latin5BulgarianModel = {
'precedence_matrix': BulgarianLangModel, 'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392, 'typical_positive_ratio': 0.969392,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "ISO-8859-5" 'charset_name': "ISO-8859-5",
'language': 'Bulgairan',
} }
Win1251BulgarianModel = { Win1251BulgarianModel = {
@ -222,8 +223,6 @@ Win1251BulgarianModel = {
'precedence_matrix': BulgarianLangModel, 'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392, 'typical_positive_ratio': 0.969392,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "windows-1251" 'charset_name': "windows-1251",
'language': 'Bulgarian',
} }

View file

@ -283,7 +283,8 @@ Koi8rModel = {
'precedence_matrix': RussianLangModel, 'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601, 'typical_positive_ratio': 0.976601,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "KOI8-R" 'charset_name': "KOI8-R",
'language': 'Russian',
} }
Win1251CyrillicModel = { Win1251CyrillicModel = {
@ -291,7 +292,8 @@ Win1251CyrillicModel = {
'precedence_matrix': RussianLangModel, 'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601, 'typical_positive_ratio': 0.976601,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "windows-1251" 'charset_name': "windows-1251",
'language': 'Russian',
} }
Latin5CyrillicModel = { Latin5CyrillicModel = {
@ -299,7 +301,8 @@ Latin5CyrillicModel = {
'precedence_matrix': RussianLangModel, 'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601, 'typical_positive_ratio': 0.976601,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "ISO-8859-5" 'charset_name': "ISO-8859-5",
'language': 'Russian',
} }
MacCyrillicModel = { MacCyrillicModel = {
@ -307,7 +310,8 @@ MacCyrillicModel = {
'precedence_matrix': RussianLangModel, 'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601, 'typical_positive_ratio': 0.976601,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "MacCyrillic" 'charset_name': "MacCyrillic",
'language': 'Russian',
} }
Ibm866Model = { Ibm866Model = {
@ -315,7 +319,8 @@ Ibm866Model = {
'precedence_matrix': RussianLangModel, 'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601, 'typical_positive_ratio': 0.976601,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "IBM866" 'charset_name': "IBM866",
'language': 'Russian',
} }
Ibm855Model = { Ibm855Model = {
@ -323,7 +328,6 @@ Ibm855Model = {
'precedence_matrix': RussianLangModel, 'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601, 'typical_positive_ratio': 0.976601,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "IBM855" 'charset_name': "IBM855",
'language': 'Russian',
} }

View file

@ -211,7 +211,8 @@ Latin7GreekModel = {
'precedence_matrix': GreekLangModel, 'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851, 'typical_positive_ratio': 0.982851,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "ISO-8859-7" 'charset_name': "ISO-8859-7",
'language': 'Greek',
} }
Win1253GreekModel = { Win1253GreekModel = {
@ -219,7 +220,6 @@ Win1253GreekModel = {
'precedence_matrix': GreekLangModel, 'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851, 'typical_positive_ratio': 0.982851,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "windows-1253" 'charset_name': "windows-1253",
'language': 'Greek',
} }

View file

@ -195,7 +195,6 @@ Win1255HebrewModel = {
'precedence_matrix': HEBREW_LANG_MODEL, 'precedence_matrix': HEBREW_LANG_MODEL,
'typical_positive_ratio': 0.984004, 'typical_positive_ratio': 0.984004,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "windows-1255" 'charset_name': "windows-1255",
'language': 'Hebrew',
} }

View file

@ -211,7 +211,8 @@ Latin2HungarianModel = {
'precedence_matrix': HungarianLangModel, 'precedence_matrix': HungarianLangModel,
'typical_positive_ratio': 0.947368, 'typical_positive_ratio': 0.947368,
'keep_english_letter': True, 'keep_english_letter': True,
'charset_name': "ISO-8859-2" 'charset_name': "ISO-8859-2",
'language': 'Hungarian',
} }
Win1250HungarianModel = { Win1250HungarianModel = {
@ -219,7 +220,6 @@ Win1250HungarianModel = {
'precedence_matrix': HungarianLangModel, 'precedence_matrix': HungarianLangModel,
'typical_positive_ratio': 0.947368, 'typical_positive_ratio': 0.947368,
'keep_english_letter': True, 'keep_english_letter': True,
'charset_name': "windows-1250" 'charset_name': "windows-1250",
'language': 'Hungarian',
} }

View file

@ -194,7 +194,6 @@ TIS620ThaiModel = {
'precedence_matrix': ThaiLangModel, 'precedence_matrix': ThaiLangModel,
'typical_positive_ratio': 0.926386, 'typical_positive_ratio': 0.926386,
'keep_english_letter': False, 'keep_english_letter': False,
'charset_name': "TIS-620" 'charset_name': "TIS-620",
'language': 'Thai',
} }

View file

@ -188,5 +188,6 @@ Latin5TurkishModel = {
'precedence_matrix': TurkishLangModel, 'precedence_matrix': TurkishLangModel,
'typical_positive_ratio': 0.970290, 'typical_positive_ratio': 0.970290,
'keep_english_letter': True, 'keep_english_letter': True,
'charset_name': "ISO-8859-9" 'charset_name': "ISO-8859-9",
'language': 'Turkish',
} }

View file

@ -27,7 +27,6 @@
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .compat import wrap_ord
from .enums import ProbingState from .enums import ProbingState
FREQ_CAT_NUM = 4 FREQ_CAT_NUM = 4
@ -108,16 +107,20 @@ class Latin1Prober(CharSetProber):
@property @property
def charset_name(self): def charset_name(self):
return "windows-1252" return "ISO-8859-1"
@property
def language(self):
return ""
def feed(self, byte_str): def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str) byte_str = self.filter_with_english_letters(byte_str)
for c in byte_str: for c in byte_str:
char_class = Latin1_CharToClass[wrap_ord(c)] char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
+ char_class] + char_class]
if freq == 0: if freq == 0:
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
break break
self._freq_counter[freq] += 1 self._freq_counter[freq] += 1
self._last_char_class = char_class self._last_char_class = char_class
@ -125,7 +128,7 @@ class Latin1Prober(CharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self):
if self.state == ProbingState.not_me: if self.state == ProbingState.NOT_ME:
return 0.01 return 0.01
total = sum(self._freq_counter) total = sum(self._freq_counter)

View file

@ -52,34 +52,38 @@ class MultiByteCharSetProber(CharSetProber):
@property @property
def charset_name(self): def charset_name(self):
pass raise NotImplementedError
@property
def language(self):
raise NotImplementedError
def feed(self, byte_str): def feed(self, byte_str):
for i in range(len(byte_str)): for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i]) coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.error: if coding_state == MachineState.ERROR:
self.logger.debug('%s prober hit error at byte %s', self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, i) self.charset_name, self.language, i)
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
break break
elif coding_state == MachineState.its_me: elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
break break
elif coding_state == MachineState.start: elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen() char_len = self.coding_sm.get_current_charlen()
if i == 0: if i == 0:
self._last_char[1] = byte_str[0] self._last_char[1] = byte_str[0]
self.distribution_analyzer.feed(self._last_char, char_len) self.distribution_analyzer.feed(self._last_char, char_len)
else: else:
self.distribution_analyzer.feed(byte_str[i - 1:i + 1], self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len) char_len)
self._last_char[0] = byte_str[-1] self._last_char[0] = byte_str[-1]
if self.state == ProbingState.detecting: if self.state == ProbingState.DETECTING:
if (self.distribution_analyzer.got_enough_data() and if (self.distribution_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)): (self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
return self.state return self.state

View file

@ -65,9 +65,9 @@ BIG5_CLS = (
) )
BIG5_ST = ( BIG5_ST = (
MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07 MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start#10-17 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
) )
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
@ -101,13 +101,13 @@ CP949_CLS = (
CP949_ST = ( CP949_ST = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state = #cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
MachineState.error,MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start, 4, 5,MachineState.error, 6, # MachineState.start MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START, 4, 5,MachineState.ERROR, 6, # MachineState.START
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, # MachineState.error MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me, # MachineState.its_me MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 3 MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 4 MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 5 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 6 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
) )
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
@ -156,11 +156,11 @@ EUCJP_CLS = (
) )
EUCJP_ST = ( EUCJP_ST = (
3, 4, 3, 5,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#00-07 3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.start,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#10-17 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error, 3,MachineState.error,#18-1f MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
3,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start#20-27 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
) )
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
@ -209,8 +209,8 @@ EUCKR_CLS = (
) )
EUCKR_ST = ( EUCKR_ST = (
MachineState.error,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07 MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start #08-0f MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
) )
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
@ -259,12 +259,12 @@ EUCTW_CLS = (
) )
EUCTW_ST = ( EUCTW_ST = (
MachineState.error,MachineState.error,MachineState.start, 3, 3, 3, 4,MachineState.error,#00-07 MachineState.ERROR,MachineState.ERROR,MachineState.START, 3, 3, 3, 4,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.start,MachineState.error,#10-17 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17
MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
5,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.start,MachineState.start,#20-27 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
MachineState.start,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
) )
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
@ -313,12 +313,12 @@ GB2312_CLS = (
) )
GB2312_ST = ( GB2312_ST = (
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, 3,MachineState.error,#00-07 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, 3,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,#10-17 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17
4,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f 4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,#20-27 MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
) )
# To be accurate, the length of class 6 can be either 2 or 4. # To be accurate, the length of class 6 can be either 2 or 4.
@ -374,9 +374,9 @@ SJIS_CLS = (
SJIS_ST = ( SJIS_ST = (
MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07 MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start #10-17 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
) )
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
@ -425,13 +425,13 @@ UCS2BE_CLS = (
) )
UCS2BE_ST = ( UCS2BE_ST = (
5, 7, 7,MachineState.error, 4, 3,MachineState.error,MachineState.error,#00-07 5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.its_me,MachineState.its_me, 6, 6, 6, 6,MachineState.error,MachineState.error,#10-17 MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17
6, 6, 6, 6, 6,MachineState.its_me, 6, 6,#18-1f 6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,MachineState.error,#20-27 6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27
5, 8, 6, 6,MachineState.error, 6, 6, 6,#28-2f 5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
6, 6, 6, 6,MachineState.error,MachineState.error,MachineState.start,MachineState.start #30-37 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
) )
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
@ -480,13 +480,13 @@ UCS2LE_CLS = (
) )
UCS2LE_ST = ( UCS2LE_ST = (
6, 6, 7, 6, 4, 3,MachineState.error,MachineState.error,#00-07 6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.its_me,MachineState.its_me, 5, 5, 5,MachineState.error,MachineState.its_me,MachineState.error,#10-17 MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
5, 5, 5,MachineState.error, 5,MachineState.error, 6, 6,#18-1f 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,MachineState.error,#20-27 7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27
5, 5, 5,MachineState.error,MachineState.error,MachineState.error, 5, 5,#28-2f 5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
5, 5, 5,MachineState.error, 5,MachineState.error,MachineState.start,MachineState.start #30-37 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
) )
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
@ -535,32 +535,32 @@ UTF8_CLS = (
) )
UTF8_ST = ( UTF8_ST = (
MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12, 10,#00-07 MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#10-17 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#20-27 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#28-2f MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f
MachineState.error,MachineState.error, 5, 5, 5, 5,MachineState.error,MachineState.error,#30-37 MachineState.ERROR,MachineState.ERROR, 5, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#30-37
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#38-3f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f
MachineState.error,MachineState.error,MachineState.error, 5, 5, 5,MachineState.error,MachineState.error,#40-47 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#40-47
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#48-4f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f
MachineState.error,MachineState.error, 7, 7, 7, 7,MachineState.error,MachineState.error,#50-57 MachineState.ERROR,MachineState.ERROR, 7, 7, 7, 7,MachineState.ERROR,MachineState.ERROR,#50-57
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#58-5f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f
MachineState.error,MachineState.error,MachineState.error,MachineState.error, 7, 7,MachineState.error,MachineState.error,#60-67 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 7, 7,MachineState.ERROR,MachineState.ERROR,#60-67
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#68-6f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f
MachineState.error,MachineState.error, 9, 9, 9, 9,MachineState.error,MachineState.error,#70-77 MachineState.ERROR,MachineState.ERROR, 9, 9, 9, 9,MachineState.ERROR,MachineState.ERROR,#70-77
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#78-7f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 9,MachineState.error,MachineState.error,#80-87 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 9,MachineState.ERROR,MachineState.ERROR,#80-87
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#88-8f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f
MachineState.error,MachineState.error, 12, 12, 12, 12,MachineState.error,MachineState.error,#90-97 MachineState.ERROR,MachineState.ERROR, 12, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,#90-97
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#98-9f MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12,MachineState.error,MachineState.error,#a0-a7 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12,MachineState.ERROR,MachineState.ERROR,#a0-a7
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#a8-af MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af
MachineState.error,MachineState.error, 12, 12, 12,MachineState.error,MachineState.error,MachineState.error,#b0-b7 MachineState.ERROR,MachineState.ERROR, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#b8-bf MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,#c0-c7 MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error #c8-cf MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
) )
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)

View file

@ -27,18 +27,14 @@
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .compat import wrap_ord from .enums import CharacterCategory, ProbingState, SequenceLikelihood
from .enums import ProbingState
class SingleByteCharSetProber(CharSetProber): class SingleByteCharSetProber(CharSetProber):
SAMPLE_SIZE = 64 SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
POSITIVE_SHORTCUT_THRESHOLD = 0.95 POSITIVE_SHORTCUT_THRESHOLD = 0.95
NEGATIVE_SHORTCUT_THRESHOLD = 0.05 NEGATIVE_SHORTCUT_THRESHOLD = 0.05
SYMBOL_CAT_ORDER = 250
NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
def __init__(self, model, reversed=False, name_prober=None): def __init__(self, model, reversed=False, name_prober=None):
super(SingleByteCharSetProber, self).__init__() super(SingleByteCharSetProber, self).__init__()
@ -58,7 +54,7 @@ class SingleByteCharSetProber(CharSetProber):
super(SingleByteCharSetProber, self).reset() super(SingleByteCharSetProber, self).reset()
# char order of last character # char order of last character
self._last_order = 255 self._last_order = 255
self._seq_counters = [0] * self.NUMBER_OF_SEQ_CAT self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
self._total_seqs = 0 self._total_seqs = 0
self._total_char = 0 self._total_char = 0
# characters that fall in our sampling range # characters that fall in our sampling range
@ -71,15 +67,29 @@ class SingleByteCharSetProber(CharSetProber):
else: else:
return self._model['charset_name'] return self._model['charset_name']
@property
def language(self):
if self._name_prober:
return self._name_prober.language
else:
return self._model.get('language')
def feed(self, byte_str): def feed(self, byte_str):
if not self._model['keep_english_letter']: if not self._model['keep_english_letter']:
byte_str = self.filter_international_words(byte_str) byte_str = self.filter_international_words(byte_str)
num_bytes = len(byte_str) if not byte_str:
if not num_bytes:
return self.state return self.state
for c in byte_str: char_to_order_map = self._model['char_to_order_map']
order = self._model['char_to_order_map'][wrap_ord(c)] for i, c in enumerate(byte_str):
if order < self.SYMBOL_CAT_ORDER: # XXX: Order is in range 1-64, so one would think we want 0-63 here,
# but that leads to 27 more test failures than before.
order = char_to_order_map[c]
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
# to make it closer to the original intent. The only difference
# is whether or not we count digits and control characters for
# _total_char purposes.
if order < CharacterCategory.CONTROL:
self._total_char += 1 self._total_char += 1
if order < self.SAMPLE_SIZE: if order < self.SAMPLE_SIZE:
self._freq_char += 1 self._freq_char += 1
@ -94,27 +104,28 @@ class SingleByteCharSetProber(CharSetProber):
self._seq_counters[model] += 1 self._seq_counters[model] += 1
self._last_order = order self._last_order = order
if self.state == ProbingState.detecting: charset_name = self._model['charset_name']
if self.state == ProbingState.DETECTING:
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
cf = self.get_confidence() confidence = self.get_confidence()
if cf > self.POSITIVE_SHORTCUT_THRESHOLD: if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, we have a winner', self.logger.debug('%s confidence = %s, we have a winner',
self._model['charset_name'], cf) charset_name, confidence)
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
elif cf < self.NEGATIVE_SHORTCUT_THRESHOLD: elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, below negative ' self.logger.debug('%s confidence = %s, below negative '
'shortcut threshold %s', 'shortcut threshhold %s', charset_name,
self._model['charset_name'], cf, confidence,
self.NEGATIVE_SHORTCUT_THRESHOLD) self.NEGATIVE_SHORTCUT_THRESHOLD)
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
return self.state return self.state
def get_confidence(self): def get_confidence(self):
r = 0.01 r = 0.01
if self._total_seqs > 0: if self._total_seqs > 0:
r = ((1.0 * self._seq_counters[self.POSITIVE_CAT]) / self._total_seqs r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
/ self._model['typical_positive_ratio']) self._total_seqs / self._model['typical_positive_ratio'])
r = r * self._freq_char / self._total_char r = r * self._freq_char / self._total_char
if r >= 1.0: if r >= 1.0:
r = 0.99 r = 0.99

View file

@ -49,36 +49,40 @@ class SJISProber(MultiByteCharSetProber):
def charset_name(self): def charset_name(self):
return self.context_analyzer.charset_name return self.context_analyzer.charset_name
@property
def language(self):
return "Japanese"
def feed(self, byte_str): def feed(self, byte_str):
for i in range(len(byte_str)): for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i]) coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.error: if coding_state == MachineState.ERROR:
self.logger.debug('%s prober hit error at byte %s', self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, i) self.charset_name, self.language, i)
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
break break
elif coding_state == MachineState.its_me: elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
break break
elif coding_state == MachineState.start: elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen() char_len = self.coding_sm.get_current_charlen()
if i == 0: if i == 0:
self._last_char[1] = byte_str[0] self._last_char[1] = byte_str[0]
self.context_analyzer.feed(self._last_char[2 - char_len:], self.context_analyzer.feed(self._last_char[2 - char_len:],
char_len) char_len)
self.distribution_analyzer.feed(self._last_char, char_len) self.distribution_analyzer.feed(self._last_char, char_len)
else: else:
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3 self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
- char_len], char_len) - char_len], char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1], self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len) char_len)
self._last_char[0] = byte_str[-1] self._last_char[0] = byte_str[-1]
if self.state == ProbingState.detecting: if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)): (self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
return self.state return self.state

View file

@ -40,6 +40,7 @@ import codecs
import logging import logging
import re import re
from .charsetgroupprober import CharSetGroupProber
from .enums import InputState, LanguageFilter, ProbingState from .enums import InputState, LanguageFilter, ProbingState
from .escprober import EscCharSetProber from .escprober import EscCharSetProber
from .latin1prober import Latin1Prober from .latin1prober import Latin1Prober
@ -67,8 +68,17 @@ class UniversalDetector(object):
MINIMUM_THRESHOLD = 0.20 MINIMUM_THRESHOLD = 0.20
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
ESC_DETECTOR = re.compile(b'(\033|~{)') ESC_DETECTOR = re.compile(b'(\033|~{)')
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
'iso-8859-2': 'Windows-1250',
'iso-8859-5': 'Windows-1251',
'iso-8859-6': 'Windows-1256',
'iso-8859-7': 'Windows-1253',
'iso-8859-8': 'Windows-1255',
'iso-8859-9': 'Windows-1254',
'iso-8859-13': 'Windows-1257'}
def __init__(self, lang_filter=LanguageFilter.all): def __init__(self, lang_filter=LanguageFilter.ALL):
self._esc_charset_prober = None self._esc_charset_prober = None
self._charset_probers = [] self._charset_probers = []
self.result = None self.result = None
@ -78,6 +88,7 @@ class UniversalDetector(object):
self._last_char = None self._last_char = None
self.lang_filter = lang_filter self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self._has_win_bytes = None
self.reset() self.reset()
def reset(self): def reset(self):
@ -86,10 +97,11 @@ class UniversalDetector(object):
initial states. This is called by ``__init__``, so you only need to initial states. This is called by ``__init__``, so you only need to
call this directly in between analyses of different documents. call this directly in between analyses of different documents.
""" """
self.result = {'encoding': None, 'confidence': 0.0} self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
self.done = False self.done = False
self._got_data = False self._got_data = False
self._input_state = InputState.pure_ascii self._has_win_bytes = False
self._input_state = InputState.PURE_ASCII
self._last_char = b'' self._last_char = b''
if self._esc_charset_prober: if self._esc_charset_prober:
self._esc_charset_prober.reset() self._esc_charset_prober.reset()
@ -116,28 +128,40 @@ class UniversalDetector(object):
if not len(byte_str): if not len(byte_str):
return return
if not isinstance(byte_str, bytearray):
byte_str = bytearray(byte_str)
# First check for known BOMs, since these are guaranteed to be correct # First check for known BOMs, since these are guaranteed to be correct
if not self._got_data: if not self._got_data:
# If the data starts with BOM, we know it is UTF # If the data starts with BOM, we know it is UTF
if byte_str.startswith(codecs.BOM_UTF8): if byte_str.startswith(codecs.BOM_UTF8):
# EF BB BF UTF-8 with BOM # EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} self.result = {'encoding': "UTF-8-SIG",
elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE): 'confidence': 1.0,
'language': ''}
elif byte_str.startswith((codecs.BOM_UTF32_LE,
codecs.BOM_UTF32_BE)):
# FF FE 00 00 UTF-32, little-endian BOM # FF FE 00 00 UTF-32, little-endian BOM
# 00 00 FE FF UTF-32, big-endian BOM # 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32", 'confidence': 1.0} self.result = {'encoding': "UTF-32",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\xFE\xFF\x00\x00'): elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412) # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0} 'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\x00\x00\xFF\xFE'): elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0} 'confidence': 1.0,
elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE): 'language': ''}
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
# FF FE UTF-16, little endian BOM # FF FE UTF-16, little endian BOM
# FE FF UTF-16, big endian BOM # FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16", 'confidence': 1.0} self.result = {'encoding': "UTF-16",
'confidence': 1.0,
'language': ''}
self._got_data = True self._got_data = True
if self.result['encoding'] is not None: if self.result['encoding'] is not None:
@ -146,12 +170,12 @@ class UniversalDetector(object):
# If none of those matched and we've only see ASCII so far, check # If none of those matched and we've only see ASCII so far, check
# for high bytes and escape sequences # for high bytes and escape sequences
if self._input_state == InputState.pure_ascii: if self._input_state == InputState.PURE_ASCII:
if self.HIGH_BYTE_DETECTOR.search(byte_str): if self.HIGH_BYTE_DETECTOR.search(byte_str):
self._input_state = InputState.high_byte self._input_state = InputState.HIGH_BYTE
elif self._input_state == InputState.pure_ascii and \ elif self._input_state == InputState.PURE_ASCII and \
self.ESC_DETECTOR.search(self._last_char + byte_str): self.ESC_DETECTOR.search(self._last_char + byte_str):
self._input_state = InputState.esc_ascii self._input_state = InputState.ESC_ASCII
self._last_char = byte_str[-1:] self._last_char = byte_str[-1:]
@ -159,14 +183,16 @@ class UniversalDetector(object):
# uses a simple state machine to check for known escape sequences in # uses a simple state machine to check for known escape sequences in
# HZ and ISO-2022 encodings, since those are the only encodings that # HZ and ISO-2022 encodings, since those are the only encodings that
# use such sequences. # use such sequences.
if self._input_state == InputState.esc_ascii: if self._input_state == InputState.ESC_ASCII:
if not self._esc_charset_prober: if not self._esc_charset_prober:
self._esc_charset_prober = EscCharSetProber(self.lang_filter) self._esc_charset_prober = EscCharSetProber(self.lang_filter)
if self._esc_charset_prober.feed(byte_str) == ProbingState.found_it: if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding': self.result = {'encoding':
self._esc_charset_prober.charset_name, self._esc_charset_prober.charset_name,
'confidence': 'confidence':
self._esc_charset_prober.get_confidence()} self._esc_charset_prober.get_confidence(),
'language':
self._esc_charset_prober.language}
self.done = True self.done = True
# If we've seen high bytes (i.e., those with values greater than 127), # If we've seen high bytes (i.e., those with values greater than 127),
# we need to do more complicated checks using all our multi-byte and # we need to do more complicated checks using all our multi-byte and
@ -174,59 +200,87 @@ class UniversalDetector(object):
# use character bigram distributions to determine the encoding, whereas # use character bigram distributions to determine the encoding, whereas
# the multi-byte probers use a combination of character unigram and # the multi-byte probers use a combination of character unigram and
# bigram distributions. # bigram distributions.
elif self._input_state == InputState.high_byte: elif self._input_state == InputState.HIGH_BYTE:
if not self._charset_probers: if not self._charset_probers:
self._charset_probers = [MBCSGroupProber(self.lang_filter)] self._charset_probers = [MBCSGroupProber(self.lang_filter)]
# If we're checking non-CJK encodings, use single-byte prober # If we're checking non-CJK encodings, use single-byte prober
if self.lang_filter & LanguageFilter.non_cjk: if self.lang_filter & LanguageFilter.NON_CJK:
self._charset_probers.append(SBCSGroupProber()) self._charset_probers.append(SBCSGroupProber())
self._charset_probers.append(Latin1Prober()) self._charset_probers.append(Latin1Prober())
for prober in self._charset_probers: for prober in self._charset_probers:
if prober.feed(byte_str) == ProbingState.found_it: if prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding': prober.charset_name, self.result = {'encoding': prober.charset_name,
'confidence': prober.get_confidence()} 'confidence': prober.get_confidence(),
'language': prober.language}
self.done = True self.done = True
break break
if self.WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True
def close(self): def close(self):
""" """
Stop analyzing the current document and come up with a final Stop analyzing the current document and come up with a final
prediction. prediction.
:returns: The ``result`` attribute if a prediction was made, otherwise :returns: The ``result`` attribute, a ``dict`` with the keys
``None``. `encoding`, `confidence`, and `language`.
""" """
# Don't bother with checks if we're already done
if self.done: if self.done:
return self.result return self.result
if not self._got_data:
self.logger.debug('no data received!')
return
self.done = True self.done = True
if self._input_state in (InputState.pure_ascii, InputState.esc_ascii): if not self._got_data:
self.result = {'encoding': 'ascii', 'confidence': 1.0} self.logger.debug('no data received!')
return self.result
if self._input_state == InputState.high_byte: # Default to ASCII if it is all we've seen so far
proberConfidence = None elif self._input_state == InputState.PURE_ASCII:
self.result = {'encoding': 'ascii',
'confidence': 1.0,
'language': ''}
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
elif self._input_state == InputState.HIGH_BYTE:
prober_confidence = None
max_prober_confidence = 0.0 max_prober_confidence = 0.0
max_prober = None max_prober = None
for prober in self._charset_probers: for prober in self._charset_probers:
if not prober: if not prober:
continue continue
proberConfidence = prober.get_confidence() prober_confidence = prober.get_confidence()
if proberConfidence > max_prober_confidence: if prober_confidence > max_prober_confidence:
max_prober_confidence = proberConfidence max_prober_confidence = prober_confidence
max_prober = prober max_prober = prober
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
self.result = {'encoding': max_prober.charset_name, charset_name = max_prober.charset_name
'confidence': max_prober.get_confidence()} lower_charset_name = max_prober.charset_name.lower()
return self.result confidence = max_prober.get_confidence()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if self._has_win_bytes:
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
self.result = {'encoding': charset_name,
'confidence': confidence,
'language': max_prober.language}
# Log all prober confidences if none met MINIMUM_THRESHOLD
if self.logger.getEffectiveLevel() == logging.DEBUG: if self.logger.getEffectiveLevel() == logging.DEBUG:
self.logger.debug('no probers hit minimum threshold') if self.result['encoding'] is None:
for prober in self._charset_probers[0].probers: self.logger.debug('no probers hit minimum threshold')
if not prober: for group_prober in self._charset_probers:
continue if not group_prober:
self.logger.debug('%s confidence = %s', prober.charset_name, continue
prober.get_confidence()) if isinstance(group_prober, CharSetGroupProber):
for prober in group_prober.probers:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
else:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
return self.result

View file

@ -50,22 +50,26 @@ class UTF8Prober(CharSetProber):
def charset_name(self): def charset_name(self):
return "utf-8" return "utf-8"
@property
def language(self):
return ""
def feed(self, byte_str): def feed(self, byte_str):
for c in byte_str: for c in byte_str:
coding_state = self.coding_sm.next_state(c) coding_state = self.coding_sm.next_state(c)
if coding_state == MachineState.error: if coding_state == MachineState.ERROR:
self._state = ProbingState.not_me self._state = ProbingState.NOT_ME
break break
elif coding_state == MachineState.its_me: elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
break break
elif coding_state == MachineState.start: elif coding_state == MachineState.START:
if self.coding_sm.get_current_charlen() >= 2: if self.coding_sm.get_current_charlen() >= 2:
self._num_mb_chars += 1 self._num_mb_chars += 1
if self.state == ProbingState.detecting: if self.state == ProbingState.DETECTING:
if self.get_confidence() > self.SHORTCUT_THRESHOLD: if self.get_confidence() > self.SHORTCUT_THRESHOLD:
self._state = ProbingState.found_it self._state = ProbingState.FOUND_IT
return self.state return self.state

View file

@ -2,8 +2,8 @@
This module exists only to simplify retrieving the version number of chardet This module exists only to simplify retrieving the version number of chardet
from within setup.py and from chardet subpackages. from within setup.py and from chardet subpackages.
:author: Dan Blanchard (dblanchard@ets.org) :author: Dan Blanchard (dan.blanchard@gmail.com)
""" """
__version__ = "2.3.0" __version__ = "3.0.4"
VERSION = __version__.split('.') VERSION = __version__.split('.')