mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-22 09:33:37 +00:00
Merge pull request #977 from JackDandy/feature/UpdateChardet
Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2).
This commit is contained in:
commit
cf383de226
35 changed files with 486 additions and 329 deletions
|
@ -13,6 +13,7 @@
|
||||||
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
|
* Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
|
||||||
* Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
|
* Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
|
||||||
* Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
|
* Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
|
||||||
|
* Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2)
|
||||||
* Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160)
|
* Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160)
|
||||||
* Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
|
* Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
|
||||||
* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)
|
* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)
|
||||||
|
|
|
@ -16,17 +16,24 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
|
||||||
from .compat import PY2, PY3, bin_type as _bin_type
|
from .compat import PY2, PY3
|
||||||
from .universaldetector import UniversalDetector
|
from .universaldetector import UniversalDetector
|
||||||
from .version import __version__, VERSION
|
from .version import __version__, VERSION
|
||||||
|
|
||||||
|
|
||||||
def detect(byte_str):
|
def detect(byte_str):
|
||||||
if not isinstance(byte_str, _bin_type):
|
"""
|
||||||
raise TypeError('Expected object of {0} type, got: {1}'
|
Detect the encoding of the given byte string.
|
||||||
''.format(_bin_type, type(byte_str)))
|
|
||||||
|
|
||||||
u = UniversalDetector()
|
:param byte_str: The byte sequence to examine.
|
||||||
u.feed(byte_str)
|
:type byte_str: ``bytes`` or ``bytearray``
|
||||||
u.close()
|
"""
|
||||||
return u.result
|
if not isinstance(byte_str, bytearray):
|
||||||
|
if not isinstance(byte_str, bytes):
|
||||||
|
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||||
|
'{0}'.format(type(byte_str)))
|
||||||
|
else:
|
||||||
|
byte_str = bytearray(byte_str)
|
||||||
|
detector = UniversalDetector()
|
||||||
|
detector.feed(byte_str)
|
||||||
|
return detector.close()
|
||||||
|
|
|
@ -41,3 +41,7 @@ class Big5Prober(MultiByteCharSetProber):
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "Big5"
|
return "Big5"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Chinese"
|
||||||
|
|
|
@ -35,7 +35,6 @@ from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
|
||||||
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
|
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
|
||||||
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
from .compat import wrap_ord
|
|
||||||
|
|
||||||
|
|
||||||
class CharDistributionAnalysis(object):
|
class CharDistributionAnalysis(object):
|
||||||
|
@ -123,9 +122,9 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
# first byte range: 0xc4 -- 0xfe
|
# first byte range: 0xc4 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char = wrap_ord(byte_str[0])
|
first_char = byte_str[0]
|
||||||
if first_char >= 0xC4:
|
if first_char >= 0xC4:
|
||||||
return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1
|
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
@ -142,9 +141,9 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char = wrap_ord(byte_str[0])
|
first_char = byte_str[0]
|
||||||
if first_char >= 0xB0:
|
if first_char >= 0xB0:
|
||||||
return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1
|
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
@ -161,7 +160,7 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||||
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||||
else:
|
else:
|
||||||
|
@ -180,7 +179,7 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
# first byte range: 0xa4 -- 0xfe
|
# first byte range: 0xa4 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if first_char >= 0xA4:
|
if first_char >= 0xA4:
|
||||||
if second_char >= 0xA1:
|
if second_char >= 0xA1:
|
||||||
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||||
|
@ -202,7 +201,7 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if (first_char >= 0x81) and (first_char <= 0x9F):
|
if (first_char >= 0x81) and (first_char <= 0x9F):
|
||||||
order = 188 * (first_char - 0x81)
|
order = 188 * (first_char - 0x81)
|
||||||
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
||||||
|
@ -227,8 +226,8 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
# first byte range: 0xa0 -- 0xfe
|
# first byte range: 0xa0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
char = wrap_ord(byte_str[0])
|
char = byte_str[0]
|
||||||
if char >= 0xA0:
|
if char >= 0xA0:
|
||||||
return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1
|
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
|
@ -54,6 +54,14 @@ class CharSetGroupProber(CharSetProber):
|
||||||
return None
|
return None
|
||||||
return self._best_guess_prober.charset_name
|
return self._best_guess_prober.charset_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
if not self._best_guess_prober:
|
||||||
|
self.get_confidence()
|
||||||
|
if not self._best_guess_prober:
|
||||||
|
return None
|
||||||
|
return self._best_guess_prober.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if not prober:
|
if not prober:
|
||||||
|
@ -63,22 +71,22 @@ class CharSetGroupProber(CharSetProber):
|
||||||
state = prober.feed(byte_str)
|
state = prober.feed(byte_str)
|
||||||
if not state:
|
if not state:
|
||||||
continue
|
continue
|
||||||
if state == ProbingState.found_it:
|
if state == ProbingState.FOUND_IT:
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
return self.state
|
return self.state
|
||||||
elif state == ProbingState.not_me:
|
elif state == ProbingState.NOT_ME:
|
||||||
prober.active = False
|
prober.active = False
|
||||||
self._active_num -= 1
|
self._active_num -= 1
|
||||||
if self._active_num <= 0:
|
if self._active_num <= 0:
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
return self.state
|
return self.state
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
state = self.state
|
state = self.state
|
||||||
if state == ProbingState.found_it:
|
if state == ProbingState.FOUND_IT:
|
||||||
return 0.99
|
return 0.99
|
||||||
elif state == ProbingState.not_me:
|
elif state == ProbingState.NOT_ME:
|
||||||
return 0.01
|
return 0.01
|
||||||
best_conf = 0.0
|
best_conf = 0.0
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
|
@ -89,7 +97,7 @@ class CharSetGroupProber(CharSetProber):
|
||||||
self.logger.debug('%s not active', prober.charset_name)
|
self.logger.debug('%s not active', prober.charset_name)
|
||||||
continue
|
continue
|
||||||
conf = prober.get_confidence()
|
conf = prober.get_confidence()
|
||||||
self.logger.debug('%s confidence = %s', prober.charset_name, conf)
|
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
|
||||||
if best_conf < conf:
|
if best_conf < conf:
|
||||||
best_conf = conf
|
best_conf = conf
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
|
|
|
@ -42,7 +42,7 @@ class CharSetProber(object):
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._state = ProbingState.detecting
|
self._state = ProbingState.DETECTING
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
|
|
|
@ -17,15 +17,12 @@ from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
from io import open
|
|
||||||
|
|
||||||
from chardet import __version__
|
from chardet import __version__
|
||||||
from chardet.compat import PY2
|
from chardet.compat import PY2
|
||||||
from chardet.universaldetector import UniversalDetector
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def description_of(lines, name='stdin'):
|
def description_of(lines, name='stdin'):
|
||||||
"""
|
"""
|
||||||
Return a string describing the probable encoding of a file or
|
Return a string describing the probable encoding of a file or
|
||||||
|
@ -38,7 +35,11 @@ def description_of(lines, name='stdin'):
|
||||||
"""
|
"""
|
||||||
u = UniversalDetector()
|
u = UniversalDetector()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
line = bytearray(line)
|
||||||
u.feed(line)
|
u.feed(line)
|
||||||
|
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
|
||||||
|
if u.done:
|
||||||
|
break
|
||||||
u.close()
|
u.close()
|
||||||
result = u.result
|
result = u.result
|
||||||
if PY2:
|
if PY2:
|
||||||
|
|
|
@ -28,7 +28,6 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
from .compat import wrap_ord
|
|
||||||
|
|
||||||
|
|
||||||
class CodingStateMachine(object):
|
class CodingStateMachine(object):
|
||||||
|
@ -62,13 +61,13 @@ class CodingStateMachine(object):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._curr_state = MachineState.start
|
self._curr_state = MachineState.START
|
||||||
|
|
||||||
def next_state(self, c):
|
def next_state(self, c):
|
||||||
# for each byte we get its class
|
# for each byte we get its class
|
||||||
# if it is first byte, we also get byte length
|
# if it is first byte, we also get byte length
|
||||||
byte_class = self._model['class_table'][wrap_ord(c)]
|
byte_class = self._model['class_table'][c]
|
||||||
if self._curr_state == MachineState.start:
|
if self._curr_state == MachineState.START:
|
||||||
self._curr_byte_pos = 0
|
self._curr_byte_pos = 0
|
||||||
self._curr_char_len = self._model['char_len_table'][byte_class]
|
self._curr_char_len = self._model['char_len_table'][byte_class]
|
||||||
# from byte's class and state_table, we get its next state
|
# from byte's class and state_table, we get its next state
|
||||||
|
@ -83,3 +82,7 @@ class CodingStateMachine(object):
|
||||||
|
|
||||||
def get_coding_state_machine(self):
|
def get_coding_state_machine(self):
|
||||||
return self._model['name']
|
return self._model['name']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return self._model['language']
|
||||||
|
|
|
@ -27,17 +27,8 @@ if sys.version_info < (3, 0):
|
||||||
PY3 = False
|
PY3 = False
|
||||||
base_str = (str, unicode)
|
base_str = (str, unicode)
|
||||||
text_type = unicode
|
text_type = unicode
|
||||||
bin_type = str
|
|
||||||
else:
|
else:
|
||||||
PY2 = False
|
PY2 = False
|
||||||
PY3 = True
|
PY3 = True
|
||||||
base_str = (bytes, str)
|
base_str = (bytes, str)
|
||||||
text_type = str
|
text_type = str
|
||||||
bin_type = (bytes, bytearray)
|
|
||||||
|
|
||||||
|
|
||||||
def wrap_ord(a):
|
|
||||||
if PY2 and isinstance(a, base_str):
|
|
||||||
return ord(a)
|
|
||||||
else:
|
|
||||||
return a
|
|
||||||
|
|
|
@ -43,3 +43,7 @@ class CP949Prober(MultiByteCharSetProber):
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "CP949"
|
return "CP949"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Korean"
|
||||||
|
|
|
@ -9,9 +9,9 @@ class InputState(object):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a universal detector can be in.
|
This enum represents the different states a universal detector can be in.
|
||||||
"""
|
"""
|
||||||
pure_ascii = 0
|
PURE_ASCII = 0
|
||||||
esc_ascii = 1
|
ESC_ASCII = 1
|
||||||
high_byte = 2
|
HIGH_BYTE = 2
|
||||||
|
|
||||||
|
|
||||||
class LanguageFilter(object):
|
class LanguageFilter(object):
|
||||||
|
@ -19,29 +19,58 @@ class LanguageFilter(object):
|
||||||
This enum represents the different language filters we can apply to a
|
This enum represents the different language filters we can apply to a
|
||||||
``UniversalDetector``.
|
``UniversalDetector``.
|
||||||
"""
|
"""
|
||||||
chinese_simplified = 0x01
|
CHINESE_SIMPLIFIED = 0x01
|
||||||
chinese_traditional = 0x02
|
CHINESE_TRADITIONAL = 0x02
|
||||||
japanese = 0x04
|
JAPANESE = 0x04
|
||||||
korean = 0x08
|
KOREAN = 0x08
|
||||||
non_cjk = 0x10
|
NON_CJK = 0x10
|
||||||
all = 0x1F
|
ALL = 0x1F
|
||||||
chinese = chinese_simplified | chinese_traditional
|
CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
|
||||||
cjk = chinese | japanese | korean
|
CJK = CHINESE | JAPANESE | KOREAN
|
||||||
|
|
||||||
|
|
||||||
class ProbingState(object):
|
class ProbingState(object):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a prober can be in.
|
This enum represents the different states a prober can be in.
|
||||||
"""
|
"""
|
||||||
detecting = 0
|
DETECTING = 0
|
||||||
found_it = 1
|
FOUND_IT = 1
|
||||||
not_me = 2
|
NOT_ME = 2
|
||||||
|
|
||||||
|
|
||||||
class MachineState(object):
|
class MachineState(object):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a state machine can be in.
|
This enum represents the different states a state machine can be in.
|
||||||
"""
|
"""
|
||||||
start = 0
|
START = 0
|
||||||
error = 1
|
ERROR = 1
|
||||||
its_me = 2
|
ITS_ME = 2
|
||||||
|
|
||||||
|
|
||||||
|
class SequenceLikelihood(object):
|
||||||
|
"""
|
||||||
|
This enum represents the likelihood of a character following the previous one.
|
||||||
|
"""
|
||||||
|
NEGATIVE = 0
|
||||||
|
UNLIKELY = 1
|
||||||
|
LIKELY = 2
|
||||||
|
POSITIVE = 3
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_num_categories(cls):
|
||||||
|
""":returns: The number of likelihood categories in the enum."""
|
||||||
|
return 4
|
||||||
|
|
||||||
|
|
||||||
|
class CharacterCategory(object):
|
||||||
|
"""
|
||||||
|
This enum represents the different categories language models for
|
||||||
|
``SingleByteCharsetProber`` put characters into.
|
||||||
|
|
||||||
|
Anything less than CONTROL is considered a letter.
|
||||||
|
"""
|
||||||
|
UNDEFINED = 255
|
||||||
|
LINE_BREAK = 254
|
||||||
|
SYMBOL = 253
|
||||||
|
DIGIT = 252
|
||||||
|
CONTROL = 251
|
||||||
|
|
|
@ -27,7 +27,6 @@
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .compat import wrap_ord
|
|
||||||
from .enums import LanguageFilter, ProbingState, MachineState
|
from .enums import LanguageFilter, ProbingState, MachineState
|
||||||
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
|
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
|
||||||
ISO2022KR_SM_MODEL)
|
ISO2022KR_SM_MODEL)
|
||||||
|
@ -43,15 +42,16 @@ class EscCharSetProber(CharSetProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter=None):
|
||||||
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
|
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
|
||||||
self.coding_sm = []
|
self.coding_sm = []
|
||||||
if self.lang_filter & LanguageFilter.chinese_simplified:
|
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||||
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
|
||||||
if self.lang_filter & LanguageFilter.japanese:
|
if self.lang_filter & LanguageFilter.JAPANESE:
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
||||||
if self.lang_filter & LanguageFilter.korean:
|
if self.lang_filter & LanguageFilter.KOREAN:
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
||||||
self.active_sm_count = None
|
self.active_sm_count = None
|
||||||
self._detected_charset = None
|
self._detected_charset = None
|
||||||
|
self._detected_language = None
|
||||||
self._state = None
|
self._state = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
@ -64,11 +64,16 @@ class EscCharSetProber(CharSetProber):
|
||||||
coding_sm.reset()
|
coding_sm.reset()
|
||||||
self.active_sm_count = len(self.coding_sm)
|
self.active_sm_count = len(self.coding_sm)
|
||||||
self._detected_charset = None
|
self._detected_charset = None
|
||||||
|
self._detected_language = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return self._detected_charset
|
return self._detected_charset
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return self._detected_language
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
if self._detected_charset:
|
if self._detected_charset:
|
||||||
return 0.99
|
return 0.99
|
||||||
|
@ -80,16 +85,17 @@ class EscCharSetProber(CharSetProber):
|
||||||
for coding_sm in self.coding_sm:
|
for coding_sm in self.coding_sm:
|
||||||
if not coding_sm or not coding_sm.active:
|
if not coding_sm or not coding_sm.active:
|
||||||
continue
|
continue
|
||||||
coding_state = coding_sm.next_state(wrap_ord(c))
|
coding_state = coding_sm.next_state(c)
|
||||||
if coding_state == MachineState.error:
|
if coding_state == MachineState.ERROR:
|
||||||
coding_sm.active = False
|
coding_sm.active = False
|
||||||
self.active_sm_count -= 1
|
self.active_sm_count -= 1
|
||||||
if self.active_sm_count <= 0:
|
if self.active_sm_count <= 0:
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
return self.state
|
return self.state
|
||||||
elif coding_state == MachineState.its_me:
|
elif coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
self._detected_charset = coding_sm.get_coding_state_machine()
|
self._detected_charset = coding_sm.get_coding_state_machine()
|
||||||
|
self._detected_language = coding_sm.language
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
|
@ -63,12 +63,12 @@ HZ_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
HZ_ST = (
|
HZ_ST = (
|
||||||
MachineState.start,MachineState.error, 3,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07
|
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start, 4,MachineState.error,# 10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
|
||||||
5,MachineState.error, 6,MachineState.error, 5, 5, 4,MachineState.error,# 18-1f
|
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
|
||||||
4,MachineState.error, 4, 4, 4,MachineState.error, 4,MachineState.error,# 20-27
|
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
|
||||||
4,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 28-2f
|
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
|
||||||
)
|
)
|
||||||
|
|
||||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
@ -77,7 +77,8 @@ HZ_SM_MODEL = {'class_table': HZ_CLS,
|
||||||
'class_factor': 6,
|
'class_factor': 6,
|
||||||
'state_table': HZ_ST,
|
'state_table': HZ_ST,
|
||||||
'char_len_table': HZ_CHAR_LEN_TABLE,
|
'char_len_table': HZ_CHAR_LEN_TABLE,
|
||||||
'name': "HZ-GB-2312"}
|
'name': "HZ-GB-2312",
|
||||||
|
'language': 'Chinese'}
|
||||||
|
|
||||||
ISO2022CN_CLS = (
|
ISO2022CN_CLS = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2,0,0,0,0,0,0,0, # 00 - 07
|
||||||
|
@ -115,14 +116,14 @@ ISO2022CN_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022CN_ST = (
|
ISO2022CN_ST = (
|
||||||
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07
|
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
||||||
MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f
|
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
||||||
MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17
|
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,# 18-1f
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 20-27
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
|
||||||
5, 6,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 28-2f
|
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 30-37
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,# 38-3f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
@ -131,7 +132,8 @@ ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
|
||||||
'class_factor': 9,
|
'class_factor': 9,
|
||||||
'state_table': ISO2022CN_ST,
|
'state_table': ISO2022CN_ST,
|
||||||
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
|
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
|
||||||
'name': "ISO-2022-CN"}
|
'name': "ISO-2022-CN",
|
||||||
|
'language': 'Chinese'}
|
||||||
|
|
||||||
ISO2022JP_CLS = (
|
ISO2022JP_CLS = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2,0,0,0,0,0,0,0, # 00 - 07
|
||||||
|
@ -169,15 +171,15 @@ ISO2022JP_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022JP_ST = (
|
ISO2022JP_ST = (
|
||||||
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07
|
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
||||||
MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f
|
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,# 18-1f
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
||||||
MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 20-27
|
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
|
||||||
MachineState.error,MachineState.error,MachineState.error, 6,MachineState.its_me,MachineState.error,MachineState.its_me,MachineState.error,# 28-2f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,# 30-37
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 38-3f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,MachineState.start,# 40-47
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
@ -186,7 +188,8 @@ ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
|
||||||
'class_factor': 10,
|
'class_factor': 10,
|
||||||
'state_table': ISO2022JP_ST,
|
'state_table': ISO2022JP_ST,
|
||||||
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
|
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
|
||||||
'name': "ISO-2022-JP"}
|
'name': "ISO-2022-JP",
|
||||||
|
'language': 'Japanese'}
|
||||||
|
|
||||||
ISO2022KR_CLS = (
|
ISO2022KR_CLS = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2,0,0,0,0,0,0,0, # 00 - 07
|
||||||
|
@ -224,11 +227,11 @@ ISO2022KR_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022KR_ST = (
|
ISO2022KR_ST = (
|
||||||
MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07
|
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,# 18-1f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 20-27
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
@ -237,6 +240,7 @@ ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
|
||||||
'class_factor': 6,
|
'class_factor': 6,
|
||||||
'state_table': ISO2022KR_ST,
|
'state_table': ISO2022KR_ST,
|
||||||
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
|
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
|
||||||
'name': "ISO-2022-KR"}
|
'name': "ISO-2022-KR",
|
||||||
|
'language': 'Korean'}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -49,19 +49,23 @@ class EUCJPProber(MultiByteCharSetProber):
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "EUC-JP"
|
return "EUC-JP"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for i in range(len(byte_str)):
|
for i in range(len(byte_str)):
|
||||||
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
|
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||||
if coding_state == MachineState.error:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s prober hit error at byte %s',
|
self.logger.debug('%s %s prober hit error at byte %s',
|
||||||
self.charset_name, i)
|
self.charset_name, self.language, i)
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.its_me:
|
elif coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.start:
|
elif coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte_str[0]
|
||||||
|
@ -75,10 +79,10 @@ class EUCJPProber(MultiByteCharSetProber):
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.detecting:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.context_analyzer.got_enough_data() and
|
if (self.context_analyzer.got_enough_data() and
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
|
|
|
@ -41,3 +41,7 @@ class EUCKRProber(MultiByteCharSetProber):
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "EUC-KR"
|
return "EUC-KR"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Korean"
|
||||||
|
|
|
@ -44,7 +44,7 @@
|
||||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
|
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
EUCTW_TABLE_SIZE = 8102
|
EUCTW_TABLE_SIZE = 5376
|
||||||
|
|
||||||
EUCTW_CHAR_TO_FREQ_ORDER = (
|
EUCTW_CHAR_TO_FREQ_ORDER = (
|
||||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
||||||
|
|
|
@ -40,3 +40,7 @@ class EUCTWProber(MultiByteCharSetProber):
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "EUC-TW"
|
return "EUC-TW"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Taiwan"
|
||||||
|
|
|
@ -40,3 +40,7 @@ class GB2312Prober(MultiByteCharSetProber):
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "GB2312"
|
return "GB2312"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Chinese"
|
||||||
|
|
|
@ -27,7 +27,6 @@
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
from .compat import wrap_ord
|
|
||||||
|
|
||||||
# This prober doesn't actually recognize a language or a charset.
|
# This prober doesn't actually recognize a language or a charset.
|
||||||
# It is a helper prober for the use of the Hebrew model probers
|
# It is a helper prober for the use of the Hebrew model probers
|
||||||
|
@ -177,8 +176,8 @@ class HebrewProber(CharSetProber):
|
||||||
self._visual_prober = visualProber
|
self._visual_prober = visualProber
|
||||||
|
|
||||||
def is_final(self, c):
|
def is_final(self, c):
|
||||||
return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
|
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
|
||||||
self.FINAL_PE, self.FINAL_TSADI]
|
self.FINAL_PE, self.FINAL_TSADI]
|
||||||
|
|
||||||
def is_non_final(self, c):
|
def is_non_final(self, c):
|
||||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||||
|
@ -191,8 +190,8 @@ class HebrewProber(CharSetProber):
|
||||||
# for example legally end with a Non-Final Pe or Kaf. However, the
|
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||||
# benefit of these letters as Non-Final letters outweighs the damage
|
# benefit of these letters as Non-Final letters outweighs the damage
|
||||||
# since these words are quite rare.
|
# since these words are quite rare.
|
||||||
return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM,
|
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
|
||||||
self.NORMAL_NUN, self.NORMAL_PE]
|
self.NORMAL_NUN, self.NORMAL_PE]
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
# Final letter analysis for logical-visual decision.
|
# Final letter analysis for logical-visual decision.
|
||||||
|
@ -221,9 +220,9 @@ class HebrewProber(CharSetProber):
|
||||||
# We automatically filter out all 7-bit characters (replace them with
|
# We automatically filter out all 7-bit characters (replace them with
|
||||||
# spaces) so the word boundary detection works properly. [MAP]
|
# spaces) so the word boundary detection works properly. [MAP]
|
||||||
|
|
||||||
if self.state == ProbingState.not_me:
|
if self.state == ProbingState.NOT_ME:
|
||||||
# Both model probers say it's not them. No reason to continue.
|
# Both model probers say it's not them. No reason to continue.
|
||||||
return ProbingState.not_me
|
return ProbingState.NOT_ME
|
||||||
|
|
||||||
byte_str = self.filter_high_byte_only(byte_str)
|
byte_str = self.filter_high_byte_only(byte_str)
|
||||||
|
|
||||||
|
@ -250,8 +249,8 @@ class HebrewProber(CharSetProber):
|
||||||
self._prev = cur
|
self._prev = cur
|
||||||
|
|
||||||
# Forever detecting, till the end or until both model probers return
|
# Forever detecting, till the end or until both model probers return
|
||||||
# ProbingState.not_me (handled above)
|
# ProbingState.NOT_ME (handled above)
|
||||||
return ProbingState.detecting
|
return ProbingState.DETECTING
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
|
@ -280,10 +279,14 @@ class HebrewProber(CharSetProber):
|
||||||
# Logical.
|
# Logical.
|
||||||
return self.LOGICAL_HEBREW_NAME
|
return self.LOGICAL_HEBREW_NAME
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return 'Hebrew'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self):
|
||||||
# Remain active as long as any of the model probers are active.
|
# Remain active as long as any of the model probers are active.
|
||||||
if (self._logical_prober.state == ProbingState.not_me) and \
|
if (self._logical_prober.state == ProbingState.NOT_ME) and \
|
||||||
(self._visual_prober.state == ProbingState.not_me):
|
(self._visual_prober.state == ProbingState.NOT_ME):
|
||||||
return ProbingState.not_me
|
return ProbingState.NOT_ME
|
||||||
return ProbingState.detecting
|
return ProbingState.DETECTING
|
||||||
|
|
|
@ -25,7 +25,6 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .compat import wrap_ord
|
|
||||||
|
|
||||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||||
jp2CharContext = (
|
jp2CharContext = (
|
||||||
|
@ -194,7 +193,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
first_char = wrap_ord(byte_str[0])
|
first_char = byte_str[0]
|
||||||
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
|
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
|
||||||
char_len = 2
|
char_len = 2
|
||||||
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
|
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
|
||||||
|
@ -204,7 +203,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
|
|
||||||
# return its order if it is hiragana
|
# return its order if it is hiragana
|
||||||
if len(byte_str) > 1:
|
if len(byte_str) > 1:
|
||||||
second_char = wrap_ord(byte_str[1])
|
second_char = byte_str[1]
|
||||||
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
|
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
|
||||||
return second_char - 0x9F, char_len
|
return second_char - 0x9F, char_len
|
||||||
|
|
||||||
|
@ -215,7 +214,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
first_char = wrap_ord(byte_str[0])
|
first_char = byte_str[0]
|
||||||
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
|
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
|
||||||
char_len = 2
|
char_len = 2
|
||||||
elif first_char == 0x8F:
|
elif first_char == 0x8F:
|
||||||
|
@ -225,7 +224,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
|
|
||||||
# return its order if it is hiragana
|
# return its order if it is hiragana
|
||||||
if len(byte_str) > 1:
|
if len(byte_str) > 1:
|
||||||
second_char = wrap_ord(byte_str[1])
|
second_char = byte_str[1]
|
||||||
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
|
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
|
||||||
return second_char - 0xA1, char_len
|
return second_char - 0xA1, char_len
|
||||||
|
|
||||||
|
|
|
@ -214,7 +214,8 @@ Latin5BulgarianModel = {
|
||||||
'precedence_matrix': BulgarianLangModel,
|
'precedence_matrix': BulgarianLangModel,
|
||||||
'typical_positive_ratio': 0.969392,
|
'typical_positive_ratio': 0.969392,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "ISO-8859-5"
|
'charset_name': "ISO-8859-5",
|
||||||
|
'language': 'Bulgairan',
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1251BulgarianModel = {
|
Win1251BulgarianModel = {
|
||||||
|
@ -222,8 +223,6 @@ Win1251BulgarianModel = {
|
||||||
'precedence_matrix': BulgarianLangModel,
|
'precedence_matrix': BulgarianLangModel,
|
||||||
'typical_positive_ratio': 0.969392,
|
'typical_positive_ratio': 0.969392,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "windows-1251"
|
'charset_name': "windows-1251",
|
||||||
|
'language': 'Bulgarian',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -283,7 +283,8 @@ Koi8rModel = {
|
||||||
'precedence_matrix': RussianLangModel,
|
'precedence_matrix': RussianLangModel,
|
||||||
'typical_positive_ratio': 0.976601,
|
'typical_positive_ratio': 0.976601,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "KOI8-R"
|
'charset_name': "KOI8-R",
|
||||||
|
'language': 'Russian',
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1251CyrillicModel = {
|
Win1251CyrillicModel = {
|
||||||
|
@ -291,7 +292,8 @@ Win1251CyrillicModel = {
|
||||||
'precedence_matrix': RussianLangModel,
|
'precedence_matrix': RussianLangModel,
|
||||||
'typical_positive_ratio': 0.976601,
|
'typical_positive_ratio': 0.976601,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "windows-1251"
|
'charset_name': "windows-1251",
|
||||||
|
'language': 'Russian',
|
||||||
}
|
}
|
||||||
|
|
||||||
Latin5CyrillicModel = {
|
Latin5CyrillicModel = {
|
||||||
|
@ -299,7 +301,8 @@ Latin5CyrillicModel = {
|
||||||
'precedence_matrix': RussianLangModel,
|
'precedence_matrix': RussianLangModel,
|
||||||
'typical_positive_ratio': 0.976601,
|
'typical_positive_ratio': 0.976601,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "ISO-8859-5"
|
'charset_name': "ISO-8859-5",
|
||||||
|
'language': 'Russian',
|
||||||
}
|
}
|
||||||
|
|
||||||
MacCyrillicModel = {
|
MacCyrillicModel = {
|
||||||
|
@ -307,7 +310,8 @@ MacCyrillicModel = {
|
||||||
'precedence_matrix': RussianLangModel,
|
'precedence_matrix': RussianLangModel,
|
||||||
'typical_positive_ratio': 0.976601,
|
'typical_positive_ratio': 0.976601,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "MacCyrillic"
|
'charset_name': "MacCyrillic",
|
||||||
|
'language': 'Russian',
|
||||||
}
|
}
|
||||||
|
|
||||||
Ibm866Model = {
|
Ibm866Model = {
|
||||||
|
@ -315,7 +319,8 @@ Ibm866Model = {
|
||||||
'precedence_matrix': RussianLangModel,
|
'precedence_matrix': RussianLangModel,
|
||||||
'typical_positive_ratio': 0.976601,
|
'typical_positive_ratio': 0.976601,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "IBM866"
|
'charset_name': "IBM866",
|
||||||
|
'language': 'Russian',
|
||||||
}
|
}
|
||||||
|
|
||||||
Ibm855Model = {
|
Ibm855Model = {
|
||||||
|
@ -323,7 +328,6 @@ Ibm855Model = {
|
||||||
'precedence_matrix': RussianLangModel,
|
'precedence_matrix': RussianLangModel,
|
||||||
'typical_positive_ratio': 0.976601,
|
'typical_positive_ratio': 0.976601,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "IBM855"
|
'charset_name': "IBM855",
|
||||||
|
'language': 'Russian',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -211,7 +211,8 @@ Latin7GreekModel = {
|
||||||
'precedence_matrix': GreekLangModel,
|
'precedence_matrix': GreekLangModel,
|
||||||
'typical_positive_ratio': 0.982851,
|
'typical_positive_ratio': 0.982851,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "ISO-8859-7"
|
'charset_name': "ISO-8859-7",
|
||||||
|
'language': 'Greek',
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1253GreekModel = {
|
Win1253GreekModel = {
|
||||||
|
@ -219,7 +220,6 @@ Win1253GreekModel = {
|
||||||
'precedence_matrix': GreekLangModel,
|
'precedence_matrix': GreekLangModel,
|
||||||
'typical_positive_ratio': 0.982851,
|
'typical_positive_ratio': 0.982851,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "windows-1253"
|
'charset_name': "windows-1253",
|
||||||
|
'language': 'Greek',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -195,7 +195,6 @@ Win1255HebrewModel = {
|
||||||
'precedence_matrix': HEBREW_LANG_MODEL,
|
'precedence_matrix': HEBREW_LANG_MODEL,
|
||||||
'typical_positive_ratio': 0.984004,
|
'typical_positive_ratio': 0.984004,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "windows-1255"
|
'charset_name': "windows-1255",
|
||||||
|
'language': 'Hebrew',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -211,7 +211,8 @@ Latin2HungarianModel = {
|
||||||
'precedence_matrix': HungarianLangModel,
|
'precedence_matrix': HungarianLangModel,
|
||||||
'typical_positive_ratio': 0.947368,
|
'typical_positive_ratio': 0.947368,
|
||||||
'keep_english_letter': True,
|
'keep_english_letter': True,
|
||||||
'charset_name': "ISO-8859-2"
|
'charset_name': "ISO-8859-2",
|
||||||
|
'language': 'Hungarian',
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1250HungarianModel = {
|
Win1250HungarianModel = {
|
||||||
|
@ -219,7 +220,6 @@ Win1250HungarianModel = {
|
||||||
'precedence_matrix': HungarianLangModel,
|
'precedence_matrix': HungarianLangModel,
|
||||||
'typical_positive_ratio': 0.947368,
|
'typical_positive_ratio': 0.947368,
|
||||||
'keep_english_letter': True,
|
'keep_english_letter': True,
|
||||||
'charset_name': "windows-1250"
|
'charset_name': "windows-1250",
|
||||||
|
'language': 'Hungarian',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -194,7 +194,6 @@ TIS620ThaiModel = {
|
||||||
'precedence_matrix': ThaiLangModel,
|
'precedence_matrix': ThaiLangModel,
|
||||||
'typical_positive_ratio': 0.926386,
|
'typical_positive_ratio': 0.926386,
|
||||||
'keep_english_letter': False,
|
'keep_english_letter': False,
|
||||||
'charset_name': "TIS-620"
|
'charset_name': "TIS-620",
|
||||||
|
'language': 'Thai',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -188,5 +188,6 @@ Latin5TurkishModel = {
|
||||||
'precedence_matrix': TurkishLangModel,
|
'precedence_matrix': TurkishLangModel,
|
||||||
'typical_positive_ratio': 0.970290,
|
'typical_positive_ratio': 0.970290,
|
||||||
'keep_english_letter': True,
|
'keep_english_letter': True,
|
||||||
'charset_name': "ISO-8859-9"
|
'charset_name': "ISO-8859-9",
|
||||||
|
'language': 'Turkish',
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,6 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .compat import wrap_ord
|
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
|
||||||
FREQ_CAT_NUM = 4
|
FREQ_CAT_NUM = 4
|
||||||
|
@ -108,16 +107,20 @@ class Latin1Prober(CharSetProber):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "windows-1252"
|
return "ISO-8859-1"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
byte_str = self.filter_with_english_letters(byte_str)
|
byte_str = self.filter_with_english_letters(byte_str)
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
char_class = Latin1_CharToClass[wrap_ord(c)]
|
char_class = Latin1_CharToClass[c]
|
||||||
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
|
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
|
||||||
+ char_class]
|
+ char_class]
|
||||||
if freq == 0:
|
if freq == 0:
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
self._freq_counter[freq] += 1
|
self._freq_counter[freq] += 1
|
||||||
self._last_char_class = char_class
|
self._last_char_class = char_class
|
||||||
|
@ -125,7 +128,7 @@ class Latin1Prober(CharSetProber):
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
if self.state == ProbingState.not_me:
|
if self.state == ProbingState.NOT_ME:
|
||||||
return 0.01
|
return 0.01
|
||||||
|
|
||||||
total = sum(self._freq_counter)
|
total = sum(self._freq_counter)
|
||||||
|
|
|
@ -52,34 +52,38 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for i in range(len(byte_str)):
|
for i in range(len(byte_str)):
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||||
if coding_state == MachineState.error:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s prober hit error at byte %s',
|
self.logger.debug('%s %s prober hit error at byte %s',
|
||||||
self.charset_name, i)
|
self.charset_name, self.language, i)
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.its_me:
|
elif coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.start:
|
elif coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte_str[0]
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||||
char_len)
|
char_len)
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.detecting:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.distribution_analyzer.got_enough_data() and
|
if (self.distribution_analyzer.got_enough_data() and
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
|
|
|
@ -65,9 +65,9 @@ BIG5_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
BIG5_ST = (
|
BIG5_ST = (
|
||||||
MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
|
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
||||||
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start#10-17
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
||||||
)
|
)
|
||||||
|
|
||||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||||
|
@ -101,13 +101,13 @@ CP949_CLS = (
|
||||||
|
|
||||||
CP949_ST = (
|
CP949_ST = (
|
||||||
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
|
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
|
||||||
MachineState.error,MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start, 4, 5,MachineState.error, 6, # MachineState.start
|
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START, 4, 5,MachineState.ERROR, 6, # MachineState.START
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, # MachineState.error
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me, # MachineState.its_me
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
|
||||||
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 3
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3
|
||||||
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 4
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4
|
||||||
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 5
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
||||||
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 6
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
||||||
)
|
)
|
||||||
|
|
||||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||||
|
@ -156,11 +156,11 @@ EUCJP_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCJP_ST = (
|
EUCJP_ST = (
|
||||||
3, 4, 3, 5,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#00-07
|
3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.start,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
|
||||||
MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error, 3,MachineState.error,#18-1f
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
||||||
3,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start#20-27
|
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||||
|
@ -209,8 +209,8 @@ EUCKR_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCKR_ST = (
|
EUCKR_ST = (
|
||||||
MachineState.error,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
|
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start #08-0f
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||||
|
@ -259,12 +259,12 @@ EUCTW_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCTW_ST = (
|
EUCTW_ST = (
|
||||||
MachineState.error,MachineState.error,MachineState.start, 3, 3, 3, 4,MachineState.error,#00-07
|
MachineState.ERROR,MachineState.ERROR,MachineState.START, 3, 3, 3, 4,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.start,MachineState.error,#10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17
|
||||||
MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
|
MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
|
||||||
5,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.start,MachineState.start,#20-27
|
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
||||||
MachineState.start,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f
|
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||||
|
@ -313,12 +313,12 @@ GB2312_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
GB2312_ST = (
|
GB2312_ST = (
|
||||||
MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, 3,MachineState.error,#00-07
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, 3,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,#10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17
|
||||||
4,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
|
4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
|
||||||
MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,#20-27
|
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
||||||
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||||
)
|
)
|
||||||
|
|
||||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||||
|
@ -374,9 +374,9 @@ SJIS_CLS = (
|
||||||
|
|
||||||
|
|
||||||
SJIS_ST = (
|
SJIS_ST = (
|
||||||
MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
|
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start #10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
||||||
)
|
)
|
||||||
|
|
||||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||||
|
@ -425,13 +425,13 @@ UCS2BE_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
UCS2BE_ST = (
|
UCS2BE_ST = (
|
||||||
5, 7, 7,MachineState.error, 4, 3,MachineState.error,MachineState.error,#00-07
|
5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.its_me,MachineState.its_me, 6, 6, 6, 6,MachineState.error,MachineState.error,#10-17
|
MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17
|
||||||
6, 6, 6, 6, 6,MachineState.its_me, 6, 6,#18-1f
|
6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f
|
||||||
6, 6, 6, 6, 5, 7, 7,MachineState.error,#20-27
|
6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27
|
||||||
5, 8, 6, 6,MachineState.error, 6, 6, 6,#28-2f
|
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
||||||
6, 6, 6, 6,MachineState.error,MachineState.error,MachineState.start,MachineState.start #30-37
|
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||||
)
|
)
|
||||||
|
|
||||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||||
|
@ -480,13 +480,13 @@ UCS2LE_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
UCS2LE_ST = (
|
UCS2LE_ST = (
|
||||||
6, 6, 7, 6, 4, 3,MachineState.error,MachineState.error,#00-07
|
6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.its_me,MachineState.its_me, 5, 5, 5,MachineState.error,MachineState.its_me,MachineState.error,#10-17
|
MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
|
||||||
5, 5, 5,MachineState.error, 5,MachineState.error, 6, 6,#18-1f
|
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f
|
||||||
7, 6, 8, 8, 5, 5, 5,MachineState.error,#20-27
|
7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27
|
||||||
5, 5, 5,MachineState.error,MachineState.error,MachineState.error, 5, 5,#28-2f
|
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
||||||
5, 5, 5,MachineState.error, 5,MachineState.error,MachineState.start,MachineState.start #30-37
|
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||||
)
|
)
|
||||||
|
|
||||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||||
|
@ -535,32 +535,32 @@ UTF8_CLS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
UTF8_ST = (
|
UTF8_ST = (
|
||||||
MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12, 10,#00-07
|
MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12, 10,#00-07
|
||||||
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#10-17
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#20-27
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27
|
||||||
MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#28-2f
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f
|
||||||
MachineState.error,MachineState.error, 5, 5, 5, 5,MachineState.error,MachineState.error,#30-37
|
MachineState.ERROR,MachineState.ERROR, 5, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#30-37
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#38-3f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f
|
||||||
MachineState.error,MachineState.error,MachineState.error, 5, 5, 5,MachineState.error,MachineState.error,#40-47
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#40-47
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#48-4f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f
|
||||||
MachineState.error,MachineState.error, 7, 7, 7, 7,MachineState.error,MachineState.error,#50-57
|
MachineState.ERROR,MachineState.ERROR, 7, 7, 7, 7,MachineState.ERROR,MachineState.ERROR,#50-57
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#58-5f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error, 7, 7,MachineState.error,MachineState.error,#60-67
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 7, 7,MachineState.ERROR,MachineState.ERROR,#60-67
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#68-6f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f
|
||||||
MachineState.error,MachineState.error, 9, 9, 9, 9,MachineState.error,MachineState.error,#70-77
|
MachineState.ERROR,MachineState.ERROR, 9, 9, 9, 9,MachineState.ERROR,MachineState.ERROR,#70-77
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#78-7f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 9,MachineState.error,MachineState.error,#80-87
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 9,MachineState.ERROR,MachineState.ERROR,#80-87
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#88-8f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f
|
||||||
MachineState.error,MachineState.error, 12, 12, 12, 12,MachineState.error,MachineState.error,#90-97
|
MachineState.ERROR,MachineState.ERROR, 12, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,#90-97
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#98-9f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12,MachineState.error,MachineState.error,#a0-a7
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12,MachineState.ERROR,MachineState.ERROR,#a0-a7
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#a8-af
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af
|
||||||
MachineState.error,MachineState.error, 12, 12, 12,MachineState.error,MachineState.error,MachineState.error,#b0-b7
|
MachineState.ERROR,MachineState.ERROR, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#b8-bf
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf
|
||||||
MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,#c0-c7
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
||||||
MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error #c8-cf
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
||||||
)
|
)
|
||||||
|
|
||||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||||
|
|
|
@ -27,18 +27,14 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .compat import wrap_ord
|
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||||
from .enums import ProbingState
|
|
||||||
|
|
||||||
|
|
||||||
class SingleByteCharSetProber(CharSetProber):
|
class SingleByteCharSetProber(CharSetProber):
|
||||||
SAMPLE_SIZE = 64
|
SAMPLE_SIZE = 64
|
||||||
SB_ENOUGH_REL_THRESHOLD = 1024
|
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
|
||||||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||||
SYMBOL_CAT_ORDER = 250
|
|
||||||
NUMBER_OF_SEQ_CAT = 4
|
|
||||||
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
|
|
||||||
|
|
||||||
def __init__(self, model, reversed=False, name_prober=None):
|
def __init__(self, model, reversed=False, name_prober=None):
|
||||||
super(SingleByteCharSetProber, self).__init__()
|
super(SingleByteCharSetProber, self).__init__()
|
||||||
|
@ -58,7 +54,7 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
super(SingleByteCharSetProber, self).reset()
|
super(SingleByteCharSetProber, self).reset()
|
||||||
# char order of last character
|
# char order of last character
|
||||||
self._last_order = 255
|
self._last_order = 255
|
||||||
self._seq_counters = [0] * self.NUMBER_OF_SEQ_CAT
|
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
||||||
self._total_seqs = 0
|
self._total_seqs = 0
|
||||||
self._total_char = 0
|
self._total_char = 0
|
||||||
# characters that fall in our sampling range
|
# characters that fall in our sampling range
|
||||||
|
@ -71,15 +67,29 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
else:
|
else:
|
||||||
return self._model['charset_name']
|
return self._model['charset_name']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
if self._name_prober:
|
||||||
|
return self._name_prober.language
|
||||||
|
else:
|
||||||
|
return self._model.get('language')
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
if not self._model['keep_english_letter']:
|
if not self._model['keep_english_letter']:
|
||||||
byte_str = self.filter_international_words(byte_str)
|
byte_str = self.filter_international_words(byte_str)
|
||||||
num_bytes = len(byte_str)
|
if not byte_str:
|
||||||
if not num_bytes:
|
|
||||||
return self.state
|
return self.state
|
||||||
for c in byte_str:
|
char_to_order_map = self._model['char_to_order_map']
|
||||||
order = self._model['char_to_order_map'][wrap_ord(c)]
|
for i, c in enumerate(byte_str):
|
||||||
if order < self.SYMBOL_CAT_ORDER:
|
# XXX: Order is in range 1-64, so one would think we want 0-63 here,
|
||||||
|
# but that leads to 27 more test failures than before.
|
||||||
|
order = char_to_order_map[c]
|
||||||
|
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
|
||||||
|
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
|
||||||
|
# to make it closer to the original intent. The only difference
|
||||||
|
# is whether or not we count digits and control characters for
|
||||||
|
# _total_char purposes.
|
||||||
|
if order < CharacterCategory.CONTROL:
|
||||||
self._total_char += 1
|
self._total_char += 1
|
||||||
if order < self.SAMPLE_SIZE:
|
if order < self.SAMPLE_SIZE:
|
||||||
self._freq_char += 1
|
self._freq_char += 1
|
||||||
|
@ -94,27 +104,28 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
self._seq_counters[model] += 1
|
self._seq_counters[model] += 1
|
||||||
self._last_order = order
|
self._last_order = order
|
||||||
|
|
||||||
if self.state == ProbingState.detecting:
|
charset_name = self._model['charset_name']
|
||||||
|
if self.state == ProbingState.DETECTING:
|
||||||
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
||||||
cf = self.get_confidence()
|
confidence = self.get_confidence()
|
||||||
if cf > self.POSITIVE_SHORTCUT_THRESHOLD:
|
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
||||||
self.logger.debug('%s confidence = %s, we have a winner',
|
self.logger.debug('%s confidence = %s, we have a winner',
|
||||||
self._model['charset_name'], cf)
|
charset_name, confidence)
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
elif cf < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
||||||
self.logger.debug('%s confidence = %s, below negative '
|
self.logger.debug('%s confidence = %s, below negative '
|
||||||
'shortcut threshold %s',
|
'shortcut threshhold %s', charset_name,
|
||||||
self._model['charset_name'], cf,
|
confidence,
|
||||||
self.NEGATIVE_SHORTCUT_THRESHOLD)
|
self.NEGATIVE_SHORTCUT_THRESHOLD)
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
r = 0.01
|
r = 0.01
|
||||||
if self._total_seqs > 0:
|
if self._total_seqs > 0:
|
||||||
r = ((1.0 * self._seq_counters[self.POSITIVE_CAT]) / self._total_seqs
|
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
||||||
/ self._model['typical_positive_ratio'])
|
self._total_seqs / self._model['typical_positive_ratio'])
|
||||||
r = r * self._freq_char / self._total_char
|
r = r * self._freq_char / self._total_char
|
||||||
if r >= 1.0:
|
if r >= 1.0:
|
||||||
r = 0.99
|
r = 0.99
|
||||||
|
|
|
@ -49,36 +49,40 @@ class SJISProber(MultiByteCharSetProber):
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return self.context_analyzer.charset_name
|
return self.context_analyzer.charset_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for i in range(len(byte_str)):
|
for i in range(len(byte_str)):
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||||
if coding_state == MachineState.error:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s prober hit error at byte %s',
|
self.logger.debug('%s %s prober hit error at byte %s',
|
||||||
self.charset_name, i)
|
self.charset_name, self.language, i)
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.its_me:
|
elif coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.start:
|
elif coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte_str[0]
|
||||||
self.context_analyzer.feed(self._last_char[2 - char_len:],
|
self.context_analyzer.feed(self._last_char[2 - char_len:],
|
||||||
char_len)
|
char_len)
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
|
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
|
||||||
- char_len], char_len)
|
- char_len], char_len)
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||||
char_len)
|
char_len)
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.detecting:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.context_analyzer.got_enough_data() and
|
if (self.context_analyzer.got_enough_data() and
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
|
|
|
@ -40,6 +40,7 @@ import codecs
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from .enums import InputState, LanguageFilter, ProbingState
|
from .enums import InputState, LanguageFilter, ProbingState
|
||||||
from .escprober import EscCharSetProber
|
from .escprober import EscCharSetProber
|
||||||
from .latin1prober import Latin1Prober
|
from .latin1prober import Latin1Prober
|
||||||
|
@ -67,8 +68,17 @@ class UniversalDetector(object):
|
||||||
MINIMUM_THRESHOLD = 0.20
|
MINIMUM_THRESHOLD = 0.20
|
||||||
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
|
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
|
||||||
ESC_DETECTOR = re.compile(b'(\033|~{)')
|
ESC_DETECTOR = re.compile(b'(\033|~{)')
|
||||||
|
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
|
||||||
|
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
|
||||||
|
'iso-8859-2': 'Windows-1250',
|
||||||
|
'iso-8859-5': 'Windows-1251',
|
||||||
|
'iso-8859-6': 'Windows-1256',
|
||||||
|
'iso-8859-7': 'Windows-1253',
|
||||||
|
'iso-8859-8': 'Windows-1255',
|
||||||
|
'iso-8859-9': 'Windows-1254',
|
||||||
|
'iso-8859-13': 'Windows-1257'}
|
||||||
|
|
||||||
def __init__(self, lang_filter=LanguageFilter.all):
|
def __init__(self, lang_filter=LanguageFilter.ALL):
|
||||||
self._esc_charset_prober = None
|
self._esc_charset_prober = None
|
||||||
self._charset_probers = []
|
self._charset_probers = []
|
||||||
self.result = None
|
self.result = None
|
||||||
|
@ -78,6 +88,7 @@ class UniversalDetector(object):
|
||||||
self._last_char = None
|
self._last_char = None
|
||||||
self.lang_filter = lang_filter
|
self.lang_filter = lang_filter
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self._has_win_bytes = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
@ -86,10 +97,11 @@ class UniversalDetector(object):
|
||||||
initial states. This is called by ``__init__``, so you only need to
|
initial states. This is called by ``__init__``, so you only need to
|
||||||
call this directly in between analyses of different documents.
|
call this directly in between analyses of different documents.
|
||||||
"""
|
"""
|
||||||
self.result = {'encoding': None, 'confidence': 0.0}
|
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
|
||||||
self.done = False
|
self.done = False
|
||||||
self._got_data = False
|
self._got_data = False
|
||||||
self._input_state = InputState.pure_ascii
|
self._has_win_bytes = False
|
||||||
|
self._input_state = InputState.PURE_ASCII
|
||||||
self._last_char = b''
|
self._last_char = b''
|
||||||
if self._esc_charset_prober:
|
if self._esc_charset_prober:
|
||||||
self._esc_charset_prober.reset()
|
self._esc_charset_prober.reset()
|
||||||
|
@ -116,28 +128,40 @@ class UniversalDetector(object):
|
||||||
if not len(byte_str):
|
if not len(byte_str):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not isinstance(byte_str, bytearray):
|
||||||
|
byte_str = bytearray(byte_str)
|
||||||
|
|
||||||
# First check for known BOMs, since these are guaranteed to be correct
|
# First check for known BOMs, since these are guaranteed to be correct
|
||||||
if not self._got_data:
|
if not self._got_data:
|
||||||
# If the data starts with BOM, we know it is UTF
|
# If the data starts with BOM, we know it is UTF
|
||||||
if byte_str.startswith(codecs.BOM_UTF8):
|
if byte_str.startswith(codecs.BOM_UTF8):
|
||||||
# EF BB BF UTF-8 with BOM
|
# EF BB BF UTF-8 with BOM
|
||||||
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-8-SIG",
|
||||||
elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
|
'confidence': 1.0,
|
||||||
|
'language': ''}
|
||||||
|
elif byte_str.startswith((codecs.BOM_UTF32_LE,
|
||||||
|
codecs.BOM_UTF32_BE)):
|
||||||
# FF FE 00 00 UTF-32, little-endian BOM
|
# FF FE 00 00 UTF-32, little-endian BOM
|
||||||
# 00 00 FE FF UTF-32, big-endian BOM
|
# 00 00 FE FF UTF-32, big-endian BOM
|
||||||
self.result = {'encoding': "UTF-32", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-32",
|
||||||
|
'confidence': 1.0,
|
||||||
|
'language': ''}
|
||||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
||||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
||||||
'confidence': 1.0}
|
'confidence': 1.0,
|
||||||
|
'language': ''}
|
||||||
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
|
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
|
||||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
||||||
'confidence': 1.0}
|
'confidence': 1.0,
|
||||||
elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
|
'language': ''}
|
||||||
|
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
||||||
# FF FE UTF-16, little endian BOM
|
# FF FE UTF-16, little endian BOM
|
||||||
# FE FF UTF-16, big endian BOM
|
# FE FF UTF-16, big endian BOM
|
||||||
self.result = {'encoding': "UTF-16", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-16",
|
||||||
|
'confidence': 1.0,
|
||||||
|
'language': ''}
|
||||||
|
|
||||||
self._got_data = True
|
self._got_data = True
|
||||||
if self.result['encoding'] is not None:
|
if self.result['encoding'] is not None:
|
||||||
|
@ -146,12 +170,12 @@ class UniversalDetector(object):
|
||||||
|
|
||||||
# If none of those matched and we've only see ASCII so far, check
|
# If none of those matched and we've only see ASCII so far, check
|
||||||
# for high bytes and escape sequences
|
# for high bytes and escape sequences
|
||||||
if self._input_state == InputState.pure_ascii:
|
if self._input_state == InputState.PURE_ASCII:
|
||||||
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
||||||
self._input_state = InputState.high_byte
|
self._input_state = InputState.HIGH_BYTE
|
||||||
elif self._input_state == InputState.pure_ascii and \
|
elif self._input_state == InputState.PURE_ASCII and \
|
||||||
self.ESC_DETECTOR.search(self._last_char + byte_str):
|
self.ESC_DETECTOR.search(self._last_char + byte_str):
|
||||||
self._input_state = InputState.esc_ascii
|
self._input_state = InputState.ESC_ASCII
|
||||||
|
|
||||||
self._last_char = byte_str[-1:]
|
self._last_char = byte_str[-1:]
|
||||||
|
|
||||||
|
@ -159,14 +183,16 @@ class UniversalDetector(object):
|
||||||
# uses a simple state machine to check for known escape sequences in
|
# uses a simple state machine to check for known escape sequences in
|
||||||
# HZ and ISO-2022 encodings, since those are the only encodings that
|
# HZ and ISO-2022 encodings, since those are the only encodings that
|
||||||
# use such sequences.
|
# use such sequences.
|
||||||
if self._input_state == InputState.esc_ascii:
|
if self._input_state == InputState.ESC_ASCII:
|
||||||
if not self._esc_charset_prober:
|
if not self._esc_charset_prober:
|
||||||
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
||||||
if self._esc_charset_prober.feed(byte_str) == ProbingState.found_it:
|
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {'encoding':
|
self.result = {'encoding':
|
||||||
self._esc_charset_prober.charset_name,
|
self._esc_charset_prober.charset_name,
|
||||||
'confidence':
|
'confidence':
|
||||||
self._esc_charset_prober.get_confidence()}
|
self._esc_charset_prober.get_confidence(),
|
||||||
|
'language':
|
||||||
|
self._esc_charset_prober.language}
|
||||||
self.done = True
|
self.done = True
|
||||||
# If we've seen high bytes (i.e., those with values greater than 127),
|
# If we've seen high bytes (i.e., those with values greater than 127),
|
||||||
# we need to do more complicated checks using all our multi-byte and
|
# we need to do more complicated checks using all our multi-byte and
|
||||||
|
@ -174,59 +200,87 @@ class UniversalDetector(object):
|
||||||
# use character bigram distributions to determine the encoding, whereas
|
# use character bigram distributions to determine the encoding, whereas
|
||||||
# the multi-byte probers use a combination of character unigram and
|
# the multi-byte probers use a combination of character unigram and
|
||||||
# bigram distributions.
|
# bigram distributions.
|
||||||
elif self._input_state == InputState.high_byte:
|
elif self._input_state == InputState.HIGH_BYTE:
|
||||||
if not self._charset_probers:
|
if not self._charset_probers:
|
||||||
self._charset_probers = [MBCSGroupProber(self.lang_filter)]
|
self._charset_probers = [MBCSGroupProber(self.lang_filter)]
|
||||||
# If we're checking non-CJK encodings, use single-byte prober
|
# If we're checking non-CJK encodings, use single-byte prober
|
||||||
if self.lang_filter & LanguageFilter.non_cjk:
|
if self.lang_filter & LanguageFilter.NON_CJK:
|
||||||
self._charset_probers.append(SBCSGroupProber())
|
self._charset_probers.append(SBCSGroupProber())
|
||||||
self._charset_probers.append(Latin1Prober())
|
self._charset_probers.append(Latin1Prober())
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
if prober.feed(byte_str) == ProbingState.found_it:
|
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {'encoding': prober.charset_name,
|
self.result = {'encoding': prober.charset_name,
|
||||||
'confidence': prober.get_confidence()}
|
'confidence': prober.get_confidence(),
|
||||||
|
'language': prober.language}
|
||||||
self.done = True
|
self.done = True
|
||||||
break
|
break
|
||||||
|
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||||
|
self._has_win_bytes = True
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""
|
"""
|
||||||
Stop analyzing the current document and come up with a final
|
Stop analyzing the current document and come up with a final
|
||||||
prediction.
|
prediction.
|
||||||
|
|
||||||
:returns: The ``result`` attribute if a prediction was made, otherwise
|
:returns: The ``result`` attribute, a ``dict`` with the keys
|
||||||
``None``.
|
`encoding`, `confidence`, and `language`.
|
||||||
"""
|
"""
|
||||||
|
# Don't bother with checks if we're already done
|
||||||
if self.done:
|
if self.done:
|
||||||
return self.result
|
return self.result
|
||||||
if not self._got_data:
|
|
||||||
self.logger.debug('no data received!')
|
|
||||||
return
|
|
||||||
self.done = True
|
self.done = True
|
||||||
|
|
||||||
if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
|
if not self._got_data:
|
||||||
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
self.logger.debug('no data received!')
|
||||||
return self.result
|
|
||||||
|
|
||||||
if self._input_state == InputState.high_byte:
|
# Default to ASCII if it is all we've seen so far
|
||||||
proberConfidence = None
|
elif self._input_state == InputState.PURE_ASCII:
|
||||||
|
self.result = {'encoding': 'ascii',
|
||||||
|
'confidence': 1.0,
|
||||||
|
'language': ''}
|
||||||
|
|
||||||
|
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
||||||
|
elif self._input_state == InputState.HIGH_BYTE:
|
||||||
|
prober_confidence = None
|
||||||
max_prober_confidence = 0.0
|
max_prober_confidence = 0.0
|
||||||
max_prober = None
|
max_prober = None
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
if not prober:
|
if not prober:
|
||||||
continue
|
continue
|
||||||
proberConfidence = prober.get_confidence()
|
prober_confidence = prober.get_confidence()
|
||||||
if proberConfidence > max_prober_confidence:
|
if prober_confidence > max_prober_confidence:
|
||||||
max_prober_confidence = proberConfidence
|
max_prober_confidence = prober_confidence
|
||||||
max_prober = prober
|
max_prober = prober
|
||||||
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
||||||
self.result = {'encoding': max_prober.charset_name,
|
charset_name = max_prober.charset_name
|
||||||
'confidence': max_prober.get_confidence()}
|
lower_charset_name = max_prober.charset_name.lower()
|
||||||
return self.result
|
confidence = max_prober.get_confidence()
|
||||||
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
|
# extra Windows-specific bytes
|
||||||
|
if lower_charset_name.startswith('iso-8859'):
|
||||||
|
if self._has_win_bytes:
|
||||||
|
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
|
||||||
|
charset_name)
|
||||||
|
self.result = {'encoding': charset_name,
|
||||||
|
'confidence': confidence,
|
||||||
|
'language': max_prober.language}
|
||||||
|
|
||||||
|
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
||||||
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
||||||
self.logger.debug('no probers hit minimum threshold')
|
if self.result['encoding'] is None:
|
||||||
for prober in self._charset_probers[0].probers:
|
self.logger.debug('no probers hit minimum threshold')
|
||||||
if not prober:
|
for group_prober in self._charset_probers:
|
||||||
continue
|
if not group_prober:
|
||||||
self.logger.debug('%s confidence = %s', prober.charset_name,
|
continue
|
||||||
prober.get_confidence())
|
if isinstance(group_prober, CharSetGroupProber):
|
||||||
|
for prober in group_prober.probers:
|
||||||
|
self.logger.debug('%s %s confidence = %s',
|
||||||
|
prober.charset_name,
|
||||||
|
prober.language,
|
||||||
|
prober.get_confidence())
|
||||||
|
else:
|
||||||
|
self.logger.debug('%s %s confidence = %s',
|
||||||
|
prober.charset_name,
|
||||||
|
prober.language,
|
||||||
|
prober.get_confidence())
|
||||||
|
return self.result
|
||||||
|
|
|
@ -50,22 +50,26 @@ class UTF8Prober(CharSetProber):
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return "utf-8"
|
return "utf-8"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
coding_state = self.coding_sm.next_state(c)
|
coding_state = self.coding_sm.next_state(c)
|
||||||
if coding_state == MachineState.error:
|
if coding_state == MachineState.ERROR:
|
||||||
self._state = ProbingState.not_me
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.its_me:
|
elif coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.start:
|
elif coding_state == MachineState.START:
|
||||||
if self.coding_sm.get_current_charlen() >= 2:
|
if self.coding_sm.get_current_charlen() >= 2:
|
||||||
self._num_mb_chars += 1
|
self._num_mb_chars += 1
|
||||||
|
|
||||||
if self.state == ProbingState.detecting:
|
if self.state == ProbingState.DETECTING:
|
||||||
if self.get_confidence() > self.SHORTCUT_THRESHOLD:
|
if self.get_confidence() > self.SHORTCUT_THRESHOLD:
|
||||||
self._state = ProbingState.found_it
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
This module exists only to simplify retrieving the version number of chardet
|
This module exists only to simplify retrieving the version number of chardet
|
||||||
from within setup.py and from chardet subpackages.
|
from within setup.py and from chardet subpackages.
|
||||||
|
|
||||||
:author: Dan Blanchard (dblanchard@ets.org)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "2.3.0"
|
__version__ = "3.0.4"
|
||||||
VERSION = __version__.split('.')
|
VERSION = __version__.split('.')
|
||||||
|
|
Loading…
Reference in a new issue