diff --git a/CHANGES.md b/CHANGES.md index 6b67dc8c..0af4d142 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,7 @@ * Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439) * Update cachecontrol library 0.11.5 to 0.12.3 (db54c40) * Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089) +* Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2) * Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160) * Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb) * Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72) diff --git a/lib/chardet/__init__.py b/lib/chardet/__init__.py index 25242b7f..45bf7e68 100644 --- a/lib/chardet/__init__.py +++ b/lib/chardet/__init__.py @@ -16,17 +16,24 @@ ######################### END LICENSE BLOCK ######################### -from .compat import PY2, PY3, bin_type as _bin_type +from .compat import PY2, PY3 from .universaldetector import UniversalDetector from .version import __version__, VERSION def detect(byte_str): - if not isinstance(byte_str, _bin_type): - raise TypeError('Expected object of {0} type, got: {1}' - ''.format(_bin_type, type(byte_str))) + """ + Detect the encoding of the given byte string. - u = UniversalDetector() - u.feed(byte_str) - u.close() - return u.result + :param byte_str: The byte sequence to examine. + :type byte_str: ``bytes`` or ``bytearray`` + """ + if not isinstance(byte_str, bytearray): + if not isinstance(byte_str, bytes): + raise TypeError('Expected object of type bytes or bytearray, got: ' + '{0}'.format(type(byte_str))) + else: + byte_str = bytearray(byte_str) + detector = UniversalDetector() + detector.feed(byte_str) + return detector.close() diff --git a/lib/chardet/big5prober.py b/lib/chardet/big5prober.py index df1a8ec6..5b1227a5 100644 --- a/lib/chardet/big5prober.py +++ b/lib/chardet/big5prober.py @@ -41,3 +41,7 @@ class Big5Prober(MultiByteCharSetProber): @property def charset_name(self): return "Big5" + + @property + def language(self): + return "Chinese" diff --git a/lib/chardet/chardistribution.py b/lib/chardet/chardistribution.py index 874184cd..e5509a01 100644 --- a/lib/chardet/chardistribution.py +++ b/lib/chardet/chardistribution.py @@ -35,7 +35,6 @@ from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO) from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO) -from .compat import wrap_ord class CharDistributionAnalysis(object): @@ -123,9 +122,9 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis): # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - first_char = wrap_ord(byte_str[0]) + first_char = byte_str[0] if first_char >= 0xC4: - return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1 + return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1 else: return -1 @@ -142,9 +141,9 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis): # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - first_char = wrap_ord(byte_str[0]) + first_char = byte_str[0] if first_char >= 0xB0: - return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1 + return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1 else: return -1 @@ -161,7 +160,7 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis): # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1]) + first_char, second_char = byte_str[0], byte_str[1] if (first_char >= 0xB0) and (second_char >= 0xA1): return 94 * (first_char - 0xB0) + second_char - 0xA1 else: @@ -180,7 +179,7 @@ class Big5DistributionAnalysis(CharDistributionAnalysis): # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # no validation needed here. State machine has done that - first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1]) + first_char, second_char = byte_str[0], byte_str[1] if first_char >= 0xA4: if second_char >= 0xA1: return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 @@ -202,7 +201,7 @@ class SJISDistributionAnalysis(CharDistributionAnalysis): # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that - first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1]) + first_char, second_char = byte_str[0], byte_str[1] if (first_char >= 0x81) and (first_char <= 0x9F): order = 188 * (first_char - 0x81) elif (first_char >= 0xE0) and (first_char <= 0xEF): @@ -227,8 +226,8 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis): # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - char = wrap_ord(byte_str[0]) + char = byte_str[0] if char >= 0xA0: - return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1 + return 94 * (char - 0xA1) + byte_str[1] - 0xa1 else: return -1 diff --git a/lib/chardet/charsetgroupprober.py b/lib/chardet/charsetgroupprober.py index 0ef04439..1720ddc9 100644 --- a/lib/chardet/charsetgroupprober.py +++ b/lib/chardet/charsetgroupprober.py @@ -54,6 +54,14 @@ class CharSetGroupProber(CharSetProber): return None return self._best_guess_prober.charset_name + @property + def language(self): + if not self._best_guess_prober: + self.get_confidence() + if not self._best_guess_prober: + return None + return self._best_guess_prober.language + def feed(self, byte_str): for prober in self.probers: if not prober: @@ -63,22 +71,22 @@ class CharSetGroupProber(CharSetProber): state = prober.feed(byte_str) if not state: continue - if state == ProbingState.found_it: + if state == ProbingState.FOUND_IT: self._best_guess_prober = prober return self.state - elif state == ProbingState.not_me: + elif state == ProbingState.NOT_ME: prober.active = False self._active_num -= 1 if self._active_num <= 0: - self._state = ProbingState.not_me + self._state = ProbingState.NOT_ME return self.state return self.state def get_confidence(self): state = self.state - if state == ProbingState.found_it: + if state == ProbingState.FOUND_IT: return 0.99 - elif state == ProbingState.not_me: + elif state == ProbingState.NOT_ME: return 0.01 best_conf = 0.0 self._best_guess_prober = None @@ -89,7 +97,7 @@ class CharSetGroupProber(CharSetProber): self.logger.debug('%s not active', prober.charset_name) continue conf = prober.get_confidence() - self.logger.debug('%s confidence = %s', prober.charset_name, conf) + self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf) if best_conf < conf: best_conf = conf self._best_guess_prober = prober diff --git a/lib/chardet/charsetprober.py b/lib/chardet/charsetprober.py index 92dc57a1..1fc27464 100644 --- a/lib/chardet/charsetprober.py +++ b/lib/chardet/charsetprober.py @@ -42,7 +42,7 @@ class CharSetProber(object): self.logger = logging.getLogger(__name__) def reset(self): - self._state = ProbingState.detecting + self._state = ProbingState.DETECTING @property def charset_name(self): diff --git a/lib/chardet/cli/chardetect.py b/lib/chardet/cli/chardetect.py index 3eeeca85..e5f86fe9 100644 --- a/lib/chardet/cli/chardetect.py +++ b/lib/chardet/cli/chardetect.py @@ -17,15 +17,12 @@ from __future__ import absolute_import, print_function, unicode_literals import argparse import sys -from io import open from chardet import __version__ from chardet.compat import PY2 from chardet.universaldetector import UniversalDetector - - def description_of(lines, name='stdin'): """ Return a string describing the probable encoding of a file or @@ -38,7 +35,11 @@ def description_of(lines, name='stdin'): """ u = UniversalDetector() for line in lines: + line = bytearray(line) u.feed(line) + # shortcut out of the loop to save reading further - particularly useful if we read a BOM. + if u.done: + break u.close() result = u.result if PY2: diff --git a/lib/chardet/codingstatemachine.py b/lib/chardet/codingstatemachine.py index 4fa5bba7..c562e1dc 100644 --- a/lib/chardet/codingstatemachine.py +++ b/lib/chardet/codingstatemachine.py @@ -28,7 +28,6 @@ import logging from .enums import MachineState -from .compat import wrap_ord class CodingStateMachine(object): @@ -62,13 +61,13 @@ class CodingStateMachine(object): self.reset() def reset(self): - self._curr_state = MachineState.start + self._curr_state = MachineState.START def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length - byte_class = self._model['class_table'][wrap_ord(c)] - if self._curr_state == MachineState.start: + byte_class = self._model['class_table'][c] + if self._curr_state == MachineState.START: self._curr_byte_pos = 0 self._curr_char_len = self._model['char_len_table'][byte_class] # from byte's class and state_table, we get its next state @@ -83,3 +82,7 @@ class CodingStateMachine(object): def get_coding_state_machine(self): return self._model['name'] + + @property + def language(self): + return self._model['language'] diff --git a/lib/chardet/compat.py b/lib/chardet/compat.py index 01598f66..fa100a32 100644 --- a/lib/chardet/compat.py +++ b/lib/chardet/compat.py @@ -27,17 +27,8 @@ if sys.version_info < (3, 0): PY3 = False base_str = (str, unicode) text_type = unicode - bin_type = str else: PY2 = False PY3 = True base_str = (bytes, str) text_type = str - bin_type = (bytes, bytearray) - - -def wrap_ord(a): - if PY2 and isinstance(a, base_str): - return ord(a) - else: - return a diff --git a/lib/chardet/cp949prober.py b/lib/chardet/cp949prober.py index aa0e4462..de0ceab0 100644 --- a/lib/chardet/cp949prober.py +++ b/lib/chardet/cp949prober.py @@ -43,3 +43,7 @@ class CP949Prober(MultiByteCharSetProber): @property def charset_name(self): return "CP949" + + @property + def language(self): + return "Korean" diff --git a/lib/chardet/enums.py b/lib/chardet/enums.py index f1fe20e8..c8e60013 100644 --- a/lib/chardet/enums.py +++ b/lib/chardet/enums.py @@ -9,9 +9,9 @@ class InputState(object): """ This enum represents the different states a universal detector can be in. """ - pure_ascii = 0 - esc_ascii = 1 - high_byte = 2 + PURE_ASCII = 0 + ESC_ASCII = 1 + HIGH_BYTE = 2 class LanguageFilter(object): @@ -19,29 +19,58 @@ class LanguageFilter(object): This enum represents the different language filters we can apply to a ``UniversalDetector``. """ - chinese_simplified = 0x01 - chinese_traditional = 0x02 - japanese = 0x04 - korean = 0x08 - non_cjk = 0x10 - all = 0x1F - chinese = chinese_simplified | chinese_traditional - cjk = chinese | japanese | korean + CHINESE_SIMPLIFIED = 0x01 + CHINESE_TRADITIONAL = 0x02 + JAPANESE = 0x04 + KOREAN = 0x08 + NON_CJK = 0x10 + ALL = 0x1F + CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL + CJK = CHINESE | JAPANESE | KOREAN class ProbingState(object): """ This enum represents the different states a prober can be in. """ - detecting = 0 - found_it = 1 - not_me = 2 + DETECTING = 0 + FOUND_IT = 1 + NOT_ME = 2 class MachineState(object): """ This enum represents the different states a state machine can be in. """ - start = 0 - error = 1 - its_me = 2 + START = 0 + ERROR = 1 + ITS_ME = 2 + + +class SequenceLikelihood(object): + """ + This enum represents the likelihood of a character following the previous one. + """ + NEGATIVE = 0 + UNLIKELY = 1 + LIKELY = 2 + POSITIVE = 3 + + @classmethod + def get_num_categories(cls): + """:returns: The number of likelihood categories in the enum.""" + return 4 + + +class CharacterCategory(object): + """ + This enum represents the different categories language models for + ``SingleByteCharsetProber`` put characters into. + + Anything less than CONTROL is considered a letter. + """ + UNDEFINED = 255 + LINE_BREAK = 254 + SYMBOL = 253 + DIGIT = 252 + CONTROL = 251 diff --git a/lib/chardet/escprober.py b/lib/chardet/escprober.py index 54fa98b6..c52060d0 100644 --- a/lib/chardet/escprober.py +++ b/lib/chardet/escprober.py @@ -27,7 +27,6 @@ from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine -from .compat import wrap_ord from .enums import LanguageFilter, ProbingState, MachineState from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL, ISO2022KR_SM_MODEL) @@ -43,15 +42,16 @@ class EscCharSetProber(CharSetProber): def __init__(self, lang_filter=None): super(EscCharSetProber, self).__init__(lang_filter=lang_filter) self.coding_sm = [] - if self.lang_filter & LanguageFilter.chinese_simplified: + if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL)) - if self.lang_filter & LanguageFilter.japanese: + if self.lang_filter & LanguageFilter.JAPANESE: self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) - if self.lang_filter & LanguageFilter.korean: + if self.lang_filter & LanguageFilter.KOREAN: self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) self.active_sm_count = None self._detected_charset = None + self._detected_language = None self._state = None self.reset() @@ -64,11 +64,16 @@ class EscCharSetProber(CharSetProber): coding_sm.reset() self.active_sm_count = len(self.coding_sm) self._detected_charset = None + self._detected_language = None @property def charset_name(self): return self._detected_charset + @property + def language(self): + return self._detected_language + def get_confidence(self): if self._detected_charset: return 0.99 @@ -80,16 +85,17 @@ class EscCharSetProber(CharSetProber): for coding_sm in self.coding_sm: if not coding_sm or not coding_sm.active: continue - coding_state = coding_sm.next_state(wrap_ord(c)) - if coding_state == MachineState.error: + coding_state = coding_sm.next_state(c) + if coding_state == MachineState.ERROR: coding_sm.active = False self.active_sm_count -= 1 if self.active_sm_count <= 0: - self._state = ProbingState.not_me + self._state = ProbingState.NOT_ME return self.state - elif coding_state == MachineState.its_me: - self._state = ProbingState.found_it + elif coding_state == MachineState.ITS_ME: + self._state = ProbingState.FOUND_IT self._detected_charset = coding_sm.get_coding_state_machine() + self._detected_language = coding_sm.language return self.state return self.state diff --git a/lib/chardet/escsm.py b/lib/chardet/escsm.py index 38107772..b8377045 100644 --- a/lib/chardet/escsm.py +++ b/lib/chardet/escsm.py @@ -63,12 +63,12 @@ HZ_CLS = ( ) HZ_ST = ( -MachineState.start,MachineState.error, 3,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07 -MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f -MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start, 4,MachineState.error,# 10-17 - 5,MachineState.error, 6,MachineState.error, 5, 5, 4,MachineState.error,# 18-1f - 4,MachineState.error, 4, 4, 4,MachineState.error, 4,MachineState.error,# 20-27 - 4,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 28-2f +MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07 +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f +MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17 + 5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f + 4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27 + 4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f ) HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) @@ -77,7 +77,8 @@ HZ_SM_MODEL = {'class_table': HZ_CLS, 'class_factor': 6, 'state_table': HZ_ST, 'char_len_table': HZ_CHAR_LEN_TABLE, - 'name': "HZ-GB-2312"} + 'name': "HZ-GB-2312", + 'language': 'Chinese'} ISO2022CN_CLS = ( 2,0,0,0,0,0,0,0, # 00 - 07 @@ -115,14 +116,14 @@ ISO2022CN_CLS = ( ) ISO2022CN_ST = ( -MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07 -MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f -MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17 -MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,# 18-1f -MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 20-27 - 5, 6,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 28-2f -MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 30-37 -MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,# 38-3f +MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07 +MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f +MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17 +MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27 + 5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37 +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f ) ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -131,7 +132,8 @@ ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS, 'class_factor': 9, 'state_table': ISO2022CN_ST, 'char_len_table': ISO2022CN_CHAR_LEN_TABLE, - 'name': "ISO-2022-CN"} + 'name': "ISO-2022-CN", + 'language': 'Chinese'} ISO2022JP_CLS = ( 2,0,0,0,0,0,0,0, # 00 - 07 @@ -169,15 +171,15 @@ ISO2022JP_CLS = ( ) ISO2022JP_ST = ( -MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07 -MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f -MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17 -MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,# 18-1f -MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 20-27 -MachineState.error,MachineState.error,MachineState.error, 6,MachineState.its_me,MachineState.error,MachineState.its_me,MachineState.error,# 28-2f -MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,# 30-37 -MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 38-3f -MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,MachineState.start,# 40-47 +MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07 +MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17 +MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f +MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27 +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37 +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47 ) ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -186,7 +188,8 @@ ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS, 'class_factor': 10, 'state_table': ISO2022JP_ST, 'char_len_table': ISO2022JP_CHAR_LEN_TABLE, - 'name': "ISO-2022-JP"} + 'name': "ISO-2022-JP", + 'language': 'Japanese'} ISO2022KR_CLS = ( 2,0,0,0,0,0,0,0, # 00 - 07 @@ -224,11 +227,11 @@ ISO2022KR_CLS = ( ) ISO2022KR_ST = ( -MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07 -MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f -MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error, 4,MachineState.error,MachineState.error,# 10-17 -MachineState.error,MachineState.error,MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,# 18-1f -MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 20-27 +MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07 +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f +MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17 +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f +MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27 ) ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) @@ -237,6 +240,7 @@ ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS, 'class_factor': 6, 'state_table': ISO2022KR_ST, 'char_len_table': ISO2022KR_CHAR_LEN_TABLE, - 'name': "ISO-2022-KR"} + 'name': "ISO-2022-KR", + 'language': 'Korean'} diff --git a/lib/chardet/eucjpprober.py b/lib/chardet/eucjpprober.py index 12857832..a81ee1e2 100644 --- a/lib/chardet/eucjpprober.py +++ b/lib/chardet/eucjpprober.py @@ -49,19 +49,23 @@ class EUCJPProber(MultiByteCharSetProber): def charset_name(self): return "EUC-JP" + @property + def language(self): + return "Japanese" + def feed(self, byte_str): for i in range(len(byte_str)): # PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte coding_state = self.coding_sm.next_state(byte_str[i]) - if coding_state == MachineState.error: - self.logger.debug('%s prober hit error at byte %s', - self.charset_name, i) - self._state = ProbingState.not_me + if coding_state == MachineState.ERROR: + self.logger.debug('%s %s prober hit error at byte %s', + self.charset_name, self.language, i) + self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.its_me: - self._state = ProbingState.found_it + elif coding_state == MachineState.ITS_ME: + self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.start: + elif coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() if i == 0: self._last_char[1] = byte_str[0] @@ -75,10 +79,10 @@ class EUCJPProber(MultiByteCharSetProber): self._last_char[0] = byte_str[-1] - if self.state == ProbingState.detecting: + if self.state == ProbingState.DETECTING: if (self.context_analyzer.got_enough_data() and (self.get_confidence() > self.SHORTCUT_THRESHOLD)): - self._state = ProbingState.found_it + self._state = ProbingState.FOUND_IT return self.state diff --git a/lib/chardet/euckrprober.py b/lib/chardet/euckrprober.py index 4e4ae23f..99d5b154 100644 --- a/lib/chardet/euckrprober.py +++ b/lib/chardet/euckrprober.py @@ -41,3 +41,7 @@ class EUCKRProber(MultiByteCharSetProber): @property def charset_name(self): return "EUC-KR" + + @property + def language(self): + return "Korean" diff --git a/lib/chardet/euctwfreq.py b/lib/chardet/euctwfreq.py index b3f1b16f..5195275e 100644 --- a/lib/chardet/euctwfreq.py +++ b/lib/chardet/euctwfreq.py @@ -44,7 +44,7 @@ EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75 # Char to FreqOrder table , -EUCTW_TABLE_SIZE = 8102 +EUCTW_TABLE_SIZE = 5376 EUCTW_CHAR_TO_FREQ_ORDER = ( 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 diff --git a/lib/chardet/euctwprober.py b/lib/chardet/euctwprober.py index 4ee4b368..7dbc136e 100644 --- a/lib/chardet/euctwprober.py +++ b/lib/chardet/euctwprober.py @@ -40,3 +40,7 @@ class EUCTWProber(MultiByteCharSetProber): @property def charset_name(self): return "EUC-TW" + + @property + def language(self): + return "Taiwan" diff --git a/lib/chardet/gb2312prober.py b/lib/chardet/gb2312prober.py index a2f9055a..7cae6b51 100644 --- a/lib/chardet/gb2312prober.py +++ b/lib/chardet/gb2312prober.py @@ -40,3 +40,7 @@ class GB2312Prober(MultiByteCharSetProber): @property def charset_name(self): return "GB2312" + + @property + def language(self): + return "Chinese" diff --git a/lib/chardet/hebrewprober.py b/lib/chardet/hebrewprober.py index 9e44994f..10b81224 100644 --- a/lib/chardet/hebrewprober.py +++ b/lib/chardet/hebrewprober.py @@ -27,7 +27,6 @@ from .charsetprober import CharSetProber from .enums import ProbingState -from .compat import wrap_ord # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -177,8 +176,8 @@ class HebrewProber(CharSetProber): self._visual_prober = visualProber def is_final(self, c): - return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN, - self.FINAL_PE, self.FINAL_TSADI] + return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN, + self.FINAL_PE, self.FINAL_TSADI] def is_non_final(self, c): # The normal Tsadi is not a good Non-Final letter due to words like @@ -191,8 +190,8 @@ class HebrewProber(CharSetProber): # for example legally end with a Non-Final Pe or Kaf. However, the # benefit of these letters as Non-Final letters outweighs the damage # since these words are quite rare. - return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM, - self.NORMAL_NUN, self.NORMAL_PE] + return c in [self.NORMAL_KAF, self.NORMAL_MEM, + self.NORMAL_NUN, self.NORMAL_PE] def feed(self, byte_str): # Final letter analysis for logical-visual decision. @@ -221,9 +220,9 @@ class HebrewProber(CharSetProber): # We automatically filter out all 7-bit characters (replace them with # spaces) so the word boundary detection works properly. [MAP] - if self.state == ProbingState.not_me: + if self.state == ProbingState.NOT_ME: # Both model probers say it's not them. No reason to continue. - return ProbingState.not_me + return ProbingState.NOT_ME byte_str = self.filter_high_byte_only(byte_str) @@ -250,8 +249,8 @@ class HebrewProber(CharSetProber): self._prev = cur # Forever detecting, till the end or until both model probers return - # ProbingState.not_me (handled above) - return ProbingState.detecting + # ProbingState.NOT_ME (handled above) + return ProbingState.DETECTING @property def charset_name(self): @@ -280,10 +279,14 @@ class HebrewProber(CharSetProber): # Logical. return self.LOGICAL_HEBREW_NAME + @property + def language(self): + return 'Hebrew' + @property def state(self): # Remain active as long as any of the model probers are active. - if (self._logical_prober.state == ProbingState.not_me) and \ - (self._visual_prober.state == ProbingState.not_me): - return ProbingState.not_me - return ProbingState.detecting + if (self._logical_prober.state == ProbingState.NOT_ME) and \ + (self._visual_prober.state == ProbingState.NOT_ME): + return ProbingState.NOT_ME + return ProbingState.DETECTING diff --git a/lib/chardet/jpcntx.py b/lib/chardet/jpcntx.py index 87d6672b..624d5349 100644 --- a/lib/chardet/jpcntx.py +++ b/lib/chardet/jpcntx.py @@ -25,7 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .compat import wrap_ord # This is hiragana 2-char sequence table, the number in each cell represents its frequency category jp2CharContext = ( @@ -194,7 +193,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis): if not byte_str: return -1, 1 # find out current char's byte length - first_char = wrap_ord(byte_str[0]) + first_char = byte_str[0] if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC): char_len = 2 if (first_char == 0x87) or (0xFA <= first_char <= 0xFC): @@ -204,7 +203,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis): # return its order if it is hiragana if len(byte_str) > 1: - second_char = wrap_ord(byte_str[1]) + second_char = byte_str[1] if (first_char == 202) and (0x9F <= second_char <= 0xF1): return second_char - 0x9F, char_len @@ -215,7 +214,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis): if not byte_str: return -1, 1 # find out current char's byte length - first_char = wrap_ord(byte_str[0]) + first_char = byte_str[0] if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE): char_len = 2 elif first_char == 0x8F: @@ -225,7 +224,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis): # return its order if it is hiragana if len(byte_str) > 1: - second_char = wrap_ord(byte_str[1]) + second_char = byte_str[1] if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3): return second_char - 0xA1, char_len diff --git a/lib/chardet/langbulgarianmodel.py b/lib/chardet/langbulgarianmodel.py index 64a9f324..eb6f19a3 100644 --- a/lib/chardet/langbulgarianmodel.py +++ b/lib/chardet/langbulgarianmodel.py @@ -214,7 +214,8 @@ Latin5BulgarianModel = { 'precedence_matrix': BulgarianLangModel, 'typical_positive_ratio': 0.969392, 'keep_english_letter': False, - 'charset_name': "ISO-8859-5" + 'charset_name': "ISO-8859-5", + 'language': 'Bulgairan', } Win1251BulgarianModel = { @@ -222,8 +223,6 @@ Win1251BulgarianModel = { 'precedence_matrix': BulgarianLangModel, 'typical_positive_ratio': 0.969392, 'keep_english_letter': False, - 'charset_name': "windows-1251" + 'charset_name': "windows-1251", + 'language': 'Bulgarian', } - - - diff --git a/lib/chardet/langcyrillicmodel.py b/lib/chardet/langcyrillicmodel.py index bb1853d0..bdbad703 100644 --- a/lib/chardet/langcyrillicmodel.py +++ b/lib/chardet/langcyrillicmodel.py @@ -283,7 +283,8 @@ Koi8rModel = { 'precedence_matrix': RussianLangModel, 'typical_positive_ratio': 0.976601, 'keep_english_letter': False, - 'charset_name': "KOI8-R" + 'charset_name': "KOI8-R", + 'language': 'Russian', } Win1251CyrillicModel = { @@ -291,7 +292,8 @@ Win1251CyrillicModel = { 'precedence_matrix': RussianLangModel, 'typical_positive_ratio': 0.976601, 'keep_english_letter': False, - 'charset_name': "windows-1251" + 'charset_name': "windows-1251", + 'language': 'Russian', } Latin5CyrillicModel = { @@ -299,7 +301,8 @@ Latin5CyrillicModel = { 'precedence_matrix': RussianLangModel, 'typical_positive_ratio': 0.976601, 'keep_english_letter': False, - 'charset_name': "ISO-8859-5" + 'charset_name': "ISO-8859-5", + 'language': 'Russian', } MacCyrillicModel = { @@ -307,7 +310,8 @@ MacCyrillicModel = { 'precedence_matrix': RussianLangModel, 'typical_positive_ratio': 0.976601, 'keep_english_letter': False, - 'charset_name': "MacCyrillic" + 'charset_name': "MacCyrillic", + 'language': 'Russian', } Ibm866Model = { @@ -315,7 +319,8 @@ Ibm866Model = { 'precedence_matrix': RussianLangModel, 'typical_positive_ratio': 0.976601, 'keep_english_letter': False, - 'charset_name': "IBM866" + 'charset_name': "IBM866", + 'language': 'Russian', } Ibm855Model = { @@ -323,7 +328,6 @@ Ibm855Model = { 'precedence_matrix': RussianLangModel, 'typical_positive_ratio': 0.976601, 'keep_english_letter': False, - 'charset_name': "IBM855" + 'charset_name': "IBM855", + 'language': 'Russian', } - - diff --git a/lib/chardet/langgreekmodel.py b/lib/chardet/langgreekmodel.py index c84cf5eb..73541cc5 100644 --- a/lib/chardet/langgreekmodel.py +++ b/lib/chardet/langgreekmodel.py @@ -211,7 +211,8 @@ Latin7GreekModel = { 'precedence_matrix': GreekLangModel, 'typical_positive_ratio': 0.982851, 'keep_english_letter': False, - 'charset_name': "ISO-8859-7" + 'charset_name': "ISO-8859-7", + 'language': 'Greek', } Win1253GreekModel = { @@ -219,7 +220,6 @@ Win1253GreekModel = { 'precedence_matrix': GreekLangModel, 'typical_positive_ratio': 0.982851, 'keep_english_letter': False, - 'charset_name': "windows-1253" + 'charset_name': "windows-1253", + 'language': 'Greek', } - - diff --git a/lib/chardet/langhebrewmodel.py b/lib/chardet/langhebrewmodel.py index cba129fd..07029b6b 100644 --- a/lib/chardet/langhebrewmodel.py +++ b/lib/chardet/langhebrewmodel.py @@ -195,7 +195,6 @@ Win1255HebrewModel = { 'precedence_matrix': HEBREW_LANG_MODEL, 'typical_positive_ratio': 0.984004, 'keep_english_letter': False, - 'charset_name': "windows-1255" + 'charset_name': "windows-1255", + 'language': 'Hebrew', } - - diff --git a/lib/chardet/langhungarianmodel.py b/lib/chardet/langhungarianmodel.py index 9b501721..6de87b72 100644 --- a/lib/chardet/langhungarianmodel.py +++ b/lib/chardet/langhungarianmodel.py @@ -211,7 +211,8 @@ Latin2HungarianModel = { 'precedence_matrix': HungarianLangModel, 'typical_positive_ratio': 0.947368, 'keep_english_letter': True, - 'charset_name': "ISO-8859-2" + 'charset_name': "ISO-8859-2", + 'language': 'Hungarian', } Win1250HungarianModel = { @@ -219,7 +220,6 @@ Win1250HungarianModel = { 'precedence_matrix': HungarianLangModel, 'typical_positive_ratio': 0.947368, 'keep_english_letter': True, - 'charset_name': "windows-1250" + 'charset_name': "windows-1250", + 'language': 'Hungarian', } - - diff --git a/lib/chardet/langthaimodel.py b/lib/chardet/langthaimodel.py index faa05a0b..fdb33135 100644 --- a/lib/chardet/langthaimodel.py +++ b/lib/chardet/langthaimodel.py @@ -194,7 +194,6 @@ TIS620ThaiModel = { 'precedence_matrix': ThaiLangModel, 'typical_positive_ratio': 0.926386, 'keep_english_letter': False, - 'charset_name': "TIS-620" + 'charset_name': "TIS-620", + 'language': 'Thai', } - - diff --git a/lib/chardet/langturkishmodel.py b/lib/chardet/langturkishmodel.py index c1177028..64ec9bd8 100644 --- a/lib/chardet/langturkishmodel.py +++ b/lib/chardet/langturkishmodel.py @@ -188,5 +188,6 @@ Latin5TurkishModel = { 'precedence_matrix': TurkishLangModel, 'typical_positive_ratio': 0.970290, 'keep_english_letter': True, - 'charset_name': "ISO-8859-9" + 'charset_name': "ISO-8859-9", + 'language': 'Turkish', } diff --git a/lib/chardet/latin1prober.py b/lib/chardet/latin1prober.py index a8871469..7c37520b 100644 --- a/lib/chardet/latin1prober.py +++ b/lib/chardet/latin1prober.py @@ -27,7 +27,6 @@ ######################### END LICENSE BLOCK ######################### from .charsetprober import CharSetProber -from .compat import wrap_ord from .enums import ProbingState FREQ_CAT_NUM = 4 @@ -108,16 +107,20 @@ class Latin1Prober(CharSetProber): @property def charset_name(self): - return "windows-1252" + return "ISO-8859-1" + + @property + def language(self): + return "" def feed(self, byte_str): byte_str = self.filter_with_english_letters(byte_str) for c in byte_str: - char_class = Latin1_CharToClass[wrap_ord(c)] + char_class = Latin1_CharToClass[c] freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class] if freq == 0: - self._state = ProbingState.not_me + self._state = ProbingState.NOT_ME break self._freq_counter[freq] += 1 self._last_char_class = char_class @@ -125,7 +128,7 @@ class Latin1Prober(CharSetProber): return self.state def get_confidence(self): - if self.state == ProbingState.not_me: + if self.state == ProbingState.NOT_ME: return 0.01 total = sum(self._freq_counter) diff --git a/lib/chardet/mbcharsetprober.py b/lib/chardet/mbcharsetprober.py index b1a1eebd..46091543 100644 --- a/lib/chardet/mbcharsetprober.py +++ b/lib/chardet/mbcharsetprober.py @@ -52,34 +52,38 @@ class MultiByteCharSetProber(CharSetProber): @property def charset_name(self): - pass + raise NotImplementedError + + @property + def language(self): + raise NotImplementedError def feed(self, byte_str): for i in range(len(byte_str)): coding_state = self.coding_sm.next_state(byte_str[i]) - if coding_state == MachineState.error: - self.logger.debug('%s prober hit error at byte %s', - self.charset_name, i) - self._state = ProbingState.not_me + if coding_state == MachineState.ERROR: + self.logger.debug('%s %s prober hit error at byte %s', + self.charset_name, self.language, i) + self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.its_me: - self._state = ProbingState.found_it + elif coding_state == MachineState.ITS_ME: + self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.start: + elif coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() if i == 0: self._last_char[1] = byte_str[0] self.distribution_analyzer.feed(self._last_char, char_len) else: self.distribution_analyzer.feed(byte_str[i - 1:i + 1], - char_len) + char_len) self._last_char[0] = byte_str[-1] - if self.state == ProbingState.detecting: + if self.state == ProbingState.DETECTING: if (self.distribution_analyzer.got_enough_data() and (self.get_confidence() > self.SHORTCUT_THRESHOLD)): - self._state = ProbingState.found_it + self._state = ProbingState.FOUND_IT return self.state diff --git a/lib/chardet/mbcssm.py b/lib/chardet/mbcssm.py index c28f43e5..d68f6f6c 100644 --- a/lib/chardet/mbcssm.py +++ b/lib/chardet/mbcssm.py @@ -65,9 +65,9 @@ BIG5_CLS = ( ) BIG5_ST = ( - MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,#08-0f - MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start#10-17 + MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f + MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17 ) BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) @@ -101,13 +101,13 @@ CP949_CLS = ( CP949_ST = ( #cls= 0 1 2 3 4 5 6 7 8 9 # previous state = - MachineState.error,MachineState.start, 3,MachineState.error,MachineState.start,MachineState.start, 4, 5,MachineState.error, 6, # MachineState.start - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, # MachineState.error - MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me, # MachineState.its_me - MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 3 - MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 4 - MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 5 - MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 6 + MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START, 4, 5,MachineState.ERROR, 6, # MachineState.START + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME + MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3 + MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4 + MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5 + MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6 ) CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) @@ -156,11 +156,11 @@ EUCJP_CLS = ( ) EUCJP_ST = ( - 3, 4, 3, 5,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f - MachineState.its_me,MachineState.its_me,MachineState.start,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#10-17 - MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error, 3,MachineState.error,#18-1f - 3,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start#20-27 + 3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17 + MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f + 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27 ) EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) @@ -209,8 +209,8 @@ EUCKR_CLS = ( ) EUCKR_ST = ( - MachineState.error,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07 - MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start #08-0f + MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f ) EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) @@ -259,12 +259,12 @@ EUCTW_CLS = ( ) EUCTW_ST = ( - MachineState.error,MachineState.error,MachineState.start, 3, 3, 3, 4,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f - MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.start,MachineState.error,#10-17 - MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f - 5,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.start,MachineState.start,#20-27 - MachineState.start,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f + MachineState.ERROR,MachineState.ERROR,MachineState.START, 3, 3, 3, 4,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17 + MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f + 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27 + MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f ) EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) @@ -313,12 +313,12 @@ GB2312_CLS = ( ) GB2312_ST = ( - MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, 3,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f - MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,#10-17 - 4,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f - MachineState.error,MachineState.error, 5,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,#20-27 - MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f + MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, 3,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17 + 4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f + MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27 + MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f ) # To be accurate, the length of class 6 can be either 2 or 4. @@ -374,9 +374,9 @@ SJIS_CLS = ( SJIS_ST = ( - MachineState.error,MachineState.start,MachineState.start, 3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f - MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start #10-17 + MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17 ) SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) @@ -425,13 +425,13 @@ UCS2BE_CLS = ( ) UCS2BE_ST = ( - 5, 7, 7,MachineState.error, 4, 3,MachineState.error,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f - MachineState.its_me,MachineState.its_me, 6, 6, 6, 6,MachineState.error,MachineState.error,#10-17 - 6, 6, 6, 6, 6,MachineState.its_me, 6, 6,#18-1f - 6, 6, 6, 6, 5, 7, 7,MachineState.error,#20-27 - 5, 8, 6, 6,MachineState.error, 6, 6, 6,#28-2f - 6, 6, 6, 6,MachineState.error,MachineState.error,MachineState.start,MachineState.start #30-37 + 5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f + MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17 + 6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f + 6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27 + 5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f + 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37 ) UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) @@ -480,13 +480,13 @@ UCS2LE_CLS = ( ) UCS2LE_ST = ( - 6, 6, 7, 6, 4, 3,MachineState.error,MachineState.error,#00-07 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f - MachineState.its_me,MachineState.its_me, 5, 5, 5,MachineState.error,MachineState.its_me,MachineState.error,#10-17 - 5, 5, 5,MachineState.error, 5,MachineState.error, 6, 6,#18-1f - 7, 6, 8, 8, 5, 5, 5,MachineState.error,#20-27 - 5, 5, 5,MachineState.error,MachineState.error,MachineState.error, 5, 5,#28-2f - 5, 5, 5,MachineState.error, 5,MachineState.error,MachineState.start,MachineState.start #30-37 + 6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f + MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17 + 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f + 7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27 + 5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f + 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37 ) UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) @@ -535,32 +535,32 @@ UTF8_CLS = ( ) UTF8_ST = ( - MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12, 10,#00-07 + MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12, 10,#00-07 9, 11, 8, 7, 6, 5, 4, 3,#08-0f - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#10-17 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f - MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#20-27 - MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#28-2f - MachineState.error,MachineState.error, 5, 5, 5, 5,MachineState.error,MachineState.error,#30-37 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#38-3f - MachineState.error,MachineState.error,MachineState.error, 5, 5, 5,MachineState.error,MachineState.error,#40-47 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#48-4f - MachineState.error,MachineState.error, 7, 7, 7, 7,MachineState.error,MachineState.error,#50-57 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#58-5f - MachineState.error,MachineState.error,MachineState.error,MachineState.error, 7, 7,MachineState.error,MachineState.error,#60-67 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#68-6f - MachineState.error,MachineState.error, 9, 9, 9, 9,MachineState.error,MachineState.error,#70-77 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#78-7f - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 9,MachineState.error,MachineState.error,#80-87 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#88-8f - MachineState.error,MachineState.error, 12, 12, 12, 12,MachineState.error,MachineState.error,#90-97 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#98-9f - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, 12,MachineState.error,MachineState.error,#a0-a7 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#a8-af - MachineState.error,MachineState.error, 12, 12, 12,MachineState.error,MachineState.error,MachineState.error,#b0-b7 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#b8-bf - MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,#c0-c7 - MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error #c8-cf + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27 + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f + MachineState.ERROR,MachineState.ERROR, 5, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#30-37 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#40-47 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f + MachineState.ERROR,MachineState.ERROR, 7, 7, 7, 7,MachineState.ERROR,MachineState.ERROR,#50-57 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 7, 7,MachineState.ERROR,MachineState.ERROR,#60-67 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f + MachineState.ERROR,MachineState.ERROR, 9, 9, 9, 9,MachineState.ERROR,MachineState.ERROR,#70-77 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 9,MachineState.ERROR,MachineState.ERROR,#80-87 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f + MachineState.ERROR,MachineState.ERROR, 12, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,#90-97 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12,MachineState.ERROR,MachineState.ERROR,#a0-a7 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af + MachineState.ERROR,MachineState.ERROR, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf + MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7 + MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf ) UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) diff --git a/lib/chardet/sbcharsetprober.py b/lib/chardet/sbcharsetprober.py index 92802605..66e0dfcc 100644 --- a/lib/chardet/sbcharsetprober.py +++ b/lib/chardet/sbcharsetprober.py @@ -27,18 +27,14 @@ ######################### END LICENSE BLOCK ######################### from .charsetprober import CharSetProber -from .compat import wrap_ord -from .enums import ProbingState +from .enums import CharacterCategory, ProbingState, SequenceLikelihood class SingleByteCharSetProber(CharSetProber): SAMPLE_SIZE = 64 - SB_ENOUGH_REL_THRESHOLD = 1024 + SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 POSITIVE_SHORTCUT_THRESHOLD = 0.95 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 - SYMBOL_CAT_ORDER = 250 - NUMBER_OF_SEQ_CAT = 4 - POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 def __init__(self, model, reversed=False, name_prober=None): super(SingleByteCharSetProber, self).__init__() @@ -58,7 +54,7 @@ class SingleByteCharSetProber(CharSetProber): super(SingleByteCharSetProber, self).reset() # char order of last character self._last_order = 255 - self._seq_counters = [0] * self.NUMBER_OF_SEQ_CAT + self._seq_counters = [0] * SequenceLikelihood.get_num_categories() self._total_seqs = 0 self._total_char = 0 # characters that fall in our sampling range @@ -71,15 +67,29 @@ class SingleByteCharSetProber(CharSetProber): else: return self._model['charset_name'] + @property + def language(self): + if self._name_prober: + return self._name_prober.language + else: + return self._model.get('language') + def feed(self, byte_str): if not self._model['keep_english_letter']: byte_str = self.filter_international_words(byte_str) - num_bytes = len(byte_str) - if not num_bytes: + if not byte_str: return self.state - for c in byte_str: - order = self._model['char_to_order_map'][wrap_ord(c)] - if order < self.SYMBOL_CAT_ORDER: + char_to_order_map = self._model['char_to_order_map'] + for i, c in enumerate(byte_str): + # XXX: Order is in range 1-64, so one would think we want 0-63 here, + # but that leads to 27 more test failures than before. + order = char_to_order_map[c] + # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but + # CharacterCategory.SYMBOL is actually 253, so we use CONTROL + # to make it closer to the original intent. The only difference + # is whether or not we count digits and control characters for + # _total_char purposes. + if order < CharacterCategory.CONTROL: self._total_char += 1 if order < self.SAMPLE_SIZE: self._freq_char += 1 @@ -94,27 +104,28 @@ class SingleByteCharSetProber(CharSetProber): self._seq_counters[model] += 1 self._last_order = order - if self.state == ProbingState.detecting: + charset_name = self._model['charset_name'] + if self.state == ProbingState.DETECTING: if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: - cf = self.get_confidence() - if cf > self.POSITIVE_SHORTCUT_THRESHOLD: + confidence = self.get_confidence() + if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: self.logger.debug('%s confidence = %s, we have a winner', - self._model['charset_name'], cf) - self._state = ProbingState.found_it - elif cf < self.NEGATIVE_SHORTCUT_THRESHOLD: + charset_name, confidence) + self._state = ProbingState.FOUND_IT + elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: self.logger.debug('%s confidence = %s, below negative ' - 'shortcut threshold %s', - self._model['charset_name'], cf, + 'shortcut threshhold %s', charset_name, + confidence, self.NEGATIVE_SHORTCUT_THRESHOLD) - self._state = ProbingState.not_me + self._state = ProbingState.NOT_ME return self.state def get_confidence(self): r = 0.01 if self._total_seqs > 0: - r = ((1.0 * self._seq_counters[self.POSITIVE_CAT]) / self._total_seqs - / self._model['typical_positive_ratio']) + r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / + self._total_seqs / self._model['typical_positive_ratio']) r = r * self._freq_char / self._total_char if r >= 1.0: r = 0.99 diff --git a/lib/chardet/sjisprober.py b/lib/chardet/sjisprober.py index 4a6002c1..683add02 100644 --- a/lib/chardet/sjisprober.py +++ b/lib/chardet/sjisprober.py @@ -49,36 +49,40 @@ class SJISProber(MultiByteCharSetProber): def charset_name(self): return self.context_analyzer.charset_name + @property + def language(self): + return "Japanese" + def feed(self, byte_str): for i in range(len(byte_str)): coding_state = self.coding_sm.next_state(byte_str[i]) - if coding_state == MachineState.error: - self.logger.debug('%s prober hit error at byte %s', - self.charset_name, i) - self._state = ProbingState.not_me + if coding_state == MachineState.ERROR: + self.logger.debug('%s %s prober hit error at byte %s', + self.charset_name, self.language, i) + self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.its_me: - self._state = ProbingState.found_it + elif coding_state == MachineState.ITS_ME: + self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.start: + elif coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() if i == 0: self._last_char[1] = byte_str[0] self.context_analyzer.feed(self._last_char[2 - char_len:], - char_len) + char_len) self.distribution_analyzer.feed(self._last_char, char_len) else: self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3 - - char_len], char_len) + - char_len], char_len) self.distribution_analyzer.feed(byte_str[i - 1:i + 1], - char_len) + char_len) self._last_char[0] = byte_str[-1] - if self.state == ProbingState.detecting: + if self.state == ProbingState.DETECTING: if (self.context_analyzer.got_enough_data() and (self.get_confidence() > self.SHORTCUT_THRESHOLD)): - self._state = ProbingState.found_it + self._state = ProbingState.FOUND_IT return self.state diff --git a/lib/chardet/universaldetector.py b/lib/chardet/universaldetector.py index be73a00b..8a6de3b4 100644 --- a/lib/chardet/universaldetector.py +++ b/lib/chardet/universaldetector.py @@ -40,6 +40,7 @@ import codecs import logging import re +from .charsetgroupprober import CharSetGroupProber from .enums import InputState, LanguageFilter, ProbingState from .escprober import EscCharSetProber from .latin1prober import Latin1Prober @@ -67,8 +68,17 @@ class UniversalDetector(object): MINIMUM_THRESHOLD = 0.20 HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') ESC_DETECTOR = re.compile(b'(\033|~{)') + WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') + ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', + 'iso-8859-2': 'Windows-1250', + 'iso-8859-5': 'Windows-1251', + 'iso-8859-6': 'Windows-1256', + 'iso-8859-7': 'Windows-1253', + 'iso-8859-8': 'Windows-1255', + 'iso-8859-9': 'Windows-1254', + 'iso-8859-13': 'Windows-1257'} - def __init__(self, lang_filter=LanguageFilter.all): + def __init__(self, lang_filter=LanguageFilter.ALL): self._esc_charset_prober = None self._charset_probers = [] self.result = None @@ -78,6 +88,7 @@ class UniversalDetector(object): self._last_char = None self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) + self._has_win_bytes = None self.reset() def reset(self): @@ -86,10 +97,11 @@ class UniversalDetector(object): initial states. This is called by ``__init__``, so you only need to call this directly in between analyses of different documents. """ - self.result = {'encoding': None, 'confidence': 0.0} + self.result = {'encoding': None, 'confidence': 0.0, 'language': None} self.done = False self._got_data = False - self._input_state = InputState.pure_ascii + self._has_win_bytes = False + self._input_state = InputState.PURE_ASCII self._last_char = b'' if self._esc_charset_prober: self._esc_charset_prober.reset() @@ -116,28 +128,40 @@ class UniversalDetector(object): if not len(byte_str): return + if not isinstance(byte_str, bytearray): + byte_str = bytearray(byte_str) + # First check for known BOMs, since these are guaranteed to be correct if not self._got_data: # If the data starts with BOM, we know it is UTF if byte_str.startswith(codecs.BOM_UTF8): # EF BB BF UTF-8 with BOM - self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} - elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE): + self.result = {'encoding': "UTF-8-SIG", + 'confidence': 1.0, + 'language': ''} + elif byte_str.startswith((codecs.BOM_UTF32_LE, + codecs.BOM_UTF32_BE)): # FF FE 00 00 UTF-32, little-endian BOM # 00 00 FE FF UTF-32, big-endian BOM - self.result = {'encoding': "UTF-32", 'confidence': 1.0} + self.result = {'encoding': "UTF-32", + 'confidence': 1.0, + 'language': ''} elif byte_str.startswith(b'\xFE\xFF\x00\x00'): # FE FF 00 00 UCS-4, unusual octet order BOM (3412) self.result = {'encoding': "X-ISO-10646-UCS-4-3412", - 'confidence': 1.0} + 'confidence': 1.0, + 'language': ''} elif byte_str.startswith(b'\x00\x00\xFF\xFE'): # 00 00 FF FE UCS-4, unusual octet order BOM (2143) self.result = {'encoding': "X-ISO-10646-UCS-4-2143", - 'confidence': 1.0} - elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE): + 'confidence': 1.0, + 'language': ''} + elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): # FF FE UTF-16, little endian BOM # FE FF UTF-16, big endian BOM - self.result = {'encoding': "UTF-16", 'confidence': 1.0} + self.result = {'encoding': "UTF-16", + 'confidence': 1.0, + 'language': ''} self._got_data = True if self.result['encoding'] is not None: @@ -146,12 +170,12 @@ class UniversalDetector(object): # If none of those matched and we've only see ASCII so far, check # for high bytes and escape sequences - if self._input_state == InputState.pure_ascii: + if self._input_state == InputState.PURE_ASCII: if self.HIGH_BYTE_DETECTOR.search(byte_str): - self._input_state = InputState.high_byte - elif self._input_state == InputState.pure_ascii and \ + self._input_state = InputState.HIGH_BYTE + elif self._input_state == InputState.PURE_ASCII and \ self.ESC_DETECTOR.search(self._last_char + byte_str): - self._input_state = InputState.esc_ascii + self._input_state = InputState.ESC_ASCII self._last_char = byte_str[-1:] @@ -159,14 +183,16 @@ class UniversalDetector(object): # uses a simple state machine to check for known escape sequences in # HZ and ISO-2022 encodings, since those are the only encodings that # use such sequences. - if self._input_state == InputState.esc_ascii: + if self._input_state == InputState.ESC_ASCII: if not self._esc_charset_prober: self._esc_charset_prober = EscCharSetProber(self.lang_filter) - if self._esc_charset_prober.feed(byte_str) == ProbingState.found_it: + if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: self.result = {'encoding': self._esc_charset_prober.charset_name, 'confidence': - self._esc_charset_prober.get_confidence()} + self._esc_charset_prober.get_confidence(), + 'language': + self._esc_charset_prober.language} self.done = True # If we've seen high bytes (i.e., those with values greater than 127), # we need to do more complicated checks using all our multi-byte and @@ -174,59 +200,87 @@ class UniversalDetector(object): # use character bigram distributions to determine the encoding, whereas # the multi-byte probers use a combination of character unigram and # bigram distributions. - elif self._input_state == InputState.high_byte: + elif self._input_state == InputState.HIGH_BYTE: if not self._charset_probers: self._charset_probers = [MBCSGroupProber(self.lang_filter)] # If we're checking non-CJK encodings, use single-byte prober - if self.lang_filter & LanguageFilter.non_cjk: + if self.lang_filter & LanguageFilter.NON_CJK: self._charset_probers.append(SBCSGroupProber()) self._charset_probers.append(Latin1Prober()) for prober in self._charset_probers: - if prober.feed(byte_str) == ProbingState.found_it: + if prober.feed(byte_str) == ProbingState.FOUND_IT: self.result = {'encoding': prober.charset_name, - 'confidence': prober.get_confidence()} + 'confidence': prober.get_confidence(), + 'language': prober.language} self.done = True break + if self.WIN_BYTE_DETECTOR.search(byte_str): + self._has_win_bytes = True def close(self): """ Stop analyzing the current document and come up with a final prediction. - :returns: The ``result`` attribute if a prediction was made, otherwise - ``None``. + :returns: The ``result`` attribute, a ``dict`` with the keys + `encoding`, `confidence`, and `language`. """ + # Don't bother with checks if we're already done if self.done: return self.result - if not self._got_data: - self.logger.debug('no data received!') - return self.done = True - if self._input_state in (InputState.pure_ascii, InputState.esc_ascii): - self.result = {'encoding': 'ascii', 'confidence': 1.0} - return self.result + if not self._got_data: + self.logger.debug('no data received!') - if self._input_state == InputState.high_byte: - proberConfidence = None + # Default to ASCII if it is all we've seen so far + elif self._input_state == InputState.PURE_ASCII: + self.result = {'encoding': 'ascii', + 'confidence': 1.0, + 'language': ''} + + # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD + elif self._input_state == InputState.HIGH_BYTE: + prober_confidence = None max_prober_confidence = 0.0 max_prober = None for prober in self._charset_probers: if not prober: continue - proberConfidence = prober.get_confidence() - if proberConfidence > max_prober_confidence: - max_prober_confidence = proberConfidence + prober_confidence = prober.get_confidence() + if prober_confidence > max_prober_confidence: + max_prober_confidence = prober_confidence max_prober = prober if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): - self.result = {'encoding': max_prober.charset_name, - 'confidence': max_prober.get_confidence()} - return self.result + charset_name = max_prober.charset_name + lower_charset_name = max_prober.charset_name.lower() + confidence = max_prober.get_confidence() + # Use Windows encoding name instead of ISO-8859 if we saw any + # extra Windows-specific bytes + if lower_charset_name.startswith('iso-8859'): + if self._has_win_bytes: + charset_name = self.ISO_WIN_MAP.get(lower_charset_name, + charset_name) + self.result = {'encoding': charset_name, + 'confidence': confidence, + 'language': max_prober.language} + # Log all prober confidences if none met MINIMUM_THRESHOLD if self.logger.getEffectiveLevel() == logging.DEBUG: - self.logger.debug('no probers hit minimum threshold') - for prober in self._charset_probers[0].probers: - if not prober: - continue - self.logger.debug('%s confidence = %s', prober.charset_name, - prober.get_confidence()) + if self.result['encoding'] is None: + self.logger.debug('no probers hit minimum threshold') + for group_prober in self._charset_probers: + if not group_prober: + continue + if isinstance(group_prober, CharSetGroupProber): + for prober in group_prober.probers: + self.logger.debug('%s %s confidence = %s', + prober.charset_name, + prober.language, + prober.get_confidence()) + else: + self.logger.debug('%s %s confidence = %s', + prober.charset_name, + prober.language, + prober.get_confidence()) + return self.result diff --git a/lib/chardet/utf8prober.py b/lib/chardet/utf8prober.py index 670ef436..45732679 100644 --- a/lib/chardet/utf8prober.py +++ b/lib/chardet/utf8prober.py @@ -50,22 +50,26 @@ class UTF8Prober(CharSetProber): def charset_name(self): return "utf-8" + @property + def language(self): + return "" + def feed(self, byte_str): for c in byte_str: coding_state = self.coding_sm.next_state(c) - if coding_state == MachineState.error: - self._state = ProbingState.not_me + if coding_state == MachineState.ERROR: + self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.its_me: - self._state = ProbingState.found_it + elif coding_state == MachineState.ITS_ME: + self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.start: + elif coding_state == MachineState.START: if self.coding_sm.get_current_charlen() >= 2: self._num_mb_chars += 1 - if self.state == ProbingState.detecting: + if self.state == ProbingState.DETECTING: if self.get_confidence() > self.SHORTCUT_THRESHOLD: - self._state = ProbingState.found_it + self._state = ProbingState.FOUND_IT return self.state diff --git a/lib/chardet/version.py b/lib/chardet/version.py index a76856ea..f24d042f 100644 --- a/lib/chardet/version.py +++ b/lib/chardet/version.py @@ -2,8 +2,8 @@ This module exists only to simplify retrieving the version number of chardet from within setup.py and from chardet subpackages. -:author: Dan Blanchard (dblanchard@ets.org) +:author: Dan Blanchard (dan.blanchard@gmail.com) """ -__version__ = "2.3.0" +__version__ = "3.0.4" VERSION = __version__.split('.')