Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2).

2025-04-13 22:11:26 +00:00 · 2017-08-26 00:19:44 +01:00 · 2017-08-26 00:19:44 +01:00 · 0b6b9388bc
commit 0b6b9388bc
parent 1c613d951c
35 changed files with 486 additions and 329 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -13,6 +13,7 @@
 * Update Beautiful Soup 4.4.0 (r397) to 4.5.3 (r439)
 * Update cachecontrol library 0.11.5 to 0.12.3 (db54c40)
 * Update Certifi 2015.11.20.1 (385476b) to 2017.07.27 (f808089)
+* Update chardet packages 2.3.0 (d7fae98) to 3.0.4 (9b8c5c2)
 * Update dateutil library 2.4.2 (d4baf97) to 2.6.1 (2f3a160)
 * Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
 * Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)
--- a/lib/chardet/init.py
+++ b/lib/chardet/init.py
@ -16,17 +16,24 @@
 ######################### END LICENSE BLOCK #########################


-from .compat import PY2, PY3, bin_type as _bin_type
+from .compat import PY2, PY3
 from .universaldetector import UniversalDetector
 from .version import __version__, VERSION


 def detect(byte_str):
-    if not isinstance(byte_str, _bin_type):
-        raise TypeError('Expected object of {0} type, got: {1}'
-                        ''.format(_bin_type, type(byte_str)))
+    """
+    Detect the encoding of the given byte string.

-    u = UniversalDetector()
-    u.feed(byte_str)
-    u.close()
-    return u.result
+    :param byte_str:     The byte sequence to examine.
+    :type byte_str:      ``bytes`` or ``bytearray``
+    """
+    if not isinstance(byte_str, bytearray):
+        if not isinstance(byte_str, bytes):
+            raise TypeError('Expected object of type bytes or bytearray, got: '
+                            '{0}'.format(type(byte_str)))
+        else:
+            byte_str = bytearray(byte_str)
+    detector = UniversalDetector()
+    detector.feed(byte_str)
+    return detector.close()
--- a/lib/chardet/big5prober.py
+++ b/lib/chardet/big5prober.py
@ -41,3 +41,7 @@ class Big5Prober(MultiByteCharSetProber):
    @property
    def charset_name(self):
        return "Big5"
+
+    @property
+    def language(self):
+        return "Chinese"
--- a/lib/chardet/chardistribution.py
+++ b/lib/chardet/chardistribution.py
@ -35,7 +35,6 @@ from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
                       BIG5_TYPICAL_DISTRIBUTION_RATIO)
 from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
                      JIS_TYPICAL_DISTRIBUTION_RATIO)
-from .compat import wrap_ord


 class CharDistributionAnalysis(object):
@ -123,9 +122,9 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
        #   first  byte range: 0xc4 -- 0xfe
        #   second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
        if first_char >= 0xC4:
-            return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1
+            return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
        else:
            return -1

@ -142,9 +141,9 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
        #   first  byte range: 0xb0 -- 0xfe
        #   second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
        if first_char >= 0xB0:
-            return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1
+            return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
        else:
            return -1

@ -161,7 +160,7 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
        #  first  byte range: 0xb0 -- 0xfe
        #  second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
        if (first_char >= 0xB0) and (second_char >= 0xA1):
            return 94 * (first_char - 0xB0) + second_char - 0xA1
        else:
@ -180,7 +179,7 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
        #   first  byte range: 0xa4 -- 0xfe
        #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
        if first_char >= 0xA4:
            if second_char >= 0xA1:
                return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
@ -202,7 +201,7 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
        #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
        #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
        # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
        if (first_char >= 0x81) and (first_char <= 0x9F):
            order = 188 * (first_char - 0x81)
        elif (first_char >= 0xE0) and (first_char <= 0xEF):
@ -227,8 +226,8 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
        #   first  byte range: 0xa0 -- 0xfe
        #   second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
-        char = wrap_ord(byte_str[0])
+        char = byte_str[0]
        if char >= 0xA0:
-            return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1
+            return 94 * (char - 0xA1) + byte_str[1] - 0xa1
        else:
            return -1
--- a/lib/chardet/charsetgroupprober.py
+++ b/lib/chardet/charsetgroupprober.py
@ -54,6 +54,14 @@ class CharSetGroupProber(CharSetProber):
                return None
        return self._best_guess_prober.charset_name

+    @property
+    def language(self):
+        if not self._best_guess_prober:
+            self.get_confidence()
+            if not self._best_guess_prober:
+                return None
+        return self._best_guess_prober.language
+
    def feed(self, byte_str):
        for prober in self.probers:
            if not prober:
@ -63,22 +71,22 @@ class CharSetGroupProber(CharSetProber):
            state = prober.feed(byte_str)
            if not state:
                continue
-            if state == ProbingState.found_it:
+            if state == ProbingState.FOUND_IT:
                self._best_guess_prober = prober
                return self.state
-            elif state == ProbingState.not_me:
+            elif state == ProbingState.NOT_ME:
                prober.active = False
                self._active_num -= 1
                if self._active_num <= 0:
-                    self._state = ProbingState.not_me
+                    self._state = ProbingState.NOT_ME
                    return self.state
        return self.state

    def get_confidence(self):
        state = self.state
-        if state == ProbingState.found_it:
+        if state == ProbingState.FOUND_IT:
            return 0.99
-        elif state == ProbingState.not_me:
+        elif state == ProbingState.NOT_ME:
            return 0.01
        best_conf = 0.0
        self._best_guess_prober = None
@ -89,7 +97,7 @@ class CharSetGroupProber(CharSetProber):
                self.logger.debug('%s not active', prober.charset_name)
                continue
            conf = prober.get_confidence()
-            self.logger.debug('%s confidence = %s', prober.charset_name, conf)
+            self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
            if best_conf < conf:
                best_conf = conf
                self._best_guess_prober = prober
--- a/lib/chardet/charsetprober.py
+++ b/lib/chardet/charsetprober.py
@ -42,7 +42,7 @@ class CharSetProber(object):
        self.logger = logging.getLogger(__name__)

    def reset(self):
-        self._state = ProbingState.detecting
+        self._state = ProbingState.DETECTING

    @property
    def charset_name(self):
--- a/lib/chardet/cli/chardetect.py
+++ b/lib/chardet/cli/chardetect.py
@ -17,15 +17,12 @@ from __future__ import absolute_import, print_function, unicode_literals

 import argparse
 import sys
-from io import open

 from chardet import __version__
 from chardet.compat import PY2
 from chardet.universaldetector import UniversalDetector


-
-
 def description_of(lines, name='stdin'):
    """
    Return a string describing the probable encoding of a file or
@ -38,7 +35,11 @@ def description_of(lines, name='stdin'):
    """
    u = UniversalDetector()
    for line in lines:
+        line = bytearray(line)
        u.feed(line)
+        # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
+        if u.done:
+            break
    u.close()
    result = u.result
    if PY2:
--- a/lib/chardet/codingstatemachine.py
+++ b/lib/chardet/codingstatemachine.py
@ -28,7 +28,6 @@
 import logging

 from .enums import MachineState
-from .compat import wrap_ord


 class CodingStateMachine(object):
@ -62,13 +61,13 @@ class CodingStateMachine(object):
        self.reset()

    def reset(self):
-        self._curr_state = MachineState.start
+        self._curr_state = MachineState.START

    def next_state(self, c):
        # for each byte we get its class
        # if it is first byte, we also get byte length
-        byte_class = self._model['class_table'][wrap_ord(c)]
-        if self._curr_state == MachineState.start:
+        byte_class = self._model['class_table'][c]
+        if self._curr_state == MachineState.START:
            self._curr_byte_pos = 0
            self._curr_char_len = self._model['char_len_table'][byte_class]
        # from byte's class and state_table, we get its next state
@ -83,3 +82,7 @@ class CodingStateMachine(object):

    def get_coding_state_machine(self):
        return self._model['name']
+
+    @property
+    def language(self):
+        return self._model['language']
--- a/lib/chardet/compat.py
+++ b/lib/chardet/compat.py
@ -27,17 +27,8 @@ if sys.version_info < (3, 0):
    PY3 = False
    base_str = (str, unicode)
    text_type = unicode
-    bin_type = str
 else:
    PY2 = False
    PY3 = True
    base_str = (bytes, str)
    text_type = str
-    bin_type = (bytes, bytearray)
-
-
-def wrap_ord(a):
-    if PY2 and isinstance(a, base_str):
-        return ord(a)
-    else:
-        return a
--- a/lib/chardet/cp949prober.py
+++ b/lib/chardet/cp949prober.py
@ -43,3 +43,7 @@ class CP949Prober(MultiByteCharSetProber):
    @property
    def charset_name(self):
        return "CP949"
+
+    @property
+    def language(self):
+        return "Korean"
--- a/lib/chardet/enums.py
+++ b/lib/chardet/enums.py
@ -9,9 +9,9 @@ class InputState(object):
    """
    This enum represents the different states a universal detector can be in.
    """
-    pure_ascii = 0
-    esc_ascii = 1
-    high_byte = 2
+    PURE_ASCII = 0
+    ESC_ASCII = 1
+    HIGH_BYTE = 2


 class LanguageFilter(object):
@ -19,29 +19,58 @@ class LanguageFilter(object):
    This enum represents the different language filters we can apply to a
    ``UniversalDetector``.
    """
-    chinese_simplified = 0x01
-    chinese_traditional = 0x02
-    japanese = 0x04
-    korean = 0x08
-    non_cjk = 0x10
-    all = 0x1F
-    chinese = chinese_simplified | chinese_traditional
-    cjk = chinese | japanese | korean
+    CHINESE_SIMPLIFIED = 0x01
+    CHINESE_TRADITIONAL = 0x02
+    JAPANESE = 0x04
+    KOREAN = 0x08
+    NON_CJK = 0x10
+    ALL = 0x1F
+    CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
+    CJK = CHINESE | JAPANESE | KOREAN


 class ProbingState(object):
    """
    This enum represents the different states a prober can be in.
    """
-    detecting = 0
-    found_it = 1
-    not_me = 2
+    DETECTING = 0
+    FOUND_IT = 1
+    NOT_ME = 2


 class MachineState(object):
    """
    This enum represents the different states a state machine can be in.
    """
-    start = 0
-    error = 1
-    its_me = 2
+    START = 0
+    ERROR = 1
+    ITS_ME = 2
+
+
+class SequenceLikelihood(object):
+    """
+    This enum represents the likelihood of a character following the previous one.
+    """
+    NEGATIVE = 0
+    UNLIKELY = 1
+    LIKELY = 2
+    POSITIVE = 3
+
+    @classmethod
+    def get_num_categories(cls):
+        """:returns: The number of likelihood categories in the enum."""
+        return 4
+
+
+class CharacterCategory(object):
+    """
+    This enum represents the different categories language models for
+    ``SingleByteCharsetProber`` put characters into.
+
+    Anything less than CONTROL is considered a letter.
+    """
+    UNDEFINED = 255
+    LINE_BREAK = 254
+    SYMBOL = 253
+    DIGIT = 252
+    CONTROL = 251
--- a/lib/chardet/escprober.py
+++ b/lib/chardet/escprober.py
@ -27,7 +27,6 @@

 from .charsetprober import CharSetProber
 from .codingstatemachine import CodingStateMachine
-from .compat import wrap_ord
 from .enums import LanguageFilter, ProbingState, MachineState
 from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
                    ISO2022KR_SM_MODEL)
@ -43,15 +42,16 @@ class EscCharSetProber(CharSetProber):
    def __init__(self, lang_filter=None):
        super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
        self.coding_sm = []
-        if self.lang_filter & LanguageFilter.chinese_simplified:
+        if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
            self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
            self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
-        if self.lang_filter & LanguageFilter.japanese:
+        if self.lang_filter & LanguageFilter.JAPANESE:
            self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
-        if self.lang_filter & LanguageFilter.korean:
+        if self.lang_filter & LanguageFilter.KOREAN:
            self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
        self.active_sm_count = None
        self._detected_charset = None
+        self._detected_language = None
        self._state = None
        self.reset()

@ -64,11 +64,16 @@ class EscCharSetProber(CharSetProber):
            coding_sm.reset()
        self.active_sm_count = len(self.coding_sm)
        self._detected_charset = None
+        self._detected_language = None

    @property
    def charset_name(self):
        return self._detected_charset

+    @property
+    def language(self):
+        return self._detected_language
+
    def get_confidence(self):
        if self._detected_charset:
            return 0.99
@ -80,16 +85,17 @@ class EscCharSetProber(CharSetProber):
            for coding_sm in self.coding_sm:
                if not coding_sm or not coding_sm.active:
                    continue
-                coding_state = coding_sm.next_state(wrap_ord(c))
-                if coding_state == MachineState.error:
+                coding_state = coding_sm.next_state(c)
+                if coding_state == MachineState.ERROR:
                    coding_sm.active = False
                    self.active_sm_count -= 1
                    if self.active_sm_count <= 0:
-                        self._state = ProbingState.not_me
+                        self._state = ProbingState.NOT_ME
                        return self.state
-                elif coding_state == MachineState.its_me:
-                    self._state = ProbingState.found_it
+                elif coding_state == MachineState.ITS_ME:
+                    self._state = ProbingState.FOUND_IT
                    self._detected_charset = coding_sm.get_coding_state_machine()
+                    self._detected_language = coding_sm.language
                    return self.state

        return self.state
--- a/lib/chardet/escsm.py
+++ b/lib/chardet/escsm.py
@ -63,12 +63,12 @@ HZ_CLS = (
 )

 HZ_ST = (
-MachineState.start,MachineState.error,     3,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f
-MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start,     4,MachineState.error,# 10-17
-     5,MachineState.error,     6,MachineState.error,     5,     5,     4,MachineState.error,# 18-1f
-     4,MachineState.error,     4,     4,     4,MachineState.error,     4,MachineState.error,# 20-27
-     4,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 28-2f
+MachineState.START,MachineState.ERROR,     3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
+MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,     4,MachineState.ERROR,# 10-17
+     5,MachineState.ERROR,     6,MachineState.ERROR,     5,     5,     4,MachineState.ERROR,# 18-1f
+     4,MachineState.ERROR,     4,     4,     4,MachineState.ERROR,     4,MachineState.ERROR,# 20-27
+     4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
 )

 HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
@ -77,7 +77,8 @@ HZ_SM_MODEL = {'class_table': HZ_CLS,
               'class_factor': 6,
               'state_table': HZ_ST,
               'char_len_table': HZ_CHAR_LEN_TABLE,
-               'name': "HZ-GB-2312"}
+               'name': "HZ-GB-2312",
+               'language': 'Chinese'}

 ISO2022CN_CLS = (
 2,0,0,0,0,0,0,0,  # 00 - 07
@ -115,14 +116,14 @@ ISO2022CN_CLS = (
 )

 ISO2022CN_ST = (
-MachineState.start,     3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07
-MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f
-MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17
-MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,     4,MachineState.error,# 18-1f
-MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 20-27
-     5,     6,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 28-2f
-MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 30-37
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,# 38-3f
+MachineState.START,     3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
+MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
+MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
+MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     4,MachineState.ERROR,# 18-1f
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
+     5,     6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
 )

 ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -131,7 +132,8 @@ ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
                      'class_factor': 9,
                      'state_table': ISO2022CN_ST,
                      'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
-                      'name': "ISO-2022-CN"}
+                      'name': "ISO-2022-CN",
+                      'language': 'Chinese'}

 ISO2022JP_CLS = (
 2,0,0,0,0,0,0,0,  # 00 - 07
@ -169,15 +171,15 @@ ISO2022JP_CLS = (
 )

 ISO2022JP_ST = (
-MachineState.start,     3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 00-07
-MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 08-0f
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 10-17
-MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,# 18-1f
-MachineState.error,     5,MachineState.error,MachineState.error,MachineState.error,     4,MachineState.error,MachineState.error,# 20-27
-MachineState.error,MachineState.error,MachineState.error,     6,MachineState.its_me,MachineState.error,MachineState.its_me,MachineState.error,# 28-2f
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,# 30-37
-MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,MachineState.error,# 38-3f
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,MachineState.start,MachineState.start,# 40-47
+MachineState.START,     3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
+MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
+MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
+MachineState.ERROR,     5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     4,MachineState.ERROR,MachineState.ERROR,# 20-27
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
 )

 ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -186,7 +188,8 @@ ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
                      'class_factor': 10,
                      'state_table': ISO2022JP_ST,
                      'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
-                      'name': "ISO-2022-JP"}
+                      'name': "ISO-2022-JP",
+                      'language': 'Japanese'}

 ISO2022KR_CLS = (
 2,0,0,0,0,0,0,0,  # 00 - 07
@ -224,11 +227,11 @@ ISO2022KR_CLS = (
 )

 ISO2022KR_ST = (
-MachineState.start,     3,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,# 00-07
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,# 08-0f
-MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.error,     4,MachineState.error,MachineState.error,# 10-17
-MachineState.error,MachineState.error,MachineState.error,MachineState.error,     5,MachineState.error,MachineState.error,MachineState.error,# 18-1f
-MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.start,MachineState.start,MachineState.start,MachineState.start,# 20-27
+MachineState.START,     3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
+MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     4,MachineState.ERROR,MachineState.ERROR,# 10-17
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
+MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
 )

 ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
@ -237,6 +240,7 @@ ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
                      'class_factor': 6,
                      'state_table': ISO2022KR_ST,
                      'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
-                      'name': "ISO-2022-KR"}
+                      'name': "ISO-2022-KR",
+                      'language': 'Korean'}


--- a/lib/chardet/eucjpprober.py
+++ b/lib/chardet/eucjpprober.py
@ -49,19 +49,23 @@ class EUCJPProber(MultiByteCharSetProber):
    def charset_name(self):
        return "EUC-JP"

+    @property
+    def language(self):
+        return "Japanese"
+
    def feed(self, byte_str):
        for i in range(len(byte_str)):
            # PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
            coding_state = self.coding_sm.next_state(byte_str[i])
-            if coding_state == MachineState.error:
-                self.logger.debug('%s prober hit error at byte %s',
-                                  self.charset_name, i)
-                self._state = ProbingState.not_me
+            if coding_state == MachineState.ERROR:
+                self.logger.debug('%s %s prober hit error at byte %s',
+                                  self.charset_name, self.language, i)
+                self._state = ProbingState.NOT_ME
                break
-            elif coding_state == MachineState.its_me:
-                self._state = ProbingState.found_it
+            elif coding_state == MachineState.ITS_ME:
+                self._state = ProbingState.FOUND_IT
                break
-            elif coding_state == MachineState.start:
+            elif coding_state == MachineState.START:
                char_len = self.coding_sm.get_current_charlen()
                if i == 0:
                    self._last_char[1] = byte_str[0]
@ -75,10 +79,10 @@ class EUCJPProber(MultiByteCharSetProber):

        self._last_char[0] = byte_str[-1]

-        if self.state == ProbingState.detecting:
+        if self.state == ProbingState.DETECTING:
            if (self.context_analyzer.got_enough_data() and
               (self.get_confidence() > self.SHORTCUT_THRESHOLD)):
-                self._state = ProbingState.found_it
+                self._state = ProbingState.FOUND_IT

        return self.state

--- a/lib/chardet/euckrprober.py
+++ b/lib/chardet/euckrprober.py
@ -41,3 +41,7 @@ class EUCKRProber(MultiByteCharSetProber):
    @property
    def charset_name(self):
        return "EUC-KR"
+
+    @property
+    def language(self):
+        return "Korean"
--- a/lib/chardet/euctwfreq.py
+++ b/lib/chardet/euctwfreq.py
@ -44,7 +44,7 @@
 EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75

 # Char to FreqOrder table ,
-EUCTW_TABLE_SIZE = 8102
+EUCTW_TABLE_SIZE = 5376

 EUCTW_CHAR_TO_FREQ_ORDER = (
   1,1800,1506, 255,1431, 198,   9,  82,   6,7310, 177, 202,3615,1256,2808, 110,  # 2742
--- a/lib/chardet/euctwprober.py
+++ b/lib/chardet/euctwprober.py
@ -40,3 +40,7 @@ class EUCTWProber(MultiByteCharSetProber):
    @property
    def charset_name(self):
        return "EUC-TW"
+
+    @property
+    def language(self):
+        return "Taiwan"
--- a/lib/chardet/gb2312prober.py
+++ b/lib/chardet/gb2312prober.py
@ -40,3 +40,7 @@ class GB2312Prober(MultiByteCharSetProber):
    @property
    def charset_name(self):
        return "GB2312"
+
+    @property
+    def language(self):
+        return "Chinese"
--- a/lib/chardet/hebrewprober.py
+++ b/lib/chardet/hebrewprober.py
@ -27,7 +27,6 @@

 from .charsetprober import CharSetProber
 from .enums import ProbingState
-from .compat import wrap_ord

 # This prober doesn't actually recognize a language or a charset.
 # It is a helper prober for the use of the Hebrew model probers
@ -177,8 +176,8 @@ class HebrewProber(CharSetProber):
        self._visual_prober = visualProber

    def is_final(self, c):
-        return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
-                               self.FINAL_PE, self.FINAL_TSADI]
+        return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
+                     self.FINAL_PE, self.FINAL_TSADI]

    def is_non_final(self, c):
        # The normal Tsadi is not a good Non-Final letter due to words like
@ -191,8 +190,8 @@ class HebrewProber(CharSetProber):
        # for example legally end with a Non-Final Pe or Kaf. However, the
        # benefit of these letters as Non-Final letters outweighs the damage
        # since these words are quite rare.
-        return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM,
-                               self.NORMAL_NUN, self.NORMAL_PE]
+        return c in [self.NORMAL_KAF, self.NORMAL_MEM,
+                     self.NORMAL_NUN, self.NORMAL_PE]

    def feed(self, byte_str):
        # Final letter analysis for logical-visual decision.
@ -221,9 +220,9 @@ class HebrewProber(CharSetProber):
        # We automatically filter out all 7-bit characters (replace them with
        # spaces) so the word boundary detection works properly. [MAP]

-        if self.state == ProbingState.not_me:
+        if self.state == ProbingState.NOT_ME:
            # Both model probers say it's not them. No reason to continue.
-            return ProbingState.not_me
+            return ProbingState.NOT_ME

        byte_str = self.filter_high_byte_only(byte_str)

@ -250,8 +249,8 @@ class HebrewProber(CharSetProber):
            self._prev = cur

        # Forever detecting, till the end or until both model probers return
-        # ProbingState.not_me (handled above)
-        return ProbingState.detecting
+        # ProbingState.NOT_ME (handled above)
+        return ProbingState.DETECTING

    @property
    def charset_name(self):
@ -280,10 +279,14 @@ class HebrewProber(CharSetProber):
        # Logical.
        return self.LOGICAL_HEBREW_NAME

+    @property
+    def language(self):
+        return 'Hebrew'
+
    @property
    def state(self):
        # Remain active as long as any of the model probers are active.
-        if (self._logical_prober.state == ProbingState.not_me) and \
-           (self._visual_prober.state == ProbingState.not_me):
-            return ProbingState.not_me
-        return ProbingState.detecting
+        if (self._logical_prober.state == ProbingState.NOT_ME) and \
+           (self._visual_prober.state == ProbingState.NOT_ME):
+            return ProbingState.NOT_ME
+        return ProbingState.DETECTING
--- a/lib/chardet/jpcntx.py
+++ b/lib/chardet/jpcntx.py
@ -25,7 +25,6 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from .compat import wrap_ord

 # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
 jp2CharContext = (
@ -194,7 +193,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
        if not byte_str:
            return -1, 1
        # find out current char's byte length
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
        if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
            char_len = 2
            if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
@ -204,7 +203,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):

        # return its order if it is hiragana
        if len(byte_str) > 1:
-            second_char = wrap_ord(byte_str[1])
+            second_char = byte_str[1]
            if (first_char == 202) and (0x9F <= second_char <= 0xF1):
                return second_char - 0x9F, char_len

@ -215,7 +214,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
        if not byte_str:
            return -1, 1
        # find out current char's byte length
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
        if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
            char_len = 2
        elif first_char == 0x8F:
@ -225,7 +224,7 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):

        # return its order if it is hiragana
        if len(byte_str) > 1:
-            second_char = wrap_ord(byte_str[1])
+            second_char = byte_str[1]
            if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
                return second_char - 0xA1, char_len

--- a/lib/chardet/langbulgarianmodel.py
+++ b/lib/chardet/langbulgarianmodel.py
@ -214,7 +214,8 @@ Latin5BulgarianModel = {
  'precedence_matrix': BulgarianLangModel,
  'typical_positive_ratio': 0.969392,
  'keep_english_letter': False,
-  'charset_name': "ISO-8859-5"
+  'charset_name': "ISO-8859-5",
+  'language': 'Bulgairan',
 }

 Win1251BulgarianModel = {
@ -222,8 +223,6 @@ Win1251BulgarianModel = {
  'precedence_matrix': BulgarianLangModel,
  'typical_positive_ratio': 0.969392,
  'keep_english_letter': False,
-  'charset_name': "windows-1251"
+  'charset_name': "windows-1251",
+  'language': 'Bulgarian',
 }
-
-
-
--- a/lib/chardet/langcyrillicmodel.py
+++ b/lib/chardet/langcyrillicmodel.py
@ -283,7 +283,8 @@ Koi8rModel = {
  'precedence_matrix': RussianLangModel,
  'typical_positive_ratio': 0.976601,
  'keep_english_letter': False,
-  'charset_name': "KOI8-R"
+  'charset_name': "KOI8-R",
+  'language': 'Russian',
 }

 Win1251CyrillicModel = {
@ -291,7 +292,8 @@ Win1251CyrillicModel = {
  'precedence_matrix': RussianLangModel,
  'typical_positive_ratio': 0.976601,
  'keep_english_letter': False,
-  'charset_name': "windows-1251"
+  'charset_name': "windows-1251",
+  'language': 'Russian',
 }

 Latin5CyrillicModel = {
@ -299,7 +301,8 @@ Latin5CyrillicModel = {
  'precedence_matrix': RussianLangModel,
  'typical_positive_ratio': 0.976601,
  'keep_english_letter': False,
-  'charset_name': "ISO-8859-5"
+  'charset_name': "ISO-8859-5",
+  'language': 'Russian',
 }

 MacCyrillicModel = {
@ -307,7 +310,8 @@ MacCyrillicModel = {
  'precedence_matrix': RussianLangModel,
  'typical_positive_ratio': 0.976601,
  'keep_english_letter': False,
-  'charset_name': "MacCyrillic"
+  'charset_name': "MacCyrillic",
+  'language': 'Russian',
 }

 Ibm866Model = {
@ -315,7 +319,8 @@ Ibm866Model = {
  'precedence_matrix': RussianLangModel,
  'typical_positive_ratio': 0.976601,
  'keep_english_letter': False,
-  'charset_name': "IBM866"
+  'charset_name': "IBM866",
+  'language': 'Russian',
 }

 Ibm855Model = {
@ -323,7 +328,6 @@ Ibm855Model = {
  'precedence_matrix': RussianLangModel,
  'typical_positive_ratio': 0.976601,
  'keep_english_letter': False,
-  'charset_name': "IBM855"
+  'charset_name': "IBM855",
+  'language': 'Russian',
 }
-
-
--- a/lib/chardet/langgreekmodel.py
+++ b/lib/chardet/langgreekmodel.py
@ -211,7 +211,8 @@ Latin7GreekModel = {
  'precedence_matrix': GreekLangModel,
  'typical_positive_ratio': 0.982851,
  'keep_english_letter': False,
-  'charset_name': "ISO-8859-7"
+  'charset_name': "ISO-8859-7",
+  'language': 'Greek',
 }

 Win1253GreekModel = {
@ -219,7 +220,6 @@ Win1253GreekModel = {
  'precedence_matrix': GreekLangModel,
  'typical_positive_ratio': 0.982851,
  'keep_english_letter': False,
-  'charset_name': "windows-1253"
+  'charset_name': "windows-1253",
+  'language': 'Greek',
 }
-
-
--- a/lib/chardet/langhebrewmodel.py
+++ b/lib/chardet/langhebrewmodel.py
@ -195,7 +195,6 @@ Win1255HebrewModel = {
  'precedence_matrix': HEBREW_LANG_MODEL,
  'typical_positive_ratio': 0.984004,
  'keep_english_letter': False,
-  'charset_name': "windows-1255"
+  'charset_name': "windows-1255",
+  'language': 'Hebrew',
 }
-
-
--- a/lib/chardet/langhungarianmodel.py
+++ b/lib/chardet/langhungarianmodel.py
@ -211,7 +211,8 @@ Latin2HungarianModel = {
  'precedence_matrix': HungarianLangModel,
  'typical_positive_ratio': 0.947368,
  'keep_english_letter': True,
-  'charset_name': "ISO-8859-2"
+  'charset_name': "ISO-8859-2",
+  'language': 'Hungarian',
 }

 Win1250HungarianModel = {
@ -219,7 +220,6 @@ Win1250HungarianModel = {
  'precedence_matrix': HungarianLangModel,
  'typical_positive_ratio': 0.947368,
  'keep_english_letter': True,
-  'charset_name': "windows-1250"
+  'charset_name': "windows-1250",
+  'language': 'Hungarian',
 }
-
-
--- a/lib/chardet/langthaimodel.py
+++ b/lib/chardet/langthaimodel.py
@ -194,7 +194,6 @@ TIS620ThaiModel = {
  'precedence_matrix': ThaiLangModel,
  'typical_positive_ratio': 0.926386,
  'keep_english_letter': False,
-  'charset_name': "TIS-620"
+  'charset_name': "TIS-620",
+  'language': 'Thai',
 }
-
-
--- a/lib/chardet/langturkishmodel.py
+++ b/lib/chardet/langturkishmodel.py
@ -188,5 +188,6 @@ Latin5TurkishModel = {
  'precedence_matrix': TurkishLangModel,
  'typical_positive_ratio': 0.970290,
  'keep_english_letter': True,
-  'charset_name': "ISO-8859-9"
+  'charset_name': "ISO-8859-9",
+  'language': 'Turkish',
 }
--- a/lib/chardet/latin1prober.py
+++ b/lib/chardet/latin1prober.py
@ -27,7 +27,6 @@
 ######################### END LICENSE BLOCK #########################

 from .charsetprober import CharSetProber
-from .compat import wrap_ord
 from .enums import ProbingState

 FREQ_CAT_NUM = 4
@ -108,16 +107,20 @@ class Latin1Prober(CharSetProber):

    @property
    def charset_name(self):
-        return "windows-1252"
+        return "ISO-8859-1"
+
+    @property
+    def language(self):
+        return ""

    def feed(self, byte_str):
        byte_str = self.filter_with_english_letters(byte_str)
        for c in byte_str:
-            char_class = Latin1_CharToClass[wrap_ord(c)]
+            char_class = Latin1_CharToClass[c]
            freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
                                    + char_class]
            if freq == 0:
-                self._state = ProbingState.not_me
+                self._state = ProbingState.NOT_ME
                break
            self._freq_counter[freq] += 1
            self._last_char_class = char_class
@ -125,7 +128,7 @@ class Latin1Prober(CharSetProber):
        return self.state

    def get_confidence(self):
-        if self.state == ProbingState.not_me:
+        if self.state == ProbingState.NOT_ME:
            return 0.01

        total = sum(self._freq_counter)
--- a/lib/chardet/mbcharsetprober.py
+++ b/lib/chardet/mbcharsetprober.py
@ -52,34 +52,38 @@ class MultiByteCharSetProber(CharSetProber):

    @property
    def charset_name(self):
-        pass
+        raise NotImplementedError
+
+    @property
+    def language(self):
+        raise NotImplementedError

    def feed(self, byte_str):
        for i in range(len(byte_str)):
            coding_state = self.coding_sm.next_state(byte_str[i])
-            if coding_state == MachineState.error:
-                self.logger.debug('%s prober hit error at byte %s',
-                                  self.charset_name, i)
-                self._state = ProbingState.not_me
+            if coding_state == MachineState.ERROR:
+                self.logger.debug('%s %s prober hit error at byte %s',
+                                  self.charset_name, self.language, i)
+                self._state = ProbingState.NOT_ME
                break
-            elif coding_state == MachineState.its_me:
-                self._state = ProbingState.found_it
+            elif coding_state == MachineState.ITS_ME:
+                self._state = ProbingState.FOUND_IT
                break
-            elif coding_state == MachineState.start:
+            elif coding_state == MachineState.START:
                char_len = self.coding_sm.get_current_charlen()
                if i == 0:
                    self._last_char[1] = byte_str[0]
                    self.distribution_analyzer.feed(self._last_char, char_len)
                else:
                    self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
-                                                     char_len)
+                                                    char_len)

        self._last_char[0] = byte_str[-1]

-        if self.state == ProbingState.detecting:
+        if self.state == ProbingState.DETECTING:
            if (self.distribution_analyzer.got_enough_data() and
                    (self.get_confidence() > self.SHORTCUT_THRESHOLD)):
-                self._state = ProbingState.found_it
+                self._state = ProbingState.FOUND_IT

        return self.state

--- a/lib/chardet/mbcssm.py
+++ b/lib/chardet/mbcssm.py
@ -65,9 +65,9 @@ BIG5_CLS = (
 )

 BIG5_ST = (
-    MachineState.error,MachineState.start,MachineState.start,     3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
-    MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,#08-0f
-    MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start#10-17
+    MachineState.ERROR,MachineState.START,MachineState.START,     3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
+    MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
+    MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
 )

 BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
@ -101,13 +101,13 @@ CP949_CLS  = (

 CP949_ST = (
 #cls=    0      1      2      3      4      5      6      7      8      9  # previous state =
-    MachineState.error,MachineState.start,     3,MachineState.error,MachineState.start,MachineState.start,     4,     5,MachineState.error,     6, # MachineState.start
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error, # MachineState.error
-    MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me, # MachineState.its_me
-    MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 3
-    MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 4
-    MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start, # 5
-    MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start, # 6
+    MachineState.ERROR,MachineState.START,     3,MachineState.ERROR,MachineState.START,MachineState.START,     4,     5,MachineState.ERROR,     6, # MachineState.START
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
+    MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3
+    MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4
+    MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
+    MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
 )

 CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
@ -156,11 +156,11 @@ EUCJP_CLS = (
 )

 EUCJP_ST = (
-          3,     4,     3,     5,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#00-07
-     MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
-     MachineState.its_me,MachineState.its_me,MachineState.start,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,#10-17
-     MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,     3,MachineState.error,#18-1f
-          3,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start#20-27
+          3,     4,     3,     5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
+     MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
+     MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
+     MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     3,MachineState.ERROR,#18-1f
+          3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
 )

 EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
@ -209,8 +209,8 @@ EUCKR_CLS  = (
 )

 EUCKR_ST = (
-    MachineState.error,MachineState.start,     3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
-    MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start #08-0f
+    MachineState.ERROR,MachineState.START,     3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
 )

 EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
@ -259,12 +259,12 @@ EUCTW_CLS = (
 )

 EUCTW_ST = (
-    MachineState.error,MachineState.error,MachineState.start,     3,     3,     3,     4,MachineState.error,#00-07
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f
-    MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.start,MachineState.error,#10-17
-    MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
-         5,MachineState.error,MachineState.error,MachineState.error,MachineState.start,MachineState.error,MachineState.start,MachineState.start,#20-27
-    MachineState.start,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f
+    MachineState.ERROR,MachineState.ERROR,MachineState.START,     3,     3,     3,     4,MachineState.ERROR,#00-07
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17
+    MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
+         5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
+    MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
 )

 EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
@ -313,12 +313,12 @@ GB2312_CLS = (
 )

 GB2312_ST = (
-    MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,     3,MachineState.error,#00-07
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,#08-0f
-    MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,#10-17
-         4,MachineState.error,MachineState.start,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
-    MachineState.error,MachineState.error,     5,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.error,#20-27
-    MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.start #28-2f
+    MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,     3,MachineState.ERROR,#00-07
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17
+         4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
+    MachineState.ERROR,MachineState.ERROR,     5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
+    MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
 )

 # To be accurate, the length of class 6 can be either 2 or 4.
@ -374,9 +374,9 @@ SJIS_CLS = (


 SJIS_ST = (
-    MachineState.error,MachineState.start,MachineState.start,     3,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#00-07
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
-    MachineState.its_me,MachineState.its_me,MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start #10-17
+    MachineState.ERROR,MachineState.START,MachineState.START,     3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
 )

 SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
@ -425,13 +425,13 @@ UCS2BE_CLS = (
 )

 UCS2BE_ST  = (
-          5,     7,     7,MachineState.error,     4,     3,MachineState.error,MachineState.error,#00-07
-     MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
-     MachineState.its_me,MachineState.its_me,     6,     6,     6,     6,MachineState.error,MachineState.error,#10-17
-          6,     6,     6,     6,     6,MachineState.its_me,     6,     6,#18-1f
-          6,     6,     6,     6,     5,     7,     7,MachineState.error,#20-27
-          5,     8,     6,     6,MachineState.error,     6,     6,     6,#28-2f
-          6,     6,     6,     6,MachineState.error,MachineState.error,MachineState.start,MachineState.start #30-37
+          5,     7,     7,MachineState.ERROR,     4,     3,MachineState.ERROR,MachineState.ERROR,#00-07
+     MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
+     MachineState.ITS_ME,MachineState.ITS_ME,     6,     6,     6,     6,MachineState.ERROR,MachineState.ERROR,#10-17
+          6,     6,     6,     6,     6,MachineState.ITS_ME,     6,     6,#18-1f
+          6,     6,     6,     6,     5,     7,     7,MachineState.ERROR,#20-27
+          5,     8,     6,     6,MachineState.ERROR,     6,     6,     6,#28-2f
+          6,     6,     6,     6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
 )

 UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
@ -480,13 +480,13 @@ UCS2LE_CLS = (
 )

 UCS2LE_ST = (
-          6,     6,     7,     6,     4,     3,MachineState.error,MachineState.error,#00-07
-     MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#08-0f
-     MachineState.its_me,MachineState.its_me,     5,     5,     5,MachineState.error,MachineState.its_me,MachineState.error,#10-17
-          5,     5,     5,MachineState.error,     5,MachineState.error,     6,     6,#18-1f
-          7,     6,     8,     8,     5,     5,     5,MachineState.error,#20-27
-          5,     5,     5,MachineState.error,MachineState.error,MachineState.error,     5,     5,#28-2f
-          5,     5,     5,MachineState.error,     5,MachineState.error,MachineState.start,MachineState.start #30-37
+          6,     6,     7,     6,     4,     3,MachineState.ERROR,MachineState.ERROR,#00-07
+     MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
+     MachineState.ITS_ME,MachineState.ITS_ME,     5,     5,     5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
+          5,     5,     5,MachineState.ERROR,     5,MachineState.ERROR,     6,     6,#18-1f
+          7,     6,     8,     8,     5,     5,     5,MachineState.ERROR,#20-27
+          5,     5,     5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     5,     5,#28-2f
+          5,     5,     5,MachineState.ERROR,     5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
 )

 UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
@ -535,32 +535,32 @@ UTF8_CLS = (
 )

 UTF8_ST = (
-    MachineState.error,MachineState.start,MachineState.error,MachineState.error,MachineState.error,MachineState.error,     12,   10,#00-07
+    MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     12,   10,#00-07
         9,     11,     8,     7,     6,     5,     4,    3,#08-0f
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#10-17
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#18-1f
-    MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#20-27
-    MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,MachineState.its_me,#28-2f
-    MachineState.error,MachineState.error,     5,     5,     5,     5,MachineState.error,MachineState.error,#30-37
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#38-3f
-    MachineState.error,MachineState.error,MachineState.error,     5,     5,     5,MachineState.error,MachineState.error,#40-47
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#48-4f
-    MachineState.error,MachineState.error,     7,     7,     7,     7,MachineState.error,MachineState.error,#50-57
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#58-5f
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,     7,     7,MachineState.error,MachineState.error,#60-67
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#68-6f
-    MachineState.error,MachineState.error,     9,     9,     9,     9,MachineState.error,MachineState.error,#70-77
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#78-7f
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,     9,MachineState.error,MachineState.error,#80-87
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#88-8f
-    MachineState.error,MachineState.error,    12,    12,    12,    12,MachineState.error,MachineState.error,#90-97
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#98-9f
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,    12,MachineState.error,MachineState.error,#a0-a7
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#a8-af
-    MachineState.error,MachineState.error,    12,    12,    12,MachineState.error,MachineState.error,MachineState.error,#b0-b7
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,#b8-bf
-    MachineState.error,MachineState.error,MachineState.start,MachineState.start,MachineState.start,MachineState.start,MachineState.error,MachineState.error,#c0-c7
-    MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error,MachineState.error #c8-cf
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27
+    MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f
+    MachineState.ERROR,MachineState.ERROR,     5,     5,     5,     5,MachineState.ERROR,MachineState.ERROR,#30-37
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     5,     5,     5,MachineState.ERROR,MachineState.ERROR,#40-47
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f
+    MachineState.ERROR,MachineState.ERROR,     7,     7,     7,     7,MachineState.ERROR,MachineState.ERROR,#50-57
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     7,     7,MachineState.ERROR,MachineState.ERROR,#60-67
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f
+    MachineState.ERROR,MachineState.ERROR,     9,     9,     9,     9,MachineState.ERROR,MachineState.ERROR,#70-77
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,     9,MachineState.ERROR,MachineState.ERROR,#80-87
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f
+    MachineState.ERROR,MachineState.ERROR,    12,    12,    12,    12,MachineState.ERROR,MachineState.ERROR,#90-97
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,    12,MachineState.ERROR,MachineState.ERROR,#a0-a7
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af
+    MachineState.ERROR,MachineState.ERROR,    12,    12,    12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf
+    MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
+    MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
 )

 UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
--- a/lib/chardet/sbcharsetprober.py
+++ b/lib/chardet/sbcharsetprober.py
@ -27,18 +27,14 @@
 ######################### END LICENSE BLOCK #########################

 from .charsetprober import CharSetProber
-from .compat import wrap_ord
-from .enums import ProbingState
+from .enums import CharacterCategory, ProbingState, SequenceLikelihood


 class SingleByteCharSetProber(CharSetProber):
    SAMPLE_SIZE = 64
-    SB_ENOUGH_REL_THRESHOLD = 1024
+    SB_ENOUGH_REL_THRESHOLD = 1024  #  0.25 * SAMPLE_SIZE^2
    POSITIVE_SHORTCUT_THRESHOLD = 0.95
    NEGATIVE_SHORTCUT_THRESHOLD = 0.05
-    SYMBOL_CAT_ORDER = 250
-    NUMBER_OF_SEQ_CAT = 4
-    POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1

    def __init__(self, model, reversed=False, name_prober=None):
        super(SingleByteCharSetProber, self).__init__()
@ -58,7 +54,7 @@ class SingleByteCharSetProber(CharSetProber):
        super(SingleByteCharSetProber, self).reset()
        # char order of last character
        self._last_order = 255
-        self._seq_counters = [0] * self.NUMBER_OF_SEQ_CAT
+        self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
        self._total_seqs = 0
        self._total_char = 0
        # characters that fall in our sampling range
@ -71,15 +67,29 @@ class SingleByteCharSetProber(CharSetProber):
        else:
            return self._model['charset_name']

+    @property
+    def language(self):
+        if self._name_prober:
+            return self._name_prober.language
+        else:
+            return self._model.get('language')
+
    def feed(self, byte_str):
        if not self._model['keep_english_letter']:
            byte_str = self.filter_international_words(byte_str)
-        num_bytes = len(byte_str)
-        if not num_bytes:
+        if not byte_str:
            return self.state
-        for c in byte_str:
-            order = self._model['char_to_order_map'][wrap_ord(c)]
-            if order < self.SYMBOL_CAT_ORDER:
+        char_to_order_map = self._model['char_to_order_map']
+        for i, c in enumerate(byte_str):
+            # XXX: Order is in range 1-64, so one would think we want 0-63 here,
+            #      but that leads to 27 more test failures than before.
+            order = char_to_order_map[c]
+            # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
+            #      CharacterCategory.SYMBOL is actually 253, so we use CONTROL
+            #      to make it closer to the original intent. The only difference
+            #      is whether or not we count digits and control characters for
+            #      _total_char purposes.
+            if order < CharacterCategory.CONTROL:
                self._total_char += 1
            if order < self.SAMPLE_SIZE:
                self._freq_char += 1
@ -94,27 +104,28 @@ class SingleByteCharSetProber(CharSetProber):
                    self._seq_counters[model] += 1
            self._last_order = order

-        if self.state == ProbingState.detecting:
+        charset_name = self._model['charset_name']
+        if self.state == ProbingState.DETECTING:
            if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
-                cf = self.get_confidence()
-                if cf > self.POSITIVE_SHORTCUT_THRESHOLD:
+                confidence = self.get_confidence()
+                if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
                    self.logger.debug('%s confidence = %s, we have a winner',
-                                      self._model['charset_name'], cf)
-                    self._state = ProbingState.found_it
-                elif cf < self.NEGATIVE_SHORTCUT_THRESHOLD:
+                                      charset_name, confidence)
+                    self._state = ProbingState.FOUND_IT
+                elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
                    self.logger.debug('%s confidence = %s, below negative '
-                                      'shortcut threshold %s',
-                                      self._model['charset_name'], cf,
+                                      'shortcut threshhold %s', charset_name,
+                                      confidence,
                                      self.NEGATIVE_SHORTCUT_THRESHOLD)
-                    self._state = ProbingState.not_me
+                    self._state = ProbingState.NOT_ME

        return self.state

    def get_confidence(self):
        r = 0.01
        if self._total_seqs > 0:
-            r = ((1.0 * self._seq_counters[self.POSITIVE_CAT]) / self._total_seqs
-                 / self._model['typical_positive_ratio'])
+            r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
+                 self._total_seqs / self._model['typical_positive_ratio'])
            r = r * self._freq_char / self._total_char
            if r >= 1.0:
                r = 0.99
--- a/lib/chardet/sjisprober.py
+++ b/lib/chardet/sjisprober.py
@ -49,36 +49,40 @@ class SJISProber(MultiByteCharSetProber):
    def charset_name(self):
        return self.context_analyzer.charset_name

+    @property
+    def language(self):
+        return "Japanese"
+
    def feed(self, byte_str):
        for i in range(len(byte_str)):
            coding_state = self.coding_sm.next_state(byte_str[i])
-            if coding_state == MachineState.error:
-                self.logger.debug('%s prober hit error at byte %s',
-                                  self.charset_name, i)
-                self._state = ProbingState.not_me
+            if coding_state == MachineState.ERROR:
+                self.logger.debug('%s %s prober hit error at byte %s',
+                                  self.charset_name, self.language, i)
+                self._state = ProbingState.NOT_ME
                break
-            elif coding_state == MachineState.its_me:
-                self._state = ProbingState.found_it
+            elif coding_state == MachineState.ITS_ME:
+                self._state = ProbingState.FOUND_IT
                break
-            elif coding_state == MachineState.start:
+            elif coding_state == MachineState.START:
                char_len = self.coding_sm.get_current_charlen()
                if i == 0:
                    self._last_char[1] = byte_str[0]
                    self.context_analyzer.feed(self._last_char[2 - char_len:],
-                                                char_len)
+                                               char_len)
                    self.distribution_analyzer.feed(self._last_char, char_len)
                else:
                    self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
-                                                     - char_len], char_len)
+                                                        - char_len], char_len)
                    self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
-                                                     char_len)
+                                                    char_len)

        self._last_char[0] = byte_str[-1]

-        if self.state == ProbingState.detecting:
+        if self.state == ProbingState.DETECTING:
            if (self.context_analyzer.got_enough_data() and
               (self.get_confidence() > self.SHORTCUT_THRESHOLD)):
-                self._state = ProbingState.found_it
+                self._state = ProbingState.FOUND_IT

        return self.state

--- a/lib/chardet/universaldetector.py
+++ b/lib/chardet/universaldetector.py
@ -40,6 +40,7 @@ import codecs
 import logging
 import re

+from .charsetgroupprober import CharSetGroupProber
 from .enums import InputState, LanguageFilter, ProbingState
 from .escprober import EscCharSetProber
 from .latin1prober import Latin1Prober
@ -67,8 +68,17 @@ class UniversalDetector(object):
    MINIMUM_THRESHOLD = 0.20
    HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
    ESC_DETECTOR = re.compile(b'(\033|~{)')
+    WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
+    ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
+                   'iso-8859-2': 'Windows-1250',
+                   'iso-8859-5': 'Windows-1251',
+                   'iso-8859-6': 'Windows-1256',
+                   'iso-8859-7': 'Windows-1253',
+                   'iso-8859-8': 'Windows-1255',
+                   'iso-8859-9': 'Windows-1254',
+                   'iso-8859-13': 'Windows-1257'}

-    def __init__(self, lang_filter=LanguageFilter.all):
+    def __init__(self, lang_filter=LanguageFilter.ALL):
        self._esc_charset_prober = None
        self._charset_probers = []
        self.result = None
@ -78,6 +88,7 @@ class UniversalDetector(object):
        self._last_char = None
        self.lang_filter = lang_filter
        self.logger = logging.getLogger(__name__)
+        self._has_win_bytes = None
        self.reset()

    def reset(self):
@ -86,10 +97,11 @@ class UniversalDetector(object):
        initial states.  This is called by ``__init__``, so you only need to
        call this directly in between analyses of different documents.
        """
-        self.result = {'encoding': None, 'confidence': 0.0}
+        self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
        self.done = False
        self._got_data = False
-        self._input_state = InputState.pure_ascii
+        self._has_win_bytes = False
+        self._input_state = InputState.PURE_ASCII
        self._last_char = b''
        if self._esc_charset_prober:
            self._esc_charset_prober.reset()
@ -116,28 +128,40 @@ class UniversalDetector(object):
        if not len(byte_str):
            return

+        if not isinstance(byte_str, bytearray):
+            byte_str = bytearray(byte_str)
+
        # First check for known BOMs, since these are guaranteed to be correct
        if not self._got_data:
            # If the data starts with BOM, we know it is UTF
            if byte_str.startswith(codecs.BOM_UTF8):
                # EF BB BF  UTF-8 with BOM
-                self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
-            elif byte_str.startswith(codecs.BOM_UTF32_LE) or byte_str.startswith(codecs.BOM_UTF32_BE):
+                self.result = {'encoding': "UTF-8-SIG",
+                               'confidence': 1.0,
+                               'language': ''}
+            elif byte_str.startswith((codecs.BOM_UTF32_LE,
+                                      codecs.BOM_UTF32_BE)):
                # FF FE 00 00  UTF-32, little-endian BOM
                # 00 00 FE FF  UTF-32, big-endian BOM
-                self.result = {'encoding': "UTF-32", 'confidence': 1.0}
+                self.result = {'encoding': "UTF-32",
+                               'confidence': 1.0,
+                               'language': ''}
            elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
                # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
-                               'confidence': 1.0}
+                               'confidence': 1.0,
+                               'language': ''}
            elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
                # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
-                               'confidence': 1.0}
-            elif byte_str.startswith(codecs.BOM_LE) or byte_str.startswith(codecs.BOM_BE):
+                               'confidence': 1.0,
+                               'language': ''}
+            elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
                # FF FE  UTF-16, little endian BOM
                # FE FF  UTF-16, big endian BOM
-                self.result = {'encoding': "UTF-16", 'confidence': 1.0}
+                self.result = {'encoding': "UTF-16",
+                               'confidence': 1.0,
+                               'language': ''}

            self._got_data = True
            if self.result['encoding'] is not None:
@ -146,12 +170,12 @@ class UniversalDetector(object):

        # If none of those matched and we've only see ASCII so far, check
        # for high bytes and escape sequences
-        if self._input_state == InputState.pure_ascii:
+        if self._input_state == InputState.PURE_ASCII:
            if self.HIGH_BYTE_DETECTOR.search(byte_str):
-                self._input_state = InputState.high_byte
-            elif self._input_state == InputState.pure_ascii and \
+                self._input_state = InputState.HIGH_BYTE
+            elif self._input_state == InputState.PURE_ASCII and \
                    self.ESC_DETECTOR.search(self._last_char + byte_str):
-                self._input_state = InputState.esc_ascii
+                self._input_state = InputState.ESC_ASCII

        self._last_char = byte_str[-1:]

@ -159,14 +183,16 @@ class UniversalDetector(object):
        # uses a simple state machine to check for known escape sequences in
        # HZ and ISO-2022 encodings, since those are the only encodings that
        # use such sequences.
-        if self._input_state == InputState.esc_ascii:
+        if self._input_state == InputState.ESC_ASCII:
            if not self._esc_charset_prober:
                self._esc_charset_prober = EscCharSetProber(self.lang_filter)
-            if self._esc_charset_prober.feed(byte_str) == ProbingState.found_it:
+            if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
                self.result = {'encoding':
                               self._esc_charset_prober.charset_name,
                               'confidence':
-                               self._esc_charset_prober.get_confidence()}
+                               self._esc_charset_prober.get_confidence(),
+                               'language':
+                               self._esc_charset_prober.language}
                self.done = True
        # If we've seen high bytes (i.e., those with values greater than 127),
        # we need to do more complicated checks using all our multi-byte and
@ -174,59 +200,87 @@ class UniversalDetector(object):
        # use character bigram distributions to determine the encoding, whereas
        # the multi-byte probers use a combination of character unigram and
        # bigram distributions.
-        elif self._input_state == InputState.high_byte:
+        elif self._input_state == InputState.HIGH_BYTE:
            if not self._charset_probers:
                self._charset_probers = [MBCSGroupProber(self.lang_filter)]
                # If we're checking non-CJK encodings, use single-byte prober
-                if self.lang_filter & LanguageFilter.non_cjk:
+                if self.lang_filter & LanguageFilter.NON_CJK:
                    self._charset_probers.append(SBCSGroupProber())
                self._charset_probers.append(Latin1Prober())
            for prober in self._charset_probers:
-                if prober.feed(byte_str) == ProbingState.found_it:
+                if prober.feed(byte_str) == ProbingState.FOUND_IT:
                    self.result = {'encoding': prober.charset_name,
-                                   'confidence': prober.get_confidence()}
+                                   'confidence': prober.get_confidence(),
+                                   'language': prober.language}
                    self.done = True
                    break
+            if self.WIN_BYTE_DETECTOR.search(byte_str):
+                self._has_win_bytes = True

    def close(self):
        """
        Stop analyzing the current document and come up with a final
        prediction.

-        :returns:  The ``result`` attribute if a prediction was made, otherwise
-                   ``None``.
+        :returns:  The ``result`` attribute, a ``dict`` with the keys
+                   `encoding`, `confidence`, and `language`.
        """
+        # Don't bother with checks if we're already done
        if self.done:
            return self.result
-        if not self._got_data:
-            self.logger.debug('no data received!')
-            return
        self.done = True

-        if self._input_state in (InputState.pure_ascii, InputState.esc_ascii):
-            self.result = {'encoding': 'ascii', 'confidence': 1.0}
-            return self.result
+        if not self._got_data:
+            self.logger.debug('no data received!')

-        if self._input_state == InputState.high_byte:
-            proberConfidence = None
+        # Default to ASCII if it is all we've seen so far
+        elif self._input_state == InputState.PURE_ASCII:
+            self.result = {'encoding': 'ascii',
+                           'confidence': 1.0,
+                           'language': ''}
+
+        # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
+        elif self._input_state == InputState.HIGH_BYTE:
+            prober_confidence = None
            max_prober_confidence = 0.0
            max_prober = None
            for prober in self._charset_probers:
                if not prober:
                    continue
-                proberConfidence = prober.get_confidence()
-                if proberConfidence > max_prober_confidence:
-                    max_prober_confidence = proberConfidence
+                prober_confidence = prober.get_confidence()
+                if prober_confidence > max_prober_confidence:
+                    max_prober_confidence = prober_confidence
                    max_prober = prober
            if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
-                self.result = {'encoding': max_prober.charset_name,
-                               'confidence': max_prober.get_confidence()}
-                return self.result
+                charset_name = max_prober.charset_name
+                lower_charset_name = max_prober.charset_name.lower()
+                confidence = max_prober.get_confidence()
+                # Use Windows encoding name instead of ISO-8859 if we saw any
+                # extra Windows-specific bytes
+                if lower_charset_name.startswith('iso-8859'):
+                    if self._has_win_bytes:
+                        charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
+                                                            charset_name)
+                self.result = {'encoding': charset_name,
+                               'confidence': confidence,
+                               'language': max_prober.language}

+        # Log all prober confidences if none met MINIMUM_THRESHOLD
        if self.logger.getEffectiveLevel() == logging.DEBUG:
-            self.logger.debug('no probers hit minimum threshold')
-            for prober in self._charset_probers[0].probers:
-                if not prober:
-                    continue
-                self.logger.debug('%s confidence = %s', prober.charset_name,
-                                  prober.get_confidence())
+            if self.result['encoding'] is None:
+                self.logger.debug('no probers hit minimum threshold')
+                for group_prober in self._charset_probers:
+                    if not group_prober:
+                        continue
+                    if isinstance(group_prober, CharSetGroupProber):
+                        for prober in group_prober.probers:
+                            self.logger.debug('%s %s confidence = %s',
+                                              prober.charset_name,
+                                              prober.language,
+                                              prober.get_confidence())
+                    else:
+                        self.logger.debug('%s %s confidence = %s',
+                                          prober.charset_name,
+                                          prober.language,
+                                          prober.get_confidence())
+        return self.result
--- a/lib/chardet/utf8prober.py
+++ b/lib/chardet/utf8prober.py
@ -50,22 +50,26 @@ class UTF8Prober(CharSetProber):
    def charset_name(self):
        return "utf-8"

+    @property
+    def language(self):
+        return ""
+
    def feed(self, byte_str):
        for c in byte_str:
            coding_state = self.coding_sm.next_state(c)
-            if coding_state == MachineState.error:
-                self._state = ProbingState.not_me
+            if coding_state == MachineState.ERROR:
+                self._state = ProbingState.NOT_ME
                break
-            elif coding_state == MachineState.its_me:
-                self._state = ProbingState.found_it
+            elif coding_state == MachineState.ITS_ME:
+                self._state = ProbingState.FOUND_IT
                break
-            elif coding_state == MachineState.start:
+            elif coding_state == MachineState.START:
                if self.coding_sm.get_current_charlen() >= 2:
                    self._num_mb_chars += 1

-        if self.state == ProbingState.detecting:
+        if self.state == ProbingState.DETECTING:
            if self.get_confidence() > self.SHORTCUT_THRESHOLD:
-                self._state = ProbingState.found_it
+                self._state = ProbingState.FOUND_IT

        return self.state

--- a/lib/chardet/version.py
+++ b/lib/chardet/version.py
@ -2,8 +2,8 @@
 This module exists only to simplify retrieving the version number of chardet
 from within setup.py and from chardet subpackages.

-:author: Dan Blanchard (dblanchard@ets.org)
+:author: Dan Blanchard (dan.blanchard@gmail.com)
 """

-__version__ = "2.3.0"
+__version__ = "3.0.4"
 VERSION = __version__.split('.')