Merge branch 'feature/UpdateChardet' into dev

This commit is contained in:
JackDandy 2023-02-09 14:38:35 +00:00
commit eacfd57a85
49 changed files with 9067 additions and 5845 deletions

View file

@ -6,6 +6,7 @@
* Remove lockfile no longer used by cachecontrol
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
* Update certifi 2022.09.24 to 2022.12.07
* Update chardet packages 4.0.0 (b3d867a) to 5.1.0 (8087f00)
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)

View file

@ -15,68 +15,101 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .universaldetector import UniversalDetector
from .charsetgroupprober import CharSetGroupProber
from .charsetprober import CharSetProber
from .enums import InputState
from .version import __version__, VERSION
from .resultdict import ResultDict
from .universaldetector import UniversalDetector
from .version import VERSION, __version__
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
def detect(byte_str):
def detect(
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
) -> ResultDict:
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
return detector.close()
def detect_all(byte_str):
def detect_all(
byte_str: Union[bytes, bytearray],
ignore_threshold: bool = False,
should_rename_legacy: bool = False,
) -> List[ResultDict]:
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
detector.close()
if detector._input_state == InputState.HIGH_BYTE:
results = []
for prober in detector._charset_probers:
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
charset_name = prober.charset_name
lower_charset_name = prober.charset_name.lower()
if detector.input_state == InputState.HIGH_BYTE:
results: List[ResultDict] = []
probers: List[CharSetProber] = []
for prober in detector.charset_probers:
if isinstance(prober, CharSetGroupProber):
probers.extend(p for p in prober.probers)
else:
probers.append(prober)
for prober in probers:
if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
charset_name = prober.charset_name or ""
lower_charset_name = charset_name.lower()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if detector._has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence()
})
if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if should_rename_legacy:
charset_name = detector.LEGACY_MAP.get(
charset_name.lower(), charset_name
)
results.append(
{
"encoding": charset_name,
"confidence": prober.get_confidence(),
"language": prober.language,
}
)
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])
return sorted(results, key=lambda result: -result["confidence"])
return [detector.result]

View file

@ -42,9 +42,9 @@
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
#Char to FreqOrder table
# Char to FreqOrder table
BIG5_TABLE_SIZE = 5376
# fmt: off
BIG5_CHAR_TO_FREQ_ORDER = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
@ -383,4 +383,4 @@ BIG5_CHAR_TO_FREQ_ORDER = (
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
)
# fmt: on

View file

@ -25,23 +25,23 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import Big5DistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import BIG5_SM_MODEL
class Big5Prober(MultiByteCharSetProber):
def __init__(self):
super(Big5Prober, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "Big5"
@property
def language(self):
def language(self) -> str:
return "Chinese"

View file

@ -25,40 +25,58 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from typing import Tuple, Union
from .big5freq import (
BIG5_CHAR_TO_FREQ_ORDER,
BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO,
)
from .euckrfreq import (
EUCKR_CHAR_TO_FREQ_ORDER,
EUCKR_TABLE_SIZE,
EUCKR_TYPICAL_DISTRIBUTION_RATIO,
)
from .euctwfreq import (
EUCTW_CHAR_TO_FREQ_ORDER,
EUCTW_TABLE_SIZE,
EUCTW_TYPICAL_DISTRIBUTION_RATIO,
)
from .gb2312freq import (
GB2312_CHAR_TO_FREQ_ORDER,
GB2312_TABLE_SIZE,
GB2312_TYPICAL_DISTRIBUTION_RATIO,
)
from .jisfreq import (
JIS_CHAR_TO_FREQ_ORDER,
JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO,
)
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
class CharDistributionAnalysis(object):
class CharDistributionAnalysis:
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
def __init__(self):
def __init__(self) -> None:
# Mapping table to get frequency order from char order (get from
# GetOrder())
self._char_to_freq_order = None
self._table_size = None # Size of above table
self._char_to_freq_order: Tuple[int, ...] = tuple()
self._table_size = 0 # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self.typical_distribution_ratio = None
self._done = None
self._total_chars = None
self._freq_chars = None
self.typical_distribution_ratio = 0.0
self._done = False
self._total_chars = 0
self._freq_chars = 0
self.reset()
def reset(self):
def reset(self) -> None:
"""reset analyser, clear any state"""
# If this flag is set to True, detection is done and conclusion has
# been made
@ -67,7 +85,7 @@ class CharDistributionAnalysis(object):
# The number of characters whose frequency order is less than 512
self._freq_chars = 0
def feed(self, char, char_len):
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
"""feed a character with known length"""
if char_len == 2:
# we only care about 2-bytes character in our distribution analysis
@ -81,7 +99,7 @@ class CharDistributionAnalysis(object):
if 512 > self._char_to_freq_order[order]:
self._freq_chars += 1
def get_confidence(self):
def get_confidence(self) -> float:
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range,
# return negative answer
@ -89,20 +107,21 @@ class CharDistributionAnalysis(object):
return self.SURE_NO
if self._total_chars != self._freq_chars:
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
* self.typical_distribution_ratio))
r = self._freq_chars / (
(self._total_chars - self._freq_chars) * self.typical_distribution_ratio
)
if r < self.SURE_YES:
return r
# normalize confidence (we don't want to be 100% sure)
return self.SURE_YES
def got_enough_data(self):
def got_enough_data(self) -> bool:
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
def get_order(self, byte_str):
def get_order(self, _: Union[bytes, bytearray]) -> int:
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
@ -111,13 +130,13 @@ class CharDistributionAnalysis(object):
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(EUCTWDistributionAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
self._table_size = EUCTW_TABLE_SIZE
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
@ -125,18 +144,17 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
first_char = byte_str[0]
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
else:
return -1
return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(EUCKRDistributionAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
@ -144,18 +162,32 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
first_char = byte_str[0]
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
else:
return -1
return -1
class JOHABDistributionAnalysis(CharDistributionAnalysis):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
first_char = byte_str[0]
if 0x88 <= first_char < 0xD4:
code = first_char * 256 + byte_str[1]
return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(GB2312DistributionAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
self._table_size = GB2312_TABLE_SIZE
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
@ -163,18 +195,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
return -1
return -1
class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(Big5DistributionAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
self._table_size = BIG5_TABLE_SIZE
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
@ -183,28 +214,26 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
else:
return 157 * (first_char - 0xA4) + second_char - 0x40
else:
return -1
return 157 * (first_char - 0xA4) + second_char - 0x40
return -1
class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(SJISDistributionAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0x81) and (first_char <= 0x9F):
if 0x81 <= first_char <= 0x9F:
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
elif 0xE0 <= first_char <= 0xEF:
order = 188 * (first_char - 0xE0 + 31)
else:
return -1
@ -215,19 +244,18 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(EUCJPDistributionAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
char = byte_str[0]
if char >= 0xA0:
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
else:
return -1
return 94 * (char - 0xA1) + byte_str[1] - 0xA1
return -1

View file

@ -25,29 +25,30 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .enums import ProbingState
from typing import List, Optional, Union
from .charsetprober import CharSetProber
from .enums import LanguageFilter, ProbingState
class CharSetGroupProber(CharSetProber):
def __init__(self, lang_filter=None):
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self._active_num = 0
self.probers = []
self._best_guess_prober = None
self.probers: List[CharSetProber] = []
self._best_guess_prober: Optional[CharSetProber] = None
def reset(self):
super(CharSetGroupProber, self).reset()
def reset(self) -> None:
super().reset()
self._active_num = 0
for prober in self.probers:
if prober:
prober.reset()
prober.active = True
self._active_num += 1
prober.reset()
prober.active = True
self._active_num += 1
self._best_guess_prober = None
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
return self._best_guess_prober.charset_name
@property
def language(self):
def language(self) -> Optional[str]:
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.language
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for prober in self.probers:
if not prober:
continue
if not prober.active:
continue
state = prober.feed(byte_str)
@ -73,8 +72,9 @@ class CharSetGroupProber(CharSetProber):
continue
if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober
self._state = ProbingState.FOUND_IT
return self.state
elif state == ProbingState.NOT_ME:
if state == ProbingState.NOT_ME:
prober.active = False
self._active_num -= 1
if self._active_num <= 0:
@ -82,22 +82,22 @@ class CharSetGroupProber(CharSetProber):
return self.state
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
state = self.state
if state == ProbingState.FOUND_IT:
return 0.99
elif state == ProbingState.NOT_ME:
if state == ProbingState.NOT_ME:
return 0.01
best_conf = 0.0
self._best_guess_prober = None
for prober in self.probers:
if not prober:
continue
if not prober.active:
self.logger.debug('%s not active', prober.charset_name)
self.logger.debug("%s not active", prober.charset_name)
continue
conf = prober.get_confidence()
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
self.logger.debug(
"%s %s confidence = %s", prober.charset_name, prober.language, conf
)
if best_conf < conf:
best_conf = conf
self._best_guess_prober = prober

View file

@ -28,54 +28,62 @@
import logging
import re
from typing import Optional, Union
from .enums import ProbingState
from .enums import LanguageFilter, ProbingState
INTERNATIONAL_WORDS_PATTERN = re.compile(
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
)
class CharSetProber(object):
class CharSetProber:
SHORTCUT_THRESHOLD = 0.95
def __init__(self, lang_filter=None):
self._state = None
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
self._state = ProbingState.DETECTING
self.active = True
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
def reset(self):
def reset(self) -> None:
self._state = ProbingState.DETECTING
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
return None
def feed(self, buf):
pass
@property
def language(self) -> Optional[str]:
raise NotImplementedError
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
raise NotImplementedError
@property
def state(self):
def state(self) -> ProbingState:
return self._state
def get_confidence(self):
def get_confidence(self) -> float:
return 0.0
@staticmethod
def filter_high_byte_only(buf):
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
return buf
@staticmethod
def filter_international_words(buf):
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
"""
We define three types of bytes:
alphabet: english alphabets [a-zA-Z]
international: international characters [\x80-\xFF]
marker: everything else [^a-zA-Z\x80-\xFF]
The input buffer can be thought to contain a series of words delimited
by markers. This function works to filter all words that contain at
least one international character. All contiguous sequences of markers
are replaced by a single space ascii character.
This filter applies to all scripts which do not use English characters.
"""
filtered = bytearray()
@ -83,8 +91,7 @@ class CharSetProber(object):
# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
buf)
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
for word in words:
filtered.extend(word[:-1])
@ -94,20 +101,17 @@ class CharSetProber(object):
# similarly across all languages and may thus have similar
# frequencies).
last_char = word[-1:]
if not last_char.isalpha() and last_char < b'\x80':
last_char = b' '
if not last_char.isalpha() and last_char < b"\x80":
last_char = b" "
filtered.extend(last_char)
return filtered
@staticmethod
def filter_with_english_letters(buf):
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
Also retains English alphabet and high byte characters immediately
before occurrences of >.
This filter can be applied to all scripts which contain both English
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
@ -115,26 +119,24 @@ class CharSetProber(object):
filtered = bytearray()
in_tag = False
prev = 0
buf = memoryview(buf).cast("c")
for curr in range(len(buf)):
# Slice here to get bytes instead of an int with Python 3
buf_char = buf[curr:curr + 1]
# Check if we're coming out of or entering an HTML tag
if buf_char == b'>':
for curr, buf_char in enumerate(buf):
# Check if we're coming out of or entering an XML tag
# https://github.com/python/typeshed/issues/8182
if buf_char == b">": # type: ignore[comparison-overlap]
prev = curr + 1
in_tag = False
elif buf_char == b'<':
in_tag = True
# If current character is not extended-ASCII and not alphabetic...
if buf_char < b'\x80' and not buf_char.isalpha():
# ...and we're not in a tag
# https://github.com/python/typeshed/issues/8182
elif buf_char == b"<": # type: ignore[comparison-overlap]
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character
filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept
filtered.extend(b' ')
prev = curr + 1
filtered.extend(b" ")
in_tag = True
# If we're not in a tag...
if not in_tag:

View file

@ -1 +0,0 @@

View file

@ -1,4 +1,3 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
@ -13,17 +12,21 @@ If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from typing import Iterable, List, Optional
from chardet import __version__
from chardet.compat import PY2
from chardet.universaldetector import UniversalDetector
from .. import __version__
from ..universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
def description_of(
lines: Iterable[bytes],
name: str = "stdin",
minimal: bool = False,
should_rename_legacy: bool = False,
) -> Optional[str]:
"""
Return a string describing the probable encoding of a file or
list of strings.
@ -32,8 +35,11 @@ def description_of(lines, name='stdin'):
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
:param should_rename_legacy: Should we rename legacy encodings to
their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
u = UniversalDetector()
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
for line in lines:
line = bytearray(line)
u.feed(line)
@ -42,16 +48,14 @@ def description_of(lines, name='stdin'):
break
u.close()
result = u.result
if PY2:
name = name.decode(sys.getfilesystemencoding(), 'ignore')
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
if minimal:
return result["encoding"]
if result["encoding"]:
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
return f"{name}: no result"
def main(argv=None):
def main(argv: Optional[List[str]] = None) -> None:
"""
Handles command line arguments and gets things started.
@ -61,25 +65,48 @@ def main(argv=None):
"""
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings")
parser.add_argument('input',
help='File whose encoding we would like to determine. \
(default: stdin)',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin if PY2 else sys.stdin.buffer])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
description=(
"Takes one or more file paths and reports their detected encodings"
)
)
parser.add_argument(
"input",
help="File whose encoding we would like to determine. (default: stdin)",
type=argparse.FileType("rb"),
nargs="*",
default=[sys.stdin.buffer],
)
parser.add_argument(
"--minimal",
help="Print only the encoding to standard output",
action="store_true",
)
parser.add_argument(
"-l",
"--legacy",
help="Rename legacy encodings to more modern ones.",
action="store_true",
)
parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}"
)
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
print(
"You are running chardetect interactively. Press "
"CTRL-D twice at the start of a blank line to signal the "
"end of your input. If you want help, run chardetect "
"--help\n",
file=sys.stderr,
)
print(
description_of(
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
)
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View file

@ -27,10 +27,11 @@
import logging
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState
class CodingStateMachine(object):
class CodingStateMachine:
"""
A state machine to verify a byte sequence for a particular encoding. For
each byte the detector receives, it will feed that byte to every active
@ -52,37 +53,38 @@ class CodingStateMachine(object):
negative answer for this encoding. Detector will exclude this
encoding from consideration from here on.
"""
def __init__(self, sm):
def __init__(self, sm: CodingStateMachineDict) -> None:
self._model = sm
self._curr_byte_pos = 0
self._curr_char_len = 0
self._curr_state = None
self._curr_state = MachineState.START
self.active = True
self.logger = logging.getLogger(__name__)
self.reset()
def reset(self):
def reset(self) -> None:
self._curr_state = MachineState.START
def next_state(self, c):
def next_state(self, c: int) -> int:
# for each byte we get its class
# if it is first byte, we also get byte length
byte_class = self._model['class_table'][c]
byte_class = self._model["class_table"][c]
if self._curr_state == MachineState.START:
self._curr_byte_pos = 0
self._curr_char_len = self._model['char_len_table'][byte_class]
self._curr_char_len = self._model["char_len_table"][byte_class]
# from byte's class and state_table, we get its next state
curr_state = (self._curr_state * self._model['class_factor']
+ byte_class)
self._curr_state = self._model['state_table'][curr_state]
curr_state = self._curr_state * self._model["class_factor"] + byte_class
self._curr_state = self._model["state_table"][curr_state]
self._curr_byte_pos += 1
return self._curr_state
def get_current_charlen(self):
def get_current_charlen(self) -> int:
return self._curr_char_len
def get_coding_state_machine(self):
return self._model['name']
def get_coding_state_machine(self) -> str:
return self._model["name"]
@property
def language(self):
return self._model['language']
def language(self) -> str:
return self._model["language"]

View file

@ -0,0 +1,19 @@
from typing import TYPE_CHECKING, Tuple
if TYPE_CHECKING:
# TypedDict was introduced in Python 3.8.
#
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
# for Python 3.7.
from typing import TypedDict
class CodingStateMachineDict(TypedDict, total=False):
class_table: Tuple[int, ...]
class_factor: int
state_table: Tuple[int, ...]
char_len_table: Tuple[int, ...]
name: str
language: str # Optional key
else:
CodingStateMachineDict = dict

View file

@ -1,36 +0,0 @@
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Dan Blanchard
# Ian Cordasco
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
if sys.version_info < (3, 0):
PY2 = True
PY3 = False
string_types = (str, unicode)
text_type = unicode
iteritems = dict.iteritems
else:
PY2 = False
PY3 = True
string_types = (bytes, str)
text_type = str
iteritems = dict.items

View file

@ -32,8 +32,8 @@ from .mbcssm import CP949_SM_MODEL
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
super(CP949Prober, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "CP949"
@property
def language(self):
def language(self) -> str:
return "Korean"

View file

@ -4,21 +4,26 @@ All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
from enum import Enum, Flag
class InputState(object):
class InputState:
"""
This enum represents the different states a universal detector can be in.
"""
PURE_ASCII = 0
ESC_ASCII = 1
HIGH_BYTE = 2
class LanguageFilter(object):
class LanguageFilter(Flag):
"""
This enum represents the different language filters we can apply to a
``UniversalDetector``.
"""
NONE = 0x00
CHINESE_SIMPLIFIED = 0x01
CHINESE_TRADITIONAL = 0x02
JAPANESE = 0x04
@ -29,46 +34,50 @@ class LanguageFilter(object):
CJK = CHINESE | JAPANESE | KOREAN
class ProbingState(object):
class ProbingState(Enum):
"""
This enum represents the different states a prober can be in.
"""
DETECTING = 0
FOUND_IT = 1
NOT_ME = 2
class MachineState(object):
class MachineState:
"""
This enum represents the different states a state machine can be in.
"""
START = 0
ERROR = 1
ITS_ME = 2
class SequenceLikelihood(object):
class SequenceLikelihood:
"""
This enum represents the likelihood of a character following the previous one.
"""
NEGATIVE = 0
UNLIKELY = 1
LIKELY = 2
POSITIVE = 3
@classmethod
def get_num_categories(cls):
def get_num_categories(cls) -> int:
""":returns: The number of likelihood categories in the enum."""
return 4
class CharacterCategory(object):
class CharacterCategory:
"""
This enum represents the different categories language models for
``SingleByteCharsetProber`` put characters into.
Anything less than CONTROL is considered a letter.
"""
UNDEFINED = 255
LINE_BREAK = 254
SYMBOL = 253

View file

@ -25,11 +25,17 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, ProbingState, MachineState
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL)
from .enums import LanguageFilter, MachineState, ProbingState
from .escsm import (
HZ_SM_MODEL,
ISO2022CN_SM_MODEL,
ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL,
)
class EscCharSetProber(CharSetProber):
@ -39,8 +45,8 @@ class EscCharSetProber(CharSetProber):
identify these encodings.
"""
def __init__(self, lang_filter=None):
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self.coding_sm = []
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
@ -49,17 +55,15 @@ class EscCharSetProber(CharSetProber):
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None
self._detected_charset = None
self._detected_language = None
self._state = None
self.active_sm_count = 0
self._detected_charset: Optional[str] = None
self._detected_language: Optional[str] = None
self._state = ProbingState.DETECTING
self.reset()
def reset(self):
super(EscCharSetProber, self).reset()
def reset(self) -> None:
super().reset()
for coding_sm in self.coding_sm:
if not coding_sm:
continue
coding_sm.active = True
coding_sm.reset()
self.active_sm_count = len(self.coding_sm)
@ -67,23 +71,20 @@ class EscCharSetProber(CharSetProber):
self._detected_language = None
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
return self._detected_charset
@property
def language(self):
def language(self) -> Optional[str]:
return self._detected_language
def get_confidence(self):
if self._detected_charset:
return 0.99
else:
return 0.00
def get_confidence(self) -> float:
return 0.99 if self._detected_charset else 0.00
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str:
for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active:
if not coding_sm.active:
continue
coding_state = coding_sm.next_state(c)
if coding_state == MachineState.ERROR:

View file

@ -12,7 +12,7 @@
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -20,227 +20,242 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState
# fmt: off
HZ_CLS = (
1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,4,0,5,2,0, # 78 - 7f
1,1,1,1,1,1,1,1, # 80 - 87
1,1,1,1,1,1,1,1, # 88 - 8f
1,1,1,1,1,1,1,1, # 90 - 97
1,1,1,1,1,1,1,1, # 98 - 9f
1,1,1,1,1,1,1,1, # a0 - a7
1,1,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,1,1,1,1,1,1, # c0 - c7
1,1,1,1,1,1,1,1, # c8 - cf
1,1,1,1,1,1,1,1, # d0 - d7
1,1,1,1,1,1,1,1, # d8 - df
1,1,1,1,1,1,1,1, # e0 - e7
1,1,1,1,1,1,1,1, # e8 - ef
1,1,1,1,1,1,1,1, # f0 - f7
1,1,1,1,1,1,1,1, # f8 - ff
1, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
0, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
0, 0, 0, 4, 0, 5, 2, 0, # 78 - 7f
1, 1, 1, 1, 1, 1, 1, 1, # 80 - 87
1, 1, 1, 1, 1, 1, 1, 1, # 88 - 8f
1, 1, 1, 1, 1, 1, 1, 1, # 90 - 97
1, 1, 1, 1, 1, 1, 1, 1, # 98 - 9f
1, 1, 1, 1, 1, 1, 1, 1, # a0 - a7
1, 1, 1, 1, 1, 1, 1, 1, # a8 - af
1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7
1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf
1, 1, 1, 1, 1, 1, 1, 1, # c0 - c7
1, 1, 1, 1, 1, 1, 1, 1, # c8 - cf
1, 1, 1, 1, 1, 1, 1, 1, # d0 - d7
1, 1, 1, 1, 1, 1, 1, 1, # d8 - df
1, 1, 1, 1, 1, 1, 1, 1, # e0 - e7
1, 1, 1, 1, 1, 1, 1, 1, # e8 - ef
1, 1, 1, 1, 1, 1, 1, 1, # f0 - f7
1, 1, 1, 1, 1, 1, 1, 1, # f8 - ff
)
HZ_ST = (
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
MachineState.START, MachineState.ERROR, 3, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, 4, MachineState.ERROR, # 10-17
5, MachineState.ERROR, 6, MachineState.ERROR, 5, 5, 4, MachineState.ERROR, # 18-1f
4, MachineState.ERROR, 4, 4, 4, MachineState.ERROR, 4, MachineState.ERROR, # 20-27
4, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 28-2f
)
# fmt: on
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
HZ_SM_MODEL = {'class_table': HZ_CLS,
'class_factor': 6,
'state_table': HZ_ST,
'char_len_table': HZ_CHAR_LEN_TABLE,
'name': "HZ-GB-2312",
'language': 'Chinese'}
HZ_SM_MODEL: CodingStateMachineDict = {
"class_table": HZ_CLS,
"class_factor": 6,
"state_table": HZ_ST,
"char_len_table": HZ_CHAR_LEN_TABLE,
"name": "HZ-GB-2312",
"language": "Chinese",
}
# fmt: off
ISO2022CN_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,4,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
0, 3, 0, 0, 0, 0, 0, 0, # 28 - 2f
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
0, 0, 0, 4, 0, 0, 0, 0, # 40 - 47
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
)
ISO2022CN_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07
MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f
MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, # 18-1f
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 20-27
5, 6, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 28-2f
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 30-37
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, # 38-3f
)
# fmt: on
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
'class_factor': 9,
'state_table': ISO2022CN_ST,
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
'name': "ISO-2022-CN",
'language': 'Chinese'}
ISO2022CN_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022CN_CLS,
"class_factor": 9,
"state_table": ISO2022CN_ST,
"char_len_table": ISO2022CN_CHAR_LEN_TABLE,
"name": "ISO-2022-CN",
"language": "Chinese",
}
# fmt: off
ISO2022JP_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,7,0,0,0, # 20 - 27
3,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
6,0,4,0,8,0,0,0, # 40 - 47
0,9,5,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
0, 0, 0, 0, 0, 0, 2, 2, # 08 - 0f
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
0, 0, 0, 0, 7, 0, 0, 0, # 20 - 27
3, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
6, 0, 4, 0, 8, 0, 0, 0, # 40 - 47
0, 9, 5, 0, 0, 0, 0, 0, # 48 - 4f
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
)
ISO2022JP_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07
MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, # 18-1f
MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 20-27
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 6, MachineState.ITS_ME, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, # 28-2f
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, # 30-37
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 38-3f
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, MachineState.START, # 40-47
)
# fmt: on
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
'class_factor': 10,
'state_table': ISO2022JP_ST,
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
'name': "ISO-2022-JP",
'language': 'Japanese'}
ISO2022JP_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022JP_CLS,
"class_factor": 10,
"state_table": ISO2022JP_ST,
"char_len_table": ISO2022JP_CHAR_LEN_TABLE,
"name": "ISO-2022-JP",
"language": "Japanese",
}
# fmt: off
ISO2022KR_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,3,0,0,0, # 20 - 27
0,4,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,5,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
0, 0, 0, 0, 3, 0, 0, 0, # 20 - 27
0, 4, 0, 0, 0, 0, 0, 0, # 28 - 2f
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
0, 0, 0, 5, 0, 0, 0, 0, # 40 - 47
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
)
ISO2022KR_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 10-17
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 18-1f
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 20-27
)
# fmt: on
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
'class_factor': 6,
'state_table': ISO2022KR_ST,
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
'name': "ISO-2022-KR",
'language': 'Korean'}
ISO2022KR_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022KR_CLS,
"class_factor": 6,
"state_table": ISO2022KR_ST,
"char_len_table": ISO2022KR_CHAR_LEN_TABLE,
"name": "ISO-2022-KR",
"language": "Korean",
}

View file

@ -25,68 +25,78 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .enums import ProbingState, MachineState
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from typing import Union
from .chardistribution import EUCJPDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState
from .jpcntx import EUCJPContextAnalysis
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import EUCJP_SM_MODEL
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
super(EUCJPProber, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis()
self.reset()
def reset(self):
super(EUCJPProber, self).reset()
def reset(self) -> None:
super().reset()
self.context_analyzer.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "EUC-JP"
@property
def language(self):
def language(self) -> str:
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i])
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
for i, byte in enumerate(byte_str):
# PY3K: byte_str is a byte array, so byte is an int, not a byte
coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self.logger.debug(
"%s %s prober hit error at byte %s",
self.charset_name,
self.language,
i,
)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
if coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
if coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
self._last_char[1] = byte
self.context_analyzer.feed(self._last_char, char_len)
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.context_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.context_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
if self.context_analyzer.got_enough_data() and (
self.get_confidence() > self.SHORTCUT_THRESHOLD
):
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)

View file

@ -43,6 +43,7 @@ EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0
EUCKR_TABLE_SIZE = 2352
# Char to FreqOrder table ,
# fmt: off
EUCKR_CHAR_TO_FREQ_ORDER = (
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
@ -192,4 +193,4 @@ EUCKR_CHAR_TO_FREQ_ORDER = (
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
)
# fmt: on

View file

@ -25,23 +25,23 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import EUCKR_SM_MODEL
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):
super(EUCKRProber, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "EUC-KR"
@property
def language(self):
def language(self) -> str:
return "Korean"

View file

@ -43,345 +43,346 @@
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table ,
# Char to FreqOrder table
EUCTW_TABLE_SIZE = 5376
# fmt: off
EUCTW_CHAR_TO_FREQ_ORDER = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
63,7312,7313, 317,1614, 75, 222, 159,4061,2412,1480,7314,3500,3068, 224,2809, # 2790
3616, 3, 10,3870,1471, 29,2774,1135,2852,1939, 873, 130,3242,1123, 312,7315, # 2806
4297,2051, 507, 252, 682,7316, 142,1914, 124, 206,2932, 34,3501,3173, 64, 604, # 2822
7317,2494,1976,1977, 155,1990, 645, 641,1606,7318,3405, 337, 72, 406,7319, 80, # 2838
630, 238,3174,1509, 263, 939,1092,2644, 756,1440,1094,3406, 449, 69,2969, 591, # 2854
179,2095, 471, 115,2034,1843, 60, 50,2970, 134, 806,1868, 734,2035,3407, 180, # 2870
995,1607, 156, 537,2893, 688,7320, 319,1305, 779,2144, 514,2374, 298,4298, 359, # 2886
2495, 90,2707,1338, 663, 11, 906,1099,2545, 20,2436, 182, 532,1716,7321, 732, # 2902
1376,4062,1311,1420,3175, 25,2312,1056, 113, 399, 382,1949, 242,3408,2467, 529, # 2918
3243, 475,1447,3617,7322, 117, 21, 656, 810,1297,2295,2329,3502,7323, 126,4063, # 2934
706, 456, 150, 613,4299, 71,1118,2036,4064, 145,3069, 85, 835, 486,2114,1246, # 2950
1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,7324,2127,2354, 347,3736, 221, # 2966
3503,3110,7325,1955,1153,4065, 83, 296,1199,3070, 192, 624, 93,7326, 822,1897, # 2982
2810,3111, 795,2064, 991,1554,1542,1592, 27, 43,2853, 859, 139,1456, 860,4300, # 2998
437, 712,3871, 164,2392,3112, 695, 211,3017,2096, 195,3872,1608,3504,3505,3618, # 3014
3873, 234, 811,2971,2097,3874,2229,1441,3506,1615,2375, 668,2076,1638, 305, 228, # 3030
1664,4301, 467, 415,7327, 262,2098,1593, 239, 108, 300, 200,1033, 512,1247,2077, # 3046
7328,7329,2173,3176,3619,2673, 593, 845,1062,3244, 88,1723,2037,3875,1950, 212, # 3062
266, 152, 149, 468,1898,4066,4302, 77, 187,7330,3018, 37, 5,2972,7331,3876, # 3078
7332,7333, 39,2517,4303,2894,3177,2078, 55, 148, 74,4304, 545, 483,1474,1029, # 3094
1665, 217,1869,1531,3113,1104,2645,4067, 24, 172,3507, 900,3877,3508,3509,4305, # 3110
32,1408,2811,1312, 329, 487,2355,2247,2708, 784,2674, 4,3019,3314,1427,1788, # 3126
188, 109, 499,7334,3620,1717,1789, 888,1217,3020,4306,7335,3510,7336,3315,1520, # 3142
3621,3878, 196,1034, 775,7337,7338, 929,1815, 249, 439, 38,7339,1063,7340, 794, # 3158
3879,1435,2296, 46, 178,3245,2065,7341,2376,7342, 214,1709,4307, 804, 35, 707, # 3174
324,3622,1601,2546, 140, 459,4068,7343,7344,1365, 839, 272, 978,2257,2572,3409, # 3190
2128,1363,3623,1423, 697, 100,3071, 48, 70,1231, 495,3114,2193,7345,1294,7346, # 3206
2079, 462, 586,1042,3246, 853, 256, 988, 185,2377,3410,1698, 434,1084,7347,3411, # 3222
314,2615,2775,4308,2330,2331, 569,2280, 637,1816,2518, 757,1162,1878,1616,3412, # 3238
287,1577,2115, 768,4309,1671,2854,3511,2519,1321,3737, 909,2413,7348,4069, 933, # 3254
3738,7349,2052,2356,1222,4310, 765,2414,1322, 786,4311,7350,1919,1462,1677,2895, # 3270
1699,7351,4312,1424,2437,3115,3624,2590,3316,1774,1940,3413,3880,4070, 309,1369, # 3286
1130,2812, 364,2230,1653,1299,3881,3512,3882,3883,2646, 525,1085,3021, 902,2000, # 3302
1475, 964,4313, 421,1844,1415,1057,2281, 940,1364,3116, 376,4314,4315,1381, 7, # 3318
2520, 983,2378, 336,1710,2675,1845, 321,3414, 559,1131,3022,2742,1808,1132,1313, # 3334
265,1481,1857,7352, 352,1203,2813,3247, 167,1089, 420,2814, 776, 792,1724,3513, # 3350
4071,2438,3248,7353,4072,7354, 446, 229, 333,2743, 901,3739,1200,1557,4316,2647, # 3366
1920, 395,2744,2676,3740,4073,1835, 125, 916,3178,2616,4317,7355,7356,3741,7357, # 3382
7358,7359,4318,3117,3625,1133,2547,1757,3415,1510,2313,1409,3514,7360,2145, 438, # 3398
2591,2896,2379,3317,1068, 958,3023, 461, 311,2855,2677,4074,1915,3179,4075,1978, # 3414
383, 750,2745,2617,4076, 274, 539, 385,1278,1442,7361,1154,1964, 384, 561, 210, # 3430
98,1295,2548,3515,7362,1711,2415,1482,3416,3884,2897,1257, 129,7363,3742, 642, # 3446
523,2776,2777,2648,7364, 141,2231,1333, 68, 176, 441, 876, 907,4077, 603,2592, # 3462
710, 171,3417, 404, 549, 18,3118,2393,1410,3626,1666,7365,3516,4319,2898,4320, # 3478
7366,2973, 368,7367, 146, 366, 99, 871,3627,1543, 748, 807,1586,1185, 22,2258, # 3494
379,3743,3180,7368,3181, 505,1941,2618,1991,1382,2314,7369, 380,2357, 218, 702, # 3510
1817,1248,3418,3024,3517,3318,3249,7370,2974,3628, 930,3250,3744,7371, 59,7372, # 3526
585, 601,4078, 497,3419,1112,1314,4321,1801,7373,1223,1472,2174,7374, 749,1836, # 3542
690,1899,3745,1772,3885,1476, 429,1043,1790,2232,2116, 917,4079, 447,1086,1629, # 3558
7375, 556,7376,7377,2020,1654, 844,1090, 105, 550, 966,1758,2815,1008,1782, 686, # 3574
1095,7378,2282, 793,1602,7379,3518,2593,4322,4080,2933,2297,4323,3746, 980,2496, # 3590
544, 353, 527,4324, 908,2678,2899,7380, 381,2619,1942,1348,7381,1341,1252, 560, # 3606
3072,7382,3420,2856,7383,2053, 973, 886,2080, 143,4325,7384,7385, 157,3886, 496, # 3622
4081, 57, 840, 540,2038,4326,4327,3421,2117,1445, 970,2259,1748,1965,2081,4082, # 3638
3119,1234,1775,3251,2816,3629, 773,1206,2129,1066,2039,1326,3887,1738,1725,4083, # 3654
279,3120, 51,1544,2594, 423,1578,2130,2066, 173,4328,1879,7386,7387,1583, 264, # 3670
610,3630,4329,2439, 280, 154,7388,7389,7390,1739, 338,1282,3073, 693,2857,1411, # 3686
1074,3747,2440,7391,4330,7392,7393,1240, 952,2394,7394,2900,1538,2679, 685,1483, # 3702
4084,2468,1436, 953,4085,2054,4331, 671,2395, 79,4086,2441,3252, 608, 567,2680, # 3718
3422,4087,4088,1691, 393,1261,1791,2396,7395,4332,7396,7397,7398,7399,1383,1672, # 3734
3748,3182,1464, 522,1119, 661,1150, 216, 675,4333,3888,1432,3519, 609,4334,2681, # 3750
2397,7400,7401,7402,4089,3025, 0,7403,2469, 315, 231,2442, 301,3319,4335,2380, # 3766
7404, 233,4090,3631,1818,4336,4337,7405, 96,1776,1315,2082,7406, 257,7407,1809, # 3782
3632,2709,1139,1819,4091,2021,1124,2163,2778,1777,2649,7408,3074, 363,1655,3183, # 3798
7409,2975,7410,7411,7412,3889,1567,3890, 718, 103,3184, 849,1443, 341,3320,2934, # 3814
1484,7413,1712, 127, 67, 339,4092,2398, 679,1412, 821,7414,7415, 834, 738, 351, # 3830
2976,2146, 846, 235,1497,1880, 418,1992,3749,2710, 186,1100,2147,2746,3520,1545, # 3846
1355,2935,2858,1377, 583,3891,4093,2573,2977,7416,1298,3633,1078,2549,3634,2358, # 3862
78,3750,3751, 267,1289,2099,2001,1594,4094, 348, 369,1274,2194,2175,1837,4338, # 3878
1820,2817,3635,2747,2283,2002,4339,2936,2748, 144,3321, 882,4340,3892,2749,3423, # 3894
4341,2901,7417,4095,1726, 320,7418,3893,3026, 788,2978,7419,2818,1773,1327,2859, # 3910
3894,2819,7420,1306,4342,2003,1700,3752,3521,2359,2650, 787,2022, 506, 824,3636, # 3926
534, 323,4343,1044,3322,2023,1900, 946,3424,7421,1778,1500,1678,7422,1881,4344, # 3942
165, 243,4345,3637,2521, 123, 683,4096, 764,4346, 36,3895,1792, 589,2902, 816, # 3958
626,1667,3027,2233,1639,1555,1622,3753,3896,7423,3897,2860,1370,1228,1932, 891, # 3974
2083,2903, 304,4097,7424, 292,2979,2711,3522, 691,2100,4098,1115,4347, 118, 662, # 3990
7425, 611,1156, 854,2381,1316,2861, 2, 386, 515,2904,7426,7427,3253, 868,2234, # 4006
1486, 855,2651, 785,2212,3028,7428,1040,3185,3523,7429,3121, 448,7430,1525,7431, # 4022
2164,4348,7432,3754,7433,4099,2820,3524,3122, 503, 818,3898,3123,1568, 814, 676, # 4038
1444, 306,1749,7434,3755,1416,1030, 197,1428, 805,2821,1501,4349,7435,7436,7437, # 4054
1993,7438,4350,7439,7440,2195, 13,2779,3638,2980,3124,1229,1916,7441,3756,2131, # 4070
7442,4100,4351,2399,3525,7443,2213,1511,1727,1120,7444,7445, 646,3757,2443, 307, # 4086
7446,7447,1595,3186,7448,7449,7450,3639,1113,1356,3899,1465,2522,2523,7451, 519, # 4102
7452, 128,2132, 92,2284,1979,7453,3900,1512, 342,3125,2196,7454,2780,2214,1980, # 4118
3323,7455, 290,1656,1317, 789, 827,2360,7456,3758,4352, 562, 581,3901,7457, 401, # 4134
4353,2248, 94,4354,1399,2781,7458,1463,2024,4355,3187,1943,7459, 828,1105,4101, # 4150
1262,1394,7460,4102, 605,4356,7461,1783,2862,7462,2822, 819,2101, 578,2197,2937, # 4166
7463,1502, 436,3254,4103,3255,2823,3902,2905,3425,3426,7464,2712,2315,7465,7466, # 4182
2332,2067, 23,4357, 193, 826,3759,2102, 699,1630,4104,3075, 390,1793,1064,3526, # 4198
7467,1579,3076,3077,1400,7468,4105,1838,1640,2863,7469,4358,4359, 137,4106, 598, # 4214
3078,1966, 780, 104, 974,2938,7470, 278, 899, 253, 402, 572, 504, 493,1339,7471, # 4230
3903,1275,4360,2574,2550,7472,3640,3029,3079,2249, 565,1334,2713, 863, 41,7473, # 4246
7474,4361,7475,1657,2333, 19, 463,2750,4107, 606,7476,2981,3256,1087,2084,1323, # 4262
2652,2982,7477,1631,1623,1750,4108,2682,7478,2864, 791,2714,2653,2334, 232,2416, # 4278
7479,2983,1498,7480,2654,2620, 755,1366,3641,3257,3126,2025,1609, 119,1917,3427, # 4294
862,1026,4109,7481,3904,3760,4362,3905,4363,2260,1951,2470,7482,1125, 817,4110, # 4310
4111,3906,1513,1766,2040,1487,4112,3030,3258,2824,3761,3127,7483,7484,1507,7485, # 4326
2683, 733, 40,1632,1106,2865, 345,4113, 841,2524, 230,4364,2984,1846,3259,3428, # 4342
7486,1263, 986,3429,7487, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562,3907, # 4358
3908,2939, 967,2751,2655,1349, 592,2133,1692,3324,2985,1994,4114,1679,3909,1901, # 4374
2185,7488, 739,3642,2715,1296,1290,7489,4115,2198,2199,1921,1563,2595,2551,1870, # 4390
2752,2986,7490, 435,7491, 343,1108, 596, 17,1751,4365,2235,3430,3643,7492,4366, # 4406
294,3527,2940,1693, 477, 979, 281,2041,3528, 643,2042,3644,2621,2782,2261,1031, # 4422
2335,2134,2298,3529,4367, 367,1249,2552,7493,3530,7494,4368,1283,3325,2004, 240, # 4438
1762,3326,4369,4370, 836,1069,3128, 474,7495,2148,2525, 268,3531,7496,3188,1521, # 4454
1284,7497,1658,1546,4116,7498,3532,3533,7499,4117,3327,2684,1685,4118, 961,1673, # 4470
2622, 190,2005,2200,3762,4371,4372,7500, 570,2497,3645,1490,7501,4373,2623,3260, # 4486
1956,4374, 584,1514, 396,1045,1944,7502,4375,1967,2444,7503,7504,4376,3910, 619, # 4502
7505,3129,3261, 215,2006,2783,2553,3189,4377,3190,4378, 763,4119,3763,4379,7506, # 4518
7507,1957,1767,2941,3328,3646,1174, 452,1477,4380,3329,3130,7508,2825,1253,2382, # 4534
2186,1091,2285,4120, 492,7509, 638,1169,1824,2135,1752,3911, 648, 926,1021,1324, # 4550
4381, 520,4382, 997, 847,1007, 892,4383,3764,2262,1871,3647,7510,2400,1784,4384, # 4566
1952,2942,3080,3191,1728,4121,2043,3648,4385,2007,1701,3131,1551, 30,2263,4122, # 4582
7511,2026,4386,3534,7512, 501,7513,4123, 594,3431,2165,1821,3535,3432,3536,3192, # 4598
829,2826,4124,7514,1680,3132,1225,4125,7515,3262,4387,4126,3133,2336,7516,4388, # 4614
4127,7517,3912,3913,7518,1847,2383,2596,3330,7519,4389, 374,3914, 652,4128,4129, # 4630
375,1140, 798,7520,7521,7522,2361,4390,2264, 546,1659, 138,3031,2445,4391,7523, # 4646
2250, 612,1848, 910, 796,3765,1740,1371, 825,3766,3767,7524,2906,2554,7525, 692, # 4662
444,3032,2624, 801,4392,4130,7526,1491, 244,1053,3033,4131,4132, 340,7527,3915, # 4678
1041,2987, 293,1168, 87,1357,7528,1539, 959,7529,2236, 721, 694,4133,3768, 219, # 4694
1478, 644,1417,3331,2656,1413,1401,1335,1389,3916,7530,7531,2988,2362,3134,1825, # 4710
730,1515, 184,2827, 66,4393,7532,1660,2943, 246,3332, 378,1457, 226,3433, 975, # 4726
3917,2944,1264,3537, 674, 696,7533, 163,7534,1141,2417,2166, 713,3538,3333,4394, # 4742
3918,7535,7536,1186, 15,7537,1079,1070,7538,1522,3193,3539, 276,1050,2716, 758, # 4758
1126, 653,2945,3263,7539,2337, 889,3540,3919,3081,2989, 903,1250,4395,3920,3434, # 4774
3541,1342,1681,1718, 766,3264, 286, 89,2946,3649,7540,1713,7541,2597,3334,2990, # 4790
7542,2947,2215,3194,2866,7543,4396,2498,2526, 181, 387,1075,3921, 731,2187,3335, # 4806
7544,3265, 310, 313,3435,2299, 770,4134, 54,3034, 189,4397,3082,3769,3922,7545, # 4822
1230,1617,1849, 355,3542,4135,4398,3336, 111,4136,3650,1350,3135,3436,3035,4137, # 4838
2149,3266,3543,7546,2784,3923,3924,2991, 722,2008,7547,1071, 247,1207,2338,2471, # 4854
1378,4399,2009, 864,1437,1214,4400, 373,3770,1142,2216, 667,4401, 442,2753,2555, # 4870
3771,3925,1968,4138,3267,1839, 837, 170,1107, 934,1336,1882,7548,7549,2118,4139, # 4886
2828, 743,1569,7550,4402,4140, 582,2384,1418,3437,7551,1802,7552, 357,1395,1729, # 4902
3651,3268,2418,1564,2237,7553,3083,3772,1633,4403,1114,2085,4141,1532,7554, 482, # 4918
2446,4404,7555,7556,1492, 833,1466,7557,2717,3544,1641,2829,7558,1526,1272,3652, # 4934
4142,1686,1794, 416,2556,1902,1953,1803,7559,3773,2785,3774,1159,2316,7560,2867, # 4950
4405,1610,1584,3036,2419,2754, 443,3269,1163,3136,7561,7562,3926,7563,4143,2499, # 4966
3037,4406,3927,3137,2103,1647,3545,2010,1872,4144,7564,4145, 431,3438,7565, 250, # 4982
97, 81,4146,7566,1648,1850,1558, 160, 848,7567, 866, 740,1694,7568,2201,2830, # 4998
3195,4147,4407,3653,1687, 950,2472, 426, 469,3196,3654,3655,3928,7569,7570,1188, # 5014
424,1995, 861,3546,4148,3775,2202,2685, 168,1235,3547,4149,7571,2086,1674,4408, # 5030
3337,3270, 220,2557,1009,7572,3776, 670,2992, 332,1208, 717,7573,7574,3548,2447, # 5046
3929,3338,7575, 513,7576,1209,2868,3339,3138,4409,1080,7577,7578,7579,7580,2527, # 5062
3656,3549, 815,1587,3930,3931,7581,3550,3439,3777,1254,4410,1328,3038,1390,3932, # 5078
1741,3933,3778,3934,7582, 236,3779,2448,3271,7583,7584,3657,3780,1273,3781,4411, # 5094
7585, 308,7586,4412, 245,4413,1851,2473,1307,2575, 430, 715,2136,2449,7587, 270, # 5110
199,2869,3935,7588,3551,2718,1753, 761,1754, 725,1661,1840,4414,3440,3658,7589, # 5126
7590, 587, 14,3272, 227,2598, 326, 480,2265, 943,2755,3552, 291, 650,1883,7591, # 5142
1702,1226, 102,1547, 62,3441, 904,4415,3442,1164,4150,7592,7593,1224,1548,2756, # 5158
391, 498,1493,7594,1386,1419,7595,2055,1177,4416, 813, 880,1081,2363, 566,1145, # 5174
4417,2286,1001,1035,2558,2599,2238, 394,1286,7596,7597,2068,7598, 86,1494,1730, # 5190
3936, 491,1588, 745, 897,2948, 843,3340,3937,2757,2870,3273,1768, 998,2217,2069, # 5206
397,1826,1195,1969,3659,2993,3341, 284,7599,3782,2500,2137,2119,1903,7600,3938, # 5222
2150,3939,4151,1036,3443,1904, 114,2559,4152, 209,1527,7601,7602,2949,2831,2625, # 5238
2385,2719,3139, 812,2560,7603,3274,7604,1559, 737,1884,3660,1210, 885, 28,2686, # 5254
3553,3783,7605,4153,1004,1779,4418,7606, 346,1981,2218,2687,4419,3784,1742, 797, # 5270
1642,3940,1933,1072,1384,2151, 896,3941,3275,3661,3197,2871,3554,7607,2561,1958, # 5286
4420,2450,1785,7608,7609,7610,3942,4154,1005,1308,3662,4155,2720,4421,4422,1528, # 5302
2600, 161,1178,4156,1982, 987,4423,1101,4157, 631,3943,1157,3198,2420,1343,1241, # 5318
1016,2239,2562, 372, 877,2339,2501,1160, 555,1934, 911,3944,7611, 466,1170, 169, # 5334
1051,2907,2688,3663,2474,2994,1182,2011,2563,1251,2626,7612, 992,2340,3444,1540, # 5350
2721,1201,2070,2401,1996,2475,7613,4424, 528,1922,2188,1503,1873,1570,2364,3342, # 5366
3276,7614, 557,1073,7615,1827,3445,2087,2266,3140,3039,3084, 767,3085,2786,4425, # 5382
1006,4158,4426,2341,1267,2176,3664,3199, 778,3945,3200,2722,1597,2657,7616,4427, # 5398
7617,3446,7618,7619,7620,3277,2689,1433,3278, 131, 95,1504,3946, 723,4159,3141, # 5414
1841,3555,2758,2189,3947,2027,2104,3665,7621,2995,3948,1218,7622,3343,3201,3949, # 5430
4160,2576, 248,1634,3785, 912,7623,2832,3666,3040,3786, 654, 53,7624,2996,7625, # 5446
1688,4428, 777,3447,1032,3950,1425,7626, 191, 820,2120,2833, 971,4429, 931,3202, # 5462
135, 664, 783,3787,1997, 772,2908,1935,3951,3788,4430,2909,3203, 282,2723, 640, # 5478
1372,3448,1127, 922, 325,3344,7627,7628, 711,2044,7629,7630,3952,2219,2787,1936, # 5494
3953,3345,2220,2251,3789,2300,7631,4431,3790,1258,3279,3954,3204,2138,2950,3955, # 5510
3956,7632,2221, 258,3205,4432, 101,1227,7633,3280,1755,7634,1391,3281,7635,2910, # 5526
2056, 893,7636,7637,7638,1402,4161,2342,7639,7640,3206,3556,7641,7642, 878,1325, # 5542
1780,2788,4433, 259,1385,2577, 744,1183,2267,4434,7643,3957,2502,7644, 684,1024, # 5558
4162,7645, 472,3557,3449,1165,3282,3958,3959, 322,2152, 881, 455,1695,1152,1340, # 5574
660, 554,2153,4435,1058,4436,4163, 830,1065,3346,3960,4437,1923,7646,1703,1918, # 5590
7647, 932,2268, 122,7648,4438, 947, 677,7649,3791,2627, 297,1905,1924,2269,4439, # 5606
2317,3283,7650,7651,4164,7652,4165, 84,4166, 112, 989,7653, 547,1059,3961, 701, # 5622
3558,1019,7654,4167,7655,3450, 942, 639, 457,2301,2451, 993,2951, 407, 851, 494, # 5638
4440,3347, 927,7656,1237,7657,2421,3348, 573,4168, 680, 921,2911,1279,1874, 285, # 5654
790,1448,1983, 719,2167,7658,7659,4441,3962,3963,1649,7660,1541, 563,7661,1077, # 5670
7662,3349,3041,3451, 511,2997,3964,3965,3667,3966,1268,2564,3350,3207,4442,4443, # 5686
7663, 535,1048,1276,1189,2912,2028,3142,1438,1373,2834,2952,1134,2012,7664,4169, # 5702
1238,2578,3086,1259,7665, 700,7666,2953,3143,3668,4170,7667,4171,1146,1875,1906, # 5718
4444,2601,3967, 781,2422, 132,1589, 203, 147, 273,2789,2402, 898,1786,2154,3968, # 5734
3969,7668,3792,2790,7669,7670,4445,4446,7671,3208,7672,1635,3793, 965,7673,1804, # 5750
2690,1516,3559,1121,1082,1329,3284,3970,1449,3794, 65,1128,2835,2913,2759,1590, # 5766
3795,7674,7675, 12,2658, 45, 976,2579,3144,4447, 517,2528,1013,1037,3209,7676, # 5782
3796,2836,7677,3797,7678,3452,7679,2602, 614,1998,2318,3798,3087,2724,2628,7680, # 5798
2580,4172, 599,1269,7681,1810,3669,7682,2691,3088, 759,1060, 489,1805,3351,3285, # 5814
1358,7683,7684,2386,1387,1215,2629,2252, 490,7685,7686,4173,1759,2387,2343,7687, # 5830
4448,3799,1907,3971,2630,1806,3210,4449,3453,3286,2760,2344, 874,7688,7689,3454, # 5846
3670,1858, 91,2914,3671,3042,3800,4450,7690,3145,3972,2659,7691,3455,1202,1403, # 5862
3801,2954,2529,1517,2503,4451,3456,2504,7692,4452,7693,2692,1885,1495,1731,3973, # 5878
2365,4453,7694,2029,7695,7696,3974,2693,1216, 237,2581,4174,2319,3975,3802,4454, # 5894
4455,2694,3560,3457, 445,4456,7697,7698,7699,7700,2761, 61,3976,3672,1822,3977, # 5910
7701, 687,2045, 935, 925, 405,2660, 703,1096,1859,2725,4457,3978,1876,1367,2695, # 5926
3352, 918,2105,1781,2476, 334,3287,1611,1093,4458, 564,3146,3458,3673,3353, 945, # 5942
2631,2057,4459,7702,1925, 872,4175,7703,3459,2696,3089, 349,4176,3674,3979,4460, # 5958
3803,4177,3675,2155,3980,4461,4462,4178,4463,2403,2046, 782,3981, 400, 251,4179, # 5974
1624,7704,7705, 277,3676, 299,1265, 476,1191,3804,2121,4180,4181,1109, 205,7706, # 5990
2582,1000,2156,3561,1860,7707,7708,7709,4464,7710,4465,2565, 107,2477,2157,3982, # 6006
3460,3147,7711,1533, 541,1301, 158, 753,4182,2872,3562,7712,1696, 370,1088,4183, # 6022
4466,3563, 579, 327, 440, 162,2240, 269,1937,1374,3461, 968,3043, 56,1396,3090, # 6038
2106,3288,3354,7713,1926,2158,4467,2998,7714,3564,7715,7716,3677,4468,2478,7717, # 6054
2791,7718,1650,4469,7719,2603,7720,7721,3983,2661,3355,1149,3356,3984,3805,3985, # 6070
7722,1076, 49,7723, 951,3211,3289,3290, 450,2837, 920,7724,1811,2792,2366,4184, # 6086
1908,1138,2367,3806,3462,7725,3212,4470,1909,1147,1518,2423,4471,3807,7726,4472, # 6102
2388,2604, 260,1795,3213,7727,7728,3808,3291, 708,7729,3565,1704,7730,3566,1351, # 6118
1618,3357,2999,1886, 944,4185,3358,4186,3044,3359,4187,7731,3678, 422, 413,1714, # 6134
3292, 500,2058,2345,4188,2479,7732,1344,1910, 954,7733,1668,7734,7735,3986,2404, # 6150
4189,3567,3809,4190,7736,2302,1318,2505,3091, 133,3092,2873,4473, 629, 31,2838, # 6166
2697,3810,4474, 850, 949,4475,3987,2955,1732,2088,4191,1496,1852,7737,3988, 620, # 6182
3214, 981,1242,3679,3360,1619,3680,1643,3293,2139,2452,1970,1719,3463,2168,7738, # 6198
3215,7739,7740,3361,1828,7741,1277,4476,1565,2047,7742,1636,3568,3093,7743, 869, # 6214
2839, 655,3811,3812,3094,3989,3000,3813,1310,3569,4477,7744,7745,7746,1733, 558, # 6230
4478,3681, 335,1549,3045,1756,4192,3682,1945,3464,1829,1291,1192, 470,2726,2107, # 6246
2793, 913,1054,3990,7747,1027,7748,3046,3991,4479, 982,2662,3362,3148,3465,3216, # 6262
3217,1946,2794,7749, 571,4480,7750,1830,7751,3570,2583,1523,2424,7752,2089, 984, # 6278
4481,3683,1959,7753,3684, 852, 923,2795,3466,3685, 969,1519, 999,2048,2320,1705, # 6294
7754,3095, 615,1662, 151, 597,3992,2405,2321,1049, 275,4482,3686,4193, 568,3687, # 6310
3571,2480,4194,3688,7755,2425,2270, 409,3218,7756,1566,2874,3467,1002, 769,2840, # 6326
194,2090,3149,3689,2222,3294,4195, 628,1505,7757,7758,1763,2177,3001,3993, 521, # 6342
1161,2584,1787,2203,2406,4483,3994,1625,4196,4197, 412, 42,3096, 464,7759,2632, # 6358
4484,3363,1760,1571,2875,3468,2530,1219,2204,3814,2633,2140,2368,4485,4486,3295, # 6374
1651,3364,3572,7760,7761,3573,2481,3469,7762,3690,7763,7764,2271,2091, 460,7765, # 6390
4487,7766,3002, 962, 588,3574, 289,3219,2634,1116, 52,7767,3047,1796,7768,7769, # 6406
7770,1467,7771,1598,1143,3691,4198,1984,1734,1067,4488,1280,3365, 465,4489,1572, # 6422
510,7772,1927,2241,1812,1644,3575,7773,4490,3692,7774,7775,2663,1573,1534,7776, # 6438
7777,4199, 536,1807,1761,3470,3815,3150,2635,7778,7779,7780,4491,3471,2915,1911, # 6454
2796,7781,3296,1122, 377,3220,7782, 360,7783,7784,4200,1529, 551,7785,2059,3693, # 6470
1769,2426,7786,2916,4201,3297,3097,2322,2108,2030,4492,1404, 136,1468,1479, 672, # 6486
1171,3221,2303, 271,3151,7787,2762,7788,2049, 678,2727, 865,1947,4493,7789,2013, # 6502
3995,2956,7790,2728,2223,1397,3048,3694,4494,4495,1735,2917,3366,3576,7791,3816, # 6518
509,2841,2453,2876,3817,7792,7793,3152,3153,4496,4202,2531,4497,2304,1166,1010, # 6534
552, 681,1887,7794,7795,2957,2958,3996,1287,1596,1861,3154, 358, 453, 736, 175, # 6550
478,1117, 905,1167,1097,7796,1853,1530,7797,1706,7798,2178,3472,2287,3695,3473, # 6566
3577,4203,2092,4204,7799,3367,1193,2482,4205,1458,2190,2205,1862,1888,1421,3298, # 6582
2918,3049,2179,3474, 595,2122,7800,3997,7801,7802,4206,1707,2636, 223,3696,1359, # 6598
751,3098, 183,3475,7803,2797,3003, 419,2369, 633, 704,3818,2389, 241,7804,7805, # 6614
7806, 838,3004,3697,2272,2763,2454,3819,1938,2050,3998,1309,3099,2242,1181,7807, # 6630
1136,2206,3820,2370,1446,4207,2305,4498,7808,7809,4208,1055,2605, 484,3698,7810, # 6646
3999, 625,4209,2273,3368,1499,4210,4000,7811,4001,4211,3222,2274,2275,3476,7812, # 6662
7813,2764, 808,2606,3699,3369,4002,4212,3100,2532, 526,3370,3821,4213, 955,7814, # 6678
1620,4214,2637,2427,7815,1429,3700,1669,1831, 994, 928,7816,3578,1260,7817,7818, # 6694
7819,1948,2288, 741,2919,1626,4215,2729,2455, 867,1184, 362,3371,1392,7820,7821, # 6710
4003,4216,1770,1736,3223,2920,4499,4500,1928,2698,1459,1158,7822,3050,3372,2877, # 6726
1292,1929,2506,2842,3701,1985,1187,2071,2014,2607,4217,7823,2566,2507,2169,3702, # 6742
2483,3299,7824,3703,4501,7825,7826, 666,1003,3005,1022,3579,4218,7827,4502,1813, # 6758
2253, 574,3822,1603, 295,1535, 705,3823,4219, 283, 858, 417,7828,7829,3224,4503, # 6774
4504,3051,1220,1889,1046,2276,2456,4004,1393,1599, 689,2567, 388,4220,7830,2484, # 6790
802,7831,2798,3824,2060,1405,2254,7832,4505,3825,2109,1052,1345,3225,1585,7833, # 6806
809,7834,7835,7836, 575,2730,3477, 956,1552,1469,1144,2323,7837,2324,1560,2457, # 6822
3580,3226,4005, 616,2207,3155,2180,2289,7838,1832,7839,3478,4506,7840,1319,3704, # 6838
3705,1211,3581,1023,3227,1293,2799,7841,7842,7843,3826, 607,2306,3827, 762,2878, # 6854
1439,4221,1360,7844,1485,3052,7845,4507,1038,4222,1450,2061,2638,4223,1379,4508, # 6870
2585,7846,7847,4224,1352,1414,2325,2921,1172,7848,7849,3828,3829,7850,1797,1451, # 6886
7851,7852,7853,7854,2922,4006,4007,2485,2346, 411,4008,4009,3582,3300,3101,4509, # 6902
1561,2664,1452,4010,1375,7855,7856, 47,2959, 316,7857,1406,1591,2923,3156,7858, # 6918
1025,2141,3102,3157, 354,2731, 884,2224,4225,2407, 508,3706, 726,3583, 996,2428, # 6934
3584, 729,7859, 392,2191,1453,4011,4510,3707,7860,7861,2458,3585,2608,1675,2800, # 6950
919,2347,2960,2348,1270,4511,4012, 73,7862,7863, 647,7864,3228,2843,2255,1550, # 6966
1346,3006,7865,1332, 883,3479,7866,7867,7868,7869,3301,2765,7870,1212, 831,1347, # 6982
4226,4512,2326,3830,1863,3053, 720,3831,4513,4514,3832,7871,4227,7872,7873,4515, # 6998
7874,7875,1798,4516,3708,2609,4517,3586,1645,2371,7876,7877,2924, 669,2208,2665, # 7014
2429,7878,2879,7879,7880,1028,3229,7881,4228,2408,7882,2256,1353,7883,7884,4518, # 7030
3158, 518,7885,4013,7886,4229,1960,7887,2142,4230,7888,7889,3007,2349,2350,3833, # 7046
516,1833,1454,4014,2699,4231,4519,2225,2610,1971,1129,3587,7890,2766,7891,2961, # 7062
1422, 577,1470,3008,1524,3373,7892,7893, 432,4232,3054,3480,7894,2586,1455,2508, # 7078
2226,1972,1175,7895,1020,2732,4015,3481,4520,7896,2733,7897,1743,1361,3055,3482, # 7094
2639,4016,4233,4521,2290, 895, 924,4234,2170, 331,2243,3056, 166,1627,3057,1098, # 7110
7898,1232,2880,2227,3374,4522, 657, 403,1196,2372, 542,3709,3375,1600,4235,3483, # 7126
7899,4523,2767,3230, 576, 530,1362,7900,4524,2533,2666,3710,4017,7901, 842,3834, # 7142
7902,2801,2031,1014,4018, 213,2700,3376, 665, 621,4236,7903,3711,2925,2430,7904, # 7158
2431,3302,3588,3377,7905,4237,2534,4238,4525,3589,1682,4239,3484,1380,7906, 724, # 7174
2277, 600,1670,7907,1337,1233,4526,3103,2244,7908,1621,4527,7909, 651,4240,7910, # 7190
1612,4241,2611,7911,2844,7912,2734,2307,3058,7913, 716,2459,3059, 174,1255,2701, # 7206
4019,3590, 548,1320,1398, 728,4020,1574,7914,1890,1197,3060,4021,7915,3061,3062, # 7222
3712,3591,3713, 747,7916, 635,4242,4528,7917,7918,7919,4243,7920,7921,4529,7922, # 7238
3378,4530,2432, 451,7923,3714,2535,2072,4244,2735,4245,4022,7924,1764,4531,7925, # 7254
4246, 350,7926,2278,2390,2486,7927,4247,4023,2245,1434,4024, 488,4532, 458,4248, # 7270
4025,3715, 771,1330,2391,3835,2568,3159,2159,2409,1553,2667,3160,4249,7928,2487, # 7286
2881,2612,1720,2702,4250,3379,4533,7929,2536,4251,7930,3231,4252,2768,7931,2015, # 7302
2736,7932,1155,1017,3716,3836,7933,3303,2308, 201,1864,4253,1430,7934,4026,7935, # 7318
7936,7937,7938,7939,4254,1604,7940, 414,1865, 371,2587,4534,4535,3485,2016,3104, # 7334
4536,1708, 960,4255, 887, 389,2171,1536,1663,1721,7941,2228,4027,2351,2926,1580, # 7350
7942,7943,7944,1744,7945,2537,4537,4538,7946,4539,7947,2073,7948,7949,3592,3380, # 7366
2882,4256,7950,4257,2640,3381,2802, 673,2703,2460, 709,3486,4028,3593,4258,7951, # 7382
1148, 502, 634,7952,7953,1204,4540,3594,1575,4541,2613,3717,7954,3718,3105, 948, # 7398
3232, 121,1745,3837,1110,7955,4259,3063,2509,3009,4029,3719,1151,1771,3838,1488, # 7414
4030,1986,7956,2433,3487,7957,7958,2093,7959,4260,3839,1213,1407,2803, 531,2737, # 7430
2538,3233,1011,1537,7960,2769,4261,3106,1061,7961,3720,3721,1866,2883,7962,2017, # 7446
120,4262,4263,2062,3595,3234,2309,3840,2668,3382,1954,4542,7963,7964,3488,1047, # 7462
2704,1266,7965,1368,4543,2845, 649,3383,3841,2539,2738,1102,2846,2669,7966,7967, # 7478
1999,7968,1111,3596,2962,7969,2488,3842,3597,2804,1854,3384,3722,7970,7971,3385, # 7494
2410,2884,3304,3235,3598,7972,2569,7973,3599,2805,4031,1460, 856,7974,3600,7975, # 7510
2885,2963,7976,2886,3843,7977,4264, 632,2510, 875,3844,1697,3845,2291,7978,7979, # 7526
4544,3010,1239, 580,4545,4265,7980, 914, 936,2074,1190,4032,1039,2123,7981,7982, # 7542
7983,3386,1473,7984,1354,4266,3846,7985,2172,3064,4033, 915,3305,4267,4268,3306, # 7558
1605,1834,7986,2739, 398,3601,4269,3847,4034, 328,1912,2847,4035,3848,1331,4270, # 7574
3011, 937,4271,7987,3602,4036,4037,3387,2160,4546,3388, 524, 742, 538,3065,1012, # 7590
7988,7989,3849,2461,7990, 658,1103, 225,3850,7991,7992,4547,7993,4548,7994,3236, # 7606
1243,7995,4038, 963,2246,4549,7996,2705,3603,3161,7997,7998,2588,2327,7999,4550, # 7622
8000,8001,8002,3489,3307, 957,3389,2540,2032,1930,2927,2462, 870,2018,3604,1746, # 7638
2770,2771,2434,2463,8003,3851,8004,3723,3107,3724,3490,3390,3725,8005,1179,3066, # 7654
8006,3162,2373,4272,3726,2541,3163,3108,2740,4039,8007,3391,1556,2542,2292, 977, # 7670
2887,2033,4040,1205,3392,8008,1765,3393,3164,2124,1271,1689, 714,4551,3491,8009, # 7686
2328,3852, 533,4273,3605,2181, 617,8010,2464,3308,3492,2310,8011,8012,3165,8013, # 7702
8014,3853,1987, 618, 427,2641,3493,3394,8015,8016,1244,1690,8017,2806,4274,4552, # 7718
8018,3494,8019,8020,2279,1576, 473,3606,4275,3395, 972,8021,3607,8022,3067,8023, # 7734
8024,4553,4554,8025,3727,4041,4042,8026, 153,4555, 356,8027,1891,2888,4276,2143, # 7750
408, 803,2352,8028,3854,8029,4277,1646,2570,2511,4556,4557,3855,8030,3856,4278, # 7766
8031,2411,3396, 752,8032,8033,1961,2964,8034, 746,3012,2465,8035,4279,3728, 698, # 7782
4558,1892,4280,3608,2543,4559,3609,3857,8036,3166,3397,8037,1823,1302,4043,2706, # 7798
3858,1973,4281,8038,4282,3167, 823,1303,1288,1236,2848,3495,4044,3398, 774,3859, # 7814
8039,1581,4560,1304,2849,3860,4561,8040,2435,2161,1083,3237,4283,4045,4284, 344, # 7830
1173, 288,2311, 454,1683,8041,8042,1461,4562,4046,2589,8043,8044,4563, 985, 894, # 7846
8045,3399,3168,8046,1913,2928,3729,1988,8047,2110,1974,8048,4047,8049,2571,1194, # 7862
425,8050,4564,3169,1245,3730,4285,8051,8052,2850,8053, 636,4565,1855,3861, 760, # 7878
1799,8054,4286,2209,1508,4566,4048,1893,1684,2293,8055,8056,8057,4287,4288,2210, # 7894
479,8058,8059, 832,8060,4049,2489,8061,2965,2490,3731, 990,3109, 627,1814,2642, # 7910
4289,1582,4290,2125,2111,3496,4567,8062, 799,4291,3170,8063,4568,2112,1737,3013, # 7926
1018, 543, 754,4292,3309,1676,4569,4570,4050,8064,1489,8065,3497,8066,2614,2889, # 7942
4051,8067,8068,2966,8069,8070,8071,8072,3171,4571,4572,2182,1722,8073,3238,3239, # 7958
1842,3610,1715, 481, 365,1975,1856,8074,8075,1962,2491,4573,8076,2126,3611,3240, # 7974
433,1894,2063,2075,8077, 602,2741,8078,8079,8080,8081,8082,3014,1628,3400,8083, # 7990
3172,4574,4052,2890,4575,2512,8084,2544,2772,8085,8086,8087,3310,4576,2891,8088, # 8006
4577,8089,2851,4578,4579,1221,2967,4053,2513,8090,8091,8092,1867,1989,8093,8094, # 8022
8095,1895,8096,8097,4580,1896,4054, 318,8098,2094,4055,4293,8099,8100, 485,8101, # 8038
938,3862, 553,2670, 116,8102,3863,3612,8103,3498,2671,2773,3401,3311,2807,8104, # 8054
3613,2929,4056,1747,2930,2968,8105,8106, 207,8107,8108,2672,4581,2514,8109,3015, # 8070
890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, # 8086
2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, # 8102
1, 1800, 1506, 255, 1431, 198, 9, 82, 6, 7310, 177, 202, 3615, 1256, 2808, 110, # 2742
3735, 33, 3241, 261, 76, 44, 2113, 16, 2931, 2184, 1176, 659, 3868, 26, 3404, 2643, # 2758
1198, 3869, 3313, 4060, 410, 2211, 302, 590, 361, 1963, 8, 204, 58, 4296, 7311, 1931, # 2774
63, 7312, 7313, 317, 1614, 75, 222, 159, 4061, 2412, 1480, 7314, 3500, 3068, 224, 2809, # 2790
3616, 3, 10, 3870, 1471, 29, 2774, 1135, 2852, 1939, 873, 130, 3242, 1123, 312, 7315, # 2806
4297, 2051, 507, 252, 682, 7316, 142, 1914, 124, 206, 2932, 34, 3501, 3173, 64, 604, # 2822
7317, 2494, 1976, 1977, 155, 1990, 645, 641, 1606, 7318, 3405, 337, 72, 406, 7319, 80, # 2838
630, 238, 3174, 1509, 263, 939, 1092, 2644, 756, 1440, 1094, 3406, 449, 69, 2969, 591, # 2854
179, 2095, 471, 115, 2034, 1843, 60, 50, 2970, 134, 806, 1868, 734, 2035, 3407, 180, # 2870
995, 1607, 156, 537, 2893, 688, 7320, 319, 1305, 779, 2144, 514, 2374, 298, 4298, 359, # 2886
2495, 90, 2707, 1338, 663, 11, 906, 1099, 2545, 20, 2436, 182, 532, 1716, 7321, 732, # 2902
1376, 4062, 1311, 1420, 3175, 25, 2312, 1056, 113, 399, 382, 1949, 242, 3408, 2467, 529, # 2918
3243, 475, 1447, 3617, 7322, 117, 21, 656, 810, 1297, 2295, 2329, 3502, 7323, 126, 4063, # 2934
706, 456, 150, 613, 4299, 71, 1118, 2036, 4064, 145, 3069, 85, 835, 486, 2114, 1246, # 2950
1426, 428, 727, 1285, 1015, 800, 106, 623, 303, 1281, 7324, 2127, 2354, 347, 3736, 221, # 2966
3503, 3110, 7325, 1955, 1153, 4065, 83, 296, 1199, 3070, 192, 624, 93, 7326, 822, 1897, # 2982
2810, 3111, 795, 2064, 991, 1554, 1542, 1592, 27, 43, 2853, 859, 139, 1456, 860, 4300, # 2998
437, 712, 3871, 164, 2392, 3112, 695, 211, 3017, 2096, 195, 3872, 1608, 3504, 3505, 3618, # 3014
3873, 234, 811, 2971, 2097, 3874, 2229, 1441, 3506, 1615, 2375, 668, 2076, 1638, 305, 228, # 3030
1664, 4301, 467, 415, 7327, 262, 2098, 1593, 239, 108, 300, 200, 1033, 512, 1247, 2077, # 3046
7328, 7329, 2173, 3176, 3619, 2673, 593, 845, 1062, 3244, 88, 1723, 2037, 3875, 1950, 212, # 3062
266, 152, 149, 468, 1898, 4066, 4302, 77, 187, 7330, 3018, 37, 5, 2972, 7331, 3876, # 3078
7332, 7333, 39, 2517, 4303, 2894, 3177, 2078, 55, 148, 74, 4304, 545, 483, 1474, 1029, # 3094
1665, 217, 1869, 1531, 3113, 1104, 2645, 4067, 24, 172, 3507, 900, 3877, 3508, 3509, 4305, # 3110
32, 1408, 2811, 1312, 329, 487, 2355, 2247, 2708, 784, 2674, 4, 3019, 3314, 1427, 1788, # 3126
188, 109, 499, 7334, 3620, 1717, 1789, 888, 1217, 3020, 4306, 7335, 3510, 7336, 3315, 1520, # 3142
3621, 3878, 196, 1034, 775, 7337, 7338, 929, 1815, 249, 439, 38, 7339, 1063, 7340, 794, # 3158
3879, 1435, 2296, 46, 178, 3245, 2065, 7341, 2376, 7342, 214, 1709, 4307, 804, 35, 707, # 3174
324, 3622, 1601, 2546, 140, 459, 4068, 7343, 7344, 1365, 839, 272, 978, 2257, 2572, 3409, # 3190
2128, 1363, 3623, 1423, 697, 100, 3071, 48, 70, 1231, 495, 3114, 2193, 7345, 1294, 7346, # 3206
2079, 462, 586, 1042, 3246, 853, 256, 988, 185, 2377, 3410, 1698, 434, 1084, 7347, 3411, # 3222
314, 2615, 2775, 4308, 2330, 2331, 569, 2280, 637, 1816, 2518, 757, 1162, 1878, 1616, 3412, # 3238
287, 1577, 2115, 768, 4309, 1671, 2854, 3511, 2519, 1321, 3737, 909, 2413, 7348, 4069, 933, # 3254
3738, 7349, 2052, 2356, 1222, 4310, 765, 2414, 1322, 786, 4311, 7350, 1919, 1462, 1677, 2895, # 3270
1699, 7351, 4312, 1424, 2437, 3115, 3624, 2590, 3316, 1774, 1940, 3413, 3880, 4070, 309, 1369, # 3286
1130, 2812, 364, 2230, 1653, 1299, 3881, 3512, 3882, 3883, 2646, 525, 1085, 3021, 902, 2000, # 3302
1475, 964, 4313, 421, 1844, 1415, 1057, 2281, 940, 1364, 3116, 376, 4314, 4315, 1381, 7, # 3318
2520, 983, 2378, 336, 1710, 2675, 1845, 321, 3414, 559, 1131, 3022, 2742, 1808, 1132, 1313, # 3334
265, 1481, 1857, 7352, 352, 1203, 2813, 3247, 167, 1089, 420, 2814, 776, 792, 1724, 3513, # 3350
4071, 2438, 3248, 7353, 4072, 7354, 446, 229, 333, 2743, 901, 3739, 1200, 1557, 4316, 2647, # 3366
1920, 395, 2744, 2676, 3740, 4073, 1835, 125, 916, 3178, 2616, 4317, 7355, 7356, 3741, 7357, # 3382
7358, 7359, 4318, 3117, 3625, 1133, 2547, 1757, 3415, 1510, 2313, 1409, 3514, 7360, 2145, 438, # 3398
2591, 2896, 2379, 3317, 1068, 958, 3023, 461, 311, 2855, 2677, 4074, 1915, 3179, 4075, 1978, # 3414
383, 750, 2745, 2617, 4076, 274, 539, 385, 1278, 1442, 7361, 1154, 1964, 384, 561, 210, # 3430
98, 1295, 2548, 3515, 7362, 1711, 2415, 1482, 3416, 3884, 2897, 1257, 129, 7363, 3742, 642, # 3446
523, 2776, 2777, 2648, 7364, 141, 2231, 1333, 68, 176, 441, 876, 907, 4077, 603, 2592, # 3462
710, 171, 3417, 404, 549, 18, 3118, 2393, 1410, 3626, 1666, 7365, 3516, 4319, 2898, 4320, # 3478
7366, 2973, 368, 7367, 146, 366, 99, 871, 3627, 1543, 748, 807, 1586, 1185, 22, 2258, # 3494
379, 3743, 3180, 7368, 3181, 505, 1941, 2618, 1991, 1382, 2314, 7369, 380, 2357, 218, 702, # 3510
1817, 1248, 3418, 3024, 3517, 3318, 3249, 7370, 2974, 3628, 930, 3250, 3744, 7371, 59, 7372, # 3526
585, 601, 4078, 497, 3419, 1112, 1314, 4321, 1801, 7373, 1223, 1472, 2174, 7374, 749, 1836, # 3542
690, 1899, 3745, 1772, 3885, 1476, 429, 1043, 1790, 2232, 2116, 917, 4079, 447, 1086, 1629, # 3558
7375, 556, 7376, 7377, 2020, 1654, 844, 1090, 105, 550, 966, 1758, 2815, 1008, 1782, 686, # 3574
1095, 7378, 2282, 793, 1602, 7379, 3518, 2593, 4322, 4080, 2933, 2297, 4323, 3746, 980, 2496, # 3590
544, 353, 527, 4324, 908, 2678, 2899, 7380, 381, 2619, 1942, 1348, 7381, 1341, 1252, 560, # 3606
3072, 7382, 3420, 2856, 7383, 2053, 973, 886, 2080, 143, 4325, 7384, 7385, 157, 3886, 496, # 3622
4081, 57, 840, 540, 2038, 4326, 4327, 3421, 2117, 1445, 970, 2259, 1748, 1965, 2081, 4082, # 3638
3119, 1234, 1775, 3251, 2816, 3629, 773, 1206, 2129, 1066, 2039, 1326, 3887, 1738, 1725, 4083, # 3654
279, 3120, 51, 1544, 2594, 423, 1578, 2130, 2066, 173, 4328, 1879, 7386, 7387, 1583, 264, # 3670
610, 3630, 4329, 2439, 280, 154, 7388, 7389, 7390, 1739, 338, 1282, 3073, 693, 2857, 1411, # 3686
1074, 3747, 2440, 7391, 4330, 7392, 7393, 1240, 952, 2394, 7394, 2900, 1538, 2679, 685, 1483, # 3702
4084, 2468, 1436, 953, 4085, 2054, 4331, 671, 2395, 79, 4086, 2441, 3252, 608, 567, 2680, # 3718
3422, 4087, 4088, 1691, 393, 1261, 1791, 2396, 7395, 4332, 7396, 7397, 7398, 7399, 1383, 1672, # 3734
3748, 3182, 1464, 522, 1119, 661, 1150, 216, 675, 4333, 3888, 1432, 3519, 609, 4334, 2681, # 3750
2397, 7400, 7401, 7402, 4089, 3025, 0, 7403, 2469, 315, 231, 2442, 301, 3319, 4335, 2380, # 3766
7404, 233, 4090, 3631, 1818, 4336, 4337, 7405, 96, 1776, 1315, 2082, 7406, 257, 7407, 1809, # 3782
3632, 2709, 1139, 1819, 4091, 2021, 1124, 2163, 2778, 1777, 2649, 7408, 3074, 363, 1655, 3183, # 3798
7409, 2975, 7410, 7411, 7412, 3889, 1567, 3890, 718, 103, 3184, 849, 1443, 341, 3320, 2934, # 3814
1484, 7413, 1712, 127, 67, 339, 4092, 2398, 679, 1412, 821, 7414, 7415, 834, 738, 351, # 3830
2976, 2146, 846, 235, 1497, 1880, 418, 1992, 3749, 2710, 186, 1100, 2147, 2746, 3520, 1545, # 3846
1355, 2935, 2858, 1377, 583, 3891, 4093, 2573, 2977, 7416, 1298, 3633, 1078, 2549, 3634, 2358, # 3862
78, 3750, 3751, 267, 1289, 2099, 2001, 1594, 4094, 348, 369, 1274, 2194, 2175, 1837, 4338, # 3878
1820, 2817, 3635, 2747, 2283, 2002, 4339, 2936, 2748, 144, 3321, 882, 4340, 3892, 2749, 3423, # 3894
4341, 2901, 7417, 4095, 1726, 320, 7418, 3893, 3026, 788, 2978, 7419, 2818, 1773, 1327, 2859, # 3910
3894, 2819, 7420, 1306, 4342, 2003, 1700, 3752, 3521, 2359, 2650, 787, 2022, 506, 824, 3636, # 3926
534, 323, 4343, 1044, 3322, 2023, 1900, 946, 3424, 7421, 1778, 1500, 1678, 7422, 1881, 4344, # 3942
165, 243, 4345, 3637, 2521, 123, 683, 4096, 764, 4346, 36, 3895, 1792, 589, 2902, 816, # 3958
626, 1667, 3027, 2233, 1639, 1555, 1622, 3753, 3896, 7423, 3897, 2860, 1370, 1228, 1932, 891, # 3974
2083, 2903, 304, 4097, 7424, 292, 2979, 2711, 3522, 691, 2100, 4098, 1115, 4347, 118, 662, # 3990
7425, 611, 1156, 854, 2381, 1316, 2861, 2, 386, 515, 2904, 7426, 7427, 3253, 868, 2234, # 4006
1486, 855, 2651, 785, 2212, 3028, 7428, 1040, 3185, 3523, 7429, 3121, 448, 7430, 1525, 7431, # 4022
2164, 4348, 7432, 3754, 7433, 4099, 2820, 3524, 3122, 503, 818, 3898, 3123, 1568, 814, 676, # 4038
1444, 306, 1749, 7434, 3755, 1416, 1030, 197, 1428, 805, 2821, 1501, 4349, 7435, 7436, 7437, # 4054
1993, 7438, 4350, 7439, 7440, 2195, 13, 2779, 3638, 2980, 3124, 1229, 1916, 7441, 3756, 2131, # 4070
7442, 4100, 4351, 2399, 3525, 7443, 2213, 1511, 1727, 1120, 7444, 7445, 646, 3757, 2443, 307, # 4086
7446, 7447, 1595, 3186, 7448, 7449, 7450, 3639, 1113, 1356, 3899, 1465, 2522, 2523, 7451, 519, # 4102
7452, 128, 2132, 92, 2284, 1979, 7453, 3900, 1512, 342, 3125, 2196, 7454, 2780, 2214, 1980, # 4118
3323, 7455, 290, 1656, 1317, 789, 827, 2360, 7456, 3758, 4352, 562, 581, 3901, 7457, 401, # 4134
4353, 2248, 94, 4354, 1399, 2781, 7458, 1463, 2024, 4355, 3187, 1943, 7459, 828, 1105, 4101, # 4150
1262, 1394, 7460, 4102, 605, 4356, 7461, 1783, 2862, 7462, 2822, 819, 2101, 578, 2197, 2937, # 4166
7463, 1502, 436, 3254, 4103, 3255, 2823, 3902, 2905, 3425, 3426, 7464, 2712, 2315, 7465, 7466, # 4182
2332, 2067, 23, 4357, 193, 826, 3759, 2102, 699, 1630, 4104, 3075, 390, 1793, 1064, 3526, # 4198
7467, 1579, 3076, 3077, 1400, 7468, 4105, 1838, 1640, 2863, 7469, 4358, 4359, 137, 4106, 598, # 4214
3078, 1966, 780, 104, 974, 2938, 7470, 278, 899, 253, 402, 572, 504, 493, 1339, 7471, # 4230
3903, 1275, 4360, 2574, 2550, 7472, 3640, 3029, 3079, 2249, 565, 1334, 2713, 863, 41, 7473, # 4246
7474, 4361, 7475, 1657, 2333, 19, 463, 2750, 4107, 606, 7476, 2981, 3256, 1087, 2084, 1323, # 4262
2652, 2982, 7477, 1631, 1623, 1750, 4108, 2682, 7478, 2864, 791, 2714, 2653, 2334, 232, 2416, # 4278
7479, 2983, 1498, 7480, 2654, 2620, 755, 1366, 3641, 3257, 3126, 2025, 1609, 119, 1917, 3427, # 4294
862, 1026, 4109, 7481, 3904, 3760, 4362, 3905, 4363, 2260, 1951, 2470, 7482, 1125, 817, 4110, # 4310
4111, 3906, 1513, 1766, 2040, 1487, 4112, 3030, 3258, 2824, 3761, 3127, 7483, 7484, 1507, 7485, # 4326
2683, 733, 40, 1632, 1106, 2865, 345, 4113, 841, 2524, 230, 4364, 2984, 1846, 3259, 3428, # 4342
7486, 1263, 986, 3429, 7487, 735, 879, 254, 1137, 857, 622, 1300, 1180, 1388, 1562, 3907, # 4358
3908, 2939, 967, 2751, 2655, 1349, 592, 2133, 1692, 3324, 2985, 1994, 4114, 1679, 3909, 1901, # 4374
2185, 7488, 739, 3642, 2715, 1296, 1290, 7489, 4115, 2198, 2199, 1921, 1563, 2595, 2551, 1870, # 4390
2752, 2986, 7490, 435, 7491, 343, 1108, 596, 17, 1751, 4365, 2235, 3430, 3643, 7492, 4366, # 4406
294, 3527, 2940, 1693, 477, 979, 281, 2041, 3528, 643, 2042, 3644, 2621, 2782, 2261, 1031, # 4422
2335, 2134, 2298, 3529, 4367, 367, 1249, 2552, 7493, 3530, 7494, 4368, 1283, 3325, 2004, 240, # 4438
1762, 3326, 4369, 4370, 836, 1069, 3128, 474, 7495, 2148, 2525, 268, 3531, 7496, 3188, 1521, # 4454
1284, 7497, 1658, 1546, 4116, 7498, 3532, 3533, 7499, 4117, 3327, 2684, 1685, 4118, 961, 1673, # 4470
2622, 190, 2005, 2200, 3762, 4371, 4372, 7500, 570, 2497, 3645, 1490, 7501, 4373, 2623, 3260, # 4486
1956, 4374, 584, 1514, 396, 1045, 1944, 7502, 4375, 1967, 2444, 7503, 7504, 4376, 3910, 619, # 4502
7505, 3129, 3261, 215, 2006, 2783, 2553, 3189, 4377, 3190, 4378, 763, 4119, 3763, 4379, 7506, # 4518
7507, 1957, 1767, 2941, 3328, 3646, 1174, 452, 1477, 4380, 3329, 3130, 7508, 2825, 1253, 2382, # 4534
2186, 1091, 2285, 4120, 492, 7509, 638, 1169, 1824, 2135, 1752, 3911, 648, 926, 1021, 1324, # 4550
4381, 520, 4382, 997, 847, 1007, 892, 4383, 3764, 2262, 1871, 3647, 7510, 2400, 1784, 4384, # 4566
1952, 2942, 3080, 3191, 1728, 4121, 2043, 3648, 4385, 2007, 1701, 3131, 1551, 30, 2263, 4122, # 4582
7511, 2026, 4386, 3534, 7512, 501, 7513, 4123, 594, 3431, 2165, 1821, 3535, 3432, 3536, 3192, # 4598
829, 2826, 4124, 7514, 1680, 3132, 1225, 4125, 7515, 3262, 4387, 4126, 3133, 2336, 7516, 4388, # 4614
4127, 7517, 3912, 3913, 7518, 1847, 2383, 2596, 3330, 7519, 4389, 374, 3914, 652, 4128, 4129, # 4630
375, 1140, 798, 7520, 7521, 7522, 2361, 4390, 2264, 546, 1659, 138, 3031, 2445, 4391, 7523, # 4646
2250, 612, 1848, 910, 796, 3765, 1740, 1371, 825, 3766, 3767, 7524, 2906, 2554, 7525, 692, # 4662
444, 3032, 2624, 801, 4392, 4130, 7526, 1491, 244, 1053, 3033, 4131, 4132, 340, 7527, 3915, # 4678
1041, 2987, 293, 1168, 87, 1357, 7528, 1539, 959, 7529, 2236, 721, 694, 4133, 3768, 219, # 4694
1478, 644, 1417, 3331, 2656, 1413, 1401, 1335, 1389, 3916, 7530, 7531, 2988, 2362, 3134, 1825, # 4710
730, 1515, 184, 2827, 66, 4393, 7532, 1660, 2943, 246, 3332, 378, 1457, 226, 3433, 975, # 4726
3917, 2944, 1264, 3537, 674, 696, 7533, 163, 7534, 1141, 2417, 2166, 713, 3538, 3333, 4394, # 4742
3918, 7535, 7536, 1186, 15, 7537, 1079, 1070, 7538, 1522, 3193, 3539, 276, 1050, 2716, 758, # 4758
1126, 653, 2945, 3263, 7539, 2337, 889, 3540, 3919, 3081, 2989, 903, 1250, 4395, 3920, 3434, # 4774
3541, 1342, 1681, 1718, 766, 3264, 286, 89, 2946, 3649, 7540, 1713, 7541, 2597, 3334, 2990, # 4790
7542, 2947, 2215, 3194, 2866, 7543, 4396, 2498, 2526, 181, 387, 1075, 3921, 731, 2187, 3335, # 4806
7544, 3265, 310, 313, 3435, 2299, 770, 4134, 54, 3034, 189, 4397, 3082, 3769, 3922, 7545, # 4822
1230, 1617, 1849, 355, 3542, 4135, 4398, 3336, 111, 4136, 3650, 1350, 3135, 3436, 3035, 4137, # 4838
2149, 3266, 3543, 7546, 2784, 3923, 3924, 2991, 722, 2008, 7547, 1071, 247, 1207, 2338, 2471, # 4854
1378, 4399, 2009, 864, 1437, 1214, 4400, 373, 3770, 1142, 2216, 667, 4401, 442, 2753, 2555, # 4870
3771, 3925, 1968, 4138, 3267, 1839, 837, 170, 1107, 934, 1336, 1882, 7548, 7549, 2118, 4139, # 4886
2828, 743, 1569, 7550, 4402, 4140, 582, 2384, 1418, 3437, 7551, 1802, 7552, 357, 1395, 1729, # 4902
3651, 3268, 2418, 1564, 2237, 7553, 3083, 3772, 1633, 4403, 1114, 2085, 4141, 1532, 7554, 482, # 4918
2446, 4404, 7555, 7556, 1492, 833, 1466, 7557, 2717, 3544, 1641, 2829, 7558, 1526, 1272, 3652, # 4934
4142, 1686, 1794, 416, 2556, 1902, 1953, 1803, 7559, 3773, 2785, 3774, 1159, 2316, 7560, 2867, # 4950
4405, 1610, 1584, 3036, 2419, 2754, 443, 3269, 1163, 3136, 7561, 7562, 3926, 7563, 4143, 2499, # 4966
3037, 4406, 3927, 3137, 2103, 1647, 3545, 2010, 1872, 4144, 7564, 4145, 431, 3438, 7565, 250, # 4982
97, 81, 4146, 7566, 1648, 1850, 1558, 160, 848, 7567, 866, 740, 1694, 7568, 2201, 2830, # 4998
3195, 4147, 4407, 3653, 1687, 950, 2472, 426, 469, 3196, 3654, 3655, 3928, 7569, 7570, 1188, # 5014
424, 1995, 861, 3546, 4148, 3775, 2202, 2685, 168, 1235, 3547, 4149, 7571, 2086, 1674, 4408, # 5030
3337, 3270, 220, 2557, 1009, 7572, 3776, 670, 2992, 332, 1208, 717, 7573, 7574, 3548, 2447, # 5046
3929, 3338, 7575, 513, 7576, 1209, 2868, 3339, 3138, 4409, 1080, 7577, 7578, 7579, 7580, 2527, # 5062
3656, 3549, 815, 1587, 3930, 3931, 7581, 3550, 3439, 3777, 1254, 4410, 1328, 3038, 1390, 3932, # 5078
1741, 3933, 3778, 3934, 7582, 236, 3779, 2448, 3271, 7583, 7584, 3657, 3780, 1273, 3781, 4411, # 5094
7585, 308, 7586, 4412, 245, 4413, 1851, 2473, 1307, 2575, 430, 715, 2136, 2449, 7587, 270, # 5110
199, 2869, 3935, 7588, 3551, 2718, 1753, 761, 1754, 725, 1661, 1840, 4414, 3440, 3658, 7589, # 5126
7590, 587, 14, 3272, 227, 2598, 326, 480, 2265, 943, 2755, 3552, 291, 650, 1883, 7591, # 5142
1702, 1226, 102, 1547, 62, 3441, 904, 4415, 3442, 1164, 4150, 7592, 7593, 1224, 1548, 2756, # 5158
391, 498, 1493, 7594, 1386, 1419, 7595, 2055, 1177, 4416, 813, 880, 1081, 2363, 566, 1145, # 5174
4417, 2286, 1001, 1035, 2558, 2599, 2238, 394, 1286, 7596, 7597, 2068, 7598, 86, 1494, 1730, # 5190
3936, 491, 1588, 745, 897, 2948, 843, 3340, 3937, 2757, 2870, 3273, 1768, 998, 2217, 2069, # 5206
397, 1826, 1195, 1969, 3659, 2993, 3341, 284, 7599, 3782, 2500, 2137, 2119, 1903, 7600, 3938, # 5222
2150, 3939, 4151, 1036, 3443, 1904, 114, 2559, 4152, 209, 1527, 7601, 7602, 2949, 2831, 2625, # 5238
2385, 2719, 3139, 812, 2560, 7603, 3274, 7604, 1559, 737, 1884, 3660, 1210, 885, 28, 2686, # 5254
3553, 3783, 7605, 4153, 1004, 1779, 4418, 7606, 346, 1981, 2218, 2687, 4419, 3784, 1742, 797, # 5270
1642, 3940, 1933, 1072, 1384, 2151, 896, 3941, 3275, 3661, 3197, 2871, 3554, 7607, 2561, 1958, # 5286
4420, 2450, 1785, 7608, 7609, 7610, 3942, 4154, 1005, 1308, 3662, 4155, 2720, 4421, 4422, 1528, # 5302
2600, 161, 1178, 4156, 1982, 987, 4423, 1101, 4157, 631, 3943, 1157, 3198, 2420, 1343, 1241, # 5318
1016, 2239, 2562, 372, 877, 2339, 2501, 1160, 555, 1934, 911, 3944, 7611, 466, 1170, 169, # 5334
1051, 2907, 2688, 3663, 2474, 2994, 1182, 2011, 2563, 1251, 2626, 7612, 992, 2340, 3444, 1540, # 5350
2721, 1201, 2070, 2401, 1996, 2475, 7613, 4424, 528, 1922, 2188, 1503, 1873, 1570, 2364, 3342, # 5366
3276, 7614, 557, 1073, 7615, 1827, 3445, 2087, 2266, 3140, 3039, 3084, 767, 3085, 2786, 4425, # 5382
1006, 4158, 4426, 2341, 1267, 2176, 3664, 3199, 778, 3945, 3200, 2722, 1597, 2657, 7616, 4427, # 5398
7617, 3446, 7618, 7619, 7620, 3277, 2689, 1433, 3278, 131, 95, 1504, 3946, 723, 4159, 3141, # 5414
1841, 3555, 2758, 2189, 3947, 2027, 2104, 3665, 7621, 2995, 3948, 1218, 7622, 3343, 3201, 3949, # 5430
4160, 2576, 248, 1634, 3785, 912, 7623, 2832, 3666, 3040, 3786, 654, 53, 7624, 2996, 7625, # 5446
1688, 4428, 777, 3447, 1032, 3950, 1425, 7626, 191, 820, 2120, 2833, 971, 4429, 931, 3202, # 5462
135, 664, 783, 3787, 1997, 772, 2908, 1935, 3951, 3788, 4430, 2909, 3203, 282, 2723, 640, # 5478
1372, 3448, 1127, 922, 325, 3344, 7627, 7628, 711, 2044, 7629, 7630, 3952, 2219, 2787, 1936, # 5494
3953, 3345, 2220, 2251, 3789, 2300, 7631, 4431, 3790, 1258, 3279, 3954, 3204, 2138, 2950, 3955, # 5510
3956, 7632, 2221, 258, 3205, 4432, 101, 1227, 7633, 3280, 1755, 7634, 1391, 3281, 7635, 2910, # 5526
2056, 893, 7636, 7637, 7638, 1402, 4161, 2342, 7639, 7640, 3206, 3556, 7641, 7642, 878, 1325, # 5542
1780, 2788, 4433, 259, 1385, 2577, 744, 1183, 2267, 4434, 7643, 3957, 2502, 7644, 684, 1024, # 5558
4162, 7645, 472, 3557, 3449, 1165, 3282, 3958, 3959, 322, 2152, 881, 455, 1695, 1152, 1340, # 5574
660, 554, 2153, 4435, 1058, 4436, 4163, 830, 1065, 3346, 3960, 4437, 1923, 7646, 1703, 1918, # 5590
7647, 932, 2268, 122, 7648, 4438, 947, 677, 7649, 3791, 2627, 297, 1905, 1924, 2269, 4439, # 5606
2317, 3283, 7650, 7651, 4164, 7652, 4165, 84, 4166, 112, 989, 7653, 547, 1059, 3961, 701, # 5622
3558, 1019, 7654, 4167, 7655, 3450, 942, 639, 457, 2301, 2451, 993, 2951, 407, 851, 494, # 5638
4440, 3347, 927, 7656, 1237, 7657, 2421, 3348, 573, 4168, 680, 921, 2911, 1279, 1874, 285, # 5654
790, 1448, 1983, 719, 2167, 7658, 7659, 4441, 3962, 3963, 1649, 7660, 1541, 563, 7661, 1077, # 5670
7662, 3349, 3041, 3451, 511, 2997, 3964, 3965, 3667, 3966, 1268, 2564, 3350, 3207, 4442, 4443, # 5686
7663, 535, 1048, 1276, 1189, 2912, 2028, 3142, 1438, 1373, 2834, 2952, 1134, 2012, 7664, 4169, # 5702
1238, 2578, 3086, 1259, 7665, 700, 7666, 2953, 3143, 3668, 4170, 7667, 4171, 1146, 1875, 1906, # 5718
4444, 2601, 3967, 781, 2422, 132, 1589, 203, 147, 273, 2789, 2402, 898, 1786, 2154, 3968, # 5734
3969, 7668, 3792, 2790, 7669, 7670, 4445, 4446, 7671, 3208, 7672, 1635, 3793, 965, 7673, 1804, # 5750
2690, 1516, 3559, 1121, 1082, 1329, 3284, 3970, 1449, 3794, 65, 1128, 2835, 2913, 2759, 1590, # 5766
3795, 7674, 7675, 12, 2658, 45, 976, 2579, 3144, 4447, 517, 2528, 1013, 1037, 3209, 7676, # 5782
3796, 2836, 7677, 3797, 7678, 3452, 7679, 2602, 614, 1998, 2318, 3798, 3087, 2724, 2628, 7680, # 5798
2580, 4172, 599, 1269, 7681, 1810, 3669, 7682, 2691, 3088, 759, 1060, 489, 1805, 3351, 3285, # 5814
1358, 7683, 7684, 2386, 1387, 1215, 2629, 2252, 490, 7685, 7686, 4173, 1759, 2387, 2343, 7687, # 5830
4448, 3799, 1907, 3971, 2630, 1806, 3210, 4449, 3453, 3286, 2760, 2344, 874, 7688, 7689, 3454, # 5846
3670, 1858, 91, 2914, 3671, 3042, 3800, 4450, 7690, 3145, 3972, 2659, 7691, 3455, 1202, 1403, # 5862
3801, 2954, 2529, 1517, 2503, 4451, 3456, 2504, 7692, 4452, 7693, 2692, 1885, 1495, 1731, 3973, # 5878
2365, 4453, 7694, 2029, 7695, 7696, 3974, 2693, 1216, 237, 2581, 4174, 2319, 3975, 3802, 4454, # 5894
4455, 2694, 3560, 3457, 445, 4456, 7697, 7698, 7699, 7700, 2761, 61, 3976, 3672, 1822, 3977, # 5910
7701, 687, 2045, 935, 925, 405, 2660, 703, 1096, 1859, 2725, 4457, 3978, 1876, 1367, 2695, # 5926
3352, 918, 2105, 1781, 2476, 334, 3287, 1611, 1093, 4458, 564, 3146, 3458, 3673, 3353, 945, # 5942
2631, 2057, 4459, 7702, 1925, 872, 4175, 7703, 3459, 2696, 3089, 349, 4176, 3674, 3979, 4460, # 5958
3803, 4177, 3675, 2155, 3980, 4461, 4462, 4178, 4463, 2403, 2046, 782, 3981, 400, 251, 4179, # 5974
1624, 7704, 7705, 277, 3676, 299, 1265, 476, 1191, 3804, 2121, 4180, 4181, 1109, 205, 7706, # 5990
2582, 1000, 2156, 3561, 1860, 7707, 7708, 7709, 4464, 7710, 4465, 2565, 107, 2477, 2157, 3982, # 6006
3460, 3147, 7711, 1533, 541, 1301, 158, 753, 4182, 2872, 3562, 7712, 1696, 370, 1088, 4183, # 6022
4466, 3563, 579, 327, 440, 162, 2240, 269, 1937, 1374, 3461, 968, 3043, 56, 1396, 3090, # 6038
2106, 3288, 3354, 7713, 1926, 2158, 4467, 2998, 7714, 3564, 7715, 7716, 3677, 4468, 2478, 7717, # 6054
2791, 7718, 1650, 4469, 7719, 2603, 7720, 7721, 3983, 2661, 3355, 1149, 3356, 3984, 3805, 3985, # 6070
7722, 1076, 49, 7723, 951, 3211, 3289, 3290, 450, 2837, 920, 7724, 1811, 2792, 2366, 4184, # 6086
1908, 1138, 2367, 3806, 3462, 7725, 3212, 4470, 1909, 1147, 1518, 2423, 4471, 3807, 7726, 4472, # 6102
2388, 2604, 260, 1795, 3213, 7727, 7728, 3808, 3291, 708, 7729, 3565, 1704, 7730, 3566, 1351, # 6118
1618, 3357, 2999, 1886, 944, 4185, 3358, 4186, 3044, 3359, 4187, 7731, 3678, 422, 413, 1714, # 6134
3292, 500, 2058, 2345, 4188, 2479, 7732, 1344, 1910, 954, 7733, 1668, 7734, 7735, 3986, 2404, # 6150
4189, 3567, 3809, 4190, 7736, 2302, 1318, 2505, 3091, 133, 3092, 2873, 4473, 629, 31, 2838, # 6166
2697, 3810, 4474, 850, 949, 4475, 3987, 2955, 1732, 2088, 4191, 1496, 1852, 7737, 3988, 620, # 6182
3214, 981, 1242, 3679, 3360, 1619, 3680, 1643, 3293, 2139, 2452, 1970, 1719, 3463, 2168, 7738, # 6198
3215, 7739, 7740, 3361, 1828, 7741, 1277, 4476, 1565, 2047, 7742, 1636, 3568, 3093, 7743, 869, # 6214
2839, 655, 3811, 3812, 3094, 3989, 3000, 3813, 1310, 3569, 4477, 7744, 7745, 7746, 1733, 558, # 6230
4478, 3681, 335, 1549, 3045, 1756, 4192, 3682, 1945, 3464, 1829, 1291, 1192, 470, 2726, 2107, # 6246
2793, 913, 1054, 3990, 7747, 1027, 7748, 3046, 3991, 4479, 982, 2662, 3362, 3148, 3465, 3216, # 6262
3217, 1946, 2794, 7749, 571, 4480, 7750, 1830, 7751, 3570, 2583, 1523, 2424, 7752, 2089, 984, # 6278
4481, 3683, 1959, 7753, 3684, 852, 923, 2795, 3466, 3685, 969, 1519, 999, 2048, 2320, 1705, # 6294
7754, 3095, 615, 1662, 151, 597, 3992, 2405, 2321, 1049, 275, 4482, 3686, 4193, 568, 3687, # 6310
3571, 2480, 4194, 3688, 7755, 2425, 2270, 409, 3218, 7756, 1566, 2874, 3467, 1002, 769, 2840, # 6326
194, 2090, 3149, 3689, 2222, 3294, 4195, 628, 1505, 7757, 7758, 1763, 2177, 3001, 3993, 521, # 6342
1161, 2584, 1787, 2203, 2406, 4483, 3994, 1625, 4196, 4197, 412, 42, 3096, 464, 7759, 2632, # 6358
4484, 3363, 1760, 1571, 2875, 3468, 2530, 1219, 2204, 3814, 2633, 2140, 2368, 4485, 4486, 3295, # 6374
1651, 3364, 3572, 7760, 7761, 3573, 2481, 3469, 7762, 3690, 7763, 7764, 2271, 2091, 460, 7765, # 6390
4487, 7766, 3002, 962, 588, 3574, 289, 3219, 2634, 1116, 52, 7767, 3047, 1796, 7768, 7769, # 6406
7770, 1467, 7771, 1598, 1143, 3691, 4198, 1984, 1734, 1067, 4488, 1280, 3365, 465, 4489, 1572, # 6422
510, 7772, 1927, 2241, 1812, 1644, 3575, 7773, 4490, 3692, 7774, 7775, 2663, 1573, 1534, 7776, # 6438
7777, 4199, 536, 1807, 1761, 3470, 3815, 3150, 2635, 7778, 7779, 7780, 4491, 3471, 2915, 1911, # 6454
2796, 7781, 3296, 1122, 377, 3220, 7782, 360, 7783, 7784, 4200, 1529, 551, 7785, 2059, 3693, # 6470
1769, 2426, 7786, 2916, 4201, 3297, 3097, 2322, 2108, 2030, 4492, 1404, 136, 1468, 1479, 672, # 6486
1171, 3221, 2303, 271, 3151, 7787, 2762, 7788, 2049, 678, 2727, 865, 1947, 4493, 7789, 2013, # 6502
3995, 2956, 7790, 2728, 2223, 1397, 3048, 3694, 4494, 4495, 1735, 2917, 3366, 3576, 7791, 3816, # 6518
509, 2841, 2453, 2876, 3817, 7792, 7793, 3152, 3153, 4496, 4202, 2531, 4497, 2304, 1166, 1010, # 6534
552, 681, 1887, 7794, 7795, 2957, 2958, 3996, 1287, 1596, 1861, 3154, 358, 453, 736, 175, # 6550
478, 1117, 905, 1167, 1097, 7796, 1853, 1530, 7797, 1706, 7798, 2178, 3472, 2287, 3695, 3473, # 6566
3577, 4203, 2092, 4204, 7799, 3367, 1193, 2482, 4205, 1458, 2190, 2205, 1862, 1888, 1421, 3298, # 6582
2918, 3049, 2179, 3474, 595, 2122, 7800, 3997, 7801, 7802, 4206, 1707, 2636, 223, 3696, 1359, # 6598
751, 3098, 183, 3475, 7803, 2797, 3003, 419, 2369, 633, 704, 3818, 2389, 241, 7804, 7805, # 6614
7806, 838, 3004, 3697, 2272, 2763, 2454, 3819, 1938, 2050, 3998, 1309, 3099, 2242, 1181, 7807, # 6630
1136, 2206, 3820, 2370, 1446, 4207, 2305, 4498, 7808, 7809, 4208, 1055, 2605, 484, 3698, 7810, # 6646
3999, 625, 4209, 2273, 3368, 1499, 4210, 4000, 7811, 4001, 4211, 3222, 2274, 2275, 3476, 7812, # 6662
7813, 2764, 808, 2606, 3699, 3369, 4002, 4212, 3100, 2532, 526, 3370, 3821, 4213, 955, 7814, # 6678
1620, 4214, 2637, 2427, 7815, 1429, 3700, 1669, 1831, 994, 928, 7816, 3578, 1260, 7817, 7818, # 6694
7819, 1948, 2288, 741, 2919, 1626, 4215, 2729, 2455, 867, 1184, 362, 3371, 1392, 7820, 7821, # 6710
4003, 4216, 1770, 1736, 3223, 2920, 4499, 4500, 1928, 2698, 1459, 1158, 7822, 3050, 3372, 2877, # 6726
1292, 1929, 2506, 2842, 3701, 1985, 1187, 2071, 2014, 2607, 4217, 7823, 2566, 2507, 2169, 3702, # 6742
2483, 3299, 7824, 3703, 4501, 7825, 7826, 666, 1003, 3005, 1022, 3579, 4218, 7827, 4502, 1813, # 6758
2253, 574, 3822, 1603, 295, 1535, 705, 3823, 4219, 283, 858, 417, 7828, 7829, 3224, 4503, # 6774
4504, 3051, 1220, 1889, 1046, 2276, 2456, 4004, 1393, 1599, 689, 2567, 388, 4220, 7830, 2484, # 6790
802, 7831, 2798, 3824, 2060, 1405, 2254, 7832, 4505, 3825, 2109, 1052, 1345, 3225, 1585, 7833, # 6806
809, 7834, 7835, 7836, 575, 2730, 3477, 956, 1552, 1469, 1144, 2323, 7837, 2324, 1560, 2457, # 6822
3580, 3226, 4005, 616, 2207, 3155, 2180, 2289, 7838, 1832, 7839, 3478, 4506, 7840, 1319, 3704, # 6838
3705, 1211, 3581, 1023, 3227, 1293, 2799, 7841, 7842, 7843, 3826, 607, 2306, 3827, 762, 2878, # 6854
1439, 4221, 1360, 7844, 1485, 3052, 7845, 4507, 1038, 4222, 1450, 2061, 2638, 4223, 1379, 4508, # 6870
2585, 7846, 7847, 4224, 1352, 1414, 2325, 2921, 1172, 7848, 7849, 3828, 3829, 7850, 1797, 1451, # 6886
7851, 7852, 7853, 7854, 2922, 4006, 4007, 2485, 2346, 411, 4008, 4009, 3582, 3300, 3101, 4509, # 6902
1561, 2664, 1452, 4010, 1375, 7855, 7856, 47, 2959, 316, 7857, 1406, 1591, 2923, 3156, 7858, # 6918
1025, 2141, 3102, 3157, 354, 2731, 884, 2224, 4225, 2407, 508, 3706, 726, 3583, 996, 2428, # 6934
3584, 729, 7859, 392, 2191, 1453, 4011, 4510, 3707, 7860, 7861, 2458, 3585, 2608, 1675, 2800, # 6950
919, 2347, 2960, 2348, 1270, 4511, 4012, 73, 7862, 7863, 647, 7864, 3228, 2843, 2255, 1550, # 6966
1346, 3006, 7865, 1332, 883, 3479, 7866, 7867, 7868, 7869, 3301, 2765, 7870, 1212, 831, 1347, # 6982
4226, 4512, 2326, 3830, 1863, 3053, 720, 3831, 4513, 4514, 3832, 7871, 4227, 7872, 7873, 4515, # 6998
7874, 7875, 1798, 4516, 3708, 2609, 4517, 3586, 1645, 2371, 7876, 7877, 2924, 669, 2208, 2665, # 7014
2429, 7878, 2879, 7879, 7880, 1028, 3229, 7881, 4228, 2408, 7882, 2256, 1353, 7883, 7884, 4518, # 7030
3158, 518, 7885, 4013, 7886, 4229, 1960, 7887, 2142, 4230, 7888, 7889, 3007, 2349, 2350, 3833, # 7046
516, 1833, 1454, 4014, 2699, 4231, 4519, 2225, 2610, 1971, 1129, 3587, 7890, 2766, 7891, 2961, # 7062
1422, 577, 1470, 3008, 1524, 3373, 7892, 7893, 432, 4232, 3054, 3480, 7894, 2586, 1455, 2508, # 7078
2226, 1972, 1175, 7895, 1020, 2732, 4015, 3481, 4520, 7896, 2733, 7897, 1743, 1361, 3055, 3482, # 7094
2639, 4016, 4233, 4521, 2290, 895, 924, 4234, 2170, 331, 2243, 3056, 166, 1627, 3057, 1098, # 7110
7898, 1232, 2880, 2227, 3374, 4522, 657, 403, 1196, 2372, 542, 3709, 3375, 1600, 4235, 3483, # 7126
7899, 4523, 2767, 3230, 576, 530, 1362, 7900, 4524, 2533, 2666, 3710, 4017, 7901, 842, 3834, # 7142
7902, 2801, 2031, 1014, 4018, 213, 2700, 3376, 665, 621, 4236, 7903, 3711, 2925, 2430, 7904, # 7158
2431, 3302, 3588, 3377, 7905, 4237, 2534, 4238, 4525, 3589, 1682, 4239, 3484, 1380, 7906, 724, # 7174
2277, 600, 1670, 7907, 1337, 1233, 4526, 3103, 2244, 7908, 1621, 4527, 7909, 651, 4240, 7910, # 7190
1612, 4241, 2611, 7911, 2844, 7912, 2734, 2307, 3058, 7913, 716, 2459, 3059, 174, 1255, 2701, # 7206
4019, 3590, 548, 1320, 1398, 728, 4020, 1574, 7914, 1890, 1197, 3060, 4021, 7915, 3061, 3062, # 7222
3712, 3591, 3713, 747, 7916, 635, 4242, 4528, 7917, 7918, 7919, 4243, 7920, 7921, 4529, 7922, # 7238
3378, 4530, 2432, 451, 7923, 3714, 2535, 2072, 4244, 2735, 4245, 4022, 7924, 1764, 4531, 7925, # 7254
4246, 350, 7926, 2278, 2390, 2486, 7927, 4247, 4023, 2245, 1434, 4024, 488, 4532, 458, 4248, # 7270
4025, 3715, 771, 1330, 2391, 3835, 2568, 3159, 2159, 2409, 1553, 2667, 3160, 4249, 7928, 2487, # 7286
2881, 2612, 1720, 2702, 4250, 3379, 4533, 7929, 2536, 4251, 7930, 3231, 4252, 2768, 7931, 2015, # 7302
2736, 7932, 1155, 1017, 3716, 3836, 7933, 3303, 2308, 201, 1864, 4253, 1430, 7934, 4026, 7935, # 7318
7936, 7937, 7938, 7939, 4254, 1604, 7940, 414, 1865, 371, 2587, 4534, 4535, 3485, 2016, 3104, # 7334
4536, 1708, 960, 4255, 887, 389, 2171, 1536, 1663, 1721, 7941, 2228, 4027, 2351, 2926, 1580, # 7350
7942, 7943, 7944, 1744, 7945, 2537, 4537, 4538, 7946, 4539, 7947, 2073, 7948, 7949, 3592, 3380, # 7366
2882, 4256, 7950, 4257, 2640, 3381, 2802, 673, 2703, 2460, 709, 3486, 4028, 3593, 4258, 7951, # 7382
1148, 502, 634, 7952, 7953, 1204, 4540, 3594, 1575, 4541, 2613, 3717, 7954, 3718, 3105, 948, # 7398
3232, 121, 1745, 3837, 1110, 7955, 4259, 3063, 2509, 3009, 4029, 3719, 1151, 1771, 3838, 1488, # 7414
4030, 1986, 7956, 2433, 3487, 7957, 7958, 2093, 7959, 4260, 3839, 1213, 1407, 2803, 531, 2737, # 7430
2538, 3233, 1011, 1537, 7960, 2769, 4261, 3106, 1061, 7961, 3720, 3721, 1866, 2883, 7962, 2017, # 7446
120, 4262, 4263, 2062, 3595, 3234, 2309, 3840, 2668, 3382, 1954, 4542, 7963, 7964, 3488, 1047, # 7462
2704, 1266, 7965, 1368, 4543, 2845, 649, 3383, 3841, 2539, 2738, 1102, 2846, 2669, 7966, 7967, # 7478
1999, 7968, 1111, 3596, 2962, 7969, 2488, 3842, 3597, 2804, 1854, 3384, 3722, 7970, 7971, 3385, # 7494
2410, 2884, 3304, 3235, 3598, 7972, 2569, 7973, 3599, 2805, 4031, 1460, 856, 7974, 3600, 7975, # 7510
2885, 2963, 7976, 2886, 3843, 7977, 4264, 632, 2510, 875, 3844, 1697, 3845, 2291, 7978, 7979, # 7526
4544, 3010, 1239, 580, 4545, 4265, 7980, 914, 936, 2074, 1190, 4032, 1039, 2123, 7981, 7982, # 7542
7983, 3386, 1473, 7984, 1354, 4266, 3846, 7985, 2172, 3064, 4033, 915, 3305, 4267, 4268, 3306, # 7558
1605, 1834, 7986, 2739, 398, 3601, 4269, 3847, 4034, 328, 1912, 2847, 4035, 3848, 1331, 4270, # 7574
3011, 937, 4271, 7987, 3602, 4036, 4037, 3387, 2160, 4546, 3388, 524, 742, 538, 3065, 1012, # 7590
7988, 7989, 3849, 2461, 7990, 658, 1103, 225, 3850, 7991, 7992, 4547, 7993, 4548, 7994, 3236, # 7606
1243, 7995, 4038, 963, 2246, 4549, 7996, 2705, 3603, 3161, 7997, 7998, 2588, 2327, 7999, 4550, # 7622
8000, 8001, 8002, 3489, 3307, 957, 3389, 2540, 2032, 1930, 2927, 2462, 870, 2018, 3604, 1746, # 7638
2770, 2771, 2434, 2463, 8003, 3851, 8004, 3723, 3107, 3724, 3490, 3390, 3725, 8005, 1179, 3066, # 7654
8006, 3162, 2373, 4272, 3726, 2541, 3163, 3108, 2740, 4039, 8007, 3391, 1556, 2542, 2292, 977, # 7670
2887, 2033, 4040, 1205, 3392, 8008, 1765, 3393, 3164, 2124, 1271, 1689, 714, 4551, 3491, 8009, # 7686
2328, 3852, 533, 4273, 3605, 2181, 617, 8010, 2464, 3308, 3492, 2310, 8011, 8012, 3165, 8013, # 7702
8014, 3853, 1987, 618, 427, 2641, 3493, 3394, 8015, 8016, 1244, 1690, 8017, 2806, 4274, 4552, # 7718
8018, 3494, 8019, 8020, 2279, 1576, 473, 3606, 4275, 3395, 972, 8021, 3607, 8022, 3067, 8023, # 7734
8024, 4553, 4554, 8025, 3727, 4041, 4042, 8026, 153, 4555, 356, 8027, 1891, 2888, 4276, 2143, # 7750
408, 803, 2352, 8028, 3854, 8029, 4277, 1646, 2570, 2511, 4556, 4557, 3855, 8030, 3856, 4278, # 7766
8031, 2411, 3396, 752, 8032, 8033, 1961, 2964, 8034, 746, 3012, 2465, 8035, 4279, 3728, 698, # 7782
4558, 1892, 4280, 3608, 2543, 4559, 3609, 3857, 8036, 3166, 3397, 8037, 1823, 1302, 4043, 2706, # 7798
3858, 1973, 4281, 8038, 4282, 3167, 823, 1303, 1288, 1236, 2848, 3495, 4044, 3398, 774, 3859, # 7814
8039, 1581, 4560, 1304, 2849, 3860, 4561, 8040, 2435, 2161, 1083, 3237, 4283, 4045, 4284, 344, # 7830
1173, 288, 2311, 454, 1683, 8041, 8042, 1461, 4562, 4046, 2589, 8043, 8044, 4563, 985, 894, # 7846
8045, 3399, 3168, 8046, 1913, 2928, 3729, 1988, 8047, 2110, 1974, 8048, 4047, 8049, 2571, 1194, # 7862
425, 8050, 4564, 3169, 1245, 3730, 4285, 8051, 8052, 2850, 8053, 636, 4565, 1855, 3861, 760, # 7878
1799, 8054, 4286, 2209, 1508, 4566, 4048, 1893, 1684, 2293, 8055, 8056, 8057, 4287, 4288, 2210, # 7894
479, 8058, 8059, 832, 8060, 4049, 2489, 8061, 2965, 2490, 3731, 990, 3109, 627, 1814, 2642, # 7910
4289, 1582, 4290, 2125, 2111, 3496, 4567, 8062, 799, 4291, 3170, 8063, 4568, 2112, 1737, 3013, # 7926
1018, 543, 754, 4292, 3309, 1676, 4569, 4570, 4050, 8064, 1489, 8065, 3497, 8066, 2614, 2889, # 7942
4051, 8067, 8068, 2966, 8069, 8070, 8071, 8072, 3171, 4571, 4572, 2182, 1722, 8073, 3238, 3239, # 7958
1842, 3610, 1715, 481, 365, 1975, 1856, 8074, 8075, 1962, 2491, 4573, 8076, 2126, 3611, 3240, # 7974
433, 1894, 2063, 2075, 8077, 602, 2741, 8078, 8079, 8080, 8081, 8082, 3014, 1628, 3400, 8083, # 7990
3172, 4574, 4052, 2890, 4575, 2512, 8084, 2544, 2772, 8085, 8086, 8087, 3310, 4576, 2891, 8088, # 8006
4577, 8089, 2851, 4578, 4579, 1221, 2967, 4053, 2513, 8090, 8091, 8092, 1867, 1989, 8093, 8094, # 8022
8095, 1895, 8096, 8097, 4580, 1896, 4054, 318, 8098, 2094, 4055, 4293, 8099, 8100, 485, 8101, # 8038
938, 3862, 553, 2670, 116, 8102, 3863, 3612, 8103, 3498, 2671, 2773, 3401, 3311, 2807, 8104, # 8054
3613, 2929, 4056, 1747, 2930, 2968, 8105, 8106, 207, 8107, 8108, 2672, 4581, 2514, 8109, 3015, # 8070
890, 3614, 3864, 8110, 1877, 3732, 3402, 8111, 2183, 2353, 3403, 1652, 8112, 8113, 8114, 941, # 8086
2294, 208, 3499, 4057, 2019, 330, 4294, 3865, 2892, 2492, 3733, 4295, 8115, 8116, 8117, 8118, # 8102
)
# fmt: on

View file

@ -25,22 +25,23 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCTWDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import EUCTW_SM_MODEL
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):
super(EUCTWProber, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
self.distribution_analyzer = EUCTWDistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "EUC-TW"
@property
def language(self):
def language(self) -> str:
return "Taiwan"

View file

@ -43,6 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760
# fmt: off
GB2312_CHAR_TO_FREQ_ORDER = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
@ -280,4 +281,4 @@ GB2312_CHAR_TO_FREQ_ORDER = (
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
)
# fmt: on

View file

@ -25,22 +25,23 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import GB2312DistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import GB2312_SM_MODEL
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):
super(GB2312Prober, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
self.distribution_analyzer = GB2312DistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "GB2312"
@property
def language(self):
def language(self) -> str:
return "Chinese"

View file

@ -25,8 +25,11 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
from .sbcharsetprober import SingleByteCharSetProber
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
@ -125,18 +128,20 @@ from .enums import ProbingState
# model probers scores. The answer is returned in the form of the name of the
# charset identified, either "windows-1255" or "ISO-8859-8".
class HebrewProber(CharSetProber):
SPACE = 0x20
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
FINAL_KAF = 0xEA
NORMAL_KAF = 0xEB
FINAL_MEM = 0xED
NORMAL_MEM = 0xEE
FINAL_NUN = 0xEF
NORMAL_NUN = 0xF0
FINAL_PE = 0xF3
NORMAL_PE = 0xF4
FINAL_TSADI = 0xF5
NORMAL_TSADI = 0xF6
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score
@ -151,35 +156,44 @@ class HebrewProber(CharSetProber):
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
def __init__(self):
super(HebrewProber, self).__init__()
self._final_char_logical_score = None
self._final_char_visual_score = None
self._prev = None
self._before_prev = None
self._logical_prober = None
self._visual_prober = None
def __init__(self) -> None:
super().__init__()
self._final_char_logical_score = 0
self._final_char_visual_score = 0
self._prev = self.SPACE
self._before_prev = self.SPACE
self._logical_prober: Optional[SingleByteCharSetProber] = None
self._visual_prober: Optional[SingleByteCharSetProber] = None
self.reset()
def reset(self):
def reset(self) -> None:
self._final_char_logical_score = 0
self._final_char_visual_score = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._prev = ' '
self._before_prev = ' '
self._prev = self.SPACE
self._before_prev = self.SPACE
# These probers are owned by the group prober.
def set_model_probers(self, logicalProber, visualProber):
self._logical_prober = logicalProber
self._visual_prober = visualProber
def set_model_probers(
self,
logical_prober: SingleByteCharSetProber,
visual_prober: SingleByteCharSetProber,
) -> None:
self._logical_prober = logical_prober
self._visual_prober = visual_prober
def is_final(self, c):
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]
def is_final(self, c: int) -> bool:
return c in [
self.FINAL_KAF,
self.FINAL_MEM,
self.FINAL_NUN,
self.FINAL_PE,
self.FINAL_TSADI,
]
def is_non_final(self, c):
def is_non_final(self, c: int) -> bool:
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters
@ -190,10 +204,9 @@ class HebrewProber(CharSetProber):
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
@ -227,9 +240,9 @@ class HebrewProber(CharSetProber):
byte_str = self.filter_high_byte_only(byte_str)
for cur in byte_str:
if cur == ' ':
if cur == self.SPACE:
# We stand on a space - a word just ended
if self._before_prev != ' ':
if self._before_prev != self.SPACE:
# next-to-last char was not a space so self._prev is not a
# 1 letter word
if self.is_final(self._prev):
@ -241,8 +254,11 @@ class HebrewProber(CharSetProber):
self._final_char_visual_score += 1
else:
# Not standing on a space
if ((self._before_prev == ' ') and
(self.is_final(self._prev)) and (cur != ' ')):
if (
(self._before_prev == self.SPACE)
and (self.is_final(self._prev))
and (cur != self.SPACE)
):
# case (3) [-2:space][-1:final letter][cur:not space]
self._final_char_visual_score += 1
self._before_prev = self._prev
@ -253,7 +269,10 @@ class HebrewProber(CharSetProber):
return ProbingState.DETECTING
@property
def charset_name(self):
def charset_name(self) -> str:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Make the decision: is it Logical or Visual?
# If the final letter score distance is dominant enough, rely on it.
finalsub = self._final_char_logical_score - self._final_char_visual_score
@ -263,8 +282,9 @@ class HebrewProber(CharSetProber):
return self.VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead.
modelsub = (self._logical_prober.get_confidence()
- self._visual_prober.get_confidence())
modelsub = (
self._logical_prober.get_confidence() - self._visual_prober.get_confidence()
)
if modelsub > self.MIN_MODEL_DISTANCE:
return self.LOGICAL_HEBREW_NAME
if modelsub < -self.MIN_MODEL_DISTANCE:
@ -280,13 +300,17 @@ class HebrewProber(CharSetProber):
return self.LOGICAL_HEBREW_NAME
@property
def language(self):
return 'Hebrew'
def language(self) -> str:
return "Hebrew"
@property
def state(self):
def state(self) -> ProbingState:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.NOT_ME) and \
(self._visual_prober.state == ProbingState.NOT_ME):
if (self._logical_prober.state == ProbingState.NOT_ME) and (
self._visual_prober.state == ProbingState.NOT_ME
):
return ProbingState.NOT_ME
return ProbingState.DETECTING

View file

@ -46,6 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
# Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368
# fmt: off
JIS_CHAR_TO_FREQ_ORDER = (
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
@ -321,5 +322,4 @@ JIS_CHAR_TO_FREQ_ORDER = (
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
)
# fmt: on

2382
lib/chardet/johabfreq.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,47 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .chardistribution import JOHABDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import JOHAB_SM_MODEL
class JOHABProber(MultiByteCharSetProber):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
self.distribution_analyzer = JOHABDistributionAnalysis()
self.reset()
@property
def charset_name(self) -> str:
return "Johab"
@property
def language(self) -> str:
return "Korean"

View file

@ -25,110 +25,114 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Tuple, Union
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = (
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
(0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4),
(1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4),
(0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3),
(0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3),
(0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3),
(0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4),
(0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3),
(2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4),
(0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3),
(0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5),
(0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3),
(2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5),
(0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4),
(1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4),
(0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3),
(0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3),
(0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3),
(0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5),
(0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4),
(0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5),
(0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3),
(0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4),
(0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4),
(0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4),
(0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1),
(0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0),
(1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3),
(0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0),
(0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3),
(0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3),
(0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5),
(0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4),
(2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5),
(0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3),
(0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3),
(0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3),
(0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3),
(0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4),
(0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4),
(0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2),
(0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3),
(0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3),
(0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3),
(0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3),
(0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4),
(0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3),
(0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4),
(0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3),
(0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3),
(0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4),
(0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4),
(0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3),
(2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4),
(0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4),
(0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3),
(0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4),
(0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4),
(1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4),
(0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3),
(0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2),
(0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2),
(0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3),
(0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3),
(0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5),
(0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3),
(0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4),
(1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4),
(0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1),
(0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2),
(0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3),
(0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1),
# fmt: off
jp2_char_context = (
(0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
(2, 4, 0, 4, 0, 3, 0, 4, 0, 3, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 3, 3, 4, 2, 3, 3, 3, 2, 4, 1, 4, 3, 3, 1, 5, 4, 3, 4, 3, 4, 3, 5, 3, 0, 3, 5, 4, 2, 0, 3, 1, 0, 3, 3, 0, 3, 3, 0, 1, 1, 0, 4, 3, 0, 3, 3, 0, 4, 0, 2, 0, 3, 5, 5, 5, 5, 4, 0, 4, 1, 0, 3, 4),
(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
(0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 4, 4, 3, 5, 3, 5, 1, 5, 3, 4, 3, 4, 4, 3, 4, 3, 3, 4, 3, 5, 4, 4, 3, 5, 5, 3, 5, 5, 5, 3, 5, 5, 3, 4, 5, 5, 3, 1, 3, 2, 0, 3, 4, 0, 4, 2, 0, 4, 2, 1, 5, 3, 2, 3, 5, 0, 4, 0, 2, 0, 5, 4, 4, 5, 4, 5, 0, 4, 0, 0, 4, 4),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
(0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 5, 4, 3, 3, 3, 3, 4, 3, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 4, 4, 4, 4, 5, 3, 4, 4, 3, 4, 5, 5, 4, 5, 5, 1, 4, 5, 4, 3, 0, 3, 3, 1, 3, 3, 0, 4, 4, 0, 3, 3, 1, 5, 3, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 0, 4, 1, 1, 3, 4),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
(0, 4, 0, 3, 0, 3, 0, 4, 0, 3, 4, 4, 3, 2, 2, 1, 2, 1, 3, 1, 3, 3, 3, 3, 3, 4, 3, 1, 3, 3, 5, 3, 3, 0, 4, 3, 0, 5, 4, 3, 3, 5, 4, 4, 3, 4, 4, 5, 0, 1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 0, 0, 5, 2, 2, 1, 4, 0, 3, 0, 1, 0, 4, 4, 3, 5, 4, 3, 0, 2, 1, 0, 4, 3),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
(0, 3, 0, 5, 0, 4, 0, 2, 1, 4, 4, 2, 4, 1, 4, 2, 4, 2, 4, 3, 3, 3, 4, 3, 3, 3, 3, 1, 4, 2, 3, 3, 3, 1, 4, 4, 1, 1, 1, 4, 3, 3, 2, 0, 2, 4, 3, 2, 0, 3, 3, 0, 3, 1, 1, 0, 0, 0, 3, 3, 0, 4, 2, 2, 3, 4, 0, 4, 0, 3, 0, 4, 4, 5, 3, 4, 4, 0, 3, 0, 0, 1, 4),
(1, 4, 0, 4, 0, 4, 0, 4, 0, 3, 5, 4, 4, 3, 4, 3, 5, 4, 3, 3, 4, 3, 5, 4, 4, 4, 4, 3, 4, 2, 4, 3, 3, 1, 5, 4, 3, 2, 4, 5, 4, 5, 5, 4, 4, 5, 4, 4, 0, 3, 2, 2, 3, 3, 0, 4, 3, 1, 3, 2, 1, 4, 3, 3, 4, 5, 0, 3, 0, 2, 0, 4, 5, 5, 4, 5, 4, 0, 4, 0, 0, 5, 4),
(0, 5, 0, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 3, 4, 0, 4, 4, 4, 3, 4, 3, 4, 3, 3, 1, 4, 2, 4, 3, 4, 0, 5, 4, 1, 4, 5, 4, 4, 5, 3, 2, 4, 3, 4, 3, 2, 4, 1, 3, 3, 3, 2, 3, 2, 0, 4, 3, 3, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 4, 3, 0, 4, 1, 0, 1, 3),
(0, 3, 1, 4, 0, 3, 0, 2, 0, 3, 4, 4, 3, 1, 4, 2, 3, 3, 4, 3, 4, 3, 4, 3, 4, 4, 3, 2, 3, 1, 5, 4, 4, 1, 4, 4, 3, 5, 4, 4, 3, 5, 5, 4, 3, 4, 4, 3, 1, 2, 3, 1, 2, 2, 0, 3, 2, 0, 3, 1, 0, 5, 3, 3, 3, 4, 3, 3, 3, 3, 4, 4, 4, 4, 5, 4, 2, 0, 3, 3, 2, 4, 3),
(0, 2, 0, 3, 0, 1, 0, 1, 0, 0, 3, 2, 0, 0, 2, 0, 1, 0, 2, 1, 3, 3, 3, 1, 2, 3, 1, 0, 1, 0, 4, 2, 1, 1, 3, 3, 0, 4, 3, 3, 1, 4, 3, 3, 0, 3, 3, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 4, 1, 0, 2, 3, 2, 2, 2, 1, 3, 3, 3, 4, 4, 3, 2, 0, 3, 1, 0, 3, 3),
(0, 4, 0, 4, 0, 3, 0, 3, 0, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 4, 3, 4, 3, 3, 2, 4, 3, 4, 5, 4, 1, 4, 5, 3, 5, 4, 5, 3, 5, 4, 0, 3, 5, 5, 3, 1, 3, 3, 2, 2, 3, 0, 3, 4, 1, 3, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 5, 3, 0, 4, 1, 0, 3, 4),
(0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 2, 2, 1, 0, 1, 0, 0, 0, 3, 0, 3, 0, 3, 0, 1, 3, 1, 0, 3, 1, 3, 3, 3, 1, 3, 3, 3, 0, 1, 3, 1, 3, 4, 0, 0, 3, 1, 1, 0, 3, 2, 0, 0, 0, 0, 1, 3, 0, 1, 0, 0, 3, 3, 2, 0, 3, 0, 0, 0, 0, 0, 3, 4, 3, 4, 3, 3, 0, 3, 0, 0, 2, 3),
(2, 3, 0, 3, 0, 2, 0, 1, 0, 3, 3, 4, 3, 1, 3, 1, 1, 1, 3, 1, 4, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 4, 3, 1, 4, 3, 2, 5, 5, 4, 4, 4, 4, 3, 3, 4, 4, 4, 0, 2, 1, 1, 3, 2, 0, 1, 2, 0, 0, 1, 0, 4, 1, 3, 3, 3, 0, 3, 0, 1, 0, 4, 4, 4, 5, 5, 3, 0, 2, 0, 0, 4, 4),
(0, 2, 0, 1, 0, 3, 1, 3, 0, 2, 3, 3, 3, 0, 3, 1, 0, 0, 3, 0, 3, 2, 3, 1, 3, 2, 1, 1, 0, 0, 4, 2, 1, 0, 2, 3, 1, 4, 3, 2, 0, 4, 4, 3, 1, 3, 1, 3, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 1, 1, 1, 2, 0, 3, 0, 0, 0, 3, 4, 2, 4, 3, 2, 0, 1, 0, 0, 3, 3),
(0, 1, 0, 4, 0, 5, 0, 4, 0, 2, 4, 4, 2, 3, 3, 2, 3, 3, 5, 3, 3, 3, 4, 3, 4, 2, 3, 0, 4, 3, 3, 3, 4, 1, 4, 3, 2, 1, 5, 5, 3, 4, 5, 1, 3, 5, 4, 2, 0, 3, 3, 0, 1, 3, 0, 4, 2, 0, 1, 3, 1, 4, 3, 3, 3, 3, 0, 3, 0, 1, 0, 3, 4, 4, 4, 5, 5, 0, 3, 0, 1, 4, 5),
(0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 3, 1, 3, 0, 4, 0, 1, 1, 3, 0, 3, 4, 3, 2, 3, 1, 0, 3, 3, 2, 3, 1, 3, 0, 2, 3, 0, 2, 1, 4, 1, 2, 2, 0, 0, 3, 3, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 3, 2, 1, 3, 3, 0, 2, 0, 2, 0, 0, 3, 3, 1, 2, 4, 0, 3, 0, 2, 2, 3),
(2, 4, 0, 5, 0, 4, 0, 4, 0, 2, 4, 4, 4, 3, 4, 3, 3, 3, 1, 2, 4, 3, 4, 3, 4, 4, 5, 0, 3, 3, 3, 3, 2, 0, 4, 3, 1, 4, 3, 4, 1, 4, 4, 3, 3, 4, 4, 3, 1, 2, 3, 0, 4, 2, 0, 4, 1, 0, 3, 3, 0, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 3, 5, 3, 4, 5, 2, 0, 3, 0, 0, 4, 5),
(0, 3, 0, 4, 0, 1, 0, 1, 0, 1, 3, 2, 2, 1, 3, 0, 3, 0, 2, 0, 2, 0, 3, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 4, 0, 3, 1, 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 3, 1, 0, 3, 0, 0, 0, 1, 4, 4, 4, 3, 0, 0, 4, 0, 0, 1, 4),
(1, 4, 1, 5, 0, 3, 0, 3, 0, 4, 5, 4, 4, 3, 5, 3, 3, 4, 4, 3, 4, 1, 3, 3, 3, 3, 2, 1, 4, 1, 5, 4, 3, 1, 4, 4, 3, 5, 4, 4, 3, 5, 4, 3, 3, 4, 4, 4, 0, 3, 3, 1, 2, 3, 0, 3, 1, 0, 3, 3, 0, 5, 4, 4, 4, 4, 4, 4, 3, 3, 5, 4, 4, 3, 3, 5, 4, 0, 3, 2, 0, 4, 4),
(0, 2, 0, 3, 0, 1, 0, 0, 0, 1, 3, 3, 3, 2, 4, 1, 3, 0, 3, 1, 3, 0, 2, 2, 1, 1, 0, 0, 2, 0, 4, 3, 1, 0, 4, 3, 0, 4, 4, 4, 1, 4, 3, 1, 1, 3, 3, 1, 0, 2, 0, 0, 1, 3, 0, 0, 0, 0, 2, 0, 0, 4, 3, 2, 4, 3, 5, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 0, 2, 1, 0, 3, 3),
(0, 2, 0, 4, 0, 3, 0, 2, 0, 2, 5, 5, 3, 4, 4, 4, 4, 1, 4, 3, 3, 0, 4, 3, 4, 3, 1, 3, 3, 2, 4, 3, 0, 3, 4, 3, 0, 3, 4, 4, 2, 4, 4, 0, 4, 5, 3, 3, 2, 2, 1, 1, 1, 2, 0, 1, 5, 0, 3, 3, 2, 4, 3, 3, 3, 4, 0, 3, 0, 2, 0, 4, 4, 3, 5, 5, 0, 0, 3, 0, 2, 3, 3),
(0, 3, 0, 4, 0, 3, 0, 1, 0, 3, 4, 3, 3, 1, 3, 3, 3, 0, 3, 1, 3, 0, 4, 3, 3, 1, 1, 0, 3, 0, 3, 3, 0, 0, 4, 4, 0, 1, 5, 4, 3, 3, 5, 0, 3, 3, 4, 3, 0, 2, 0, 1, 1, 1, 0, 1, 3, 0, 1, 2, 1, 3, 3, 2, 3, 3, 0, 3, 0, 1, 0, 1, 3, 3, 4, 4, 1, 0, 1, 2, 2, 1, 3),
(0, 1, 0, 4, 0, 4, 0, 3, 0, 1, 3, 3, 3, 2, 3, 1, 1, 0, 3, 0, 3, 3, 4, 3, 2, 4, 2, 0, 1, 0, 4, 3, 2, 0, 4, 3, 0, 5, 3, 3, 2, 4, 4, 4, 3, 3, 3, 4, 0, 1, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 2, 3, 3, 3, 0, 3, 0, 0, 0, 4, 4, 4, 5, 3, 2, 0, 3, 3, 0, 3, 5),
(0, 2, 0, 3, 0, 0, 0, 3, 0, 1, 3, 0, 2, 0, 0, 0, 1, 0, 3, 1, 1, 3, 3, 0, 0, 3, 0, 0, 3, 0, 2, 3, 1, 0, 3, 1, 0, 3, 3, 2, 0, 4, 2, 2, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 0, 0, 1, 3, 1, 2, 0, 0, 0, 1, 0, 0, 1, 4),
(0, 3, 0, 3, 0, 5, 0, 1, 0, 2, 4, 3, 1, 3, 3, 2, 1, 1, 5, 2, 1, 0, 5, 1, 2, 0, 0, 0, 3, 3, 2, 2, 3, 2, 4, 3, 0, 0, 3, 3, 1, 3, 3, 0, 2, 5, 3, 4, 0, 3, 3, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0, 2, 2, 3, 3, 3, 0, 2, 0, 1, 0, 3, 4, 4, 2, 5, 4, 0, 3, 0, 0, 3, 5),
(0, 3, 0, 3, 0, 3, 0, 1, 0, 3, 3, 3, 3, 0, 3, 0, 2, 0, 2, 1, 1, 0, 2, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 3, 2, 0, 0, 3, 3, 1, 2, 3, 1, 0, 3, 3, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 3, 1, 2, 3, 0, 3, 0, 1, 0, 3, 2, 1, 0, 4, 3, 0, 1, 1, 0, 3, 3),
(0, 4, 0, 5, 0, 3, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 4, 3, 5, 3, 3, 2, 5, 3, 4, 4, 4, 3, 4, 3, 4, 5, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, 3, 4, 5, 5, 4, 2, 3, 4, 2, 3, 4, 0, 3, 3, 1, 4, 3, 2, 4, 3, 3, 5, 5, 0, 3, 0, 3, 0, 5, 5, 5, 5, 4, 4, 0, 4, 0, 1, 4, 4),
(0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 5, 4, 4, 2, 3, 2, 5, 1, 3, 2, 5, 1, 4, 2, 3, 2, 3, 3, 4, 3, 3, 3, 3, 2, 5, 4, 1, 3, 3, 5, 3, 4, 4, 0, 4, 4, 3, 1, 1, 3, 1, 0, 2, 3, 0, 2, 3, 0, 3, 0, 0, 4, 3, 1, 3, 4, 0, 3, 0, 2, 0, 4, 4, 4, 3, 4, 5, 0, 4, 0, 0, 3, 4),
(0, 3, 0, 3, 0, 3, 1, 2, 0, 3, 4, 4, 3, 3, 3, 0, 2, 2, 4, 3, 3, 1, 3, 3, 3, 1, 1, 0, 3, 1, 4, 3, 2, 3, 4, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 3, 1, 3, 3, 1, 3, 3, 0, 4, 1, 0, 2, 2, 1, 4, 3, 2, 3, 3, 5, 4, 3, 3, 5, 4, 4, 3, 3, 0, 4, 0, 3, 2, 2, 4, 4),
(0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 2, 1, 3, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 1, 3, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 3, 4, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1),
(0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 4, 1, 4, 0, 3, 0, 4, 0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 1, 5, 1, 4, 0, 0, 3, 0, 5, 0, 5, 2, 0, 1, 0, 0, 0, 2, 1, 4, 0, 1, 3, 0, 0, 3, 0, 0, 3, 1, 1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
(1, 4, 0, 5, 0, 3, 0, 2, 0, 3, 5, 4, 4, 3, 4, 3, 5, 3, 4, 3, 3, 0, 4, 3, 3, 3, 3, 3, 3, 2, 4, 4, 3, 1, 3, 4, 4, 5, 4, 4, 3, 4, 4, 1, 3, 5, 4, 3, 3, 3, 1, 2, 2, 3, 3, 1, 3, 1, 3, 3, 3, 5, 3, 3, 4, 5, 0, 3, 0, 3, 0, 3, 4, 3, 4, 4, 3, 0, 3, 0, 2, 4, 3),
(0, 1, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 4, 1, 4, 2, 4, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 3, 1, 1, 1, 0, 3, 0, 0, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 3, 2, 0, 2, 2, 0, 1, 0, 0, 0, 2, 3, 2, 3, 3, 0, 0, 0, 0, 2, 1, 0),
(0, 5, 1, 5, 0, 3, 0, 3, 0, 5, 4, 4, 5, 1, 5, 3, 3, 0, 4, 3, 4, 3, 5, 3, 4, 3, 3, 2, 4, 3, 4, 3, 3, 0, 3, 3, 1, 4, 4, 3, 4, 4, 4, 3, 4, 5, 5, 3, 2, 3, 1, 1, 3, 3, 1, 3, 1, 1, 3, 3, 2, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 5, 3, 3, 0, 3, 4, 0, 4, 3),
(0, 5, 0, 5, 0, 3, 0, 2, 0, 4, 4, 3, 5, 2, 4, 3, 3, 3, 4, 4, 4, 3, 5, 3, 5, 3, 3, 1, 4, 0, 4, 3, 3, 0, 3, 3, 0, 4, 4, 4, 4, 5, 4, 3, 3, 5, 5, 3, 2, 3, 1, 2, 3, 2, 0, 1, 0, 0, 3, 2, 2, 4, 4, 3, 1, 5, 0, 4, 0, 3, 0, 4, 3, 1, 3, 2, 1, 0, 3, 3, 0, 3, 3),
(0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 5, 5, 3, 4, 3, 3, 2, 5, 4, 4, 3, 5, 3, 5, 3, 4, 0, 4, 3, 4, 4, 3, 2, 4, 4, 3, 4, 5, 4, 4, 5, 5, 0, 3, 5, 5, 4, 1, 3, 3, 2, 3, 3, 1, 3, 1, 0, 4, 3, 1, 4, 4, 3, 4, 5, 0, 4, 0, 2, 0, 4, 3, 4, 4, 3, 3, 0, 4, 0, 0, 5, 5),
(0, 4, 0, 4, 0, 5, 0, 1, 1, 3, 3, 4, 4, 3, 4, 1, 3, 0, 5, 1, 3, 0, 3, 1, 3, 1, 1, 0, 3, 0, 3, 3, 4, 0, 4, 3, 0, 4, 4, 4, 3, 4, 4, 0, 3, 5, 4, 1, 0, 3, 0, 0, 2, 3, 0, 3, 1, 0, 3, 1, 0, 3, 2, 1, 3, 5, 0, 3, 0, 1, 0, 3, 2, 3, 3, 4, 4, 0, 2, 2, 0, 4, 4),
(2, 4, 0, 5, 0, 4, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 5, 3, 5, 3, 5, 2, 5, 3, 4, 3, 3, 4, 3, 4, 5, 3, 2, 1, 5, 4, 3, 2, 3, 4, 5, 3, 4, 1, 2, 5, 4, 3, 0, 3, 3, 0, 3, 2, 0, 2, 3, 0, 4, 1, 0, 3, 4, 3, 3, 5, 0, 3, 0, 1, 0, 4, 5, 5, 5, 4, 3, 0, 4, 2, 0, 3, 5),
(0, 5, 0, 4, 0, 4, 0, 2, 0, 5, 4, 3, 4, 3, 4, 3, 3, 3, 4, 3, 4, 2, 5, 3, 5, 3, 4, 1, 4, 3, 4, 4, 4, 0, 3, 5, 0, 4, 4, 4, 4, 5, 3, 1, 3, 4, 5, 3, 3, 3, 3, 3, 3, 3, 0, 2, 2, 0, 3, 3, 2, 4, 3, 3, 3, 5, 3, 4, 1, 3, 3, 5, 3, 2, 0, 0, 0, 0, 4, 3, 1, 3, 3),
(0, 1, 0, 3, 0, 3, 0, 1, 0, 1, 3, 3, 3, 2, 3, 3, 3, 0, 3, 0, 0, 0, 3, 1, 3, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 2, 0, 1, 2, 4, 1, 3, 3, 0, 0, 3, 3, 3, 0, 1, 0, 0, 2, 1, 0, 0, 3, 0, 3, 1, 0, 3, 0, 0, 1, 3, 0, 2, 0, 1, 0, 3, 3, 1, 3, 3, 0, 0, 1, 1, 0, 3, 3),
(0, 2, 0, 3, 0, 2, 1, 4, 0, 2, 2, 3, 1, 1, 3, 1, 1, 0, 2, 0, 3, 1, 2, 3, 1, 3, 0, 0, 1, 0, 4, 3, 2, 3, 3, 3, 1, 4, 2, 3, 3, 3, 3, 1, 0, 3, 1, 4, 0, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 0, 3, 1, 3, 2, 2, 0, 1, 0, 0, 0, 2, 3, 3, 3, 1, 0, 0, 0, 0, 0, 2, 3),
(0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 5, 5, 3, 3, 4, 3, 3, 1, 5, 4, 4, 2, 4, 4, 4, 3, 4, 2, 4, 3, 5, 5, 4, 3, 3, 4, 3, 3, 5, 5, 4, 5, 5, 1, 3, 4, 5, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 1, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 5, 3, 3, 1, 4, 3, 0, 4, 0, 1, 5, 3),
(0, 5, 0, 5, 0, 4, 0, 2, 0, 4, 4, 3, 4, 3, 3, 3, 3, 3, 5, 4, 4, 4, 4, 4, 4, 5, 3, 3, 5, 2, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 5, 5, 3, 3, 4, 3, 4, 3, 3, 4, 3, 3, 3, 3, 1, 2, 2, 1, 4, 3, 3, 5, 4, 4, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 4, 4, 1, 0, 4, 2, 0, 2, 4),
(0, 4, 0, 4, 0, 3, 0, 1, 0, 3, 5, 2, 3, 0, 3, 0, 2, 1, 4, 2, 3, 3, 4, 1, 4, 3, 3, 2, 4, 1, 3, 3, 3, 0, 3, 3, 0, 0, 3, 3, 3, 5, 3, 3, 3, 3, 3, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 3, 1, 2, 2, 3, 0, 3, 0, 2, 0, 4, 4, 3, 3, 4, 1, 0, 3, 0, 0, 2, 4),
(0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 2, 0, 0, 0, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 0, 2),
(0, 2, 1, 3, 0, 2, 0, 2, 0, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 4, 2, 2, 1, 2, 1, 4, 0, 4, 3, 1, 3, 3, 3, 2, 4, 3, 5, 4, 3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 2, 3, 0, 3, 3, 0, 3, 3, 4, 2, 3, 1, 4, 0, 1, 2, 0, 2, 3),
(0, 3, 0, 3, 0, 1, 0, 3, 0, 2, 3, 3, 3, 0, 3, 1, 2, 0, 3, 3, 2, 3, 3, 2, 3, 2, 3, 1, 3, 0, 4, 3, 2, 0, 3, 3, 1, 4, 3, 3, 2, 3, 4, 3, 1, 3, 3, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 4, 1, 1, 0, 3, 0, 3, 1, 0, 2, 3, 3, 3, 3, 3, 1, 0, 0, 2, 0, 3, 3),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3),
(0, 2, 0, 3, 1, 3, 0, 3, 0, 2, 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3, 1, 3, 0, 2, 3, 1, 1, 4, 3, 3, 2, 3, 3, 1, 2, 2, 4, 1, 3, 3, 0, 1, 4, 2, 3, 0, 1, 3, 0, 3, 0, 0, 1, 3, 0, 2, 0, 0, 3, 3, 2, 1, 3, 0, 3, 0, 2, 0, 3, 4, 4, 4, 3, 1, 0, 3, 0, 0, 3, 3),
(0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 3, 2, 2, 1, 3, 0, 1, 1, 3, 0, 3, 2, 3, 1, 2, 0, 2, 0, 1, 1, 3, 3, 3, 0, 3, 3, 1, 1, 2, 3, 2, 3, 3, 1, 2, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 2, 1, 2, 1, 3, 0, 3, 0, 0, 0, 3, 4, 4, 4, 3, 2, 0, 2, 0, 0, 2, 4),
(0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 3),
(0, 3, 0, 3, 0, 2, 0, 3, 0, 3, 3, 3, 2, 3, 2, 2, 2, 0, 3, 1, 3, 3, 3, 2, 3, 3, 0, 0, 3, 0, 3, 2, 2, 0, 2, 3, 1, 4, 3, 4, 3, 3, 2, 3, 1, 5, 4, 4, 0, 3, 1, 2, 1, 3, 0, 3, 1, 1, 2, 0, 2, 3, 1, 3, 1, 3, 0, 3, 0, 1, 0, 3, 3, 4, 4, 2, 1, 0, 2, 1, 0, 2, 4),
(0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 4, 2, 5, 1, 4, 0, 2, 0, 2, 1, 3, 1, 4, 0, 2, 1, 0, 0, 2, 1, 4, 1, 1, 0, 3, 3, 0, 5, 1, 3, 2, 3, 3, 1, 0, 3, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, 0, 3, 0, 2, 0, 1, 0, 3, 3, 3, 4, 3, 3, 0, 0, 0, 0, 2, 3),
(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 3),
(0, 1, 0, 3, 0, 4, 0, 3, 0, 2, 4, 3, 1, 0, 3, 2, 2, 1, 3, 1, 2, 2, 3, 1, 1, 1, 2, 1, 3, 0, 1, 2, 0, 1, 3, 2, 1, 3, 0, 5, 5, 1, 0, 0, 1, 3, 2, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 1, 1, 1, 3, 2, 0, 2, 0, 1, 0, 2, 3, 3, 1, 2, 3, 0, 1, 0, 1, 0, 4),
(0, 0, 0, 1, 0, 3, 0, 3, 0, 2, 2, 1, 0, 0, 4, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 0, 1, 0, 3, 0, 3, 1, 3, 0, 3, 3, 0, 0, 1, 2, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 2, 0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 4),
(0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 2, 0, 2, 3, 0, 0, 2, 2, 3, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 2, 3),
(2, 4, 0, 5, 0, 5, 0, 4, 0, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 4, 5, 5, 5, 2, 3, 0, 5, 5, 4, 1, 5, 4, 3, 1, 5, 4, 3, 4, 4, 3, 3, 4, 3, 3, 0, 3, 2, 0, 2, 3, 0, 3, 0, 0, 3, 3, 0, 5, 3, 2, 3, 3, 0, 3, 0, 3, 0, 3, 4, 5, 4, 5, 3, 0, 4, 3, 0, 3, 4),
(0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 3, 4, 3, 2, 3, 2, 3, 0, 4, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 2, 4, 3, 3, 1, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 1, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 3, 1, 0, 5, 3, 2, 1, 3, 0, 3, 0, 1, 2, 4, 3, 2, 4, 3, 3, 0, 3, 2, 0, 4, 4),
(0, 3, 0, 3, 0, 1, 0, 0, 0, 1, 4, 3, 3, 2, 3, 1, 3, 1, 4, 2, 3, 2, 4, 2, 3, 4, 3, 0, 2, 2, 3, 3, 3, 0, 3, 3, 3, 0, 3, 4, 1, 3, 3, 0, 3, 4, 3, 3, 0, 1, 1, 0, 1, 0, 0, 0, 4, 0, 3, 0, 0, 3, 1, 2, 1, 3, 0, 4, 0, 1, 0, 4, 3, 3, 4, 3, 3, 0, 2, 0, 0, 3, 3),
(0, 3, 0, 4, 0, 1, 0, 3, 0, 3, 4, 3, 3, 0, 3, 3, 3, 1, 3, 1, 3, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 3, 3, 1, 3, 3, 2, 5, 4, 3, 3, 4, 5, 3, 2, 5, 3, 4, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 4, 2, 2, 1, 3, 0, 3, 0, 2, 0, 4, 4, 3, 5, 3, 2, 0, 1, 1, 0, 3, 4),
(0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 4, 3, 3, 2, 3, 3, 3, 1, 4, 3, 4, 1, 5, 3, 4, 3, 4, 0, 4, 2, 4, 3, 4, 1, 5, 4, 0, 4, 4, 4, 4, 5, 4, 1, 3, 5, 4, 2, 1, 4, 1, 1, 3, 2, 0, 3, 1, 0, 3, 2, 1, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 3, 3, 3, 0, 4, 2, 0, 3, 4),
(1, 4, 0, 4, 0, 3, 0, 1, 0, 3, 3, 3, 1, 1, 3, 3, 2, 2, 3, 3, 1, 0, 3, 2, 2, 1, 2, 0, 3, 1, 2, 1, 2, 0, 3, 2, 0, 2, 2, 3, 3, 4, 3, 0, 3, 3, 1, 2, 0, 1, 1, 3, 1, 2, 0, 0, 3, 0, 1, 1, 0, 3, 2, 2, 3, 3, 0, 3, 0, 0, 0, 2, 3, 3, 4, 3, 3, 0, 1, 0, 0, 1, 4),
(0, 4, 0, 4, 0, 4, 0, 0, 0, 3, 4, 4, 3, 1, 4, 2, 3, 2, 3, 3, 3, 1, 4, 3, 4, 0, 3, 0, 4, 2, 3, 3, 2, 2, 5, 4, 2, 1, 3, 4, 3, 4, 3, 1, 3, 3, 4, 2, 0, 2, 1, 0, 3, 3, 0, 0, 2, 0, 3, 1, 0, 4, 4, 3, 4, 3, 0, 4, 0, 1, 0, 2, 4, 4, 4, 4, 4, 0, 3, 2, 0, 3, 3),
(0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2),
(0, 2, 0, 3, 0, 4, 0, 4, 0, 1, 3, 3, 3, 0, 4, 0, 2, 1, 2, 1, 1, 1, 2, 0, 3, 1, 1, 0, 1, 0, 3, 1, 0, 0, 3, 3, 2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 3, 1, 0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 3, 0, 0, 0, 0, 1, 0, 0, 3, 3, 4, 3, 1, 0, 1, 0, 3, 0, 2),
(0, 0, 0, 3, 0, 5, 0, 0, 0, 0, 1, 0, 2, 0, 3, 1, 0, 1, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 4, 0, 0, 0, 2, 3, 0, 1, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 3),
(0, 2, 0, 5, 0, 5, 0, 1, 0, 2, 4, 3, 3, 2, 5, 1, 3, 2, 3, 3, 3, 0, 4, 1, 2, 0, 3, 0, 4, 0, 2, 2, 1, 1, 5, 3, 0, 0, 1, 4, 2, 3, 2, 0, 3, 3, 3, 2, 0, 2, 4, 1, 1, 2, 0, 1, 1, 0, 3, 1, 0, 1, 3, 1, 2, 3, 0, 2, 0, 0, 0, 1, 3, 5, 4, 4, 4, 0, 3, 0, 0, 1, 3),
(0, 4, 0, 5, 0, 4, 0, 4, 0, 4, 5, 4, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 3, 4, 5, 4, 2, 4, 2, 3, 4, 3, 1, 4, 4, 1, 3, 5, 4, 4, 5, 5, 4, 4, 5, 5, 5, 2, 3, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 4, 4, 4, 0, 3, 0, 4, 0, 3, 3, 4, 4, 5, 0, 0, 4, 3, 0, 4, 5),
(0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 4, 4, 4, 3, 3, 2, 4, 3, 4, 3, 4, 3, 5, 3, 4, 3, 2, 1, 4, 2, 4, 4, 3, 1, 3, 4, 2, 4, 5, 5, 3, 4, 5, 4, 1, 5, 4, 3, 0, 3, 2, 2, 3, 2, 1, 3, 1, 0, 3, 3, 3, 5, 3, 3, 3, 5, 4, 4, 2, 3, 3, 4, 3, 3, 3, 2, 1, 0, 3, 2, 1, 4, 3),
(0, 4, 0, 5, 0, 4, 0, 3, 0, 3, 5, 5, 3, 2, 4, 3, 4, 0, 5, 4, 4, 1, 4, 4, 4, 3, 3, 3, 4, 3, 5, 5, 2, 3, 3, 4, 1, 2, 5, 5, 3, 5, 5, 2, 3, 5, 5, 4, 0, 3, 2, 0, 3, 3, 1, 1, 5, 1, 4, 1, 0, 4, 3, 2, 3, 5, 0, 4, 0, 3, 0, 5, 4, 3, 4, 3, 0, 0, 4, 1, 0, 4, 4),
(1, 3, 0, 4, 0, 2, 0, 2, 0, 2, 5, 5, 3, 3, 3, 3, 3, 0, 4, 2, 3, 4, 4, 4, 3, 4, 0, 0, 3, 4, 5, 4, 3, 3, 3, 3, 2, 5, 5, 4, 5, 5, 5, 4, 3, 5, 5, 5, 1, 3, 1, 0, 1, 0, 0, 3, 2, 0, 4, 2, 0, 5, 2, 3, 2, 4, 1, 3, 0, 3, 0, 4, 5, 4, 5, 4, 3, 0, 4, 2, 0, 5, 4),
(0, 3, 0, 4, 0, 5, 0, 3, 0, 3, 4, 4, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 4, 3, 3, 2, 2, 0, 3, 3, 3, 3, 3, 1, 3, 3, 3, 0, 4, 4, 3, 4, 4, 1, 1, 4, 4, 2, 0, 3, 1, 0, 1, 1, 0, 4, 1, 0, 2, 3, 1, 3, 3, 1, 3, 4, 0, 3, 0, 1, 0, 3, 1, 3, 0, 0, 1, 0, 2, 0, 0, 4, 4),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
(0, 3, 0, 3, 0, 2, 0, 3, 0, 1, 5, 4, 3, 3, 3, 1, 4, 2, 1, 2, 3, 4, 4, 2, 4, 4, 5, 0, 3, 1, 4, 3, 4, 0, 4, 3, 3, 3, 2, 3, 2, 5, 3, 4, 3, 2, 2, 3, 0, 0, 3, 0, 2, 1, 0, 1, 2, 0, 0, 0, 0, 2, 1, 1, 3, 1, 0, 2, 0, 4, 0, 3, 4, 4, 4, 5, 2, 0, 2, 0, 0, 1, 3),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 4, 2, 1, 1, 0, 1, 0, 3, 2, 0, 0, 3, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 4, 0, 4, 2, 1, 0, 0, 0, 0, 0, 1),
(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 0, 2, 1, 0, 0, 1, 2, 1, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2),
(0, 4, 0, 4, 0, 4, 0, 3, 0, 4, 4, 3, 4, 2, 4, 3, 2, 0, 4, 4, 4, 3, 5, 3, 5, 3, 3, 2, 4, 2, 4, 3, 4, 3, 1, 4, 0, 2, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 4, 1, 3, 4, 3, 2, 1, 2, 1, 3, 3, 3, 4, 4, 3, 3, 5, 0, 4, 0, 3, 0, 4, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3),
(0, 4, 0, 3, 0, 3, 0, 3, 0, 3, 5, 5, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 3, 5, 3, 3, 1, 3, 2, 4, 5, 5, 5, 5, 4, 3, 4, 5, 5, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 4, 3, 2, 2, 1, 2, 0, 3, 0, 0, 4, 1),
)
# fmt: on
class JapaneseContextAnalysis(object):
class JapaneseContextAnalysis:
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
ENOUGH_REL_THRESHOLD = 100
MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
def __init__(self):
self._total_rel = None
self._rel_sample = None
self._need_to_skip_char_num = None
self._last_char_order = None
self._done = None
def __init__(self) -> None:
self._total_rel = 0
self._rel_sample: List[int] = []
self._need_to_skip_char_num = 0
self._last_char_order = -1
self._done = False
self.reset()
def reset(self):
def reset(self) -> None:
self._total_rel = 0 # total sequence received
# category counters, each integer counts sequence in its category
self._rel_sample = [0] * self.NUM_OF_CATEGORY
@ -140,7 +144,7 @@ class JapaneseContextAnalysis(object):
# been made
self._done = False
def feed(self, byte_str, num_bytes):
def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
if self._done:
return
@ -153,7 +157,7 @@ class JapaneseContextAnalysis(object):
# this character will simply our logic and improve performance.
i = self._need_to_skip_char_num
while i < num_bytes:
order, char_len = self.get_order(byte_str[i:i + 2])
order, char_len = self.get_order(byte_str[i : i + 2])
i += char_len
if i > num_bytes:
self._need_to_skip_char_num = i - num_bytes
@ -164,32 +168,34 @@ class JapaneseContextAnalysis(object):
if self._total_rel > self.MAX_REL_THRESHOLD:
self._done = True
break
self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
self._rel_sample[
jp2_char_context[self._last_char_order][order]
] += 1
self._last_char_order = order
def got_enough_data(self):
def got_enough_data(self) -> bool:
return self._total_rel > self.ENOUGH_REL_THRESHOLD
def get_confidence(self):
def get_confidence(self) -> float:
# This is just one way to calculate confidence. It works well for me.
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
return (self._total_rel - self._rel_sample[0]) / self._total_rel
else:
return self.DONT_KNOW
return self.DONT_KNOW
def get_order(self, byte_str):
def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self):
super(SJISContextAnalysis, self).__init__()
def __init__(self) -> None:
super().__init__()
self._charset_name = "SHIFT_JIS"
@property
def charset_name(self):
def charset_name(self) -> str:
return self._charset_name
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
if not byte_str:
return -1, 1
# find out current char's byte length
@ -209,8 +215,9 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
return -1, char_len
class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
if not byte_str:
return -1, 1
# find out current char's byte length
@ -229,5 +236,3 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
return second_char - 0xA1, char_len
return -1, char_len

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from chardet.sbcharsetprober import SingleByteCharSetModel
# 3: Positive
# 2: Likely
# 1: Unlikely
@ -4115,269 +4111,270 @@ HEBREW_LANG_MODEL = {
# Character Mapping Table(s):
WINDOWS_1255_HEBREW_CHAR_TO_ORDER = {
0: 255, # '\x00'
1: 255, # '\x01'
2: 255, # '\x02'
3: 255, # '\x03'
4: 255, # '\x04'
5: 255, # '\x05'
6: 255, # '\x06'
7: 255, # '\x07'
8: 255, # '\x08'
9: 255, # '\t'
10: 254, # '\n'
11: 255, # '\x0b'
12: 255, # '\x0c'
13: 254, # '\r'
14: 255, # '\x0e'
15: 255, # '\x0f'
16: 255, # '\x10'
17: 255, # '\x11'
18: 255, # '\x12'
19: 255, # '\x13'
20: 255, # '\x14'
21: 255, # '\x15'
22: 255, # '\x16'
23: 255, # '\x17'
24: 255, # '\x18'
25: 255, # '\x19'
26: 255, # '\x1a'
27: 255, # '\x1b'
28: 255, # '\x1c'
29: 255, # '\x1d'
30: 255, # '\x1e'
31: 255, # '\x1f'
32: 253, # ' '
33: 253, # '!'
34: 253, # '"'
35: 253, # '#'
36: 253, # '$'
37: 253, # '%'
38: 253, # '&'
39: 253, # "'"
40: 253, # '('
41: 253, # ')'
42: 253, # '*'
43: 253, # '+'
44: 253, # ','
45: 253, # '-'
46: 253, # '.'
47: 253, # '/'
48: 252, # '0'
49: 252, # '1'
50: 252, # '2'
51: 252, # '3'
52: 252, # '4'
53: 252, # '5'
54: 252, # '6'
55: 252, # '7'
56: 252, # '8'
57: 252, # '9'
58: 253, # ':'
59: 253, # ';'
60: 253, # '<'
61: 253, # '='
62: 253, # '>'
63: 253, # '?'
64: 253, # '@'
65: 69, # 'A'
66: 91, # 'B'
67: 79, # 'C'
68: 80, # 'D'
69: 92, # 'E'
70: 89, # 'F'
71: 97, # 'G'
72: 90, # 'H'
73: 68, # 'I'
74: 111, # 'J'
75: 112, # 'K'
76: 82, # 'L'
77: 73, # 'M'
78: 95, # 'N'
79: 85, # 'O'
80: 78, # 'P'
81: 121, # 'Q'
82: 86, # 'R'
83: 71, # 'S'
84: 67, # 'T'
85: 102, # 'U'
86: 107, # 'V'
87: 84, # 'W'
88: 114, # 'X'
89: 103, # 'Y'
90: 115, # 'Z'
91: 253, # '['
92: 253, # '\\'
93: 253, # ']'
94: 253, # '^'
95: 253, # '_'
96: 253, # '`'
97: 50, # 'a'
98: 74, # 'b'
99: 60, # 'c'
100: 61, # 'd'
101: 42, # 'e'
102: 76, # 'f'
103: 70, # 'g'
104: 64, # 'h'
105: 53, # 'i'
106: 105, # 'j'
107: 93, # 'k'
108: 56, # 'l'
109: 65, # 'm'
110: 54, # 'n'
111: 49, # 'o'
112: 66, # 'p'
113: 110, # 'q'
114: 51, # 'r'
115: 43, # 's'
116: 44, # 't'
117: 63, # 'u'
118: 81, # 'v'
119: 77, # 'w'
120: 98, # 'x'
121: 75, # 'y'
122: 108, # 'z'
123: 253, # '{'
124: 253, # '|'
125: 253, # '}'
126: 253, # '~'
127: 253, # '\x7f'
128: 124, # '€'
129: 202, # None
130: 203, # ''
131: 204, # 'ƒ'
132: 205, # '„'
133: 40, # '…'
134: 58, # '†'
135: 206, # '‡'
136: 207, # 'ˆ'
137: 208, # '‰'
138: 209, # None
139: 210, # ''
140: 211, # None
141: 212, # None
142: 213, # None
143: 214, # None
144: 215, # None
145: 83, # ''
146: 52, # ''
147: 47, # '“'
148: 46, # '”'
149: 72, # '•'
150: 32, # ''
151: 94, # '—'
152: 216, # '˜'
153: 113, # '™'
154: 217, # None
155: 109, # ''
156: 218, # None
157: 219, # None
158: 220, # None
159: 221, # None
160: 34, # '\xa0'
161: 116, # '¡'
162: 222, # '¢'
163: 118, # '£'
164: 100, # '₪'
165: 223, # '¥'
166: 224, # '¦'
167: 117, # '§'
168: 119, # '¨'
169: 104, # '©'
170: 125, # '×'
171: 225, # '«'
172: 226, # '¬'
173: 87, # '\xad'
174: 99, # '®'
175: 227, # '¯'
176: 106, # '°'
177: 122, # '±'
178: 123, # '²'
179: 228, # '³'
180: 55, # '´'
181: 229, # 'µ'
182: 230, # '¶'
183: 101, # '·'
184: 231, # '¸'
185: 232, # '¹'
186: 120, # '÷'
187: 233, # '»'
188: 48, # '¼'
189: 39, # '½'
190: 57, # '¾'
191: 234, # '¿'
192: 30, # 'ְ'
193: 59, # 'ֱ'
194: 41, # 'ֲ'
195: 88, # 'ֳ'
196: 33, # 'ִ'
197: 37, # 'ֵ'
198: 36, # 'ֶ'
199: 31, # 'ַ'
200: 29, # 'ָ'
201: 35, # 'ֹ'
202: 235, # None
203: 62, # 'ֻ'
204: 28, # 'ּ'
205: 236, # 'ֽ'
206: 126, # '־'
207: 237, # 'ֿ'
208: 238, # '׀'
209: 38, # 'ׁ'
210: 45, # 'ׂ'
211: 239, # '׃'
212: 240, # 'װ'
213: 241, # 'ױ'
214: 242, # 'ײ'
215: 243, # '׳'
216: 127, # '״'
217: 244, # None
218: 245, # None
219: 246, # None
220: 247, # None
221: 248, # None
222: 249, # None
223: 250, # None
224: 9, # 'א'
225: 8, # 'ב'
226: 20, # 'ג'
227: 16, # 'ד'
228: 3, # 'ה'
229: 2, # 'ו'
230: 24, # 'ז'
231: 14, # 'ח'
232: 22, # 'ט'
233: 1, # 'י'
234: 25, # 'ך'
235: 15, # 'כ'
236: 4, # 'ל'
237: 11, # 'ם'
238: 6, # 'מ'
239: 23, # 'ן'
240: 12, # 'נ'
241: 19, # 'ס'
242: 13, # 'ע'
243: 26, # 'ף'
244: 18, # 'פ'
245: 27, # 'ץ'
246: 21, # 'צ'
247: 17, # 'ק'
248: 7, # 'ר'
249: 10, # 'ש'
250: 5, # 'ת'
251: 251, # None
252: 252, # None
253: 128, # '\u200e'
254: 96, # '\u200f'
255: 253, # None
0: 255, # '\x00'
1: 255, # '\x01'
2: 255, # '\x02'
3: 255, # '\x03'
4: 255, # '\x04'
5: 255, # '\x05'
6: 255, # '\x06'
7: 255, # '\x07'
8: 255, # '\x08'
9: 255, # '\t'
10: 254, # '\n'
11: 255, # '\x0b'
12: 255, # '\x0c'
13: 254, # '\r'
14: 255, # '\x0e'
15: 255, # '\x0f'
16: 255, # '\x10'
17: 255, # '\x11'
18: 255, # '\x12'
19: 255, # '\x13'
20: 255, # '\x14'
21: 255, # '\x15'
22: 255, # '\x16'
23: 255, # '\x17'
24: 255, # '\x18'
25: 255, # '\x19'
26: 255, # '\x1a'
27: 255, # '\x1b'
28: 255, # '\x1c'
29: 255, # '\x1d'
30: 255, # '\x1e'
31: 255, # '\x1f'
32: 253, # ' '
33: 253, # '!'
34: 253, # '"'
35: 253, # '#'
36: 253, # '$'
37: 253, # '%'
38: 253, # '&'
39: 253, # "'"
40: 253, # '('
41: 253, # ')'
42: 253, # '*'
43: 253, # '+'
44: 253, # ','
45: 253, # '-'
46: 253, # '.'
47: 253, # '/'
48: 252, # '0'
49: 252, # '1'
50: 252, # '2'
51: 252, # '3'
52: 252, # '4'
53: 252, # '5'
54: 252, # '6'
55: 252, # '7'
56: 252, # '8'
57: 252, # '9'
58: 253, # ':'
59: 253, # ';'
60: 253, # '<'
61: 253, # '='
62: 253, # '>'
63: 253, # '?'
64: 253, # '@'
65: 69, # 'A'
66: 91, # 'B'
67: 79, # 'C'
68: 80, # 'D'
69: 92, # 'E'
70: 89, # 'F'
71: 97, # 'G'
72: 90, # 'H'
73: 68, # 'I'
74: 111, # 'J'
75: 112, # 'K'
76: 82, # 'L'
77: 73, # 'M'
78: 95, # 'N'
79: 85, # 'O'
80: 78, # 'P'
81: 121, # 'Q'
82: 86, # 'R'
83: 71, # 'S'
84: 67, # 'T'
85: 102, # 'U'
86: 107, # 'V'
87: 84, # 'W'
88: 114, # 'X'
89: 103, # 'Y'
90: 115, # 'Z'
91: 253, # '['
92: 253, # '\\'
93: 253, # ']'
94: 253, # '^'
95: 253, # '_'
96: 253, # '`'
97: 50, # 'a'
98: 74, # 'b'
99: 60, # 'c'
100: 61, # 'd'
101: 42, # 'e'
102: 76, # 'f'
103: 70, # 'g'
104: 64, # 'h'
105: 53, # 'i'
106: 105, # 'j'
107: 93, # 'k'
108: 56, # 'l'
109: 65, # 'm'
110: 54, # 'n'
111: 49, # 'o'
112: 66, # 'p'
113: 110, # 'q'
114: 51, # 'r'
115: 43, # 's'
116: 44, # 't'
117: 63, # 'u'
118: 81, # 'v'
119: 77, # 'w'
120: 98, # 'x'
121: 75, # 'y'
122: 108, # 'z'
123: 253, # '{'
124: 253, # '|'
125: 253, # '}'
126: 253, # '~'
127: 253, # '\x7f'
128: 124, # '€'
129: 202, # None
130: 203, # ''
131: 204, # 'ƒ'
132: 205, # '„'
133: 40, # '…'
134: 58, # '†'
135: 206, # '‡'
136: 207, # 'ˆ'
137: 208, # '‰'
138: 209, # None
139: 210, # ''
140: 211, # None
141: 212, # None
142: 213, # None
143: 214, # None
144: 215, # None
145: 83, # ''
146: 52, # ''
147: 47, # '“'
148: 46, # '”'
149: 72, # '•'
150: 32, # ''
151: 94, # '—'
152: 216, # '˜'
153: 113, # '™'
154: 217, # None
155: 109, # ''
156: 218, # None
157: 219, # None
158: 220, # None
159: 221, # None
160: 34, # '\xa0'
161: 116, # '¡'
162: 222, # '¢'
163: 118, # '£'
164: 100, # '₪'
165: 223, # '¥'
166: 224, # '¦'
167: 117, # '§'
168: 119, # '¨'
169: 104, # '©'
170: 125, # '×'
171: 225, # '«'
172: 226, # '¬'
173: 87, # '\xad'
174: 99, # '®'
175: 227, # '¯'
176: 106, # '°'
177: 122, # '±'
178: 123, # '²'
179: 228, # '³'
180: 55, # '´'
181: 229, # 'µ'
182: 230, # '¶'
183: 101, # '·'
184: 231, # '¸'
185: 232, # '¹'
186: 120, # '÷'
187: 233, # '»'
188: 48, # '¼'
189: 39, # '½'
190: 57, # '¾'
191: 234, # '¿'
192: 30, # 'ְ'
193: 59, # 'ֱ'
194: 41, # 'ֲ'
195: 88, # 'ֳ'
196: 33, # 'ִ'
197: 37, # 'ֵ'
198: 36, # 'ֶ'
199: 31, # 'ַ'
200: 29, # 'ָ'
201: 35, # 'ֹ'
202: 235, # None
203: 62, # 'ֻ'
204: 28, # 'ּ'
205: 236, # 'ֽ'
206: 126, # '־'
207: 237, # 'ֿ'
208: 238, # '׀'
209: 38, # 'ׁ'
210: 45, # 'ׂ'
211: 239, # '׃'
212: 240, # 'װ'
213: 241, # 'ױ'
214: 242, # 'ײ'
215: 243, # '׳'
216: 127, # '״'
217: 244, # None
218: 245, # None
219: 246, # None
220: 247, # None
221: 248, # None
222: 249, # None
223: 250, # None
224: 9, # 'א'
225: 8, # 'ב'
226: 20, # 'ג'
227: 16, # 'ד'
228: 3, # 'ה'
229: 2, # 'ו'
230: 24, # 'ז'
231: 14, # 'ח'
232: 22, # 'ט'
233: 1, # 'י'
234: 25, # 'ך'
235: 15, # 'כ'
236: 4, # 'ל'
237: 11, # 'ם'
238: 6, # 'מ'
239: 23, # 'ן'
240: 12, # 'נ'
241: 19, # 'ס'
242: 13, # 'ע'
243: 26, # 'ף'
244: 18, # 'פ'
245: 27, # 'ץ'
246: 21, # 'צ'
247: 17, # 'ק'
248: 7, # 'ר'
249: 10, # 'ש'
250: 5, # 'ת'
251: 251, # None
252: 252, # None
253: 128, # '\u200e'
254: 96, # '\u200f'
255: 253, # None
}
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(charset_name='windows-1255',
language='Hebrew',
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
language_model=HEBREW_LANG_MODEL,
typical_positive_ratio=0.984004,
keep_ascii_letters=False,
alphabet='אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ')
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(
charset_name="windows-1255",
language="Hebrew",
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
language_model=HEBREW_LANG_MODEL,
typical_positive_ratio=0.984004,
keep_ascii_letters=False,
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from chardet.sbcharsetprober import SingleByteCharSetModel
# 3: Positive
# 2: Likely
# 1: Unlikely
@ -4115,269 +4111,270 @@ THAI_LANG_MODEL = {
# Character Mapping Table(s):
TIS_620_THAI_CHAR_TO_ORDER = {
0: 255, # '\x00'
1: 255, # '\x01'
2: 255, # '\x02'
3: 255, # '\x03'
4: 255, # '\x04'
5: 255, # '\x05'
6: 255, # '\x06'
7: 255, # '\x07'
8: 255, # '\x08'
9: 255, # '\t'
10: 254, # '\n'
11: 255, # '\x0b'
12: 255, # '\x0c'
13: 254, # '\r'
14: 255, # '\x0e'
15: 255, # '\x0f'
16: 255, # '\x10'
17: 255, # '\x11'
18: 255, # '\x12'
19: 255, # '\x13'
20: 255, # '\x14'
21: 255, # '\x15'
22: 255, # '\x16'
23: 255, # '\x17'
24: 255, # '\x18'
25: 255, # '\x19'
26: 255, # '\x1a'
27: 255, # '\x1b'
28: 255, # '\x1c'
29: 255, # '\x1d'
30: 255, # '\x1e'
31: 255, # '\x1f'
32: 253, # ' '
33: 253, # '!'
34: 253, # '"'
35: 253, # '#'
36: 253, # '$'
37: 253, # '%'
38: 253, # '&'
39: 253, # "'"
40: 253, # '('
41: 253, # ')'
42: 253, # '*'
43: 253, # '+'
44: 253, # ','
45: 253, # '-'
46: 253, # '.'
47: 253, # '/'
48: 252, # '0'
49: 252, # '1'
50: 252, # '2'
51: 252, # '3'
52: 252, # '4'
53: 252, # '5'
54: 252, # '6'
55: 252, # '7'
56: 252, # '8'
57: 252, # '9'
58: 253, # ':'
59: 253, # ';'
60: 253, # '<'
61: 253, # '='
62: 253, # '>'
63: 253, # '?'
64: 253, # '@'
65: 182, # 'A'
66: 106, # 'B'
67: 107, # 'C'
68: 100, # 'D'
69: 183, # 'E'
70: 184, # 'F'
71: 185, # 'G'
72: 101, # 'H'
73: 94, # 'I'
74: 186, # 'J'
75: 187, # 'K'
76: 108, # 'L'
77: 109, # 'M'
78: 110, # 'N'
79: 111, # 'O'
80: 188, # 'P'
81: 189, # 'Q'
82: 190, # 'R'
83: 89, # 'S'
84: 95, # 'T'
85: 112, # 'U'
86: 113, # 'V'
87: 191, # 'W'
88: 192, # 'X'
89: 193, # 'Y'
90: 194, # 'Z'
91: 253, # '['
92: 253, # '\\'
93: 253, # ']'
94: 253, # '^'
95: 253, # '_'
96: 253, # '`'
97: 64, # 'a'
98: 72, # 'b'
99: 73, # 'c'
100: 114, # 'd'
101: 74, # 'e'
102: 115, # 'f'
103: 116, # 'g'
104: 102, # 'h'
105: 81, # 'i'
106: 201, # 'j'
107: 117, # 'k'
108: 90, # 'l'
109: 103, # 'm'
110: 78, # 'n'
111: 82, # 'o'
112: 96, # 'p'
113: 202, # 'q'
114: 91, # 'r'
115: 79, # 's'
116: 84, # 't'
117: 104, # 'u'
118: 105, # 'v'
119: 97, # 'w'
120: 98, # 'x'
121: 92, # 'y'
122: 203, # 'z'
123: 253, # '{'
124: 253, # '|'
125: 253, # '}'
126: 253, # '~'
127: 253, # '\x7f'
128: 209, # '\x80'
129: 210, # '\x81'
130: 211, # '\x82'
131: 212, # '\x83'
132: 213, # '\x84'
133: 88, # '\x85'
134: 214, # '\x86'
135: 215, # '\x87'
136: 216, # '\x88'
137: 217, # '\x89'
138: 218, # '\x8a'
139: 219, # '\x8b'
140: 220, # '\x8c'
141: 118, # '\x8d'
142: 221, # '\x8e'
143: 222, # '\x8f'
144: 223, # '\x90'
145: 224, # '\x91'
146: 99, # '\x92'
147: 85, # '\x93'
148: 83, # '\x94'
149: 225, # '\x95'
150: 226, # '\x96'
151: 227, # '\x97'
152: 228, # '\x98'
153: 229, # '\x99'
154: 230, # '\x9a'
155: 231, # '\x9b'
156: 232, # '\x9c'
157: 233, # '\x9d'
158: 234, # '\x9e'
159: 235, # '\x9f'
160: 236, # None
161: 5, # 'ก'
162: 30, # 'ข'
163: 237, # 'ฃ'
164: 24, # 'ค'
165: 238, # 'ฅ'
166: 75, # 'ฆ'
167: 8, # 'ง'
168: 26, # 'จ'
169: 52, # 'ฉ'
170: 34, # 'ช'
171: 51, # 'ซ'
172: 119, # 'ฌ'
173: 47, # 'ญ'
174: 58, # 'ฎ'
175: 57, # 'ฏ'
176: 49, # 'ฐ'
177: 53, # 'ฑ'
178: 55, # 'ฒ'
179: 43, # 'ณ'
180: 20, # 'ด'
181: 19, # 'ต'
182: 44, # 'ถ'
183: 14, # 'ท'
184: 48, # 'ธ'
185: 3, # 'น'
186: 17, # 'บ'
187: 25, # 'ป'
188: 39, # 'ผ'
189: 62, # 'ฝ'
190: 31, # 'พ'
191: 54, # 'ฟ'
192: 45, # 'ภ'
193: 9, # 'ม'
194: 16, # 'ย'
195: 2, # 'ร'
196: 61, # 'ฤ'
197: 15, # 'ล'
198: 239, # 'ฦ'
199: 12, # 'ว'
200: 42, # 'ศ'
201: 46, # 'ษ'
202: 18, # 'ส'
203: 21, # 'ห'
204: 76, # 'ฬ'
205: 4, # 'อ'
206: 66, # 'ฮ'
207: 63, # 'ฯ'
208: 22, # 'ะ'
209: 10, # 'ั'
210: 1, # 'า'
211: 36, # 'ำ'
212: 23, # 'ิ'
213: 13, # 'ี'
214: 40, # 'ึ'
215: 27, # 'ื'
216: 32, # 'ุ'
217: 35, # 'ู'
218: 86, # 'ฺ'
219: 240, # None
220: 241, # None
221: 242, # None
222: 243, # None
223: 244, # '฿'
224: 11, # 'เ'
225: 28, # 'แ'
226: 41, # 'โ'
227: 29, # 'ใ'
228: 33, # 'ไ'
229: 245, # 'ๅ'
230: 50, # 'ๆ'
231: 37, # '็'
232: 6, # '่'
233: 7, # '้'
234: 67, # '๊'
235: 77, # '๋'
236: 38, # '์'
237: 93, # 'ํ'
238: 246, # '๎'
239: 247, # '๏'
240: 68, # ''
241: 56, # '๑'
242: 59, # '๒'
243: 65, # '๓'
244: 69, # '๔'
245: 60, # '๕'
246: 70, # '๖'
247: 80, # '๗'
248: 71, # '๘'
249: 87, # '๙'
250: 248, # '๚'
251: 249, # '๛'
252: 250, # None
253: 251, # None
254: 252, # None
255: 253, # None
0: 255, # '\x00'
1: 255, # '\x01'
2: 255, # '\x02'
3: 255, # '\x03'
4: 255, # '\x04'
5: 255, # '\x05'
6: 255, # '\x06'
7: 255, # '\x07'
8: 255, # '\x08'
9: 255, # '\t'
10: 254, # '\n'
11: 255, # '\x0b'
12: 255, # '\x0c'
13: 254, # '\r'
14: 255, # '\x0e'
15: 255, # '\x0f'
16: 255, # '\x10'
17: 255, # '\x11'
18: 255, # '\x12'
19: 255, # '\x13'
20: 255, # '\x14'
21: 255, # '\x15'
22: 255, # '\x16'
23: 255, # '\x17'
24: 255, # '\x18'
25: 255, # '\x19'
26: 255, # '\x1a'
27: 255, # '\x1b'
28: 255, # '\x1c'
29: 255, # '\x1d'
30: 255, # '\x1e'
31: 255, # '\x1f'
32: 253, # ' '
33: 253, # '!'
34: 253, # '"'
35: 253, # '#'
36: 253, # '$'
37: 253, # '%'
38: 253, # '&'
39: 253, # "'"
40: 253, # '('
41: 253, # ')'
42: 253, # '*'
43: 253, # '+'
44: 253, # ','
45: 253, # '-'
46: 253, # '.'
47: 253, # '/'
48: 252, # '0'
49: 252, # '1'
50: 252, # '2'
51: 252, # '3'
52: 252, # '4'
53: 252, # '5'
54: 252, # '6'
55: 252, # '7'
56: 252, # '8'
57: 252, # '9'
58: 253, # ':'
59: 253, # ';'
60: 253, # '<'
61: 253, # '='
62: 253, # '>'
63: 253, # '?'
64: 253, # '@'
65: 182, # 'A'
66: 106, # 'B'
67: 107, # 'C'
68: 100, # 'D'
69: 183, # 'E'
70: 184, # 'F'
71: 185, # 'G'
72: 101, # 'H'
73: 94, # 'I'
74: 186, # 'J'
75: 187, # 'K'
76: 108, # 'L'
77: 109, # 'M'
78: 110, # 'N'
79: 111, # 'O'
80: 188, # 'P'
81: 189, # 'Q'
82: 190, # 'R'
83: 89, # 'S'
84: 95, # 'T'
85: 112, # 'U'
86: 113, # 'V'
87: 191, # 'W'
88: 192, # 'X'
89: 193, # 'Y'
90: 194, # 'Z'
91: 253, # '['
92: 253, # '\\'
93: 253, # ']'
94: 253, # '^'
95: 253, # '_'
96: 253, # '`'
97: 64, # 'a'
98: 72, # 'b'
99: 73, # 'c'
100: 114, # 'd'
101: 74, # 'e'
102: 115, # 'f'
103: 116, # 'g'
104: 102, # 'h'
105: 81, # 'i'
106: 201, # 'j'
107: 117, # 'k'
108: 90, # 'l'
109: 103, # 'm'
110: 78, # 'n'
111: 82, # 'o'
112: 96, # 'p'
113: 202, # 'q'
114: 91, # 'r'
115: 79, # 's'
116: 84, # 't'
117: 104, # 'u'
118: 105, # 'v'
119: 97, # 'w'
120: 98, # 'x'
121: 92, # 'y'
122: 203, # 'z'
123: 253, # '{'
124: 253, # '|'
125: 253, # '}'
126: 253, # '~'
127: 253, # '\x7f'
128: 209, # '\x80'
129: 210, # '\x81'
130: 211, # '\x82'
131: 212, # '\x83'
132: 213, # '\x84'
133: 88, # '\x85'
134: 214, # '\x86'
135: 215, # '\x87'
136: 216, # '\x88'
137: 217, # '\x89'
138: 218, # '\x8a'
139: 219, # '\x8b'
140: 220, # '\x8c'
141: 118, # '\x8d'
142: 221, # '\x8e'
143: 222, # '\x8f'
144: 223, # '\x90'
145: 224, # '\x91'
146: 99, # '\x92'
147: 85, # '\x93'
148: 83, # '\x94'
149: 225, # '\x95'
150: 226, # '\x96'
151: 227, # '\x97'
152: 228, # '\x98'
153: 229, # '\x99'
154: 230, # '\x9a'
155: 231, # '\x9b'
156: 232, # '\x9c'
157: 233, # '\x9d'
158: 234, # '\x9e'
159: 235, # '\x9f'
160: 236, # None
161: 5, # 'ก'
162: 30, # 'ข'
163: 237, # 'ฃ'
164: 24, # 'ค'
165: 238, # 'ฅ'
166: 75, # 'ฆ'
167: 8, # 'ง'
168: 26, # 'จ'
169: 52, # 'ฉ'
170: 34, # 'ช'
171: 51, # 'ซ'
172: 119, # 'ฌ'
173: 47, # 'ญ'
174: 58, # 'ฎ'
175: 57, # 'ฏ'
176: 49, # 'ฐ'
177: 53, # 'ฑ'
178: 55, # 'ฒ'
179: 43, # 'ณ'
180: 20, # 'ด'
181: 19, # 'ต'
182: 44, # 'ถ'
183: 14, # 'ท'
184: 48, # 'ธ'
185: 3, # 'น'
186: 17, # 'บ'
187: 25, # 'ป'
188: 39, # 'ผ'
189: 62, # 'ฝ'
190: 31, # 'พ'
191: 54, # 'ฟ'
192: 45, # 'ภ'
193: 9, # 'ม'
194: 16, # 'ย'
195: 2, # 'ร'
196: 61, # 'ฤ'
197: 15, # 'ล'
198: 239, # 'ฦ'
199: 12, # 'ว'
200: 42, # 'ศ'
201: 46, # 'ษ'
202: 18, # 'ส'
203: 21, # 'ห'
204: 76, # 'ฬ'
205: 4, # 'อ'
206: 66, # 'ฮ'
207: 63, # 'ฯ'
208: 22, # 'ะ'
209: 10, # 'ั'
210: 1, # 'า'
211: 36, # 'ำ'
212: 23, # 'ิ'
213: 13, # 'ี'
214: 40, # 'ึ'
215: 27, # 'ื'
216: 32, # 'ุ'
217: 35, # 'ู'
218: 86, # 'ฺ'
219: 240, # None
220: 241, # None
221: 242, # None
222: 243, # None
223: 244, # '฿'
224: 11, # 'เ'
225: 28, # 'แ'
226: 41, # 'โ'
227: 29, # 'ใ'
228: 33, # 'ไ'
229: 245, # 'ๅ'
230: 50, # 'ๆ'
231: 37, # '็'
232: 6, # '่'
233: 7, # '้'
234: 67, # '๊'
235: 77, # '๋'
236: 38, # '์'
237: 93, # 'ํ'
238: 246, # '๎'
239: 247, # '๏'
240: 68, # ''
241: 56, # '๑'
242: 59, # '๒'
243: 65, # '๓'
244: 69, # '๔'
245: 60, # '๕'
246: 70, # '๖'
247: 80, # '๗'
248: 71, # '๘'
249: 87, # '๙'
250: 248, # '๚'
251: 249, # '๛'
252: 250, # None
253: 251, # None
254: 252, # None
255: 253, # None
}
TIS_620_THAI_MODEL = SingleByteCharSetModel(charset_name='TIS-620',
language='Thai',
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
language_model=THAI_LANG_MODEL,
typical_positive_ratio=0.926386,
keep_ascii_letters=False,
alphabet='กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛')
TIS_620_THAI_MODEL = SingleByteCharSetModel(
charset_name="TIS-620",
language="Thai",
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
language_model=THAI_LANG_MODEL,
typical_positive_ratio=0.926386,
keep_ascii_letters=False,
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
)

View file

@ -1,9 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from chardet.sbcharsetprober import SingleByteCharSetModel
# 3: Positive
# 2: Likely
# 1: Unlikely
@ -4115,269 +4111,270 @@ TURKISH_LANG_MODEL = {
# Character Mapping Table(s):
ISO_8859_9_TURKISH_CHAR_TO_ORDER = {
0: 255, # '\x00'
1: 255, # '\x01'
2: 255, # '\x02'
3: 255, # '\x03'
4: 255, # '\x04'
5: 255, # '\x05'
6: 255, # '\x06'
7: 255, # '\x07'
8: 255, # '\x08'
9: 255, # '\t'
10: 255, # '\n'
11: 255, # '\x0b'
12: 255, # '\x0c'
13: 255, # '\r'
14: 255, # '\x0e'
15: 255, # '\x0f'
16: 255, # '\x10'
17: 255, # '\x11'
18: 255, # '\x12'
19: 255, # '\x13'
20: 255, # '\x14'
21: 255, # '\x15'
22: 255, # '\x16'
23: 255, # '\x17'
24: 255, # '\x18'
25: 255, # '\x19'
26: 255, # '\x1a'
27: 255, # '\x1b'
28: 255, # '\x1c'
29: 255, # '\x1d'
30: 255, # '\x1e'
31: 255, # '\x1f'
32: 255, # ' '
33: 255, # '!'
34: 255, # '"'
35: 255, # '#'
36: 255, # '$'
37: 255, # '%'
38: 255, # '&'
39: 255, # "'"
40: 255, # '('
41: 255, # ')'
42: 255, # '*'
43: 255, # '+'
44: 255, # ','
45: 255, # '-'
46: 255, # '.'
47: 255, # '/'
48: 255, # '0'
49: 255, # '1'
50: 255, # '2'
51: 255, # '3'
52: 255, # '4'
53: 255, # '5'
54: 255, # '6'
55: 255, # '7'
56: 255, # '8'
57: 255, # '9'
58: 255, # ':'
59: 255, # ';'
60: 255, # '<'
61: 255, # '='
62: 255, # '>'
63: 255, # '?'
64: 255, # '@'
65: 23, # 'A'
66: 37, # 'B'
67: 47, # 'C'
68: 39, # 'D'
69: 29, # 'E'
70: 52, # 'F'
71: 36, # 'G'
72: 45, # 'H'
73: 53, # 'I'
74: 60, # 'J'
75: 16, # 'K'
76: 49, # 'L'
77: 20, # 'M'
78: 46, # 'N'
79: 42, # 'O'
80: 48, # 'P'
81: 69, # 'Q'
82: 44, # 'R'
83: 35, # 'S'
84: 31, # 'T'
85: 51, # 'U'
86: 38, # 'V'
87: 62, # 'W'
88: 65, # 'X'
89: 43, # 'Y'
90: 56, # 'Z'
91: 255, # '['
92: 255, # '\\'
93: 255, # ']'
94: 255, # '^'
95: 255, # '_'
96: 255, # '`'
97: 1, # 'a'
98: 21, # 'b'
99: 28, # 'c'
100: 12, # 'd'
101: 2, # 'e'
102: 18, # 'f'
103: 27, # 'g'
104: 25, # 'h'
105: 3, # 'i'
106: 24, # 'j'
107: 10, # 'k'
108: 5, # 'l'
109: 13, # 'm'
110: 4, # 'n'
111: 15, # 'o'
112: 26, # 'p'
113: 64, # 'q'
114: 7, # 'r'
115: 8, # 's'
116: 9, # 't'
117: 14, # 'u'
118: 32, # 'v'
119: 57, # 'w'
120: 58, # 'x'
121: 11, # 'y'
122: 22, # 'z'
123: 255, # '{'
124: 255, # '|'
125: 255, # '}'
126: 255, # '~'
127: 255, # '\x7f'
128: 180, # '\x80'
129: 179, # '\x81'
130: 178, # '\x82'
131: 177, # '\x83'
132: 176, # '\x84'
133: 175, # '\x85'
134: 174, # '\x86'
135: 173, # '\x87'
136: 172, # '\x88'
137: 171, # '\x89'
138: 170, # '\x8a'
139: 169, # '\x8b'
140: 168, # '\x8c'
141: 167, # '\x8d'
142: 166, # '\x8e'
143: 165, # '\x8f'
144: 164, # '\x90'
145: 163, # '\x91'
146: 162, # '\x92'
147: 161, # '\x93'
148: 160, # '\x94'
149: 159, # '\x95'
150: 101, # '\x96'
151: 158, # '\x97'
152: 157, # '\x98'
153: 156, # '\x99'
154: 155, # '\x9a'
155: 154, # '\x9b'
156: 153, # '\x9c'
157: 152, # '\x9d'
158: 151, # '\x9e'
159: 106, # '\x9f'
160: 150, # '\xa0'
161: 149, # '¡'
162: 148, # '¢'
163: 147, # '£'
164: 146, # '¤'
165: 145, # '¥'
166: 144, # '¦'
167: 100, # '§'
168: 143, # '¨'
169: 142, # '©'
170: 141, # 'ª'
171: 140, # '«'
172: 139, # '¬'
173: 138, # '\xad'
174: 137, # '®'
175: 136, # '¯'
176: 94, # '°'
177: 80, # '±'
178: 93, # '²'
179: 135, # '³'
180: 105, # '´'
181: 134, # 'µ'
182: 133, # '¶'
183: 63, # '·'
184: 132, # '¸'
185: 131, # '¹'
186: 130, # 'º'
187: 129, # '»'
188: 128, # '¼'
189: 127, # '½'
190: 126, # '¾'
191: 125, # '¿'
192: 124, # 'À'
193: 104, # 'Á'
194: 73, # 'Â'
195: 99, # 'Ã'
196: 79, # 'Ä'
197: 85, # 'Å'
198: 123, # 'Æ'
199: 54, # 'Ç'
200: 122, # 'È'
201: 98, # 'É'
202: 92, # 'Ê'
203: 121, # 'Ë'
204: 120, # 'Ì'
205: 91, # 'Í'
206: 103, # 'Î'
207: 119, # 'Ï'
208: 68, # 'Ğ'
209: 118, # 'Ñ'
210: 117, # 'Ò'
211: 97, # 'Ó'
212: 116, # 'Ô'
213: 115, # 'Õ'
214: 50, # 'Ö'
215: 90, # '×'
216: 114, # 'Ø'
217: 113, # 'Ù'
218: 112, # 'Ú'
219: 111, # 'Û'
220: 55, # 'Ü'
221: 41, # 'İ'
222: 40, # 'Ş'
223: 86, # 'ß'
224: 89, # 'à'
225: 70, # 'á'
226: 59, # 'â'
227: 78, # 'ã'
228: 71, # 'ä'
229: 82, # 'å'
230: 88, # 'æ'
231: 33, # 'ç'
232: 77, # 'è'
233: 66, # 'é'
234: 84, # 'ê'
235: 83, # 'ë'
236: 110, # 'ì'
237: 75, # 'í'
238: 61, # 'î'
239: 96, # 'ï'
240: 30, # 'ğ'
241: 67, # 'ñ'
242: 109, # 'ò'
243: 74, # 'ó'
244: 87, # 'ô'
245: 102, # 'õ'
246: 34, # 'ö'
247: 95, # '÷'
248: 81, # 'ø'
249: 108, # 'ù'
250: 76, # 'ú'
251: 72, # 'û'
252: 17, # 'ü'
253: 6, # 'ı'
254: 19, # 'ş'
255: 107, # 'ÿ'
0: 255, # '\x00'
1: 255, # '\x01'
2: 255, # '\x02'
3: 255, # '\x03'
4: 255, # '\x04'
5: 255, # '\x05'
6: 255, # '\x06'
7: 255, # '\x07'
8: 255, # '\x08'
9: 255, # '\t'
10: 255, # '\n'
11: 255, # '\x0b'
12: 255, # '\x0c'
13: 255, # '\r'
14: 255, # '\x0e'
15: 255, # '\x0f'
16: 255, # '\x10'
17: 255, # '\x11'
18: 255, # '\x12'
19: 255, # '\x13'
20: 255, # '\x14'
21: 255, # '\x15'
22: 255, # '\x16'
23: 255, # '\x17'
24: 255, # '\x18'
25: 255, # '\x19'
26: 255, # '\x1a'
27: 255, # '\x1b'
28: 255, # '\x1c'
29: 255, # '\x1d'
30: 255, # '\x1e'
31: 255, # '\x1f'
32: 255, # ' '
33: 255, # '!'
34: 255, # '"'
35: 255, # '#'
36: 255, # '$'
37: 255, # '%'
38: 255, # '&'
39: 255, # "'"
40: 255, # '('
41: 255, # ')'
42: 255, # '*'
43: 255, # '+'
44: 255, # ','
45: 255, # '-'
46: 255, # '.'
47: 255, # '/'
48: 255, # '0'
49: 255, # '1'
50: 255, # '2'
51: 255, # '3'
52: 255, # '4'
53: 255, # '5'
54: 255, # '6'
55: 255, # '7'
56: 255, # '8'
57: 255, # '9'
58: 255, # ':'
59: 255, # ';'
60: 255, # '<'
61: 255, # '='
62: 255, # '>'
63: 255, # '?'
64: 255, # '@'
65: 23, # 'A'
66: 37, # 'B'
67: 47, # 'C'
68: 39, # 'D'
69: 29, # 'E'
70: 52, # 'F'
71: 36, # 'G'
72: 45, # 'H'
73: 53, # 'I'
74: 60, # 'J'
75: 16, # 'K'
76: 49, # 'L'
77: 20, # 'M'
78: 46, # 'N'
79: 42, # 'O'
80: 48, # 'P'
81: 69, # 'Q'
82: 44, # 'R'
83: 35, # 'S'
84: 31, # 'T'
85: 51, # 'U'
86: 38, # 'V'
87: 62, # 'W'
88: 65, # 'X'
89: 43, # 'Y'
90: 56, # 'Z'
91: 255, # '['
92: 255, # '\\'
93: 255, # ']'
94: 255, # '^'
95: 255, # '_'
96: 255, # '`'
97: 1, # 'a'
98: 21, # 'b'
99: 28, # 'c'
100: 12, # 'd'
101: 2, # 'e'
102: 18, # 'f'
103: 27, # 'g'
104: 25, # 'h'
105: 3, # 'i'
106: 24, # 'j'
107: 10, # 'k'
108: 5, # 'l'
109: 13, # 'm'
110: 4, # 'n'
111: 15, # 'o'
112: 26, # 'p'
113: 64, # 'q'
114: 7, # 'r'
115: 8, # 's'
116: 9, # 't'
117: 14, # 'u'
118: 32, # 'v'
119: 57, # 'w'
120: 58, # 'x'
121: 11, # 'y'
122: 22, # 'z'
123: 255, # '{'
124: 255, # '|'
125: 255, # '}'
126: 255, # '~'
127: 255, # '\x7f'
128: 180, # '\x80'
129: 179, # '\x81'
130: 178, # '\x82'
131: 177, # '\x83'
132: 176, # '\x84'
133: 175, # '\x85'
134: 174, # '\x86'
135: 173, # '\x87'
136: 172, # '\x88'
137: 171, # '\x89'
138: 170, # '\x8a'
139: 169, # '\x8b'
140: 168, # '\x8c'
141: 167, # '\x8d'
142: 166, # '\x8e'
143: 165, # '\x8f'
144: 164, # '\x90'
145: 163, # '\x91'
146: 162, # '\x92'
147: 161, # '\x93'
148: 160, # '\x94'
149: 159, # '\x95'
150: 101, # '\x96'
151: 158, # '\x97'
152: 157, # '\x98'
153: 156, # '\x99'
154: 155, # '\x9a'
155: 154, # '\x9b'
156: 153, # '\x9c'
157: 152, # '\x9d'
158: 151, # '\x9e'
159: 106, # '\x9f'
160: 150, # '\xa0'
161: 149, # '¡'
162: 148, # '¢'
163: 147, # '£'
164: 146, # '¤'
165: 145, # '¥'
166: 144, # '¦'
167: 100, # '§'
168: 143, # '¨'
169: 142, # '©'
170: 141, # 'ª'
171: 140, # '«'
172: 139, # '¬'
173: 138, # '\xad'
174: 137, # '®'
175: 136, # '¯'
176: 94, # '°'
177: 80, # '±'
178: 93, # '²'
179: 135, # '³'
180: 105, # '´'
181: 134, # 'µ'
182: 133, # '¶'
183: 63, # '·'
184: 132, # '¸'
185: 131, # '¹'
186: 130, # 'º'
187: 129, # '»'
188: 128, # '¼'
189: 127, # '½'
190: 126, # '¾'
191: 125, # '¿'
192: 124, # 'À'
193: 104, # 'Á'
194: 73, # 'Â'
195: 99, # 'Ã'
196: 79, # 'Ä'
197: 85, # 'Å'
198: 123, # 'Æ'
199: 54, # 'Ç'
200: 122, # 'È'
201: 98, # 'É'
202: 92, # 'Ê'
203: 121, # 'Ë'
204: 120, # 'Ì'
205: 91, # 'Í'
206: 103, # 'Î'
207: 119, # 'Ï'
208: 68, # 'Ğ'
209: 118, # 'Ñ'
210: 117, # 'Ò'
211: 97, # 'Ó'
212: 116, # 'Ô'
213: 115, # 'Õ'
214: 50, # 'Ö'
215: 90, # '×'
216: 114, # 'Ø'
217: 113, # 'Ù'
218: 112, # 'Ú'
219: 111, # 'Û'
220: 55, # 'Ü'
221: 41, # 'İ'
222: 40, # 'Ş'
223: 86, # 'ß'
224: 89, # 'à'
225: 70, # 'á'
226: 59, # 'â'
227: 78, # 'ã'
228: 71, # 'ä'
229: 82, # 'å'
230: 88, # 'æ'
231: 33, # 'ç'
232: 77, # 'è'
233: 66, # 'é'
234: 84, # 'ê'
235: 83, # 'ë'
236: 110, # 'ì'
237: 75, # 'í'
238: 61, # 'î'
239: 96, # 'ï'
240: 30, # 'ğ'
241: 67, # 'ñ'
242: 109, # 'ò'
243: 74, # 'ó'
244: 87, # 'ô'
245: 102, # 'õ'
246: 34, # 'ö'
247: 95, # '÷'
248: 81, # 'ø'
249: 108, # 'ù'
250: 76, # 'ú'
251: 72, # 'û'
252: 17, # 'ü'
253: 6, # 'ı'
254: 19, # 'ş'
255: 107, # 'ÿ'
}
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-9',
language='Turkish',
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
language_model=TURKISH_LANG_MODEL,
typical_positive_ratio=0.97029,
keep_ascii_letters=True,
alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş')
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(
charset_name="ISO-8859-9",
language="Turkish",
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
language_model=TURKISH_LANG_MODEL,
typical_positive_ratio=0.97029,
keep_ascii_letters=True,
alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş",
)

View file

@ -26,6 +26,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
@ -41,6 +43,7 @@ ASV = 6 # accent small vowel
ASO = 7 # accent small other
CLASS_NUM = 8 # total classes
# fmt: off
Latin1_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
@ -91,34 +94,34 @@ Latin1ClassModel = (
0, 3, 1, 3, 1, 1, 1, 3, # ASV
0, 3, 1, 3, 1, 1, 3, 3, # ASO
)
# fmt: on
class Latin1Prober(CharSetProber):
def __init__(self):
super(Latin1Prober, self).__init__()
self._last_char_class = None
self._freq_counter = None
def __init__(self) -> None:
super().__init__()
self._last_char_class = OTH
self._freq_counter: List[int] = []
self.reset()
def reset(self):
def reset(self) -> None:
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
CharSetProber.reset(self)
super().reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "ISO-8859-1"
@property
def language(self):
def language(self) -> str:
return ""
def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
byte_str = self.remove_xml_tags(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
+ char_class]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]
if freq == 0:
self._state = ProbingState.NOT_ME
break
@ -127,19 +130,18 @@ class Latin1Prober(CharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._freq_counter)
if total < 0.01:
confidence = 0.0
else:
confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)
/ total)
if confidence < 0.0:
confidence = 0.0
confidence = (
0.0
if total < 0.01
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
)
confidence = max(confidence, 0.0)
# lower the confidence of latin1 so that other more accurate
# detector can take priority.
confidence = confidence * 0.73
confidence *= 0.73
return confidence

View file

@ -0,0 +1,162 @@
######################## BEGIN LICENSE BLOCK ########################
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Rob Speer - adapt to MacRoman encoding
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
FREQ_CAT_NUM = 4
UDF = 0 # undefined
OTH = 1 # other
ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel
ACO = 5 # accent capital other
ASV = 6 # accent small vowel
ASO = 7 # accent small other
ODD = 8 # character that is unlikely to appear
CLASS_NUM = 9 # total classes
# The change from Latin1 is that we explicitly look for extended characters
# that are infrequently-occurring symbols, and consider them to always be
# improbable. This should let MacRoman get out of the way of more likely
# encodings in most situations.
# fmt: off
MacRoman_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
)
# 0 : illegal
# 1 : very unlikely
# 2 : normal
# 3 : very likely
MacRomanClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
)
# fmt: on
class MacRomanProber(CharSetProber):
def __init__(self) -> None:
super().__init__()
self._last_char_class = OTH
self._freq_counter: List[int] = []
self.reset()
def reset(self) -> None:
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
# express the prior that MacRoman is a somewhat rare encoding;
# this can be done by starting out in a slightly improbable state
# that must be overcome
self._freq_counter[2] = 10
super().reset()
@property
def charset_name(self) -> str:
return "MacRoman"
@property
def language(self) -> str:
return ""
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
byte_str = self.remove_xml_tags(byte_str)
for c in byte_str:
char_class = MacRoman_CharToClass[c]
freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
if freq == 0:
self._state = ProbingState.NOT_ME
break
self._freq_counter[freq] += 1
self._last_char_class = char_class
return self.state
def get_confidence(self) -> float:
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._freq_counter)
confidence = (
0.0
if total < 0.01
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
)
confidence = max(confidence, 0.0)
# lower the confidence of MacRoman so that other more accurate
# detector can take priority.
confidence *= 0.73
return confidence

View file

@ -27,8 +27,12 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .chardistribution import CharDistributionAnalysis
from .charsetprober import CharSetProber
from .enums import ProbingState, MachineState
from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, MachineState, ProbingState
class MultiByteCharSetProber(CharSetProber):
@ -36,56 +40,56 @@ class MultiByteCharSetProber(CharSetProber):
MultiByteCharSetProber
"""
def __init__(self, lang_filter=None):
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
self.distribution_analyzer = None
self.coding_sm = None
self._last_char = [0, 0]
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
self.coding_sm: Optional[CodingStateMachine] = None
self._last_char = bytearray(b"\0\0")
def reset(self):
super(MultiByteCharSetProber, self).reset()
def reset(self) -> None:
super().reset()
if self.coding_sm:
self.coding_sm.reset()
if self.distribution_analyzer:
self.distribution_analyzer.reset()
self._last_char = [0, 0]
self._last_char = bytearray(b"\0\0")
@property
def charset_name(self):
raise NotImplementedError
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
@property
def language(self):
raise NotImplementedError
def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
for i, byte in enumerate(byte_str):
coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self.logger.debug(
"%s %s prober hit error at byte %s",
self.charset_name,
self.language,
i,
)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
if coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
if coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
self._last_char[1] = byte
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.DETECTING:
if (self.distribution_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
if self.distribution_analyzer.got_enough_data() and (
self.get_confidence() > self.SHORTCUT_THRESHOLD
):
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
return self.distribution_analyzer.get_confidence()

View file

@ -27,20 +27,22 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetgroupprober import CharSetGroupProber
from .utf8prober import UTF8Prober
from .sjisprober import SJISProber
from .eucjpprober import EUCJPProber
from .gb2312prober import GB2312Prober
from .euckrprober import EUCKRProber
from .cp949prober import CP949Prober
from .big5prober import Big5Prober
from .charsetgroupprober import CharSetGroupProber
from .cp949prober import CP949Prober
from .enums import LanguageFilter
from .eucjpprober import EUCJPProber
from .euckrprober import EUCKRProber
from .euctwprober import EUCTWProber
from .gb2312prober import GB2312Prober
from .johabprober import JOHABProber
from .sjisprober import SJISProber
from .utf8prober import UTF8Prober
class MBCSGroupProber(CharSetGroupProber):
def __init__(self, lang_filter=None):
super(MBCSGroupProber, self).__init__(lang_filter=lang_filter)
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self.probers = [
UTF8Prober(),
SJISProber(),
@ -49,6 +51,7 @@ class MBCSGroupProber(CharSetGroupProber):
EUCKRProber(),
CP949Prober(),
Big5Prober(),
EUCTWProber()
EUCTWProber(),
JOHABProber(),
]
self.reset()

View file

@ -25,43 +25,45 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState
# BIG5
# fmt: off
BIG5_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
4,4,4,4,4,4,4,4, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
4,3,3,3,3,3,3,3, # a0 - a7
3,3,3,3,3,3,3,3, # a8 - af
3,3,3,3,3,3,3,3, # b0 - b7
3,3,3,3,3,3,3,3, # b8 - bf
3,3,3,3,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0 # f8 - ff
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
4, 4, 4, 4, 4, 4, 4, 4, # 80 - 87
4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
4, 3, 3, 3, 3, 3, 3, 3, # a0 - a7
3, 3, 3, 3, 3, 3, 3, 3, # a8 - af
3, 3, 3, 3, 3, 3, 3, 3, # b0 - b7
3, 3, 3, 3, 3, 3, 3, 3, # b8 - bf
3, 3, 3, 3, 3, 3, 3, 3, # c0 - c7
3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
)
BIG5_ST = (
@ -69,34 +71,37 @@ BIG5_ST = (
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
)
# fmt: on
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
BIG5_SM_MODEL = {'class_table': BIG5_CLS,
'class_factor': 5,
'state_table': BIG5_ST,
'char_len_table': BIG5_CHAR_LEN_TABLE,
'name': 'Big5'}
BIG5_SM_MODEL: CodingStateMachineDict = {
"class_table": BIG5_CLS,
"class_factor": 5,
"state_table": BIG5_ST,
"char_len_table": BIG5_CHAR_LEN_TABLE,
"name": "Big5",
}
# CP949
# fmt: off
CP949_CLS = (
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 2f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 3f
1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 4f
4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 50 - 5f
1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 60 - 6f
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 70 - 7f
0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 80 - 8f
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 9f
6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, # a0 - af
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0 - bf
7, 7, 7, 7, 7, 7, 9, 2, 2, 3, 2, 2, 2, 2, 2, 2, # c0 - cf
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0 - df
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # e0 - ef
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, # f0 - ff
)
CP949_ST = (
@ -109,50 +114,53 @@ CP949_ST = (
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
)
# fmt: on
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949_SM_MODEL = {'class_table': CP949_CLS,
'class_factor': 10,
'state_table': CP949_ST,
'char_len_table': CP949_CHAR_LEN_TABLE,
'name': 'CP949'}
CP949_SM_MODEL: CodingStateMachineDict = {
"class_table": CP949_CLS,
"class_factor": 10,
"state_table": CP949_ST,
"char_len_table": CP949_CHAR_LEN_TABLE,
"name": "CP949",
}
# EUC-JP
# fmt: off
EUCJP_CLS = (
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,5,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f
4,4,4,4,4,4,4,4, # 30 - 37
4,4,4,4,4,4,4,4, # 38 - 3f
4,4,4,4,4,4,4,4, # 40 - 47
4,4,4,4,4,4,4,4, # 48 - 4f
4,4,4,4,4,4,4,4, # 50 - 57
4,4,4,4,4,4,4,4, # 58 - 5f
4,4,4,4,4,4,4,4, # 60 - 67
4,4,4,4,4,4,4,4, # 68 - 6f
4,4,4,4,4,4,4,4, # 70 - 77
4,4,4,4,4,4,4,4, # 78 - 7f
5,5,5,5,5,5,5,5, # 80 - 87
5,5,5,5,5,5,1,3, # 88 - 8f
5,5,5,5,5,5,5,5, # 90 - 97
5,5,5,5,5,5,5,5, # 98 - 9f
5,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5 # f8 - ff
4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07
4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f
4, 4, 4, 4, 4, 4, 4, 4, # 10 - 17
4, 4, 4, 5, 4, 4, 4, 4, # 18 - 1f
4, 4, 4, 4, 4, 4, 4, 4, # 20 - 27
4, 4, 4, 4, 4, 4, 4, 4, # 28 - 2f
4, 4, 4, 4, 4, 4, 4, 4, # 30 - 37
4, 4, 4, 4, 4, 4, 4, 4, # 38 - 3f
4, 4, 4, 4, 4, 4, 4, 4, # 40 - 47
4, 4, 4, 4, 4, 4, 4, 4, # 48 - 4f
4, 4, 4, 4, 4, 4, 4, 4, # 50 - 57
4, 4, 4, 4, 4, 4, 4, 4, # 58 - 5f
4, 4, 4, 4, 4, 4, 4, 4, # 60 - 67
4, 4, 4, 4, 4, 4, 4, 4, # 68 - 6f
4, 4, 4, 4, 4, 4, 4, 4, # 70 - 77
4, 4, 4, 4, 4, 4, 4, 4, # 78 - 7f
5, 5, 5, 5, 5, 5, 5, 5, # 80 - 87
5, 5, 5, 5, 5, 5, 1, 3, # 88 - 8f
5, 5, 5, 5, 5, 5, 5, 5, # 90 - 97
5, 5, 5, 5, 5, 5, 5, 5, # 98 - 9f
5, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
0, 0, 0, 0, 0, 0, 0, 5 # f8 - ff
)
EUCJP_ST = (
@ -162,100 +170,163 @@ EUCJP_ST = (
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
)
# fmt: on
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
EUCJP_SM_MODEL = {'class_table': EUCJP_CLS,
'class_factor': 6,
'state_table': EUCJP_ST,
'char_len_table': EUCJP_CHAR_LEN_TABLE,
'name': 'EUC-JP'}
EUCJP_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCJP_CLS,
"class_factor": 6,
"state_table": EUCJP_ST,
"char_len_table": EUCJP_CHAR_LEN_TABLE,
"name": "EUC-JP",
}
# EUC-KR
# fmt: off
EUCKR_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,3,3,3, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,3,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0 # f8 - ff
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
0, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
2, 2, 2, 2, 2, 3, 3, 3, # a8 - af
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
2, 3, 2, 2, 2, 2, 2, 2, # c8 - cf
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
2, 2, 2, 2, 2, 2, 2, 0 # f8 - ff
)
EUCKR_ST = (
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
)
# fmt: on
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
EUCKR_SM_MODEL = {'class_table': EUCKR_CLS,
'class_factor': 4,
'state_table': EUCKR_ST,
'char_len_table': EUCKR_CHAR_LEN_TABLE,
'name': 'EUC-KR'}
EUCKR_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCKR_CLS,
"class_factor": 4,
"state_table": EUCKR_ST,
"char_len_table": EUCKR_CHAR_LEN_TABLE,
"name": "EUC-KR",
}
# JOHAB
# fmt: off
JOHAB_CLS = (
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,0,0, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,0,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f
4,3,3,3,3,3,3,3, # 30 - 37
3,3,3,3,3,3,3,3, # 38 - 3f
3,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,2, # 78 - 7f
6,6,6,6,8,8,8,8, # 80 - 87
8,8,8,8,8,8,8,8, # 88 - 8f
8,7,7,7,7,7,7,7, # 90 - 97
7,7,7,7,7,7,7,7, # 98 - 9f
7,7,7,7,7,7,7,7, # a0 - a7
7,7,7,7,7,7,7,7, # a8 - af
7,7,7,7,7,7,7,7, # b0 - b7
7,7,7,7,7,7,7,7, # b8 - bf
7,7,7,7,7,7,7,7, # c0 - c7
7,7,7,7,7,7,7,7, # c8 - cf
7,7,7,7,5,5,5,5, # d0 - d7
5,9,9,9,9,9,9,5, # d8 - df
9,9,9,9,9,9,9,9, # e0 - e7
9,9,9,9,9,9,9,9, # e8 - ef
9,9,9,9,9,9,9,9, # f0 - f7
9,9,5,5,5,5,5,0 # f8 - ff
)
JOHAB_ST = (
# cls = 0 1 2 3 4 5 6 7 8 9
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3
MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4
)
# fmt: on
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
JOHAB_SM_MODEL: CodingStateMachineDict = {
"class_table": JOHAB_CLS,
"class_factor": 10,
"state_table": JOHAB_ST,
"char_len_table": JOHAB_CHAR_LEN_TABLE,
"name": "Johab",
}
# EUC-TW
# fmt: off
EUCTW_CLS = (
2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17
2,2,2,0,2,2,2,2, # 18 - 1f
2,2,2,2,2,2,2,2, # 20 - 27
2,2,2,2,2,2,2,2, # 28 - 2f
2,2,2,2,2,2,2,2, # 30 - 37
2,2,2,2,2,2,2,2, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,2, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,6,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,3,4,4,4,4,4,4, # a0 - a7
5,5,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,3,1,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0 # f8 - ff
2, 2, 2, 2, 2, 2, 2, 2, # 00 - 07
2, 2, 2, 2, 2, 2, 0, 0, # 08 - 0f
2, 2, 2, 2, 2, 2, 2, 2, # 10 - 17
2, 2, 2, 0, 2, 2, 2, 2, # 18 - 1f
2, 2, 2, 2, 2, 2, 2, 2, # 20 - 27
2, 2, 2, 2, 2, 2, 2, 2, # 28 - 2f
2, 2, 2, 2, 2, 2, 2, 2, # 30 - 37
2, 2, 2, 2, 2, 2, 2, 2, # 38 - 3f
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
2, 2, 2, 2, 2, 2, 2, 2, # 78 - 7f
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
0, 0, 0, 0, 0, 0, 6, 0, # 88 - 8f
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
0, 3, 4, 4, 4, 4, 4, 4, # a0 - a7
5, 5, 1, 1, 1, 1, 1, 1, # a8 - af
1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7
1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf
1, 1, 3, 1, 3, 3, 3, 3, # c0 - c7
3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
)
EUCTW_ST = (
@ -266,50 +337,53 @@ EUCTW_ST = (
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
)
# fmt: on
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
EUCTW_SM_MODEL = {'class_table': EUCTW_CLS,
'class_factor': 7,
'state_table': EUCTW_ST,
'char_len_table': EUCTW_CHAR_LEN_TABLE,
'name': 'x-euc-tw'}
EUCTW_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCTW_CLS,
"class_factor": 7,
"state_table": EUCTW_ST,
"char_len_table": EUCTW_CHAR_LEN_TABLE,
"name": "x-euc-tw",
}
# GB2312
# fmt: off
GB2312_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
3,3,3,3,3,3,3,3, # 30 - 37
3,3,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,4, # 78 - 7f
5,6,6,6,6,6,6,6, # 80 - 87
6,6,6,6,6,6,6,6, # 88 - 8f
6,6,6,6,6,6,6,6, # 90 - 97
6,6,6,6,6,6,6,6, # 98 - 9f
6,6,6,6,6,6,6,6, # a0 - a7
6,6,6,6,6,6,6,6, # a8 - af
6,6,6,6,6,6,6,6, # b0 - b7
6,6,6,6,6,6,6,6, # b8 - bf
6,6,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0 # f8 - ff
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
3, 3, 3, 3, 3, 3, 3, 3, # 30 - 37
3, 3, 1, 1, 1, 1, 1, 1, # 38 - 3f
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
2, 2, 2, 2, 2, 2, 2, 4, # 78 - 7f
5, 6, 6, 6, 6, 6, 6, 6, # 80 - 87
6, 6, 6, 6, 6, 6, 6, 6, # 88 - 8f
6, 6, 6, 6, 6, 6, 6, 6, # 90 - 97
6, 6, 6, 6, 6, 6, 6, 6, # 98 - 9f
6, 6, 6, 6, 6, 6, 6, 6, # a0 - a7
6, 6, 6, 6, 6, 6, 6, 6, # a8 - af
6, 6, 6, 6, 6, 6, 6, 6, # b0 - b7
6, 6, 6, 6, 6, 6, 6, 6, # b8 - bf
6, 6, 6, 6, 6, 6, 6, 6, # c0 - c7
6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
6, 6, 6, 6, 6, 6, 6, 6, # e0 - e7
6, 6, 6, 6, 6, 6, 6, 6, # e8 - ef
6, 6, 6, 6, 6, 6, 6, 6, # f0 - f7
6, 6, 6, 6, 6, 6, 6, 0 # f8 - ff
)
GB2312_ST = (
@ -320,6 +394,7 @@ GB2312_ST = (
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
)
# fmt: on
# To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since
@ -328,100 +403,105 @@ GB2312_ST = (
# 2 here.
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
GB2312_SM_MODEL = {'class_table': GB2312_CLS,
'class_factor': 7,
'state_table': GB2312_ST,
'char_len_table': GB2312_CHAR_LEN_TABLE,
'name': 'GB2312'}
GB2312_SM_MODEL: CodingStateMachineDict = {
"class_table": GB2312_CLS,
"class_factor": 7,
"state_table": GB2312_ST,
"char_len_table": GB2312_CHAR_LEN_TABLE,
"name": "GB2312",
}
# Shift_JIS
# fmt: off
SJIS_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
3, 3, 3, 3, 3, 2, 2, 3, # 80 - 87
3, 3, 3, 3, 3, 3, 3, 3, # 88 - 8f
3, 3, 3, 3, 3, 3, 3, 3, # 90 - 97
3, 3, 3, 3, 3, 3, 3, 3, # 98 - 9f
#0xa0 is illegal in sjis encoding, but some pages does
#contain such byte. We need to be more error forgiven.
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,0,0,0) # f8 - ff
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff
)
SJIS_ST = (
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
)
# fmt: on
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
SJIS_SM_MODEL = {'class_table': SJIS_CLS,
'class_factor': 6,
'state_table': SJIS_ST,
'char_len_table': SJIS_CHAR_LEN_TABLE,
'name': 'Shift_JIS'}
SJIS_SM_MODEL: CodingStateMachineDict = {
"class_table": SJIS_CLS,
"class_factor": 6,
"state_table": SJIS_ST,
"char_len_table": SJIS_CHAR_LEN_TABLE,
"name": "Shift_JIS",
}
# UCS2-BE
# fmt: off
UCS2BE_CLS = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5 # f8 - ff
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
)
UCS2BE_ST = (
@ -433,50 +513,53 @@ UCS2BE_ST = (
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
)
# fmt: on
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS,
'class_factor': 6,
'state_table': UCS2BE_ST,
'char_len_table': UCS2BE_CHAR_LEN_TABLE,
'name': 'UTF-16BE'}
UCS2BE_SM_MODEL: CodingStateMachineDict = {
"class_table": UCS2BE_CLS,
"class_factor": 6,
"state_table": UCS2BE_ST,
"char_len_table": UCS2BE_CHAR_LEN_TABLE,
"name": "UTF-16BE",
}
# UCS2-LE
# fmt: off
UCS2LE_CLS = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5 # f8 - ff
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
)
UCS2LE_ST = (
@ -488,50 +571,53 @@ UCS2LE_ST = (
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
)
# fmt: on
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS,
'class_factor': 6,
'state_table': UCS2LE_ST,
'char_len_table': UCS2LE_CHAR_LEN_TABLE,
'name': 'UTF-16LE'}
UCS2LE_SM_MODEL: CodingStateMachineDict = {
"class_table": UCS2LE_CLS,
"class_factor": 6,
"state_table": UCS2LE_ST,
"char_len_table": UCS2LE_CHAR_LEN_TABLE,
"name": "UTF-16LE",
}
# UTF-8
# fmt: off
UTF8_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
2,2,2,2,3,3,3,3, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
5,5,5,5,5,5,5,5, # a0 - a7
5,5,5,5,5,5,5,5, # a8 - af
5,5,5,5,5,5,5,5, # b0 - b7
5,5,5,5,5,5,5,5, # b8 - bf
0,0,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0 # f8 - ff
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as a legal value
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
2, 2, 2, 2, 3, 3, 3, 3, # 80 - 87
4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
5, 5, 5, 5, 5, 5, 5, 5, # a0 - a7
5, 5, 5, 5, 5, 5, 5, 5, # a8 - af
5, 5, 5, 5, 5, 5, 5, 5, # b0 - b7
5, 5, 5, 5, 5, 5, 5, 5, # b8 - bf
0, 0, 6, 6, 6, 6, 6, 6, # c0 - c7
6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
7, 8, 8, 8, 8, 8, 8, 8, # e0 - e7
8, 8, 8, 8, 8, 9, 8, 8, # e8 - ef
10, 11, 11, 11, 11, 11, 11, 11, # f0 - f7
12, 13, 13, 13, 14, 15, 0, 0 # f8 - ff
)
UTF8_ST = (
@ -562,11 +648,14 @@ UTF8_ST = (
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
)
# fmt: on
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
UTF8_SM_MODEL = {'class_table': UTF8_CLS,
'class_factor': 16,
'state_table': UTF8_ST,
'char_len_table': UTF8_CHAR_LEN_TABLE,
'name': 'UTF-8'}
UTF8_SM_MODEL: CodingStateMachineDict = {
"class_table": UTF8_CLS,
"class_factor": 16,
"state_table": UTF8_ST,
"char_len_table": UTF8_CHAR_LEN_TABLE,
"name": "UTF-8",
}

View file

@ -1,19 +1,17 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Metadata about languages used by our model training code for our
SingleByteCharSetProbers. Could be used for other things in the future.
This code is based on the language metadata from the uchardet project.
"""
from __future__ import absolute_import, print_function
from string import ascii_letters
from typing import List, Optional
# TODO: Add Ukrainian (KOI8-U)
# TODO: Add Ukranian (KOI8-U)
class Language(object):
class Language:
"""Metadata about a language useful for training models
:ivar name: The human name for the language, in English.
@ -33,9 +31,17 @@ class Language(object):
Wikipedia for training data.
:type wiki_start_pages: list of str
"""
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
alphabet=None, wiki_start_pages=None):
super(Language, self).__init__()
def __init__(
self,
name: Optional[str] = None,
iso_code: Optional[str] = None,
use_ascii: bool = True,
charsets: Optional[List[str]] = None,
alphabet: Optional[str] = None,
wiki_start_pages: Optional[List[str]] = None,
) -> None:
super().__init__()
self.name = name
self.iso_code = iso_code
self.use_ascii = use_ascii
@ -46,265 +52,301 @@ class Language(object):
else:
alphabet = ascii_letters
elif not alphabet:
raise ValueError('Must supply alphabet if use_ascii is False')
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
raise ValueError("Must supply alphabet if use_ascii is False")
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
self.wiki_start_pages = wiki_start_pages
def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
', '.join('{}={!r}'.format(k, v)
for k, v in self.__dict__.items()
if not k.startswith('_')))
def __repr__(self) -> str:
param_str = ", ".join(
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
)
return f"{self.__class__.__name__}({param_str})"
LANGUAGES = {'Arabic': Language(name='Arabic',
iso_code='ar',
use_ascii=False,
# We only support encodings that use isolated
# forms, because the current recommendation is
# that the rendering system handles presentation
# forms. This means we purposefully skip IBM864.
charsets=['ISO-8859-6', 'WINDOWS-1256',
'CP720', 'CP864'],
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
wiki_start_pages=[u'الصفحة_الرئيسية']),
'Belarusian': Language(name='Belarusian',
iso_code='be',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'IBM866', 'MacCyrillic'],
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
wiki_start_pages=[u'Галоўная_старонка']),
'Bulgarian': Language(name='Bulgarian',
iso_code='bg',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'IBM855'],
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
wiki_start_pages=[u'Начална_страница']),
'Czech': Language(name='Czech',
iso_code='cz',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
wiki_start_pages=[u'Hlavní_strana']),
'Danish': Language(name='Danish',
iso_code='da',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'æøåÆØÅ',
wiki_start_pages=[u'Forside']),
'German': Language(name='German',
iso_code='de',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
alphabet=u'äöüßÄÖÜ',
wiki_start_pages=[u'Wikipedia:Hauptseite']),
'Greek': Language(name='Greek',
iso_code='el',
use_ascii=False,
charsets=['ISO-8859-7', 'WINDOWS-1253'],
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
wiki_start_pages=[u'Πύλη:Κύρια']),
'English': Language(name='English',
iso_code='en',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
wiki_start_pages=[u'Main_Page']),
'Esperanto': Language(name='Esperanto',
iso_code='eo',
# Q, W, X, and Y not used at all
use_ascii=False,
charsets=['ISO-8859-3'],
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
'Spanish': Language(name='Spanish',
iso_code='es',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
wiki_start_pages=[u'Wikipedia:Portada']),
'Estonian': Language(name='Estonian',
iso_code='et',
use_ascii=False,
charsets=['ISO-8859-4', 'ISO-8859-13',
'WINDOWS-1257'],
# C, F, Š, Q, W, X, Y, Z, Ž are only for
# loanwords
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
u'abdeghijklmnoprstuvõäöü'),
wiki_start_pages=[u'Esileht']),
'Finnish': Language(name='Finnish',
iso_code='fi',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÅÄÖŠŽåäöšž',
wiki_start_pages=[u'Wikipedia:Etusivu']),
'French': Language(name='French',
iso_code='fr',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
wiki_start_pages=[u'Wikipédia:Accueil_principal',
u'Bœuf (animal)']),
'Hebrew': Language(name='Hebrew',
iso_code='he',
use_ascii=False,
charsets=['ISO-8859-8', 'WINDOWS-1255'],
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
wiki_start_pages=[u'עמוד_ראשי']),
'Croatian': Language(name='Croatian',
iso_code='hr',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
wiki_start_pages=[u'Glavna_stranica']),
'Hungarian': Language(name='Hungarian',
iso_code='hu',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
wiki_start_pages=[u'Kezdőlap']),
'Italian': Language(name='Italian',
iso_code='it',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
wiki_start_pages=[u'Pagina_principale']),
'Lithuanian': Language(name='Lithuanian',
iso_code='lt',
use_ascii=False,
charsets=['ISO-8859-13', 'WINDOWS-1257',
'ISO-8859-4'],
# Q, W, and X not used at all
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
wiki_start_pages=[u'Pagrindinis_puslapis']),
'Latvian': Language(name='Latvian',
iso_code='lv',
use_ascii=False,
charsets=['ISO-8859-13', 'WINDOWS-1257',
'ISO-8859-4'],
# Q, W, X, Y are only for loanwords
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
wiki_start_pages=[u'Sākumlapa']),
'Macedonian': Language(name='Macedonian',
iso_code='mk',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'MacCyrillic', 'IBM855'],
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
wiki_start_pages=[u'Главна_страница']),
'Dutch': Language(name='Dutch',
iso_code='nl',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
wiki_start_pages=[u'Hoofdpagina']),
'Polish': Language(name='Polish',
iso_code='pl',
# Q and X are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
wiki_start_pages=[u'Wikipedia:Strona_główna']),
'Portuguese': Language(name='Portuguese',
iso_code='pt',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
wiki_start_pages=[u'Wikipédia:Página_principal']),
'Romanian': Language(name='Romanian',
iso_code='ro',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'ăâîșțĂÂÎȘȚ',
wiki_start_pages=[u'Pagina_principală']),
'Russian': Language(name='Russian',
iso_code='ru',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'KOI8-R', 'MacCyrillic', 'IBM866',
'IBM855'],
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
wiki_start_pages=[u'Заглавная_страница']),
'Slovak': Language(name='Slovak',
iso_code='sk',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
wiki_start_pages=[u'Hlavná_stránka']),
'Slovene': Language(name='Slovene',
iso_code='sl',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcčdefghijklmnoprsštuvzž'
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
wiki_start_pages=[u'Glavna_stran']),
# Serbian can be written in both Latin and Cyrillic, but there's no
# simple way to get the Latin alphabet pages from Wikipedia through
# the API, so for now we just support Cyrillic.
'Serbian': Language(name='Serbian',
iso_code='sr',
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
charsets=['ISO-8859-5', 'WINDOWS-1251',
'MacCyrillic', 'IBM855'],
wiki_start_pages=[u'Главна_страна']),
'Thai': Language(name='Thai',
iso_code='th',
use_ascii=False,
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
wiki_start_pages=[u'หน้าหลัก']),
'Turkish': Language(name='Turkish',
iso_code='tr',
# Q, W, and X are not used by Turkish
use_ascii=False,
charsets=['ISO-8859-3', 'ISO-8859-9',
'WINDOWS-1254'],
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
wiki_start_pages=[u'Ana_Sayfa']),
'Vietnamese': Language(name='Vietnamese',
iso_code='vi',
use_ascii=False,
# Windows-1258 is the only common 8-bit
# Vietnamese encoding supported by Python.
# From Wikipedia:
# For systems that lack support for Unicode,
# dozens of 8-bit Vietnamese code pages are
# available.[1] The most common are VISCII
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
# Where ASCII is required, such as when
# ensuring readability in plain text e-mail,
# Vietnamese letters are often encoded
# according to Vietnamese Quoted-Readable
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
# though usage of either variable-width
# scheme has declined dramatically following
# the adoption of Unicode on the World Wide
# Web.
charsets=['WINDOWS-1258'],
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
}
LANGUAGES = {
"Arabic": Language(
name="Arabic",
iso_code="ar",
use_ascii=False,
# We only support encodings that use isolated
# forms, because the current recommendation is
# that the rendering system handles presentation
# forms. This means we purposefully skip IBM864.
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
wiki_start_pages=["الصفحة_الرئيسية"],
),
"Belarusian": Language(
name="Belarusian",
iso_code="be",
use_ascii=False,
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
wiki_start_pages=["Галоўная_старонка"],
),
"Bulgarian": Language(
name="Bulgarian",
iso_code="bg",
use_ascii=False,
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
wiki_start_pages=["Начална_страница"],
),
"Czech": Language(
name="Czech",
iso_code="cz",
use_ascii=True,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
wiki_start_pages=["Hlavní_strana"],
),
"Danish": Language(
name="Danish",
iso_code="da",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="æøåÆØÅ",
wiki_start_pages=["Forside"],
),
"German": Language(
name="German",
iso_code="de",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="äöüßẞÄÖÜ",
wiki_start_pages=["Wikipedia:Hauptseite"],
),
"Greek": Language(
name="Greek",
iso_code="el",
use_ascii=False,
charsets=["ISO-8859-7", "WINDOWS-1253"],
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
wiki_start_pages=["Πύλη:Κύρια"],
),
"English": Language(
name="English",
iso_code="en",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Main_Page"],
),
"Esperanto": Language(
name="Esperanto",
iso_code="eo",
# Q, W, X, and Y not used at all
use_ascii=False,
charsets=["ISO-8859-3"],
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
),
"Spanish": Language(
name="Spanish",
iso_code="es",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
wiki_start_pages=["Wikipedia:Portada"],
),
"Estonian": Language(
name="Estonian",
iso_code="et",
use_ascii=False,
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
# C, F, Š, Q, W, X, Y, Z, Ž are only for
# loanwords
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
wiki_start_pages=["Esileht"],
),
"Finnish": Language(
name="Finnish",
iso_code="fi",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÅÄÖŠŽåäöšž",
wiki_start_pages=["Wikipedia:Etusivu"],
),
"French": Language(
name="French",
iso_code="fr",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
),
"Hebrew": Language(
name="Hebrew",
iso_code="he",
use_ascii=False,
charsets=["ISO-8859-8", "WINDOWS-1255"],
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
wiki_start_pages=["עמוד_ראשי"],
),
"Croatian": Language(
name="Croatian",
iso_code="hr",
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
wiki_start_pages=["Glavna_stranica"],
),
"Hungarian": Language(
name="Hungarian",
iso_code="hu",
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
wiki_start_pages=["Kezdőlap"],
),
"Italian": Language(
name="Italian",
iso_code="it",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
wiki_start_pages=["Pagina_principale"],
),
"Lithuanian": Language(
name="Lithuanian",
iso_code="lt",
use_ascii=False,
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
# Q, W, and X not used at all
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
wiki_start_pages=["Pagrindinis_puslapis"],
),
"Latvian": Language(
name="Latvian",
iso_code="lv",
use_ascii=False,
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
# Q, W, X, Y are only for loanwords
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
wiki_start_pages=["Sākumlapa"],
),
"Macedonian": Language(
name="Macedonian",
iso_code="mk",
use_ascii=False,
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
wiki_start_pages=["Главна_страница"],
),
"Dutch": Language(
name="Dutch",
iso_code="nl",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Hoofdpagina"],
),
"Polish": Language(
name="Polish",
iso_code="pl",
# Q and X are only used for foreign words.
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
wiki_start_pages=["Wikipedia:Strona_główna"],
),
"Portuguese": Language(
name="Portuguese",
iso_code="pt",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
wiki_start_pages=["Wikipédia:Página_principal"],
),
"Romanian": Language(
name="Romanian",
iso_code="ro",
use_ascii=True,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="ăâîșțĂÂÎȘȚ",
wiki_start_pages=["Pagina_principală"],
),
"Russian": Language(
name="Russian",
iso_code="ru",
use_ascii=False,
charsets=[
"ISO-8859-5",
"WINDOWS-1251",
"KOI8-R",
"MacCyrillic",
"IBM866",
"IBM855",
],
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
wiki_start_pages=["Заглавная_страница"],
),
"Slovak": Language(
name="Slovak",
iso_code="sk",
use_ascii=True,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
wiki_start_pages=["Hlavná_stránka"],
),
"Slovene": Language(
name="Slovene",
iso_code="sl",
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
wiki_start_pages=["Glavna_stran"],
),
# Serbian can be written in both Latin and Cyrillic, but there's no
# simple way to get the Latin alphabet pages from Wikipedia through
# the API, so for now we just support Cyrillic.
"Serbian": Language(
name="Serbian",
iso_code="sr",
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
wiki_start_pages=["Главна_страна"],
),
"Thai": Language(
name="Thai",
iso_code="th",
use_ascii=False,
charsets=["ISO-8859-11", "TIS-620", "CP874"],
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
wiki_start_pages=["หน้าหลัก"],
),
"Turkish": Language(
name="Turkish",
iso_code="tr",
# Q, W, and X are not used by Turkish
use_ascii=False,
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
wiki_start_pages=["Ana_Sayfa"],
),
"Vietnamese": Language(
name="Vietnamese",
iso_code="vi",
use_ascii=False,
# Windows-1258 is the only common 8-bit
# Vietnamese encoding supported by Python.
# From Wikipedia:
# For systems that lack support for Unicode,
# dozens of 8-bit Vietnamese code pages are
# available.[1] The most common are VISCII
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
# Where ASCII is required, such as when
# ensuring readability in plain text e-mail,
# Vietnamese letters are often encoded
# according to Vietnamese Quoted-Readable
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
# though usage of either variable-width
# scheme has declined dramatically following
# the adoption of Unicode on the World Wide
# Web.
charsets=["WINDOWS-1258"],
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
wiki_start_pages=["Chữ_Quốc_ngữ"],
),
}

16
lib/chardet/resultdict.py Normal file
View file

@ -0,0 +1,16 @@
from typing import TYPE_CHECKING, Optional
if TYPE_CHECKING:
# TypedDict was introduced in Python 3.8.
#
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
# for Python 3.7.
from typing import TypedDict
class ResultDict(TypedDict):
encoding: Optional[str]
confidence: float
language: Optional[str]
else:
ResultDict = dict

View file

@ -26,70 +26,77 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from collections import namedtuple
from typing import Dict, List, NamedTuple, Optional, Union
from .charsetprober import CharSetProber
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
['charset_name',
'language',
'char_to_order_map',
'language_model',
'typical_positive_ratio',
'keep_ascii_letters',
'alphabet'])
class SingleByteCharSetModel(NamedTuple):
charset_name: str
language: str
char_to_order_map: Dict[int, int]
language_model: Dict[int, Dict[int, int]]
typical_positive_ratio: float
keep_ascii_letters: bool
alphabet: str
class SingleByteCharSetProber(CharSetProber):
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
POSITIVE_SHORTCUT_THRESHOLD = 0.95
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
def __init__(self, model, reversed=False, name_prober=None):
super(SingleByteCharSetProber, self).__init__()
def __init__(
self,
model: SingleByteCharSetModel,
is_reversed: bool = False,
name_prober: Optional[CharSetProber] = None,
) -> None:
super().__init__()
self._model = model
# TRUE if we need to reverse every pair in the model lookup
self._reversed = reversed
self._reversed = is_reversed
# Optional auxiliary prober for name decision
self._name_prober = name_prober
self._last_order = None
self._seq_counters = None
self._total_seqs = None
self._total_char = None
self._freq_char = None
self._last_order = 255
self._seq_counters: List[int] = []
self._total_seqs = 0
self._total_char = 0
self._control_char = 0
self._freq_char = 0
self.reset()
def reset(self):
super(SingleByteCharSetProber, self).reset()
def reset(self) -> None:
super().reset()
# char order of last character
self._last_order = 255
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
self._total_seqs = 0
self._total_char = 0
self._control_char = 0
# characters that fall in our sampling range
self._freq_char = 0
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
if self._name_prober:
return self._name_prober.charset_name
else:
return self._model.charset_name
return self._model.charset_name
@property
def language(self):
def language(self) -> Optional[str]:
if self._name_prober:
return self._name_prober.language
else:
return self._model.language
return self._model.language
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# TODO: Make filter_international_words keep things in self.alphabet
if not self._model.keep_ascii_letters:
byte_str = self.filter_international_words(byte_str)
else:
byte_str = self.remove_xml_tags(byte_str)
if not byte_str:
return self.state
char_to_order_map = self._model.char_to_order_map
@ -103,9 +110,6 @@ class SingleByteCharSetProber(CharSetProber):
# _total_char purposes.
if order < CharacterCategory.CONTROL:
self._total_char += 1
# TODO: Follow uchardet's lead and discount confidence for frequent
# control characters.
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
if order < self.SAMPLE_SIZE:
self._freq_char += 1
if self._last_order < self.SAMPLE_SIZE:
@ -122,23 +126,36 @@ class SingleByteCharSetProber(CharSetProber):
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
confidence = self.get_confidence()
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, we have a winner',
charset_name, confidence)
self.logger.debug(
"%s confidence = %s, we have a winner", charset_name, confidence
)
self._state = ProbingState.FOUND_IT
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, below negative '
'shortcut threshhold %s', charset_name,
confidence,
self.NEGATIVE_SHORTCUT_THRESHOLD)
self.logger.debug(
"%s confidence = %s, below negative shortcut threshold %s",
charset_name,
confidence,
self.NEGATIVE_SHORTCUT_THRESHOLD,
)
self._state = ProbingState.NOT_ME
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
r = 0.01
if self._total_seqs > 0:
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
self._total_seqs / self._model.typical_positive_ratio)
r = (
(
self._seq_counters[SequenceLikelihood.POSITIVE]
+ 0.25 * self._seq_counters[SequenceLikelihood.LIKELY]
)
/ self._total_seqs
/ self._model.typical_positive_ratio
)
# The more control characters (proportionnaly to the size
# of the text), the less confident we become in the current
# charset.
r = r * (self._total_char - self._control_char) / self._total_char
r = r * self._freq_char / self._total_char
if r >= 1.0:
r = 0.99

View file

@ -28,33 +28,38 @@
from .charsetgroupprober import CharSetGroupProber
from .hebrewprober import HebrewProber
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
WINDOWS_1251_BULGARIAN_MODEL)
from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
# WINDOWS_1250_HUNGARIAN_MODEL)
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
MACCYRILLIC_RUSSIAN_MODEL,
WINDOWS_1251_RUSSIAN_MODEL)
from .langrussianmodel import (
IBM855_RUSSIAN_MODEL,
IBM866_RUSSIAN_MODEL,
ISO_8859_5_RUSSIAN_MODEL,
KOI8_R_RUSSIAN_MODEL,
MACCYRILLIC_RUSSIAN_MODEL,
WINDOWS_1251_RUSSIAN_MODEL,
)
from .langthaimodel import TIS_620_THAI_MODEL
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
from .sbcharsetprober import SingleByteCharSetProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
super(SBCSGroupProber, self).__init__()
def __init__(self) -> None:
super().__init__()
hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
False, hebrew_prober)
logical_hebrew_prober = SingleByteCharSetProber(
WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
)
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
# it's actually the visual one
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
True, hebrew_prober)
hebrew_prober.set_model_probers(logical_hebrew_prober,
visual_hebrew_prober)
visual_hebrew_prober = SingleByteCharSetProber(
WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
)
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
# and several tests failed that did not before. Some thought
# should be put into the ordering, and we should consider making

View file

@ -25,68 +25,81 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from typing import Union
from .chardistribution import SJISDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState
from .jpcntx import SJISContextAnalysis
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import SJIS_SM_MODEL
from .enums import ProbingState, MachineState
class SJISProber(MultiByteCharSetProber):
def __init__(self):
super(SJISProber, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
self.distribution_analyzer = SJISDistributionAnalysis()
self.context_analyzer = SJISContextAnalysis()
self.reset()
def reset(self):
super(SJISProber, self).reset()
def reset(self) -> None:
super().reset()
self.context_analyzer.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return self.context_analyzer.charset_name
@property
def language(self):
def language(self) -> str:
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
for i, byte in enumerate(byte_str):
coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self.logger.debug(
"%s %s prober hit error at byte %s",
self.charset_name,
self.language,
i,
)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
if coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
if coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
self.context_analyzer.feed(self._last_char[2 - char_len:],
char_len)
self._last_char[1] = byte
self.context_analyzer.feed(
self._last_char[2 - char_len :], char_len
)
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
- char_len], char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.context_analyzer.feed(
byte_str[i + 1 - char_len : i + 3 - char_len], char_len
)
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
if self.context_analyzer.got_enough_data() and (
self.get_confidence() > self.SHORTCUT_THRESHOLD
):
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)

View file

@ -39,16 +39,21 @@ class a user of ``chardet`` should use.
import codecs
import logging
import re
from typing import List, Optional, Union
from .charsetgroupprober import CharSetGroupProber
from .charsetprober import CharSetProber
from .enums import InputState, LanguageFilter, ProbingState
from .escprober import EscCharSetProber
from .latin1prober import Latin1Prober
from .macromanprober import MacRomanProber
from .mbcsgroupprober import MBCSGroupProber
from .resultdict import ResultDict
from .sbcsgroupprober import SBCSGroupProber
from .utf1632prober import UTF1632Prober
class UniversalDetector(object):
class UniversalDetector:
"""
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
and coordinates all of the different charset probers.
@ -66,49 +71,87 @@ class UniversalDetector(object):
"""
MINIMUM_THRESHOLD = 0.20
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
ESC_DETECTOR = re.compile(b'(\033|~{)')
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
'iso-8859-2': 'Windows-1250',
'iso-8859-5': 'Windows-1251',
'iso-8859-6': 'Windows-1256',
'iso-8859-7': 'Windows-1253',
'iso-8859-8': 'Windows-1255',
'iso-8859-9': 'Windows-1254',
'iso-8859-13': 'Windows-1257'}
HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
ESC_DETECTOR = re.compile(b"(\033|~{)")
WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
ISO_WIN_MAP = {
"iso-8859-1": "Windows-1252",
"iso-8859-2": "Windows-1250",
"iso-8859-5": "Windows-1251",
"iso-8859-6": "Windows-1256",
"iso-8859-7": "Windows-1253",
"iso-8859-8": "Windows-1255",
"iso-8859-9": "Windows-1254",
"iso-8859-13": "Windows-1257",
}
# Based on https://encoding.spec.whatwg.org/#names-and-labels
# but altered to match Python names for encodings and remove mappings
# that break tests.
LEGACY_MAP = {
"ascii": "Windows-1252",
"iso-8859-1": "Windows-1252",
"tis-620": "ISO-8859-11",
"iso-8859-9": "Windows-1254",
"gb2312": "GB18030",
"euc-kr": "CP949",
"utf-16le": "UTF-16",
}
def __init__(self, lang_filter=LanguageFilter.ALL):
self._esc_charset_prober = None
self._charset_probers = []
self.result = None
self.done = None
self._got_data = None
self._input_state = None
self._last_char = None
def __init__(
self,
lang_filter: LanguageFilter = LanguageFilter.ALL,
should_rename_legacy: bool = False,
) -> None:
self._esc_charset_prober: Optional[EscCharSetProber] = None
self._utf1632_prober: Optional[UTF1632Prober] = None
self._charset_probers: List[CharSetProber] = []
self.result: ResultDict = {
"encoding": None,
"confidence": 0.0,
"language": None,
}
self.done = False
self._got_data = False
self._input_state = InputState.PURE_ASCII
self._last_char = b""
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
self._has_win_bytes = None
self._has_win_bytes = False
self.should_rename_legacy = should_rename_legacy
self.reset()
def reset(self):
@property
def input_state(self) -> int:
return self._input_state
@property
def has_win_bytes(self) -> bool:
return self._has_win_bytes
@property
def charset_probers(self) -> List[CharSetProber]:
return self._charset_probers
def reset(self) -> None:
"""
Reset the UniversalDetector and all of its probers back to their
initial states. This is called by ``__init__``, so you only need to
call this directly in between analyses of different documents.
"""
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
self.result = {"encoding": None, "confidence": 0.0, "language": None}
self.done = False
self._got_data = False
self._has_win_bytes = False
self._input_state = InputState.PURE_ASCII
self._last_char = b''
self._last_char = b""
if self._esc_charset_prober:
self._esc_charset_prober.reset()
if self._utf1632_prober:
self._utf1632_prober.reset()
for prober in self._charset_probers:
prober.reset()
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> None:
"""
Takes a chunk of a document and feeds it through all of the relevant
charset probers.
@ -125,7 +168,7 @@ class UniversalDetector(object):
if self.done:
return
if not len(byte_str):
if not byte_str:
return
if not isinstance(byte_str, bytearray):
@ -136,35 +179,38 @@ class UniversalDetector(object):
# If the data starts with BOM, we know it is UTF
if byte_str.startswith(codecs.BOM_UTF8):
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8-SIG",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith((codecs.BOM_UTF32_LE,
codecs.BOM_UTF32_BE)):
self.result = {
"encoding": "UTF-8-SIG",
"confidence": 1.0,
"language": "",
}
elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
# FF FE 00 00 UTF-32, little-endian BOM
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-3412",
"confidence": 1.0,
"language": "",
}
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0,
'language': ''}
self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-2143",
"confidence": 1.0,
"language": "",
}
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
# FF FE UTF-16, little endian BOM
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16",
'confidence': 1.0,
'language': ''}
self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
self._got_data = True
if self.result['encoding'] is not None:
if self.result["encoding"] is not None:
self.done = True
return
@ -173,12 +219,29 @@ class UniversalDetector(object):
if self._input_state == InputState.PURE_ASCII:
if self.HIGH_BYTE_DETECTOR.search(byte_str):
self._input_state = InputState.HIGH_BYTE
elif self._input_state == InputState.PURE_ASCII and \
self.ESC_DETECTOR.search(self._last_char + byte_str):
elif (
self._input_state == InputState.PURE_ASCII
and self.ESC_DETECTOR.search(self._last_char + byte_str)
):
self._input_state = InputState.ESC_ASCII
self._last_char = byte_str[-1:]
# next we will look to see if it is appears to be either a UTF-16 or
# UTF-32 encoding
if not self._utf1632_prober:
self._utf1632_prober = UTF1632Prober()
if self._utf1632_prober.state == ProbingState.DETECTING:
if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {
"encoding": self._utf1632_prober.charset_name,
"confidence": self._utf1632_prober.get_confidence(),
"language": "",
}
self.done = True
return
# If we've seen escape sequences, use the EscCharSetProber, which
# uses a simple state machine to check for known escape sequences in
# HZ and ISO-2022 encodings, since those are the only encodings that
@ -187,12 +250,11 @@ class UniversalDetector(object):
if not self._esc_charset_prober:
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding':
self._esc_charset_prober.charset_name,
'confidence':
self._esc_charset_prober.get_confidence(),
'language':
self._esc_charset_prober.language}
self.result = {
"encoding": self._esc_charset_prober.charset_name,
"confidence": self._esc_charset_prober.get_confidence(),
"language": self._esc_charset_prober.language,
}
self.done = True
# If we've seen high bytes (i.e., those with values greater than 127),
# we need to do more complicated checks using all our multi-byte and
@ -207,17 +269,20 @@ class UniversalDetector(object):
if self.lang_filter & LanguageFilter.NON_CJK:
self._charset_probers.append(SBCSGroupProber())
self._charset_probers.append(Latin1Prober())
self._charset_probers.append(MacRomanProber())
for prober in self._charset_probers:
if prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding': prober.charset_name,
'confidence': prober.get_confidence(),
'language': prober.language}
self.result = {
"encoding": prober.charset_name,
"confidence": prober.get_confidence(),
"language": prober.language,
}
self.done = True
break
if self.WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True
def close(self):
def close(self) -> ResultDict:
"""
Stop analyzing the current document and come up with a final
prediction.
@ -231,13 +296,11 @@ class UniversalDetector(object):
self.done = True
if not self._got_data:
self.logger.debug('no data received!')
self.logger.debug("no data received!")
# Default to ASCII if it is all we've seen so far
elif self._input_state == InputState.PURE_ASCII:
self.result = {'encoding': 'ascii',
'confidence': 1.0,
'language': ''}
self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
elif self._input_state == InputState.HIGH_BYTE:
@ -253,34 +316,47 @@ class UniversalDetector(object):
max_prober = prober
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
charset_name = max_prober.charset_name
lower_charset_name = max_prober.charset_name.lower()
assert charset_name is not None
lower_charset_name = charset_name.lower()
confidence = max_prober.get_confidence()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if lower_charset_name.startswith("iso-8859"):
if self._has_win_bytes:
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
self.result = {'encoding': charset_name,
'confidence': confidence,
'language': max_prober.language}
charset_name = self.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if self.should_rename_legacy:
charset_name = self.LEGACY_MAP.get(
(charset_name or "").lower(), charset_name
)
self.result = {
"encoding": charset_name,
"confidence": confidence,
"language": max_prober.language,
}
# Log all prober confidences if none met MINIMUM_THRESHOLD
if self.logger.getEffectiveLevel() <= logging.DEBUG:
if self.result['encoding'] is None:
self.logger.debug('no probers hit minimum threshold')
if self.result["encoding"] is None:
self.logger.debug("no probers hit minimum threshold")
for group_prober in self._charset_probers:
if not group_prober:
continue
if isinstance(group_prober, CharSetGroupProber):
for prober in group_prober.probers:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
self.logger.debug(
"%s %s confidence = %s",
prober.charset_name,
prober.language,
prober.get_confidence(),
)
else:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
self.logger.debug(
"%s %s confidence = %s",
group_prober.charset_name,
group_prober.language,
group_prober.get_confidence(),
)
return self.result

View file

@ -0,0 +1,225 @@
######################## BEGIN LICENSE BLOCK ########################
#
# Contributor(s):
# Jason Zavaglia
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
class UTF1632Prober(CharSetProber):
"""
This class simply looks for occurrences of zero bytes, and infers
whether the file is UTF16 or UTF32 (low-endian or big-endian)
For instance, files looking like ( \0 \0 \0 [nonzero] )+
have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
may be guessed to be UTF16BE, and inversely for little-endian varieties.
"""
# how many logical characters to scan before feeling confident of prediction
MIN_CHARS_FOR_DETECTION = 20
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
EXPECTED_RATIO = 0.94
def __init__(self) -> None:
super().__init__()
self.position = 0
self.zeros_at_mod = [0] * 4
self.nonzeros_at_mod = [0] * 4
self._state = ProbingState.DETECTING
self.quad = [0, 0, 0, 0]
self.invalid_utf16be = False
self.invalid_utf16le = False
self.invalid_utf32be = False
self.invalid_utf32le = False
self.first_half_surrogate_pair_detected_16be = False
self.first_half_surrogate_pair_detected_16le = False
self.reset()
def reset(self) -> None:
super().reset()
self.position = 0
self.zeros_at_mod = [0] * 4
self.nonzeros_at_mod = [0] * 4
self._state = ProbingState.DETECTING
self.invalid_utf16be = False
self.invalid_utf16le = False
self.invalid_utf32be = False
self.invalid_utf32le = False
self.first_half_surrogate_pair_detected_16be = False
self.first_half_surrogate_pair_detected_16le = False
self.quad = [0, 0, 0, 0]
@property
def charset_name(self) -> str:
if self.is_likely_utf32be():
return "utf-32be"
if self.is_likely_utf32le():
return "utf-32le"
if self.is_likely_utf16be():
return "utf-16be"
if self.is_likely_utf16le():
return "utf-16le"
# default to something valid
return "utf-16"
@property
def language(self) -> str:
return ""
def approx_32bit_chars(self) -> float:
return max(1.0, self.position / 4.0)
def approx_16bit_chars(self) -> float:
return max(1.0, self.position / 2.0)
def is_likely_utf32be(self) -> bool:
approx_chars = self.approx_32bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
and not self.invalid_utf32be
)
def is_likely_utf32le(self) -> bool:
approx_chars = self.approx_32bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
and not self.invalid_utf32le
)
def is_likely_utf16be(self) -> bool:
approx_chars = self.approx_16bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
> self.EXPECTED_RATIO
and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
> self.EXPECTED_RATIO
and not self.invalid_utf16be
)
def is_likely_utf16le(self) -> bool:
approx_chars = self.approx_16bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
> self.EXPECTED_RATIO
and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
> self.EXPECTED_RATIO
and not self.invalid_utf16le
)
def validate_utf32_characters(self, quad: List[int]) -> None:
"""
Validate if the quad of bytes is valid UTF-32.
UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
excluding 0x0000D800 - 0x0000DFFF
https://en.wikipedia.org/wiki/UTF-32
"""
if (
quad[0] != 0
or quad[1] > 0x10
or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
):
self.invalid_utf32be = True
if (
quad[3] != 0
or quad[2] > 0x10
or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
):
self.invalid_utf32le = True
def validate_utf16_characters(self, pair: List[int]) -> None:
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
if not self.first_half_surrogate_pair_detected_16be:
if 0xD8 <= pair[0] <= 0xDB:
self.first_half_surrogate_pair_detected_16be = True
elif 0xDC <= pair[0] <= 0xDF:
self.invalid_utf16be = True
else:
if 0xDC <= pair[0] <= 0xDF:
self.first_half_surrogate_pair_detected_16be = False
else:
self.invalid_utf16be = True
if not self.first_half_surrogate_pair_detected_16le:
if 0xD8 <= pair[1] <= 0xDB:
self.first_half_surrogate_pair_detected_16le = True
elif 0xDC <= pair[1] <= 0xDF:
self.invalid_utf16le = True
else:
if 0xDC <= pair[1] <= 0xDF:
self.first_half_surrogate_pair_detected_16le = False
else:
self.invalid_utf16le = True
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str:
mod4 = self.position % 4
self.quad[mod4] = c
if mod4 == 3:
self.validate_utf32_characters(self.quad)
self.validate_utf16_characters(self.quad[0:2])
self.validate_utf16_characters(self.quad[2:4])
if c == 0:
self.zeros_at_mod[mod4] += 1
else:
self.nonzeros_at_mod[mod4] += 1
self.position += 1
return self.state
@property
def state(self) -> ProbingState:
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
# terminal, decided states
return self._state
if self.get_confidence() > 0.80:
self._state = ProbingState.FOUND_IT
elif self.position > 4 * 1024:
# if we get to 4kb into the file, and we can't conclude it's UTF,
# let's give up
self._state = ProbingState.NOT_ME
return self._state
def get_confidence(self) -> float:
return (
0.85
if (
self.is_likely_utf16le()
or self.is_likely_utf16be()
or self.is_likely_utf32le()
or self.is_likely_utf32be()
)
else 0.00
)

View file

@ -25,45 +25,46 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState, MachineState
from .codingstatemachine import CodingStateMachine
from .mbcssm import UTF8_SM_MODEL
from typing import Union
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState
from .mbcssm import UTF8_SM_MODEL
class UTF8Prober(CharSetProber):
ONE_CHAR_PROB = 0.5
def __init__(self):
super(UTF8Prober, self).__init__()
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
self._num_mb_chars = None
self._num_mb_chars = 0
self.reset()
def reset(self):
super(UTF8Prober, self).reset()
def reset(self) -> None:
super().reset()
self.coding_sm.reset()
self._num_mb_chars = 0
@property
def charset_name(self):
def charset_name(self) -> str:
return "utf-8"
@property
def language(self):
def language(self) -> str:
return ""
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str:
coding_state = self.coding_sm.next_state(c)
if coding_state == MachineState.ERROR:
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
if coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
if coding_state == MachineState.START:
if self.coding_sm.get_current_charlen() >= 2:
self._num_mb_chars += 1
@ -73,10 +74,9 @@ class UTF8Prober(CharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
unlike = 0.99
if self._num_mb_chars < 6:
unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
return 1.0 - unlike
else:
return unlike
return unlike

View file

@ -1,9 +1,9 @@
"""
This module exists only to simplify retrieving the version number of chardet
from within setup.py and from chardet subpackages.
from within setuptools and from chardet subpackages.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
__version__ = "4.0.0"
VERSION = __version__.split('.')
__version__ = "5.1.0"
VERSION = __version__.split(".")