From ec0205fc470e2f8935d0ed4ab9c1208416ab453a Mon Sep 17 00:00:00 2001 From: JackDandy Date: Fri, 13 Jan 2023 23:33:43 +0000 Subject: [PATCH] =?UTF-8?q?Update=20chardet=20packages=204.0.0=20(b3d867a)?= =?UTF-8?q?=20=E2=86=92=205.1.0=20(8087f00).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 1 + lib/chardet/__init__.py | 101 +- lib/chardet/big5freq.py | 6 +- lib/chardet/big5prober.py | 12 +- lib/chardet/chardistribution.py | 144 +- lib/chardet/charsetgroupprober.py | 46 +- lib/chardet/charsetprober.py | 74 +- lib/chardet/cli/__init__.py | 1 - lib/chardet/cli/chardetect.py | 87 +- lib/chardet/codingstatemachine.py | 32 +- lib/chardet/codingstatemachinedict.py | 19 + lib/chardet/compat.py | 36 - lib/chardet/cp949prober.py | 8 +- lib/chardet/enums.py | 23 +- lib/chardet/escprober.py | 45 +- lib/chardet/escsm.py | 385 +-- lib/chardet/eucjpprober.py | 60 +- lib/chardet/euckrfreq.py | 3 +- lib/chardet/euckrprober.py | 12 +- lib/chardet/euctwfreq.py | 677 +++--- lib/chardet/euctwprober.py | 13 +- lib/chardet/gb2312freq.py | 3 +- lib/chardet/gb2312prober.py | 13 +- lib/chardet/hebrewprober.py | 110 +- lib/chardet/jisfreq.py | 4 +- lib/chardet/johabfreq.py | 2382 +++++++++++++++++++ lib/chardet/johabprober.py | 47 + lib/chardet/jpcntx.py | 219 +- lib/chardet/langbulgarianmodel.py | 1061 +++++---- lib/chardet/langgreekmodel.py | 1061 +++++---- lib/chardet/langhebrewmodel.py | 533 +++-- lib/chardet/langhungarianmodel.py | 1061 +++++---- lib/chardet/langrussianmodel.py | 3173 +++++++++++++------------ lib/chardet/langthaimodel.py | 533 +++-- lib/chardet/langturkishmodel.py | 533 +++-- lib/chardet/latin1prober.py | 44 +- lib/chardet/macromanprober.py | 162 ++ lib/chardet/mbcharsetprober.py | 62 +- lib/chardet/mbcsgroupprober.py | 23 +- lib/chardet/mbcssm.py | 817 ++++--- lib/chardet/metadata/languages.py | 576 ++--- lib/chardet/resultdict.py | 16 + lib/chardet/sbcharsetprober.py | 95 +- lib/chardet/sbcsgroupprober.py | 33 +- lib/chardet/sjisprober.py | 65 +- lib/chardet/universaldetector.py | 234 +- lib/chardet/utf1632prober.py | 225 ++ lib/chardet/utf8prober.py | 36 +- lib/chardet/version.py | 6 +- 49 files changed, 9067 insertions(+), 5845 deletions(-) create mode 100644 lib/chardet/codingstatemachinedict.py delete mode 100644 lib/chardet/compat.py create mode 100644 lib/chardet/johabfreq.py create mode 100644 lib/chardet/johabprober.py create mode 100644 lib/chardet/macromanprober.py create mode 100644 lib/chardet/resultdict.py create mode 100644 lib/chardet/utf1632prober.py diff --git a/CHANGES.md b/CHANGES.md index 5341e936..7913370e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ * Remove lockfile no longer used by cachecontrol * Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5) * Update certifi 2022.09.24 to 2022.12.07 +* Update chardet packages 4.0.0 (b3d867a) to 5.1.0 (8087f00) * Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425) * Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae) * Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb) diff --git a/lib/chardet/__init__.py b/lib/chardet/__init__.py index 9a469957..2d0d8b03 100644 --- a/lib/chardet/__init__.py +++ b/lib/chardet/__init__.py @@ -15,68 +15,101 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union -from .universaldetector import UniversalDetector +from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState -from .version import __version__, VERSION +from .resultdict import ResultDict +from .universaldetector import UniversalDetector +from .version import VERSION, __version__ + +__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"] -__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION'] - - -def detect(byte_str): +def detect( + byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False +) -> ResultDict: """ Detect the encoding of the given byte string. :param byte_str: The byte sequence to examine. :type byte_str: ``bytes`` or ``bytearray`` + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): - raise TypeError('Expected object of type bytes or bytearray, got: ' - '{0}'.format(type(byte_str))) - else: - byte_str = bytearray(byte_str) - detector = UniversalDetector() + raise TypeError( + f"Expected object of type bytes or bytearray, got: {type(byte_str)}" + ) + byte_str = bytearray(byte_str) + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) return detector.close() -def detect_all(byte_str): +def detect_all( + byte_str: Union[bytes, bytearray], + ignore_threshold: bool = False, + should_rename_legacy: bool = False, +) -> List[ResultDict]: """ Detect all the possible encodings of the given byte string. - :param byte_str: The byte sequence to examine. - :type byte_str: ``bytes`` or ``bytearray`` + :param byte_str: The byte sequence to examine. + :type byte_str: ``bytes`` or ``bytearray`` + :param ignore_threshold: Include encodings that are below + ``UniversalDetector.MINIMUM_THRESHOLD`` + in results. + :type ignore_threshold: ``bool`` + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): - raise TypeError('Expected object of type bytes or bytearray, got: ' - '{0}'.format(type(byte_str))) - else: - byte_str = bytearray(byte_str) + raise TypeError( + f"Expected object of type bytes or bytearray, got: {type(byte_str)}" + ) + byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) detector.close() - if detector._input_state == InputState.HIGH_BYTE: - results = [] - for prober in detector._charset_probers: - if prober.get_confidence() > detector.MINIMUM_THRESHOLD: - charset_name = prober.charset_name - lower_charset_name = prober.charset_name.lower() + if detector.input_state == InputState.HIGH_BYTE: + results: List[ResultDict] = [] + probers: List[CharSetProber] = [] + for prober in detector.charset_probers: + if isinstance(prober, CharSetGroupProber): + probers.extend(p for p in prober.probers) + else: + probers.append(prober) + for prober in probers: + if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD: + charset_name = prober.charset_name or "" + lower_charset_name = charset_name.lower() # Use Windows encoding name instead of ISO-8859 if we saw any # extra Windows-specific bytes - if lower_charset_name.startswith('iso-8859'): - if detector._has_win_bytes: - charset_name = detector.ISO_WIN_MAP.get(lower_charset_name, - charset_name) - results.append({ - 'encoding': charset_name, - 'confidence': prober.get_confidence() - }) + if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes: + charset_name = detector.ISO_WIN_MAP.get( + lower_charset_name, charset_name + ) + # Rename legacy encodings with superset encodings if asked + if should_rename_legacy: + charset_name = detector.LEGACY_MAP.get( + charset_name.lower(), charset_name + ) + results.append( + { + "encoding": charset_name, + "confidence": prober.get_confidence(), + "language": prober.language, + } + ) if len(results) > 0: - return sorted(results, key=lambda result: -result['confidence']) + return sorted(results, key=lambda result: -result["confidence"]) return [detector.result] diff --git a/lib/chardet/big5freq.py b/lib/chardet/big5freq.py index 88023ae4..272ce851 100644 --- a/lib/chardet/big5freq.py +++ b/lib/chardet/big5freq.py @@ -42,9 +42,9 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75 -#Char to FreqOrder table +# Char to FreqOrder table BIG5_TABLE_SIZE = 5376 - +# fmt: off BIG5_CHAR_TO_FREQ_ORDER = ( 1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16 3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32 @@ -383,4 +383,4 @@ BIG5_CHAR_TO_FREQ_ORDER = ( 890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360 2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376 ) - +# fmt: on diff --git a/lib/chardet/big5prober.py b/lib/chardet/big5prober.py index 5b1227a5..8608ec46 100644 --- a/lib/chardet/big5prober.py +++ b/lib/chardet/big5prober.py @@ -25,23 +25,23 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .mbcharsetprober import MultiByteCharSetProber -from .codingstatemachine import CodingStateMachine from .chardistribution import Big5DistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .mbcharsetprober import MultiByteCharSetProber from .mbcssm import BIG5_SM_MODEL class Big5Prober(MultiByteCharSetProber): - def __init__(self): - super(Big5Prober, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) self.distribution_analyzer = Big5DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Big5" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/lib/chardet/chardistribution.py b/lib/chardet/chardistribution.py index e5509a01..7ec21e4d 100644 --- a/lib/chardet/chardistribution.py +++ b/lib/chardet/chardistribution.py @@ -25,40 +25,58 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE, - EUCTW_TYPICAL_DISTRIBUTION_RATIO) -from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE, - EUCKR_TYPICAL_DISTRIBUTION_RATIO) -from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE, - GB2312_TYPICAL_DISTRIBUTION_RATIO) -from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, - BIG5_TYPICAL_DISTRIBUTION_RATIO) -from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE, - JIS_TYPICAL_DISTRIBUTION_RATIO) +from typing import Tuple, Union + +from .big5freq import ( + BIG5_CHAR_TO_FREQ_ORDER, + BIG5_TABLE_SIZE, + BIG5_TYPICAL_DISTRIBUTION_RATIO, +) +from .euckrfreq import ( + EUCKR_CHAR_TO_FREQ_ORDER, + EUCKR_TABLE_SIZE, + EUCKR_TYPICAL_DISTRIBUTION_RATIO, +) +from .euctwfreq import ( + EUCTW_CHAR_TO_FREQ_ORDER, + EUCTW_TABLE_SIZE, + EUCTW_TYPICAL_DISTRIBUTION_RATIO, +) +from .gb2312freq import ( + GB2312_CHAR_TO_FREQ_ORDER, + GB2312_TABLE_SIZE, + GB2312_TYPICAL_DISTRIBUTION_RATIO, +) +from .jisfreq import ( + JIS_CHAR_TO_FREQ_ORDER, + JIS_TABLE_SIZE, + JIS_TYPICAL_DISTRIBUTION_RATIO, +) +from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE -class CharDistributionAnalysis(object): +class CharDistributionAnalysis: ENOUGH_DATA_THRESHOLD = 1024 SURE_YES = 0.99 SURE_NO = 0.01 MINIMUM_DATA_THRESHOLD = 3 - def __init__(self): + def __init__(self) -> None: # Mapping table to get frequency order from char order (get from # GetOrder()) - self._char_to_freq_order = None - self._table_size = None # Size of above table + self._char_to_freq_order: Tuple[int, ...] = tuple() + self._table_size = 0 # Size of above table # This is a constant value which varies from language to language, # used in calculating confidence. See # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html # for further detail. - self.typical_distribution_ratio = None - self._done = None - self._total_chars = None - self._freq_chars = None + self.typical_distribution_ratio = 0.0 + self._done = False + self._total_chars = 0 + self._freq_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: """reset analyser, clear any state""" # If this flag is set to True, detection is done and conclusion has # been made @@ -67,7 +85,7 @@ class CharDistributionAnalysis(object): # The number of characters whose frequency order is less than 512 self._freq_chars = 0 - def feed(self, char, char_len): + def feed(self, char: Union[bytes, bytearray], char_len: int) -> None: """feed a character with known length""" if char_len == 2: # we only care about 2-bytes character in our distribution analysis @@ -81,7 +99,7 @@ class CharDistributionAnalysis(object): if 512 > self._char_to_freq_order[order]: self._freq_chars += 1 - def get_confidence(self): + def get_confidence(self) -> float: """return confidence based on existing data""" # if we didn't receive any character in our consideration range, # return negative answer @@ -89,20 +107,21 @@ class CharDistributionAnalysis(object): return self.SURE_NO if self._total_chars != self._freq_chars: - r = (self._freq_chars / ((self._total_chars - self._freq_chars) - * self.typical_distribution_ratio)) + r = self._freq_chars / ( + (self._total_chars - self._freq_chars) * self.typical_distribution_ratio + ) if r < self.SURE_YES: return r # normalize confidence (we don't want to be 100% sure) return self.SURE_YES - def got_enough_data(self): + def got_enough_data(self) -> bool: # It is not necessary to receive all data to draw conclusion. # For charset detection, certain amount of data is enough return self._total_chars > self.ENOUGH_DATA_THRESHOLD - def get_order(self, byte_str): + def get_order(self, _: Union[bytes, bytearray]) -> int: # We do not handle characters based on the original encoding string, # but convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency @@ -111,13 +130,13 @@ class CharDistributionAnalysis(object): class EUCTWDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - super(EUCTWDistributionAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER self._table_size = EUCTW_TABLE_SIZE self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -125,18 +144,17 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis): first_char = byte_str[0] if first_char >= 0xC4: return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1 - else: - return -1 + return -1 class EUCKRDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - super(EUCKRDistributionAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-KR encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -144,18 +162,32 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis): first_char = byte_str[0] if first_char >= 0xB0: return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1 - else: - return -1 + return -1 + + +class JOHABDistributionAnalysis(CharDistributionAnalysis): + def __init__(self) -> None: + super().__init__() + self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER + self._table_size = EUCKR_TABLE_SIZE + self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO + + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: + first_char = byte_str[0] + if 0x88 <= first_char < 0xD4: + code = first_char * 256 + byte_str[1] + return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1) + return -1 class GB2312DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - super(GB2312DistributionAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER self._table_size = GB2312_TABLE_SIZE self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -163,18 +195,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis): first_char, second_char = byte_str[0], byte_str[1] if (first_char >= 0xB0) and (second_char >= 0xA1): return 94 * (first_char - 0xB0) + second_char - 0xA1 - else: - return -1 + return -1 class Big5DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - super(Big5DistributionAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER self._table_size = BIG5_TABLE_SIZE self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe @@ -183,28 +214,26 @@ class Big5DistributionAnalysis(CharDistributionAnalysis): if first_char >= 0xA4: if second_char >= 0xA1: return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 - else: - return 157 * (first_char - 0xA4) + second_char - 0x40 - else: - return -1 + return 157 * (first_char - 0xA4) + second_char - 0x40 + return -1 class SJISDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - super(SJISDistributionAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that first_char, second_char = byte_str[0], byte_str[1] - if (first_char >= 0x81) and (first_char <= 0x9F): + if 0x81 <= first_char <= 0x9F: order = 188 * (first_char - 0x81) - elif (first_char >= 0xE0) and (first_char <= 0xEF): + elif 0xE0 <= first_char <= 0xEF: order = 188 * (first_char - 0xE0 + 31) else: return -1 @@ -215,19 +244,18 @@ class SJISDistributionAnalysis(CharDistributionAnalysis): class EUCJPDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): - super(EUCJPDistributionAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that char = byte_str[0] if char >= 0xA0: - return 94 * (char - 0xA1) + byte_str[1] - 0xa1 - else: - return -1 + return 94 * (char - 0xA1) + byte_str[1] - 0xA1 + return -1 diff --git a/lib/chardet/charsetgroupprober.py b/lib/chardet/charsetgroupprober.py index 1720ddc9..6b05c5f9 100644 --- a/lib/chardet/charsetgroupprober.py +++ b/lib/chardet/charsetgroupprober.py @@ -25,29 +25,30 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .enums import ProbingState +from typing import List, Optional, Union + from .charsetprober import CharSetProber +from .enums import LanguageFilter, ProbingState class CharSetGroupProber(CharSetProber): - def __init__(self, lang_filter=None): - super(CharSetGroupProber, self).__init__(lang_filter=lang_filter) + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + super().__init__(lang_filter=lang_filter) self._active_num = 0 - self.probers = [] - self._best_guess_prober = None + self.probers: List[CharSetProber] = [] + self._best_guess_prober: Optional[CharSetProber] = None - def reset(self): - super(CharSetGroupProber, self).reset() + def reset(self) -> None: + super().reset() self._active_num = 0 for prober in self.probers: - if prober: - prober.reset() - prober.active = True - self._active_num += 1 + prober.reset() + prober.active = True + self._active_num += 1 self._best_guess_prober = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: @@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber): return self._best_guess_prober.charset_name @property - def language(self): + def language(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: return None return self._best_guess_prober.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for prober in self.probers: - if not prober: - continue if not prober.active: continue state = prober.feed(byte_str) @@ -73,8 +72,9 @@ class CharSetGroupProber(CharSetProber): continue if state == ProbingState.FOUND_IT: self._best_guess_prober = prober + self._state = ProbingState.FOUND_IT return self.state - elif state == ProbingState.NOT_ME: + if state == ProbingState.NOT_ME: prober.active = False self._active_num -= 1 if self._active_num <= 0: @@ -82,22 +82,22 @@ class CharSetGroupProber(CharSetProber): return self.state return self.state - def get_confidence(self): + def get_confidence(self) -> float: state = self.state if state == ProbingState.FOUND_IT: return 0.99 - elif state == ProbingState.NOT_ME: + if state == ProbingState.NOT_ME: return 0.01 best_conf = 0.0 self._best_guess_prober = None for prober in self.probers: - if not prober: - continue if not prober.active: - self.logger.debug('%s not active', prober.charset_name) + self.logger.debug("%s not active", prober.charset_name) continue conf = prober.get_confidence() - self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf) + self.logger.debug( + "%s %s confidence = %s", prober.charset_name, prober.language, conf + ) if best_conf < conf: best_conf = conf self._best_guess_prober = prober diff --git a/lib/chardet/charsetprober.py b/lib/chardet/charsetprober.py index 1fc27464..dd053239 100644 --- a/lib/chardet/charsetprober.py +++ b/lib/chardet/charsetprober.py @@ -28,54 +28,62 @@ import logging import re +from typing import Optional, Union -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState + +INTERNATIONAL_WORDS_PATTERN = re.compile( + b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?" +) -class CharSetProber(object): +class CharSetProber: SHORTCUT_THRESHOLD = 0.95 - def __init__(self, lang_filter=None): - self._state = None + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + self._state = ProbingState.DETECTING + self.active = True self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - def reset(self): + def reset(self) -> None: self._state = ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return None - def feed(self, buf): - pass + @property + def language(self) -> Optional[str]: + raise NotImplementedError + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + raise NotImplementedError @property - def state(self): + def state(self) -> ProbingState: return self._state - def get_confidence(self): + def get_confidence(self) -> float: return 0.0 @staticmethod - def filter_high_byte_only(buf): - buf = re.sub(b'([\x00-\x7F])+', b' ', buf) + def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: + buf = re.sub(b"([\x00-\x7F])+", b" ", buf) return buf @staticmethod - def filter_international_words(buf): + def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: """ We define three types of bytes: alphabet: english alphabets [a-zA-Z] international: international characters [\x80-\xFF] marker: everything else [^a-zA-Z\x80-\xFF] - The input buffer can be thought to contain a series of words delimited by markers. This function works to filter all words that contain at least one international character. All contiguous sequences of markers are replaced by a single space ascii character. - This filter applies to all scripts which do not use English characters. """ filtered = bytearray() @@ -83,8 +91,7 @@ class CharSetProber(object): # This regex expression filters out only words that have at-least one # international character. The word may include one marker character at # the end. - words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', - buf) + words = INTERNATIONAL_WORDS_PATTERN.findall(buf) for word in words: filtered.extend(word[:-1]) @@ -94,20 +101,17 @@ class CharSetProber(object): # similarly across all languages and may thus have similar # frequencies). last_char = word[-1:] - if not last_char.isalpha() and last_char < b'\x80': - last_char = b' ' + if not last_char.isalpha() and last_char < b"\x80": + last_char = b" " filtered.extend(last_char) return filtered @staticmethod - def filter_with_english_letters(buf): + def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes: """ Returns a copy of ``buf`` that retains only the sequences of English alphabet and high byte characters that are not between <> characters. - Also retains English alphabet and high byte characters immediately - before occurrences of >. - This filter can be applied to all scripts which contain both English characters and extended ASCII characters, but is currently only used by ``Latin1Prober``. @@ -115,26 +119,24 @@ class CharSetProber(object): filtered = bytearray() in_tag = False prev = 0 + buf = memoryview(buf).cast("c") - for curr in range(len(buf)): - # Slice here to get bytes instead of an int with Python 3 - buf_char = buf[curr:curr + 1] - # Check if we're coming out of or entering an HTML tag - if buf_char == b'>': + for curr, buf_char in enumerate(buf): + # Check if we're coming out of or entering an XML tag + + # https://github.com/python/typeshed/issues/8182 + if buf_char == b">": # type: ignore[comparison-overlap] + prev = curr + 1 in_tag = False - elif buf_char == b'<': - in_tag = True - - # If current character is not extended-ASCII and not alphabetic... - if buf_char < b'\x80' and not buf_char.isalpha(): - # ...and we're not in a tag + # https://github.com/python/typeshed/issues/8182 + elif buf_char == b"<": # type: ignore[comparison-overlap] if curr > prev and not in_tag: # Keep everything after last non-extended-ASCII, # non-alphabetic character filtered.extend(buf[prev:curr]) # Output a space to delimit stretch we kept - filtered.extend(b' ') - prev = curr + 1 + filtered.extend(b" ") + in_tag = True # If we're not in a tag... if not in_tag: diff --git a/lib/chardet/cli/__init__.py b/lib/chardet/cli/__init__.py index d3f5a12f..e69de29b 100644 --- a/lib/chardet/cli/__init__.py +++ b/lib/chardet/cli/__init__.py @@ -1 +0,0 @@ - diff --git a/lib/chardet/cli/chardetect.py b/lib/chardet/cli/chardetect.py index e5f86fe9..ad709821 100644 --- a/lib/chardet/cli/chardetect.py +++ b/lib/chardet/cli/chardetect.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ Script which takes one or more file paths and reports on their detected encodings @@ -13,17 +12,21 @@ If no paths are provided, it takes its input from stdin. """ -from __future__ import absolute_import, print_function, unicode_literals import argparse import sys +from typing import Iterable, List, Optional -from chardet import __version__ -from chardet.compat import PY2 -from chardet.universaldetector import UniversalDetector +from .. import __version__ +from ..universaldetector import UniversalDetector -def description_of(lines, name='stdin'): +def description_of( + lines: Iterable[bytes], + name: str = "stdin", + minimal: bool = False, + should_rename_legacy: bool = False, +) -> Optional[str]: """ Return a string describing the probable encoding of a file or list of strings. @@ -32,8 +35,11 @@ def description_of(lines, name='stdin'): :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str + :param should_rename_legacy: Should we rename legacy encodings to + their more modern equivalents? + :type should_rename_legacy: ``bool`` """ - u = UniversalDetector() + u = UniversalDetector(should_rename_legacy=should_rename_legacy) for line in lines: line = bytearray(line) u.feed(line) @@ -42,16 +48,14 @@ def description_of(lines, name='stdin'): break u.close() result = u.result - if PY2: - name = name.decode(sys.getfilesystemencoding(), 'ignore') - if result['encoding']: - return '{0}: {1} with confidence {2}'.format(name, result['encoding'], - result['confidence']) - else: - return '{0}: no result'.format(name) + if minimal: + return result["encoding"] + if result["encoding"]: + return f'{name}: {result["encoding"]} with confidence {result["confidence"]}' + return f"{name}: no result" -def main(argv=None): +def main(argv: Optional[List[str]] = None) -> None: """ Handles command line arguments and gets things started. @@ -61,25 +65,48 @@ def main(argv=None): """ # Get command line arguments parser = argparse.ArgumentParser( - description="Takes one or more file paths and reports their detected \ - encodings") - parser.add_argument('input', - help='File whose encoding we would like to determine. \ - (default: stdin)', - type=argparse.FileType('rb'), nargs='*', - default=[sys.stdin if PY2 else sys.stdin.buffer]) - parser.add_argument('--version', action='version', - version='%(prog)s {0}'.format(__version__)) + description=( + "Takes one or more file paths and reports their detected encodings" + ) + ) + parser.add_argument( + "input", + help="File whose encoding we would like to determine. (default: stdin)", + type=argparse.FileType("rb"), + nargs="*", + default=[sys.stdin.buffer], + ) + parser.add_argument( + "--minimal", + help="Print only the encoding to standard output", + action="store_true", + ) + parser.add_argument( + "-l", + "--legacy", + help="Rename legacy encodings to more modern ones.", + action="store_true", + ) + parser.add_argument( + "--version", action="version", version=f"%(prog)s {__version__}" + ) args = parser.parse_args(argv) for f in args.input: if f.isatty(): - print("You are running chardetect interactively. Press " + - "CTRL-D twice at the start of a blank line to signal the " + - "end of your input. If you want help, run chardetect " + - "--help\n", file=sys.stderr) - print(description_of(f, f.name)) + print( + "You are running chardetect interactively. Press " + "CTRL-D twice at the start of a blank line to signal the " + "end of your input. If you want help, run chardetect " + "--help\n", + file=sys.stderr, + ) + print( + description_of( + f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy + ) + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/lib/chardet/codingstatemachine.py b/lib/chardet/codingstatemachine.py index c562e1dc..a90a5871 100644 --- a/lib/chardet/codingstatemachine.py +++ b/lib/chardet/codingstatemachine.py @@ -27,10 +27,11 @@ import logging +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState -class CodingStateMachine(object): +class CodingStateMachine: """ A state machine to verify a byte sequence for a particular encoding. For each byte the detector receives, it will feed that byte to every active @@ -52,37 +53,38 @@ class CodingStateMachine(object): negative answer for this encoding. Detector will exclude this encoding from consideration from here on. """ - def __init__(self, sm): + + def __init__(self, sm: CodingStateMachineDict) -> None: self._model = sm self._curr_byte_pos = 0 self._curr_char_len = 0 - self._curr_state = None + self._curr_state = MachineState.START + self.active = True self.logger = logging.getLogger(__name__) self.reset() - def reset(self): + def reset(self) -> None: self._curr_state = MachineState.START - def next_state(self, c): + def next_state(self, c: int) -> int: # for each byte we get its class # if it is first byte, we also get byte length - byte_class = self._model['class_table'][c] + byte_class = self._model["class_table"][c] if self._curr_state == MachineState.START: self._curr_byte_pos = 0 - self._curr_char_len = self._model['char_len_table'][byte_class] + self._curr_char_len = self._model["char_len_table"][byte_class] # from byte's class and state_table, we get its next state - curr_state = (self._curr_state * self._model['class_factor'] - + byte_class) - self._curr_state = self._model['state_table'][curr_state] + curr_state = self._curr_state * self._model["class_factor"] + byte_class + self._curr_state = self._model["state_table"][curr_state] self._curr_byte_pos += 1 return self._curr_state - def get_current_charlen(self): + def get_current_charlen(self) -> int: return self._curr_char_len - def get_coding_state_machine(self): - return self._model['name'] + def get_coding_state_machine(self) -> str: + return self._model["name"] @property - def language(self): - return self._model['language'] + def language(self) -> str: + return self._model["language"] diff --git a/lib/chardet/codingstatemachinedict.py b/lib/chardet/codingstatemachinedict.py new file mode 100644 index 00000000..a744d7aa --- /dev/null +++ b/lib/chardet/codingstatemachinedict.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING, Tuple + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class CodingStateMachineDict(TypedDict, total=False): + class_table: Tuple[int, ...] + class_factor: int + state_table: Tuple[int, ...] + char_len_table: Tuple[int, ...] + name: str + language: str # Optional key + +else: + CodingStateMachineDict = dict diff --git a/lib/chardet/compat.py b/lib/chardet/compat.py deleted file mode 100644 index ee3f619a..00000000 --- a/lib/chardet/compat.py +++ /dev/null @@ -1,36 +0,0 @@ -######################## BEGIN LICENSE BLOCK ######################## -# Contributor(s): -# Dan Blanchard -# Ian Cordasco -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA -# 02110-1301 USA -######################### END LICENSE BLOCK ######################### - -import sys - - -if sys.version_info < (3, 0): - PY2 = True - PY3 = False - string_types = (str, unicode) - text_type = unicode - iteritems = dict.iteritems -else: - PY2 = False - PY3 = True - string_types = (bytes, str) - text_type = str - iteritems = dict.items diff --git a/lib/chardet/cp949prober.py b/lib/chardet/cp949prober.py index de0ceab0..2cc71502 100644 --- a/lib/chardet/cp949prober.py +++ b/lib/chardet/cp949prober.py @@ -32,8 +32,8 @@ from .mbcssm import CP949_SM_MODEL class CP949Prober(MultiByteCharSetProber): - def __init__(self): - super(CP949Prober, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(CP949_SM_MODEL) # NOTE: CP949 is a superset of EUC-KR, so the distribution should be # not different. @@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber): self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "CP949" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/lib/chardet/enums.py b/lib/chardet/enums.py index c8e60013..089793b7 100644 --- a/lib/chardet/enums.py +++ b/lib/chardet/enums.py @@ -4,21 +4,26 @@ All of the Enums that are used throughout the chardet package. :author: Dan Blanchard (dan.blanchard@gmail.com) """ +from enum import Enum, Flag -class InputState(object): + +class InputState: """ This enum represents the different states a universal detector can be in. """ + PURE_ASCII = 0 ESC_ASCII = 1 HIGH_BYTE = 2 -class LanguageFilter(object): +class LanguageFilter(Flag): """ This enum represents the different language filters we can apply to a ``UniversalDetector``. """ + + NONE = 0x00 CHINESE_SIMPLIFIED = 0x01 CHINESE_TRADITIONAL = 0x02 JAPANESE = 0x04 @@ -29,46 +34,50 @@ class LanguageFilter(object): CJK = CHINESE | JAPANESE | KOREAN -class ProbingState(object): +class ProbingState(Enum): """ This enum represents the different states a prober can be in. """ + DETECTING = 0 FOUND_IT = 1 NOT_ME = 2 -class MachineState(object): +class MachineState: """ This enum represents the different states a state machine can be in. """ + START = 0 ERROR = 1 ITS_ME = 2 -class SequenceLikelihood(object): +class SequenceLikelihood: """ This enum represents the likelihood of a character following the previous one. """ + NEGATIVE = 0 UNLIKELY = 1 LIKELY = 2 POSITIVE = 3 @classmethod - def get_num_categories(cls): + def get_num_categories(cls) -> int: """:returns: The number of likelihood categories in the enum.""" return 4 -class CharacterCategory(object): +class CharacterCategory: """ This enum represents the different categories language models for ``SingleByteCharsetProber`` put characters into. Anything less than CONTROL is considered a letter. """ + UNDEFINED = 255 LINE_BREAK = 254 SYMBOL = 253 diff --git a/lib/chardet/escprober.py b/lib/chardet/escprober.py index c52060d0..cea4e8a5 100644 --- a/lib/chardet/escprober.py +++ b/lib/chardet/escprober.py @@ -25,11 +25,17 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine -from .enums import LanguageFilter, ProbingState, MachineState -from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL, - ISO2022KR_SM_MODEL) +from .enums import LanguageFilter, MachineState, ProbingState +from .escsm import ( + HZ_SM_MODEL, + ISO2022CN_SM_MODEL, + ISO2022JP_SM_MODEL, + ISO2022KR_SM_MODEL, +) class EscCharSetProber(CharSetProber): @@ -39,8 +45,8 @@ class EscCharSetProber(CharSetProber): identify these encodings. """ - def __init__(self, lang_filter=None): - super(EscCharSetProber, self).__init__(lang_filter=lang_filter) + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + super().__init__(lang_filter=lang_filter) self.coding_sm = [] if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) @@ -49,17 +55,15 @@ class EscCharSetProber(CharSetProber): self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) if self.lang_filter & LanguageFilter.KOREAN: self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) - self.active_sm_count = None - self._detected_charset = None - self._detected_language = None - self._state = None + self.active_sm_count = 0 + self._detected_charset: Optional[str] = None + self._detected_language: Optional[str] = None + self._state = ProbingState.DETECTING self.reset() - def reset(self): - super(EscCharSetProber, self).reset() + def reset(self) -> None: + super().reset() for coding_sm in self.coding_sm: - if not coding_sm: - continue coding_sm.active = True coding_sm.reset() self.active_sm_count = len(self.coding_sm) @@ -67,23 +71,20 @@ class EscCharSetProber(CharSetProber): self._detected_language = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return self._detected_charset @property - def language(self): + def language(self) -> Optional[str]: return self._detected_language - def get_confidence(self): - if self._detected_charset: - return 0.99 - else: - return 0.00 + def get_confidence(self) -> float: + return 0.99 if self._detected_charset else 0.00 - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: for coding_sm in self.coding_sm: - if not coding_sm or not coding_sm.active: + if not coding_sm.active: continue coding_state = coding_sm.next_state(c) if coding_state == MachineState.ERROR: diff --git a/lib/chardet/escsm.py b/lib/chardet/escsm.py index b8377045..4e9f6fff 100644 --- a/lib/chardet/escsm.py +++ b/lib/chardet/escsm.py @@ -12,7 +12,7 @@ # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. +# version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -20,227 +20,242 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState +# fmt: off HZ_CLS = ( -1,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,0,0,0,0, # 20 - 27 -0,0,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,0,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,4,0,5,2,0, # 78 - 7f -1,1,1,1,1,1,1,1, # 80 - 87 -1,1,1,1,1,1,1,1, # 88 - 8f -1,1,1,1,1,1,1,1, # 90 - 97 -1,1,1,1,1,1,1,1, # 98 - 9f -1,1,1,1,1,1,1,1, # a0 - a7 -1,1,1,1,1,1,1,1, # a8 - af -1,1,1,1,1,1,1,1, # b0 - b7 -1,1,1,1,1,1,1,1, # b8 - bf -1,1,1,1,1,1,1,1, # c0 - c7 -1,1,1,1,1,1,1,1, # c8 - cf -1,1,1,1,1,1,1,1, # d0 - d7 -1,1,1,1,1,1,1,1, # d8 - df -1,1,1,1,1,1,1,1, # e0 - e7 -1,1,1,1,1,1,1,1, # e8 - ef -1,1,1,1,1,1,1,1, # f0 - f7 -1,1,1,1,1,1,1,1, # f8 - ff + 1, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 + 0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f + 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 + 0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f + 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27 + 0, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f + 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 + 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f + 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47 + 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f + 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 + 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f + 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 + 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f + 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 + 0, 0, 0, 4, 0, 5, 2, 0, # 78 - 7f + 1, 1, 1, 1, 1, 1, 1, 1, # 80 - 87 + 1, 1, 1, 1, 1, 1, 1, 1, # 88 - 8f + 1, 1, 1, 1, 1, 1, 1, 1, # 90 - 97 + 1, 1, 1, 1, 1, 1, 1, 1, # 98 - 9f + 1, 1, 1, 1, 1, 1, 1, 1, # a0 - a7 + 1, 1, 1, 1, 1, 1, 1, 1, # a8 - af + 1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7 + 1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf + 1, 1, 1, 1, 1, 1, 1, 1, # c0 - c7 + 1, 1, 1, 1, 1, 1, 1, 1, # c8 - cf + 1, 1, 1, 1, 1, 1, 1, 1, # d0 - d7 + 1, 1, 1, 1, 1, 1, 1, 1, # d8 - df + 1, 1, 1, 1, 1, 1, 1, 1, # e0 - e7 + 1, 1, 1, 1, 1, 1, 1, 1, # e8 - ef + 1, 1, 1, 1, 1, 1, 1, 1, # f0 - f7 + 1, 1, 1, 1, 1, 1, 1, 1, # f8 - ff ) HZ_ST = ( -MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07 -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f -MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17 - 5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f - 4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27 - 4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f +MachineState.START, MachineState.ERROR, 3, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07 +MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f +MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, 4, MachineState.ERROR, # 10-17 + 5, MachineState.ERROR, 6, MachineState.ERROR, 5, 5, 4, MachineState.ERROR, # 18-1f + 4, MachineState.ERROR, 4, 4, 4, MachineState.ERROR, 4, MachineState.ERROR, # 20-27 + 4, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 28-2f ) +# fmt: on HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -HZ_SM_MODEL = {'class_table': HZ_CLS, - 'class_factor': 6, - 'state_table': HZ_ST, - 'char_len_table': HZ_CHAR_LEN_TABLE, - 'name': "HZ-GB-2312", - 'language': 'Chinese'} +HZ_SM_MODEL: CodingStateMachineDict = { + "class_table": HZ_CLS, + "class_factor": 6, + "state_table": HZ_ST, + "char_len_table": HZ_CHAR_LEN_TABLE, + "name": "HZ-GB-2312", + "language": "Chinese", +} +# fmt: off ISO2022CN_CLS = ( -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,0,0,0,0, # 20 - 27 -0,3,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,4,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff + 2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 + 0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f + 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 + 0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f + 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27 + 0, 3, 0, 0, 0, 0, 0, 0, # 28 - 2f + 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 + 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f + 0, 0, 0, 4, 0, 0, 0, 0, # 40 - 47 + 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f + 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 + 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f + 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 + 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f + 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 + 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f + 2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87 + 2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f + 2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97 + 2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f + 2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 + 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af + 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 + 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf + 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 + 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf + 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 + 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df + 2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7 + 2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef + 2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7 + 2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff ) ISO2022CN_ST = ( -MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07 -MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f -MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17 -MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27 - 5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37 -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f + MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07 + MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f + MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17 + MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, # 18-1f + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 20-27 + 5, 6, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 28-2f + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 30-37 + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, # 38-3f ) +# fmt: on ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS, - 'class_factor': 9, - 'state_table': ISO2022CN_ST, - 'char_len_table': ISO2022CN_CHAR_LEN_TABLE, - 'name': "ISO-2022-CN", - 'language': 'Chinese'} +ISO2022CN_SM_MODEL: CodingStateMachineDict = { + "class_table": ISO2022CN_CLS, + "class_factor": 9, + "state_table": ISO2022CN_ST, + "char_len_table": ISO2022CN_CHAR_LEN_TABLE, + "name": "ISO-2022-CN", + "language": "Chinese", +} +# fmt: off ISO2022JP_CLS = ( -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,2,2, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,7,0,0,0, # 20 - 27 -3,0,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -6,0,4,0,8,0,0,0, # 40 - 47 -0,9,5,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff + 2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 + 0, 0, 0, 0, 0, 0, 2, 2, # 08 - 0f + 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 + 0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f + 0, 0, 0, 0, 7, 0, 0, 0, # 20 - 27 + 3, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f + 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 + 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f + 6, 0, 4, 0, 8, 0, 0, 0, # 40 - 47 + 0, 9, 5, 0, 0, 0, 0, 0, # 48 - 4f + 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 + 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f + 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 + 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f + 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 + 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f + 2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87 + 2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f + 2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97 + 2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f + 2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 + 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af + 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 + 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf + 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 + 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf + 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 + 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df + 2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7 + 2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef + 2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7 + 2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff ) ISO2022JP_ST = ( -MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07 -MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17 -MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f -MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27 -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37 -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47 + MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07 + MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17 + MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, # 18-1f + MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 20-27 + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 6, MachineState.ITS_ME, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, # 28-2f + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, # 30-37 + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 38-3f + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, MachineState.START, # 40-47 ) +# fmt: on ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS, - 'class_factor': 10, - 'state_table': ISO2022JP_ST, - 'char_len_table': ISO2022JP_CHAR_LEN_TABLE, - 'name': "ISO-2022-JP", - 'language': 'Japanese'} +ISO2022JP_SM_MODEL: CodingStateMachineDict = { + "class_table": ISO2022JP_CLS, + "class_factor": 10, + "state_table": ISO2022JP_ST, + "char_len_table": ISO2022JP_CHAR_LEN_TABLE, + "name": "ISO-2022-JP", + "language": "Japanese", +} +# fmt: off ISO2022KR_CLS = ( -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,3,0,0,0, # 20 - 27 -0,4,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,5,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff + 2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 + 0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f + 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 + 0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f + 0, 0, 0, 0, 3, 0, 0, 0, # 20 - 27 + 0, 4, 0, 0, 0, 0, 0, 0, # 28 - 2f + 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 + 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f + 0, 0, 0, 5, 0, 0, 0, 0, # 40 - 47 + 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f + 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 + 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f + 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 + 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f + 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 + 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f + 2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87 + 2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f + 2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97 + 2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f + 2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 + 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af + 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 + 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf + 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 + 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf + 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 + 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df + 2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7 + 2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef + 2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7 + 2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff ) ISO2022KR_ST = ( -MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07 -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f -MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17 -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f -MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27 + MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07 + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f + MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 10-17 + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 18-1f + MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 20-27 ) +# fmt: on ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS, - 'class_factor': 6, - 'state_table': ISO2022KR_ST, - 'char_len_table': ISO2022KR_CHAR_LEN_TABLE, - 'name': "ISO-2022-KR", - 'language': 'Korean'} - - +ISO2022KR_SM_MODEL: CodingStateMachineDict = { + "class_table": ISO2022KR_CLS, + "class_factor": 6, + "state_table": ISO2022KR_ST, + "char_len_table": ISO2022KR_CHAR_LEN_TABLE, + "name": "ISO-2022-KR", + "language": "Korean", +} diff --git a/lib/chardet/eucjpprober.py b/lib/chardet/eucjpprober.py index a81ee1e2..3fafb219 100644 --- a/lib/chardet/eucjpprober.py +++ b/lib/chardet/eucjpprober.py @@ -25,68 +25,78 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .enums import ProbingState, MachineState -from .mbcharsetprober import MultiByteCharSetProber -from .codingstatemachine import CodingStateMachine +from typing import Union + from .chardistribution import EUCJPDistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .enums import MachineState, ProbingState from .jpcntx import EUCJPContextAnalysis +from .mbcharsetprober import MultiByteCharSetProber from .mbcssm import EUCJP_SM_MODEL class EUCJPProber(MultiByteCharSetProber): - def __init__(self): - super(EUCJPProber, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) self.distribution_analyzer = EUCJPDistributionAnalysis() self.context_analyzer = EUCJPContextAnalysis() self.reset() - def reset(self): - super(EUCJPProber, self).reset() + def reset(self) -> None: + super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-JP" @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): - for i in range(len(byte_str)): - # PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte - coding_state = self.coding_sm.next_state(byte_str[i]) + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + + for i, byte in enumerate(byte_str): + # PY3K: byte_str is a byte array, so byte is an int, not a byte + coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: - self.logger.debug('%s %s prober hit error at byte %s', - self.charset_name, self.language, i) + self.logger.debug( + "%s %s prober hit error at byte %s", + self.charset_name, + self.language, + i, + ) self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.ITS_ME: + if coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.START: + if coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() if i == 0: - self._last_char[1] = byte_str[0] + self._last_char[1] = byte self.context_analyzer.feed(self._last_char, char_len) self.distribution_analyzer.feed(self._last_char, char_len) else: - self.context_analyzer.feed(byte_str[i - 1:i + 1], - char_len) - self.distribution_analyzer.feed(byte_str[i - 1:i + 1], - char_len) + self.context_analyzer.feed(byte_str[i - 1 : i + 1], char_len) + self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len) self._last_char[0] = byte_str[-1] if self.state == ProbingState.DETECTING: - if (self.context_analyzer.got_enough_data() and - (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + if self.context_analyzer.got_enough_data() and ( + self.get_confidence() > self.SHORTCUT_THRESHOLD + ): self._state = ProbingState.FOUND_IT return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/lib/chardet/euckrfreq.py b/lib/chardet/euckrfreq.py index ae25c1bf..a0899f0b 100644 --- a/lib/chardet/euckrfreq.py +++ b/lib/chardet/euckrfreq.py @@ -43,6 +43,7 @@ EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0 EUCKR_TABLE_SIZE = 2352 # Char to FreqOrder table , +# fmt: off EUCKR_CHAR_TO_FREQ_ORDER = ( 13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87, 1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398, @@ -192,4 +193,4 @@ EUCKR_CHAR_TO_FREQ_ORDER = ( 2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042, 670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256 ) - +# fmt: on diff --git a/lib/chardet/euckrprober.py b/lib/chardet/euckrprober.py index 99d5b154..6827b0ef 100644 --- a/lib/chardet/euckrprober.py +++ b/lib/chardet/euckrprober.py @@ -25,23 +25,23 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .mbcharsetprober import MultiByteCharSetProber -from .codingstatemachine import CodingStateMachine from .chardistribution import EUCKRDistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .mbcharsetprober import MultiByteCharSetProber from .mbcssm import EUCKR_SM_MODEL class EUCKRProber(MultiByteCharSetProber): - def __init__(self): - super(EUCKRProber, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) self.distribution_analyzer = EUCKRDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-KR" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/lib/chardet/euctwfreq.py b/lib/chardet/euctwfreq.py index 5195275e..196392ca 100644 --- a/lib/chardet/euctwfreq.py +++ b/lib/chardet/euctwfreq.py @@ -43,345 +43,346 @@ EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75 -# Char to FreqOrder table , +# Char to FreqOrder table EUCTW_TABLE_SIZE = 5376 +# fmt: off EUCTW_CHAR_TO_FREQ_ORDER = ( - 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 -3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758 -1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774 - 63,7312,7313, 317,1614, 75, 222, 159,4061,2412,1480,7314,3500,3068, 224,2809, # 2790 -3616, 3, 10,3870,1471, 29,2774,1135,2852,1939, 873, 130,3242,1123, 312,7315, # 2806 -4297,2051, 507, 252, 682,7316, 142,1914, 124, 206,2932, 34,3501,3173, 64, 604, # 2822 -7317,2494,1976,1977, 155,1990, 645, 641,1606,7318,3405, 337, 72, 406,7319, 80, # 2838 - 630, 238,3174,1509, 263, 939,1092,2644, 756,1440,1094,3406, 449, 69,2969, 591, # 2854 - 179,2095, 471, 115,2034,1843, 60, 50,2970, 134, 806,1868, 734,2035,3407, 180, # 2870 - 995,1607, 156, 537,2893, 688,7320, 319,1305, 779,2144, 514,2374, 298,4298, 359, # 2886 -2495, 90,2707,1338, 663, 11, 906,1099,2545, 20,2436, 182, 532,1716,7321, 732, # 2902 -1376,4062,1311,1420,3175, 25,2312,1056, 113, 399, 382,1949, 242,3408,2467, 529, # 2918 -3243, 475,1447,3617,7322, 117, 21, 656, 810,1297,2295,2329,3502,7323, 126,4063, # 2934 - 706, 456, 150, 613,4299, 71,1118,2036,4064, 145,3069, 85, 835, 486,2114,1246, # 2950 -1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,7324,2127,2354, 347,3736, 221, # 2966 -3503,3110,7325,1955,1153,4065, 83, 296,1199,3070, 192, 624, 93,7326, 822,1897, # 2982 -2810,3111, 795,2064, 991,1554,1542,1592, 27, 43,2853, 859, 139,1456, 860,4300, # 2998 - 437, 712,3871, 164,2392,3112, 695, 211,3017,2096, 195,3872,1608,3504,3505,3618, # 3014 -3873, 234, 811,2971,2097,3874,2229,1441,3506,1615,2375, 668,2076,1638, 305, 228, # 3030 -1664,4301, 467, 415,7327, 262,2098,1593, 239, 108, 300, 200,1033, 512,1247,2077, # 3046 -7328,7329,2173,3176,3619,2673, 593, 845,1062,3244, 88,1723,2037,3875,1950, 212, # 3062 - 266, 152, 149, 468,1898,4066,4302, 77, 187,7330,3018, 37, 5,2972,7331,3876, # 3078 -7332,7333, 39,2517,4303,2894,3177,2078, 55, 148, 74,4304, 545, 483,1474,1029, # 3094 -1665, 217,1869,1531,3113,1104,2645,4067, 24, 172,3507, 900,3877,3508,3509,4305, # 3110 - 32,1408,2811,1312, 329, 487,2355,2247,2708, 784,2674, 4,3019,3314,1427,1788, # 3126 - 188, 109, 499,7334,3620,1717,1789, 888,1217,3020,4306,7335,3510,7336,3315,1520, # 3142 -3621,3878, 196,1034, 775,7337,7338, 929,1815, 249, 439, 38,7339,1063,7340, 794, # 3158 -3879,1435,2296, 46, 178,3245,2065,7341,2376,7342, 214,1709,4307, 804, 35, 707, # 3174 - 324,3622,1601,2546, 140, 459,4068,7343,7344,1365, 839, 272, 978,2257,2572,3409, # 3190 -2128,1363,3623,1423, 697, 100,3071, 48, 70,1231, 495,3114,2193,7345,1294,7346, # 3206 -2079, 462, 586,1042,3246, 853, 256, 988, 185,2377,3410,1698, 434,1084,7347,3411, # 3222 - 314,2615,2775,4308,2330,2331, 569,2280, 637,1816,2518, 757,1162,1878,1616,3412, # 3238 - 287,1577,2115, 768,4309,1671,2854,3511,2519,1321,3737, 909,2413,7348,4069, 933, # 3254 -3738,7349,2052,2356,1222,4310, 765,2414,1322, 786,4311,7350,1919,1462,1677,2895, # 3270 -1699,7351,4312,1424,2437,3115,3624,2590,3316,1774,1940,3413,3880,4070, 309,1369, # 3286 -1130,2812, 364,2230,1653,1299,3881,3512,3882,3883,2646, 525,1085,3021, 902,2000, # 3302 -1475, 964,4313, 421,1844,1415,1057,2281, 940,1364,3116, 376,4314,4315,1381, 7, # 3318 -2520, 983,2378, 336,1710,2675,1845, 321,3414, 559,1131,3022,2742,1808,1132,1313, # 3334 - 265,1481,1857,7352, 352,1203,2813,3247, 167,1089, 420,2814, 776, 792,1724,3513, # 3350 -4071,2438,3248,7353,4072,7354, 446, 229, 333,2743, 901,3739,1200,1557,4316,2647, # 3366 -1920, 395,2744,2676,3740,4073,1835, 125, 916,3178,2616,4317,7355,7356,3741,7357, # 3382 -7358,7359,4318,3117,3625,1133,2547,1757,3415,1510,2313,1409,3514,7360,2145, 438, # 3398 -2591,2896,2379,3317,1068, 958,3023, 461, 311,2855,2677,4074,1915,3179,4075,1978, # 3414 - 383, 750,2745,2617,4076, 274, 539, 385,1278,1442,7361,1154,1964, 384, 561, 210, # 3430 - 98,1295,2548,3515,7362,1711,2415,1482,3416,3884,2897,1257, 129,7363,3742, 642, # 3446 - 523,2776,2777,2648,7364, 141,2231,1333, 68, 176, 441, 876, 907,4077, 603,2592, # 3462 - 710, 171,3417, 404, 549, 18,3118,2393,1410,3626,1666,7365,3516,4319,2898,4320, # 3478 -7366,2973, 368,7367, 146, 366, 99, 871,3627,1543, 748, 807,1586,1185, 22,2258, # 3494 - 379,3743,3180,7368,3181, 505,1941,2618,1991,1382,2314,7369, 380,2357, 218, 702, # 3510 -1817,1248,3418,3024,3517,3318,3249,7370,2974,3628, 930,3250,3744,7371, 59,7372, # 3526 - 585, 601,4078, 497,3419,1112,1314,4321,1801,7373,1223,1472,2174,7374, 749,1836, # 3542 - 690,1899,3745,1772,3885,1476, 429,1043,1790,2232,2116, 917,4079, 447,1086,1629, # 3558 -7375, 556,7376,7377,2020,1654, 844,1090, 105, 550, 966,1758,2815,1008,1782, 686, # 3574 -1095,7378,2282, 793,1602,7379,3518,2593,4322,4080,2933,2297,4323,3746, 980,2496, # 3590 - 544, 353, 527,4324, 908,2678,2899,7380, 381,2619,1942,1348,7381,1341,1252, 560, # 3606 -3072,7382,3420,2856,7383,2053, 973, 886,2080, 143,4325,7384,7385, 157,3886, 496, # 3622 -4081, 57, 840, 540,2038,4326,4327,3421,2117,1445, 970,2259,1748,1965,2081,4082, # 3638 -3119,1234,1775,3251,2816,3629, 773,1206,2129,1066,2039,1326,3887,1738,1725,4083, # 3654 - 279,3120, 51,1544,2594, 423,1578,2130,2066, 173,4328,1879,7386,7387,1583, 264, # 3670 - 610,3630,4329,2439, 280, 154,7388,7389,7390,1739, 338,1282,3073, 693,2857,1411, # 3686 -1074,3747,2440,7391,4330,7392,7393,1240, 952,2394,7394,2900,1538,2679, 685,1483, # 3702 -4084,2468,1436, 953,4085,2054,4331, 671,2395, 79,4086,2441,3252, 608, 567,2680, # 3718 -3422,4087,4088,1691, 393,1261,1791,2396,7395,4332,7396,7397,7398,7399,1383,1672, # 3734 -3748,3182,1464, 522,1119, 661,1150, 216, 675,4333,3888,1432,3519, 609,4334,2681, # 3750 -2397,7400,7401,7402,4089,3025, 0,7403,2469, 315, 231,2442, 301,3319,4335,2380, # 3766 -7404, 233,4090,3631,1818,4336,4337,7405, 96,1776,1315,2082,7406, 257,7407,1809, # 3782 -3632,2709,1139,1819,4091,2021,1124,2163,2778,1777,2649,7408,3074, 363,1655,3183, # 3798 -7409,2975,7410,7411,7412,3889,1567,3890, 718, 103,3184, 849,1443, 341,3320,2934, # 3814 -1484,7413,1712, 127, 67, 339,4092,2398, 679,1412, 821,7414,7415, 834, 738, 351, # 3830 -2976,2146, 846, 235,1497,1880, 418,1992,3749,2710, 186,1100,2147,2746,3520,1545, # 3846 -1355,2935,2858,1377, 583,3891,4093,2573,2977,7416,1298,3633,1078,2549,3634,2358, # 3862 - 78,3750,3751, 267,1289,2099,2001,1594,4094, 348, 369,1274,2194,2175,1837,4338, # 3878 -1820,2817,3635,2747,2283,2002,4339,2936,2748, 144,3321, 882,4340,3892,2749,3423, # 3894 -4341,2901,7417,4095,1726, 320,7418,3893,3026, 788,2978,7419,2818,1773,1327,2859, # 3910 -3894,2819,7420,1306,4342,2003,1700,3752,3521,2359,2650, 787,2022, 506, 824,3636, # 3926 - 534, 323,4343,1044,3322,2023,1900, 946,3424,7421,1778,1500,1678,7422,1881,4344, # 3942 - 165, 243,4345,3637,2521, 123, 683,4096, 764,4346, 36,3895,1792, 589,2902, 816, # 3958 - 626,1667,3027,2233,1639,1555,1622,3753,3896,7423,3897,2860,1370,1228,1932, 891, # 3974 -2083,2903, 304,4097,7424, 292,2979,2711,3522, 691,2100,4098,1115,4347, 118, 662, # 3990 -7425, 611,1156, 854,2381,1316,2861, 2, 386, 515,2904,7426,7427,3253, 868,2234, # 4006 -1486, 855,2651, 785,2212,3028,7428,1040,3185,3523,7429,3121, 448,7430,1525,7431, # 4022 -2164,4348,7432,3754,7433,4099,2820,3524,3122, 503, 818,3898,3123,1568, 814, 676, # 4038 -1444, 306,1749,7434,3755,1416,1030, 197,1428, 805,2821,1501,4349,7435,7436,7437, # 4054 -1993,7438,4350,7439,7440,2195, 13,2779,3638,2980,3124,1229,1916,7441,3756,2131, # 4070 -7442,4100,4351,2399,3525,7443,2213,1511,1727,1120,7444,7445, 646,3757,2443, 307, # 4086 -7446,7447,1595,3186,7448,7449,7450,3639,1113,1356,3899,1465,2522,2523,7451, 519, # 4102 -7452, 128,2132, 92,2284,1979,7453,3900,1512, 342,3125,2196,7454,2780,2214,1980, # 4118 -3323,7455, 290,1656,1317, 789, 827,2360,7456,3758,4352, 562, 581,3901,7457, 401, # 4134 -4353,2248, 94,4354,1399,2781,7458,1463,2024,4355,3187,1943,7459, 828,1105,4101, # 4150 -1262,1394,7460,4102, 605,4356,7461,1783,2862,7462,2822, 819,2101, 578,2197,2937, # 4166 -7463,1502, 436,3254,4103,3255,2823,3902,2905,3425,3426,7464,2712,2315,7465,7466, # 4182 -2332,2067, 23,4357, 193, 826,3759,2102, 699,1630,4104,3075, 390,1793,1064,3526, # 4198 -7467,1579,3076,3077,1400,7468,4105,1838,1640,2863,7469,4358,4359, 137,4106, 598, # 4214 -3078,1966, 780, 104, 974,2938,7470, 278, 899, 253, 402, 572, 504, 493,1339,7471, # 4230 -3903,1275,4360,2574,2550,7472,3640,3029,3079,2249, 565,1334,2713, 863, 41,7473, # 4246 -7474,4361,7475,1657,2333, 19, 463,2750,4107, 606,7476,2981,3256,1087,2084,1323, # 4262 -2652,2982,7477,1631,1623,1750,4108,2682,7478,2864, 791,2714,2653,2334, 232,2416, # 4278 -7479,2983,1498,7480,2654,2620, 755,1366,3641,3257,3126,2025,1609, 119,1917,3427, # 4294 - 862,1026,4109,7481,3904,3760,4362,3905,4363,2260,1951,2470,7482,1125, 817,4110, # 4310 -4111,3906,1513,1766,2040,1487,4112,3030,3258,2824,3761,3127,7483,7484,1507,7485, # 4326 -2683, 733, 40,1632,1106,2865, 345,4113, 841,2524, 230,4364,2984,1846,3259,3428, # 4342 -7486,1263, 986,3429,7487, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562,3907, # 4358 -3908,2939, 967,2751,2655,1349, 592,2133,1692,3324,2985,1994,4114,1679,3909,1901, # 4374 -2185,7488, 739,3642,2715,1296,1290,7489,4115,2198,2199,1921,1563,2595,2551,1870, # 4390 -2752,2986,7490, 435,7491, 343,1108, 596, 17,1751,4365,2235,3430,3643,7492,4366, # 4406 - 294,3527,2940,1693, 477, 979, 281,2041,3528, 643,2042,3644,2621,2782,2261,1031, # 4422 -2335,2134,2298,3529,4367, 367,1249,2552,7493,3530,7494,4368,1283,3325,2004, 240, # 4438 -1762,3326,4369,4370, 836,1069,3128, 474,7495,2148,2525, 268,3531,7496,3188,1521, # 4454 -1284,7497,1658,1546,4116,7498,3532,3533,7499,4117,3327,2684,1685,4118, 961,1673, # 4470 -2622, 190,2005,2200,3762,4371,4372,7500, 570,2497,3645,1490,7501,4373,2623,3260, # 4486 -1956,4374, 584,1514, 396,1045,1944,7502,4375,1967,2444,7503,7504,4376,3910, 619, # 4502 -7505,3129,3261, 215,2006,2783,2553,3189,4377,3190,4378, 763,4119,3763,4379,7506, # 4518 -7507,1957,1767,2941,3328,3646,1174, 452,1477,4380,3329,3130,7508,2825,1253,2382, # 4534 -2186,1091,2285,4120, 492,7509, 638,1169,1824,2135,1752,3911, 648, 926,1021,1324, # 4550 -4381, 520,4382, 997, 847,1007, 892,4383,3764,2262,1871,3647,7510,2400,1784,4384, # 4566 -1952,2942,3080,3191,1728,4121,2043,3648,4385,2007,1701,3131,1551, 30,2263,4122, # 4582 -7511,2026,4386,3534,7512, 501,7513,4123, 594,3431,2165,1821,3535,3432,3536,3192, # 4598 - 829,2826,4124,7514,1680,3132,1225,4125,7515,3262,4387,4126,3133,2336,7516,4388, # 4614 -4127,7517,3912,3913,7518,1847,2383,2596,3330,7519,4389, 374,3914, 652,4128,4129, # 4630 - 375,1140, 798,7520,7521,7522,2361,4390,2264, 546,1659, 138,3031,2445,4391,7523, # 4646 -2250, 612,1848, 910, 796,3765,1740,1371, 825,3766,3767,7524,2906,2554,7525, 692, # 4662 - 444,3032,2624, 801,4392,4130,7526,1491, 244,1053,3033,4131,4132, 340,7527,3915, # 4678 -1041,2987, 293,1168, 87,1357,7528,1539, 959,7529,2236, 721, 694,4133,3768, 219, # 4694 -1478, 644,1417,3331,2656,1413,1401,1335,1389,3916,7530,7531,2988,2362,3134,1825, # 4710 - 730,1515, 184,2827, 66,4393,7532,1660,2943, 246,3332, 378,1457, 226,3433, 975, # 4726 -3917,2944,1264,3537, 674, 696,7533, 163,7534,1141,2417,2166, 713,3538,3333,4394, # 4742 -3918,7535,7536,1186, 15,7537,1079,1070,7538,1522,3193,3539, 276,1050,2716, 758, # 4758 -1126, 653,2945,3263,7539,2337, 889,3540,3919,3081,2989, 903,1250,4395,3920,3434, # 4774 -3541,1342,1681,1718, 766,3264, 286, 89,2946,3649,7540,1713,7541,2597,3334,2990, # 4790 -7542,2947,2215,3194,2866,7543,4396,2498,2526, 181, 387,1075,3921, 731,2187,3335, # 4806 -7544,3265, 310, 313,3435,2299, 770,4134, 54,3034, 189,4397,3082,3769,3922,7545, # 4822 -1230,1617,1849, 355,3542,4135,4398,3336, 111,4136,3650,1350,3135,3436,3035,4137, # 4838 -2149,3266,3543,7546,2784,3923,3924,2991, 722,2008,7547,1071, 247,1207,2338,2471, # 4854 -1378,4399,2009, 864,1437,1214,4400, 373,3770,1142,2216, 667,4401, 442,2753,2555, # 4870 -3771,3925,1968,4138,3267,1839, 837, 170,1107, 934,1336,1882,7548,7549,2118,4139, # 4886 -2828, 743,1569,7550,4402,4140, 582,2384,1418,3437,7551,1802,7552, 357,1395,1729, # 4902 -3651,3268,2418,1564,2237,7553,3083,3772,1633,4403,1114,2085,4141,1532,7554, 482, # 4918 -2446,4404,7555,7556,1492, 833,1466,7557,2717,3544,1641,2829,7558,1526,1272,3652, # 4934 -4142,1686,1794, 416,2556,1902,1953,1803,7559,3773,2785,3774,1159,2316,7560,2867, # 4950 -4405,1610,1584,3036,2419,2754, 443,3269,1163,3136,7561,7562,3926,7563,4143,2499, # 4966 -3037,4406,3927,3137,2103,1647,3545,2010,1872,4144,7564,4145, 431,3438,7565, 250, # 4982 - 97, 81,4146,7566,1648,1850,1558, 160, 848,7567, 866, 740,1694,7568,2201,2830, # 4998 -3195,4147,4407,3653,1687, 950,2472, 426, 469,3196,3654,3655,3928,7569,7570,1188, # 5014 - 424,1995, 861,3546,4148,3775,2202,2685, 168,1235,3547,4149,7571,2086,1674,4408, # 5030 -3337,3270, 220,2557,1009,7572,3776, 670,2992, 332,1208, 717,7573,7574,3548,2447, # 5046 -3929,3338,7575, 513,7576,1209,2868,3339,3138,4409,1080,7577,7578,7579,7580,2527, # 5062 -3656,3549, 815,1587,3930,3931,7581,3550,3439,3777,1254,4410,1328,3038,1390,3932, # 5078 -1741,3933,3778,3934,7582, 236,3779,2448,3271,7583,7584,3657,3780,1273,3781,4411, # 5094 -7585, 308,7586,4412, 245,4413,1851,2473,1307,2575, 430, 715,2136,2449,7587, 270, # 5110 - 199,2869,3935,7588,3551,2718,1753, 761,1754, 725,1661,1840,4414,3440,3658,7589, # 5126 -7590, 587, 14,3272, 227,2598, 326, 480,2265, 943,2755,3552, 291, 650,1883,7591, # 5142 -1702,1226, 102,1547, 62,3441, 904,4415,3442,1164,4150,7592,7593,1224,1548,2756, # 5158 - 391, 498,1493,7594,1386,1419,7595,2055,1177,4416, 813, 880,1081,2363, 566,1145, # 5174 -4417,2286,1001,1035,2558,2599,2238, 394,1286,7596,7597,2068,7598, 86,1494,1730, # 5190 -3936, 491,1588, 745, 897,2948, 843,3340,3937,2757,2870,3273,1768, 998,2217,2069, # 5206 - 397,1826,1195,1969,3659,2993,3341, 284,7599,3782,2500,2137,2119,1903,7600,3938, # 5222 -2150,3939,4151,1036,3443,1904, 114,2559,4152, 209,1527,7601,7602,2949,2831,2625, # 5238 -2385,2719,3139, 812,2560,7603,3274,7604,1559, 737,1884,3660,1210, 885, 28,2686, # 5254 -3553,3783,7605,4153,1004,1779,4418,7606, 346,1981,2218,2687,4419,3784,1742, 797, # 5270 -1642,3940,1933,1072,1384,2151, 896,3941,3275,3661,3197,2871,3554,7607,2561,1958, # 5286 -4420,2450,1785,7608,7609,7610,3942,4154,1005,1308,3662,4155,2720,4421,4422,1528, # 5302 -2600, 161,1178,4156,1982, 987,4423,1101,4157, 631,3943,1157,3198,2420,1343,1241, # 5318 -1016,2239,2562, 372, 877,2339,2501,1160, 555,1934, 911,3944,7611, 466,1170, 169, # 5334 -1051,2907,2688,3663,2474,2994,1182,2011,2563,1251,2626,7612, 992,2340,3444,1540, # 5350 -2721,1201,2070,2401,1996,2475,7613,4424, 528,1922,2188,1503,1873,1570,2364,3342, # 5366 -3276,7614, 557,1073,7615,1827,3445,2087,2266,3140,3039,3084, 767,3085,2786,4425, # 5382 -1006,4158,4426,2341,1267,2176,3664,3199, 778,3945,3200,2722,1597,2657,7616,4427, # 5398 -7617,3446,7618,7619,7620,3277,2689,1433,3278, 131, 95,1504,3946, 723,4159,3141, # 5414 -1841,3555,2758,2189,3947,2027,2104,3665,7621,2995,3948,1218,7622,3343,3201,3949, # 5430 -4160,2576, 248,1634,3785, 912,7623,2832,3666,3040,3786, 654, 53,7624,2996,7625, # 5446 -1688,4428, 777,3447,1032,3950,1425,7626, 191, 820,2120,2833, 971,4429, 931,3202, # 5462 - 135, 664, 783,3787,1997, 772,2908,1935,3951,3788,4430,2909,3203, 282,2723, 640, # 5478 -1372,3448,1127, 922, 325,3344,7627,7628, 711,2044,7629,7630,3952,2219,2787,1936, # 5494 -3953,3345,2220,2251,3789,2300,7631,4431,3790,1258,3279,3954,3204,2138,2950,3955, # 5510 -3956,7632,2221, 258,3205,4432, 101,1227,7633,3280,1755,7634,1391,3281,7635,2910, # 5526 -2056, 893,7636,7637,7638,1402,4161,2342,7639,7640,3206,3556,7641,7642, 878,1325, # 5542 -1780,2788,4433, 259,1385,2577, 744,1183,2267,4434,7643,3957,2502,7644, 684,1024, # 5558 -4162,7645, 472,3557,3449,1165,3282,3958,3959, 322,2152, 881, 455,1695,1152,1340, # 5574 - 660, 554,2153,4435,1058,4436,4163, 830,1065,3346,3960,4437,1923,7646,1703,1918, # 5590 -7647, 932,2268, 122,7648,4438, 947, 677,7649,3791,2627, 297,1905,1924,2269,4439, # 5606 -2317,3283,7650,7651,4164,7652,4165, 84,4166, 112, 989,7653, 547,1059,3961, 701, # 5622 -3558,1019,7654,4167,7655,3450, 942, 639, 457,2301,2451, 993,2951, 407, 851, 494, # 5638 -4440,3347, 927,7656,1237,7657,2421,3348, 573,4168, 680, 921,2911,1279,1874, 285, # 5654 - 790,1448,1983, 719,2167,7658,7659,4441,3962,3963,1649,7660,1541, 563,7661,1077, # 5670 -7662,3349,3041,3451, 511,2997,3964,3965,3667,3966,1268,2564,3350,3207,4442,4443, # 5686 -7663, 535,1048,1276,1189,2912,2028,3142,1438,1373,2834,2952,1134,2012,7664,4169, # 5702 -1238,2578,3086,1259,7665, 700,7666,2953,3143,3668,4170,7667,4171,1146,1875,1906, # 5718 -4444,2601,3967, 781,2422, 132,1589, 203, 147, 273,2789,2402, 898,1786,2154,3968, # 5734 -3969,7668,3792,2790,7669,7670,4445,4446,7671,3208,7672,1635,3793, 965,7673,1804, # 5750 -2690,1516,3559,1121,1082,1329,3284,3970,1449,3794, 65,1128,2835,2913,2759,1590, # 5766 -3795,7674,7675, 12,2658, 45, 976,2579,3144,4447, 517,2528,1013,1037,3209,7676, # 5782 -3796,2836,7677,3797,7678,3452,7679,2602, 614,1998,2318,3798,3087,2724,2628,7680, # 5798 -2580,4172, 599,1269,7681,1810,3669,7682,2691,3088, 759,1060, 489,1805,3351,3285, # 5814 -1358,7683,7684,2386,1387,1215,2629,2252, 490,7685,7686,4173,1759,2387,2343,7687, # 5830 -4448,3799,1907,3971,2630,1806,3210,4449,3453,3286,2760,2344, 874,7688,7689,3454, # 5846 -3670,1858, 91,2914,3671,3042,3800,4450,7690,3145,3972,2659,7691,3455,1202,1403, # 5862 -3801,2954,2529,1517,2503,4451,3456,2504,7692,4452,7693,2692,1885,1495,1731,3973, # 5878 -2365,4453,7694,2029,7695,7696,3974,2693,1216, 237,2581,4174,2319,3975,3802,4454, # 5894 -4455,2694,3560,3457, 445,4456,7697,7698,7699,7700,2761, 61,3976,3672,1822,3977, # 5910 -7701, 687,2045, 935, 925, 405,2660, 703,1096,1859,2725,4457,3978,1876,1367,2695, # 5926 -3352, 918,2105,1781,2476, 334,3287,1611,1093,4458, 564,3146,3458,3673,3353, 945, # 5942 -2631,2057,4459,7702,1925, 872,4175,7703,3459,2696,3089, 349,4176,3674,3979,4460, # 5958 -3803,4177,3675,2155,3980,4461,4462,4178,4463,2403,2046, 782,3981, 400, 251,4179, # 5974 -1624,7704,7705, 277,3676, 299,1265, 476,1191,3804,2121,4180,4181,1109, 205,7706, # 5990 -2582,1000,2156,3561,1860,7707,7708,7709,4464,7710,4465,2565, 107,2477,2157,3982, # 6006 -3460,3147,7711,1533, 541,1301, 158, 753,4182,2872,3562,7712,1696, 370,1088,4183, # 6022 -4466,3563, 579, 327, 440, 162,2240, 269,1937,1374,3461, 968,3043, 56,1396,3090, # 6038 -2106,3288,3354,7713,1926,2158,4467,2998,7714,3564,7715,7716,3677,4468,2478,7717, # 6054 -2791,7718,1650,4469,7719,2603,7720,7721,3983,2661,3355,1149,3356,3984,3805,3985, # 6070 -7722,1076, 49,7723, 951,3211,3289,3290, 450,2837, 920,7724,1811,2792,2366,4184, # 6086 -1908,1138,2367,3806,3462,7725,3212,4470,1909,1147,1518,2423,4471,3807,7726,4472, # 6102 -2388,2604, 260,1795,3213,7727,7728,3808,3291, 708,7729,3565,1704,7730,3566,1351, # 6118 -1618,3357,2999,1886, 944,4185,3358,4186,3044,3359,4187,7731,3678, 422, 413,1714, # 6134 -3292, 500,2058,2345,4188,2479,7732,1344,1910, 954,7733,1668,7734,7735,3986,2404, # 6150 -4189,3567,3809,4190,7736,2302,1318,2505,3091, 133,3092,2873,4473, 629, 31,2838, # 6166 -2697,3810,4474, 850, 949,4475,3987,2955,1732,2088,4191,1496,1852,7737,3988, 620, # 6182 -3214, 981,1242,3679,3360,1619,3680,1643,3293,2139,2452,1970,1719,3463,2168,7738, # 6198 -3215,7739,7740,3361,1828,7741,1277,4476,1565,2047,7742,1636,3568,3093,7743, 869, # 6214 -2839, 655,3811,3812,3094,3989,3000,3813,1310,3569,4477,7744,7745,7746,1733, 558, # 6230 -4478,3681, 335,1549,3045,1756,4192,3682,1945,3464,1829,1291,1192, 470,2726,2107, # 6246 -2793, 913,1054,3990,7747,1027,7748,3046,3991,4479, 982,2662,3362,3148,3465,3216, # 6262 -3217,1946,2794,7749, 571,4480,7750,1830,7751,3570,2583,1523,2424,7752,2089, 984, # 6278 -4481,3683,1959,7753,3684, 852, 923,2795,3466,3685, 969,1519, 999,2048,2320,1705, # 6294 -7754,3095, 615,1662, 151, 597,3992,2405,2321,1049, 275,4482,3686,4193, 568,3687, # 6310 -3571,2480,4194,3688,7755,2425,2270, 409,3218,7756,1566,2874,3467,1002, 769,2840, # 6326 - 194,2090,3149,3689,2222,3294,4195, 628,1505,7757,7758,1763,2177,3001,3993, 521, # 6342 -1161,2584,1787,2203,2406,4483,3994,1625,4196,4197, 412, 42,3096, 464,7759,2632, # 6358 -4484,3363,1760,1571,2875,3468,2530,1219,2204,3814,2633,2140,2368,4485,4486,3295, # 6374 -1651,3364,3572,7760,7761,3573,2481,3469,7762,3690,7763,7764,2271,2091, 460,7765, # 6390 -4487,7766,3002, 962, 588,3574, 289,3219,2634,1116, 52,7767,3047,1796,7768,7769, # 6406 -7770,1467,7771,1598,1143,3691,4198,1984,1734,1067,4488,1280,3365, 465,4489,1572, # 6422 - 510,7772,1927,2241,1812,1644,3575,7773,4490,3692,7774,7775,2663,1573,1534,7776, # 6438 -7777,4199, 536,1807,1761,3470,3815,3150,2635,7778,7779,7780,4491,3471,2915,1911, # 6454 -2796,7781,3296,1122, 377,3220,7782, 360,7783,7784,4200,1529, 551,7785,2059,3693, # 6470 -1769,2426,7786,2916,4201,3297,3097,2322,2108,2030,4492,1404, 136,1468,1479, 672, # 6486 -1171,3221,2303, 271,3151,7787,2762,7788,2049, 678,2727, 865,1947,4493,7789,2013, # 6502 -3995,2956,7790,2728,2223,1397,3048,3694,4494,4495,1735,2917,3366,3576,7791,3816, # 6518 - 509,2841,2453,2876,3817,7792,7793,3152,3153,4496,4202,2531,4497,2304,1166,1010, # 6534 - 552, 681,1887,7794,7795,2957,2958,3996,1287,1596,1861,3154, 358, 453, 736, 175, # 6550 - 478,1117, 905,1167,1097,7796,1853,1530,7797,1706,7798,2178,3472,2287,3695,3473, # 6566 -3577,4203,2092,4204,7799,3367,1193,2482,4205,1458,2190,2205,1862,1888,1421,3298, # 6582 -2918,3049,2179,3474, 595,2122,7800,3997,7801,7802,4206,1707,2636, 223,3696,1359, # 6598 - 751,3098, 183,3475,7803,2797,3003, 419,2369, 633, 704,3818,2389, 241,7804,7805, # 6614 -7806, 838,3004,3697,2272,2763,2454,3819,1938,2050,3998,1309,3099,2242,1181,7807, # 6630 -1136,2206,3820,2370,1446,4207,2305,4498,7808,7809,4208,1055,2605, 484,3698,7810, # 6646 -3999, 625,4209,2273,3368,1499,4210,4000,7811,4001,4211,3222,2274,2275,3476,7812, # 6662 -7813,2764, 808,2606,3699,3369,4002,4212,3100,2532, 526,3370,3821,4213, 955,7814, # 6678 -1620,4214,2637,2427,7815,1429,3700,1669,1831, 994, 928,7816,3578,1260,7817,7818, # 6694 -7819,1948,2288, 741,2919,1626,4215,2729,2455, 867,1184, 362,3371,1392,7820,7821, # 6710 -4003,4216,1770,1736,3223,2920,4499,4500,1928,2698,1459,1158,7822,3050,3372,2877, # 6726 -1292,1929,2506,2842,3701,1985,1187,2071,2014,2607,4217,7823,2566,2507,2169,3702, # 6742 -2483,3299,7824,3703,4501,7825,7826, 666,1003,3005,1022,3579,4218,7827,4502,1813, # 6758 -2253, 574,3822,1603, 295,1535, 705,3823,4219, 283, 858, 417,7828,7829,3224,4503, # 6774 -4504,3051,1220,1889,1046,2276,2456,4004,1393,1599, 689,2567, 388,4220,7830,2484, # 6790 - 802,7831,2798,3824,2060,1405,2254,7832,4505,3825,2109,1052,1345,3225,1585,7833, # 6806 - 809,7834,7835,7836, 575,2730,3477, 956,1552,1469,1144,2323,7837,2324,1560,2457, # 6822 -3580,3226,4005, 616,2207,3155,2180,2289,7838,1832,7839,3478,4506,7840,1319,3704, # 6838 -3705,1211,3581,1023,3227,1293,2799,7841,7842,7843,3826, 607,2306,3827, 762,2878, # 6854 -1439,4221,1360,7844,1485,3052,7845,4507,1038,4222,1450,2061,2638,4223,1379,4508, # 6870 -2585,7846,7847,4224,1352,1414,2325,2921,1172,7848,7849,3828,3829,7850,1797,1451, # 6886 -7851,7852,7853,7854,2922,4006,4007,2485,2346, 411,4008,4009,3582,3300,3101,4509, # 6902 -1561,2664,1452,4010,1375,7855,7856, 47,2959, 316,7857,1406,1591,2923,3156,7858, # 6918 -1025,2141,3102,3157, 354,2731, 884,2224,4225,2407, 508,3706, 726,3583, 996,2428, # 6934 -3584, 729,7859, 392,2191,1453,4011,4510,3707,7860,7861,2458,3585,2608,1675,2800, # 6950 - 919,2347,2960,2348,1270,4511,4012, 73,7862,7863, 647,7864,3228,2843,2255,1550, # 6966 -1346,3006,7865,1332, 883,3479,7866,7867,7868,7869,3301,2765,7870,1212, 831,1347, # 6982 -4226,4512,2326,3830,1863,3053, 720,3831,4513,4514,3832,7871,4227,7872,7873,4515, # 6998 -7874,7875,1798,4516,3708,2609,4517,3586,1645,2371,7876,7877,2924, 669,2208,2665, # 7014 -2429,7878,2879,7879,7880,1028,3229,7881,4228,2408,7882,2256,1353,7883,7884,4518, # 7030 -3158, 518,7885,4013,7886,4229,1960,7887,2142,4230,7888,7889,3007,2349,2350,3833, # 7046 - 516,1833,1454,4014,2699,4231,4519,2225,2610,1971,1129,3587,7890,2766,7891,2961, # 7062 -1422, 577,1470,3008,1524,3373,7892,7893, 432,4232,3054,3480,7894,2586,1455,2508, # 7078 -2226,1972,1175,7895,1020,2732,4015,3481,4520,7896,2733,7897,1743,1361,3055,3482, # 7094 -2639,4016,4233,4521,2290, 895, 924,4234,2170, 331,2243,3056, 166,1627,3057,1098, # 7110 -7898,1232,2880,2227,3374,4522, 657, 403,1196,2372, 542,3709,3375,1600,4235,3483, # 7126 -7899,4523,2767,3230, 576, 530,1362,7900,4524,2533,2666,3710,4017,7901, 842,3834, # 7142 -7902,2801,2031,1014,4018, 213,2700,3376, 665, 621,4236,7903,3711,2925,2430,7904, # 7158 -2431,3302,3588,3377,7905,4237,2534,4238,4525,3589,1682,4239,3484,1380,7906, 724, # 7174 -2277, 600,1670,7907,1337,1233,4526,3103,2244,7908,1621,4527,7909, 651,4240,7910, # 7190 -1612,4241,2611,7911,2844,7912,2734,2307,3058,7913, 716,2459,3059, 174,1255,2701, # 7206 -4019,3590, 548,1320,1398, 728,4020,1574,7914,1890,1197,3060,4021,7915,3061,3062, # 7222 -3712,3591,3713, 747,7916, 635,4242,4528,7917,7918,7919,4243,7920,7921,4529,7922, # 7238 -3378,4530,2432, 451,7923,3714,2535,2072,4244,2735,4245,4022,7924,1764,4531,7925, # 7254 -4246, 350,7926,2278,2390,2486,7927,4247,4023,2245,1434,4024, 488,4532, 458,4248, # 7270 -4025,3715, 771,1330,2391,3835,2568,3159,2159,2409,1553,2667,3160,4249,7928,2487, # 7286 -2881,2612,1720,2702,4250,3379,4533,7929,2536,4251,7930,3231,4252,2768,7931,2015, # 7302 -2736,7932,1155,1017,3716,3836,7933,3303,2308, 201,1864,4253,1430,7934,4026,7935, # 7318 -7936,7937,7938,7939,4254,1604,7940, 414,1865, 371,2587,4534,4535,3485,2016,3104, # 7334 -4536,1708, 960,4255, 887, 389,2171,1536,1663,1721,7941,2228,4027,2351,2926,1580, # 7350 -7942,7943,7944,1744,7945,2537,4537,4538,7946,4539,7947,2073,7948,7949,3592,3380, # 7366 -2882,4256,7950,4257,2640,3381,2802, 673,2703,2460, 709,3486,4028,3593,4258,7951, # 7382 -1148, 502, 634,7952,7953,1204,4540,3594,1575,4541,2613,3717,7954,3718,3105, 948, # 7398 -3232, 121,1745,3837,1110,7955,4259,3063,2509,3009,4029,3719,1151,1771,3838,1488, # 7414 -4030,1986,7956,2433,3487,7957,7958,2093,7959,4260,3839,1213,1407,2803, 531,2737, # 7430 -2538,3233,1011,1537,7960,2769,4261,3106,1061,7961,3720,3721,1866,2883,7962,2017, # 7446 - 120,4262,4263,2062,3595,3234,2309,3840,2668,3382,1954,4542,7963,7964,3488,1047, # 7462 -2704,1266,7965,1368,4543,2845, 649,3383,3841,2539,2738,1102,2846,2669,7966,7967, # 7478 -1999,7968,1111,3596,2962,7969,2488,3842,3597,2804,1854,3384,3722,7970,7971,3385, # 7494 -2410,2884,3304,3235,3598,7972,2569,7973,3599,2805,4031,1460, 856,7974,3600,7975, # 7510 -2885,2963,7976,2886,3843,7977,4264, 632,2510, 875,3844,1697,3845,2291,7978,7979, # 7526 -4544,3010,1239, 580,4545,4265,7980, 914, 936,2074,1190,4032,1039,2123,7981,7982, # 7542 -7983,3386,1473,7984,1354,4266,3846,7985,2172,3064,4033, 915,3305,4267,4268,3306, # 7558 -1605,1834,7986,2739, 398,3601,4269,3847,4034, 328,1912,2847,4035,3848,1331,4270, # 7574 -3011, 937,4271,7987,3602,4036,4037,3387,2160,4546,3388, 524, 742, 538,3065,1012, # 7590 -7988,7989,3849,2461,7990, 658,1103, 225,3850,7991,7992,4547,7993,4548,7994,3236, # 7606 -1243,7995,4038, 963,2246,4549,7996,2705,3603,3161,7997,7998,2588,2327,7999,4550, # 7622 -8000,8001,8002,3489,3307, 957,3389,2540,2032,1930,2927,2462, 870,2018,3604,1746, # 7638 -2770,2771,2434,2463,8003,3851,8004,3723,3107,3724,3490,3390,3725,8005,1179,3066, # 7654 -8006,3162,2373,4272,3726,2541,3163,3108,2740,4039,8007,3391,1556,2542,2292, 977, # 7670 -2887,2033,4040,1205,3392,8008,1765,3393,3164,2124,1271,1689, 714,4551,3491,8009, # 7686 -2328,3852, 533,4273,3605,2181, 617,8010,2464,3308,3492,2310,8011,8012,3165,8013, # 7702 -8014,3853,1987, 618, 427,2641,3493,3394,8015,8016,1244,1690,8017,2806,4274,4552, # 7718 -8018,3494,8019,8020,2279,1576, 473,3606,4275,3395, 972,8021,3607,8022,3067,8023, # 7734 -8024,4553,4554,8025,3727,4041,4042,8026, 153,4555, 356,8027,1891,2888,4276,2143, # 7750 - 408, 803,2352,8028,3854,8029,4277,1646,2570,2511,4556,4557,3855,8030,3856,4278, # 7766 -8031,2411,3396, 752,8032,8033,1961,2964,8034, 746,3012,2465,8035,4279,3728, 698, # 7782 -4558,1892,4280,3608,2543,4559,3609,3857,8036,3166,3397,8037,1823,1302,4043,2706, # 7798 -3858,1973,4281,8038,4282,3167, 823,1303,1288,1236,2848,3495,4044,3398, 774,3859, # 7814 -8039,1581,4560,1304,2849,3860,4561,8040,2435,2161,1083,3237,4283,4045,4284, 344, # 7830 -1173, 288,2311, 454,1683,8041,8042,1461,4562,4046,2589,8043,8044,4563, 985, 894, # 7846 -8045,3399,3168,8046,1913,2928,3729,1988,8047,2110,1974,8048,4047,8049,2571,1194, # 7862 - 425,8050,4564,3169,1245,3730,4285,8051,8052,2850,8053, 636,4565,1855,3861, 760, # 7878 -1799,8054,4286,2209,1508,4566,4048,1893,1684,2293,8055,8056,8057,4287,4288,2210, # 7894 - 479,8058,8059, 832,8060,4049,2489,8061,2965,2490,3731, 990,3109, 627,1814,2642, # 7910 -4289,1582,4290,2125,2111,3496,4567,8062, 799,4291,3170,8063,4568,2112,1737,3013, # 7926 -1018, 543, 754,4292,3309,1676,4569,4570,4050,8064,1489,8065,3497,8066,2614,2889, # 7942 -4051,8067,8068,2966,8069,8070,8071,8072,3171,4571,4572,2182,1722,8073,3238,3239, # 7958 -1842,3610,1715, 481, 365,1975,1856,8074,8075,1962,2491,4573,8076,2126,3611,3240, # 7974 - 433,1894,2063,2075,8077, 602,2741,8078,8079,8080,8081,8082,3014,1628,3400,8083, # 7990 -3172,4574,4052,2890,4575,2512,8084,2544,2772,8085,8086,8087,3310,4576,2891,8088, # 8006 -4577,8089,2851,4578,4579,1221,2967,4053,2513,8090,8091,8092,1867,1989,8093,8094, # 8022 -8095,1895,8096,8097,4580,1896,4054, 318,8098,2094,4055,4293,8099,8100, 485,8101, # 8038 - 938,3862, 553,2670, 116,8102,3863,3612,8103,3498,2671,2773,3401,3311,2807,8104, # 8054 -3613,2929,4056,1747,2930,2968,8105,8106, 207,8107,8108,2672,4581,2514,8109,3015, # 8070 - 890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, # 8086 -2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, # 8102 + 1, 1800, 1506, 255, 1431, 198, 9, 82, 6, 7310, 177, 202, 3615, 1256, 2808, 110, # 2742 + 3735, 33, 3241, 261, 76, 44, 2113, 16, 2931, 2184, 1176, 659, 3868, 26, 3404, 2643, # 2758 + 1198, 3869, 3313, 4060, 410, 2211, 302, 590, 361, 1963, 8, 204, 58, 4296, 7311, 1931, # 2774 + 63, 7312, 7313, 317, 1614, 75, 222, 159, 4061, 2412, 1480, 7314, 3500, 3068, 224, 2809, # 2790 + 3616, 3, 10, 3870, 1471, 29, 2774, 1135, 2852, 1939, 873, 130, 3242, 1123, 312, 7315, # 2806 + 4297, 2051, 507, 252, 682, 7316, 142, 1914, 124, 206, 2932, 34, 3501, 3173, 64, 604, # 2822 + 7317, 2494, 1976, 1977, 155, 1990, 645, 641, 1606, 7318, 3405, 337, 72, 406, 7319, 80, # 2838 + 630, 238, 3174, 1509, 263, 939, 1092, 2644, 756, 1440, 1094, 3406, 449, 69, 2969, 591, # 2854 + 179, 2095, 471, 115, 2034, 1843, 60, 50, 2970, 134, 806, 1868, 734, 2035, 3407, 180, # 2870 + 995, 1607, 156, 537, 2893, 688, 7320, 319, 1305, 779, 2144, 514, 2374, 298, 4298, 359, # 2886 + 2495, 90, 2707, 1338, 663, 11, 906, 1099, 2545, 20, 2436, 182, 532, 1716, 7321, 732, # 2902 + 1376, 4062, 1311, 1420, 3175, 25, 2312, 1056, 113, 399, 382, 1949, 242, 3408, 2467, 529, # 2918 + 3243, 475, 1447, 3617, 7322, 117, 21, 656, 810, 1297, 2295, 2329, 3502, 7323, 126, 4063, # 2934 + 706, 456, 150, 613, 4299, 71, 1118, 2036, 4064, 145, 3069, 85, 835, 486, 2114, 1246, # 2950 + 1426, 428, 727, 1285, 1015, 800, 106, 623, 303, 1281, 7324, 2127, 2354, 347, 3736, 221, # 2966 + 3503, 3110, 7325, 1955, 1153, 4065, 83, 296, 1199, 3070, 192, 624, 93, 7326, 822, 1897, # 2982 + 2810, 3111, 795, 2064, 991, 1554, 1542, 1592, 27, 43, 2853, 859, 139, 1456, 860, 4300, # 2998 + 437, 712, 3871, 164, 2392, 3112, 695, 211, 3017, 2096, 195, 3872, 1608, 3504, 3505, 3618, # 3014 + 3873, 234, 811, 2971, 2097, 3874, 2229, 1441, 3506, 1615, 2375, 668, 2076, 1638, 305, 228, # 3030 + 1664, 4301, 467, 415, 7327, 262, 2098, 1593, 239, 108, 300, 200, 1033, 512, 1247, 2077, # 3046 + 7328, 7329, 2173, 3176, 3619, 2673, 593, 845, 1062, 3244, 88, 1723, 2037, 3875, 1950, 212, # 3062 + 266, 152, 149, 468, 1898, 4066, 4302, 77, 187, 7330, 3018, 37, 5, 2972, 7331, 3876, # 3078 + 7332, 7333, 39, 2517, 4303, 2894, 3177, 2078, 55, 148, 74, 4304, 545, 483, 1474, 1029, # 3094 + 1665, 217, 1869, 1531, 3113, 1104, 2645, 4067, 24, 172, 3507, 900, 3877, 3508, 3509, 4305, # 3110 + 32, 1408, 2811, 1312, 329, 487, 2355, 2247, 2708, 784, 2674, 4, 3019, 3314, 1427, 1788, # 3126 + 188, 109, 499, 7334, 3620, 1717, 1789, 888, 1217, 3020, 4306, 7335, 3510, 7336, 3315, 1520, # 3142 + 3621, 3878, 196, 1034, 775, 7337, 7338, 929, 1815, 249, 439, 38, 7339, 1063, 7340, 794, # 3158 + 3879, 1435, 2296, 46, 178, 3245, 2065, 7341, 2376, 7342, 214, 1709, 4307, 804, 35, 707, # 3174 + 324, 3622, 1601, 2546, 140, 459, 4068, 7343, 7344, 1365, 839, 272, 978, 2257, 2572, 3409, # 3190 + 2128, 1363, 3623, 1423, 697, 100, 3071, 48, 70, 1231, 495, 3114, 2193, 7345, 1294, 7346, # 3206 + 2079, 462, 586, 1042, 3246, 853, 256, 988, 185, 2377, 3410, 1698, 434, 1084, 7347, 3411, # 3222 + 314, 2615, 2775, 4308, 2330, 2331, 569, 2280, 637, 1816, 2518, 757, 1162, 1878, 1616, 3412, # 3238 + 287, 1577, 2115, 768, 4309, 1671, 2854, 3511, 2519, 1321, 3737, 909, 2413, 7348, 4069, 933, # 3254 + 3738, 7349, 2052, 2356, 1222, 4310, 765, 2414, 1322, 786, 4311, 7350, 1919, 1462, 1677, 2895, # 3270 + 1699, 7351, 4312, 1424, 2437, 3115, 3624, 2590, 3316, 1774, 1940, 3413, 3880, 4070, 309, 1369, # 3286 + 1130, 2812, 364, 2230, 1653, 1299, 3881, 3512, 3882, 3883, 2646, 525, 1085, 3021, 902, 2000, # 3302 + 1475, 964, 4313, 421, 1844, 1415, 1057, 2281, 940, 1364, 3116, 376, 4314, 4315, 1381, 7, # 3318 + 2520, 983, 2378, 336, 1710, 2675, 1845, 321, 3414, 559, 1131, 3022, 2742, 1808, 1132, 1313, # 3334 + 265, 1481, 1857, 7352, 352, 1203, 2813, 3247, 167, 1089, 420, 2814, 776, 792, 1724, 3513, # 3350 + 4071, 2438, 3248, 7353, 4072, 7354, 446, 229, 333, 2743, 901, 3739, 1200, 1557, 4316, 2647, # 3366 + 1920, 395, 2744, 2676, 3740, 4073, 1835, 125, 916, 3178, 2616, 4317, 7355, 7356, 3741, 7357, # 3382 + 7358, 7359, 4318, 3117, 3625, 1133, 2547, 1757, 3415, 1510, 2313, 1409, 3514, 7360, 2145, 438, # 3398 + 2591, 2896, 2379, 3317, 1068, 958, 3023, 461, 311, 2855, 2677, 4074, 1915, 3179, 4075, 1978, # 3414 + 383, 750, 2745, 2617, 4076, 274, 539, 385, 1278, 1442, 7361, 1154, 1964, 384, 561, 210, # 3430 + 98, 1295, 2548, 3515, 7362, 1711, 2415, 1482, 3416, 3884, 2897, 1257, 129, 7363, 3742, 642, # 3446 + 523, 2776, 2777, 2648, 7364, 141, 2231, 1333, 68, 176, 441, 876, 907, 4077, 603, 2592, # 3462 + 710, 171, 3417, 404, 549, 18, 3118, 2393, 1410, 3626, 1666, 7365, 3516, 4319, 2898, 4320, # 3478 + 7366, 2973, 368, 7367, 146, 366, 99, 871, 3627, 1543, 748, 807, 1586, 1185, 22, 2258, # 3494 + 379, 3743, 3180, 7368, 3181, 505, 1941, 2618, 1991, 1382, 2314, 7369, 380, 2357, 218, 702, # 3510 + 1817, 1248, 3418, 3024, 3517, 3318, 3249, 7370, 2974, 3628, 930, 3250, 3744, 7371, 59, 7372, # 3526 + 585, 601, 4078, 497, 3419, 1112, 1314, 4321, 1801, 7373, 1223, 1472, 2174, 7374, 749, 1836, # 3542 + 690, 1899, 3745, 1772, 3885, 1476, 429, 1043, 1790, 2232, 2116, 917, 4079, 447, 1086, 1629, # 3558 + 7375, 556, 7376, 7377, 2020, 1654, 844, 1090, 105, 550, 966, 1758, 2815, 1008, 1782, 686, # 3574 + 1095, 7378, 2282, 793, 1602, 7379, 3518, 2593, 4322, 4080, 2933, 2297, 4323, 3746, 980, 2496, # 3590 + 544, 353, 527, 4324, 908, 2678, 2899, 7380, 381, 2619, 1942, 1348, 7381, 1341, 1252, 560, # 3606 + 3072, 7382, 3420, 2856, 7383, 2053, 973, 886, 2080, 143, 4325, 7384, 7385, 157, 3886, 496, # 3622 + 4081, 57, 840, 540, 2038, 4326, 4327, 3421, 2117, 1445, 970, 2259, 1748, 1965, 2081, 4082, # 3638 + 3119, 1234, 1775, 3251, 2816, 3629, 773, 1206, 2129, 1066, 2039, 1326, 3887, 1738, 1725, 4083, # 3654 + 279, 3120, 51, 1544, 2594, 423, 1578, 2130, 2066, 173, 4328, 1879, 7386, 7387, 1583, 264, # 3670 + 610, 3630, 4329, 2439, 280, 154, 7388, 7389, 7390, 1739, 338, 1282, 3073, 693, 2857, 1411, # 3686 + 1074, 3747, 2440, 7391, 4330, 7392, 7393, 1240, 952, 2394, 7394, 2900, 1538, 2679, 685, 1483, # 3702 + 4084, 2468, 1436, 953, 4085, 2054, 4331, 671, 2395, 79, 4086, 2441, 3252, 608, 567, 2680, # 3718 + 3422, 4087, 4088, 1691, 393, 1261, 1791, 2396, 7395, 4332, 7396, 7397, 7398, 7399, 1383, 1672, # 3734 + 3748, 3182, 1464, 522, 1119, 661, 1150, 216, 675, 4333, 3888, 1432, 3519, 609, 4334, 2681, # 3750 + 2397, 7400, 7401, 7402, 4089, 3025, 0, 7403, 2469, 315, 231, 2442, 301, 3319, 4335, 2380, # 3766 + 7404, 233, 4090, 3631, 1818, 4336, 4337, 7405, 96, 1776, 1315, 2082, 7406, 257, 7407, 1809, # 3782 + 3632, 2709, 1139, 1819, 4091, 2021, 1124, 2163, 2778, 1777, 2649, 7408, 3074, 363, 1655, 3183, # 3798 + 7409, 2975, 7410, 7411, 7412, 3889, 1567, 3890, 718, 103, 3184, 849, 1443, 341, 3320, 2934, # 3814 + 1484, 7413, 1712, 127, 67, 339, 4092, 2398, 679, 1412, 821, 7414, 7415, 834, 738, 351, # 3830 + 2976, 2146, 846, 235, 1497, 1880, 418, 1992, 3749, 2710, 186, 1100, 2147, 2746, 3520, 1545, # 3846 + 1355, 2935, 2858, 1377, 583, 3891, 4093, 2573, 2977, 7416, 1298, 3633, 1078, 2549, 3634, 2358, # 3862 + 78, 3750, 3751, 267, 1289, 2099, 2001, 1594, 4094, 348, 369, 1274, 2194, 2175, 1837, 4338, # 3878 + 1820, 2817, 3635, 2747, 2283, 2002, 4339, 2936, 2748, 144, 3321, 882, 4340, 3892, 2749, 3423, # 3894 + 4341, 2901, 7417, 4095, 1726, 320, 7418, 3893, 3026, 788, 2978, 7419, 2818, 1773, 1327, 2859, # 3910 + 3894, 2819, 7420, 1306, 4342, 2003, 1700, 3752, 3521, 2359, 2650, 787, 2022, 506, 824, 3636, # 3926 + 534, 323, 4343, 1044, 3322, 2023, 1900, 946, 3424, 7421, 1778, 1500, 1678, 7422, 1881, 4344, # 3942 + 165, 243, 4345, 3637, 2521, 123, 683, 4096, 764, 4346, 36, 3895, 1792, 589, 2902, 816, # 3958 + 626, 1667, 3027, 2233, 1639, 1555, 1622, 3753, 3896, 7423, 3897, 2860, 1370, 1228, 1932, 891, # 3974 + 2083, 2903, 304, 4097, 7424, 292, 2979, 2711, 3522, 691, 2100, 4098, 1115, 4347, 118, 662, # 3990 + 7425, 611, 1156, 854, 2381, 1316, 2861, 2, 386, 515, 2904, 7426, 7427, 3253, 868, 2234, # 4006 + 1486, 855, 2651, 785, 2212, 3028, 7428, 1040, 3185, 3523, 7429, 3121, 448, 7430, 1525, 7431, # 4022 + 2164, 4348, 7432, 3754, 7433, 4099, 2820, 3524, 3122, 503, 818, 3898, 3123, 1568, 814, 676, # 4038 + 1444, 306, 1749, 7434, 3755, 1416, 1030, 197, 1428, 805, 2821, 1501, 4349, 7435, 7436, 7437, # 4054 + 1993, 7438, 4350, 7439, 7440, 2195, 13, 2779, 3638, 2980, 3124, 1229, 1916, 7441, 3756, 2131, # 4070 + 7442, 4100, 4351, 2399, 3525, 7443, 2213, 1511, 1727, 1120, 7444, 7445, 646, 3757, 2443, 307, # 4086 + 7446, 7447, 1595, 3186, 7448, 7449, 7450, 3639, 1113, 1356, 3899, 1465, 2522, 2523, 7451, 519, # 4102 + 7452, 128, 2132, 92, 2284, 1979, 7453, 3900, 1512, 342, 3125, 2196, 7454, 2780, 2214, 1980, # 4118 + 3323, 7455, 290, 1656, 1317, 789, 827, 2360, 7456, 3758, 4352, 562, 581, 3901, 7457, 401, # 4134 + 4353, 2248, 94, 4354, 1399, 2781, 7458, 1463, 2024, 4355, 3187, 1943, 7459, 828, 1105, 4101, # 4150 + 1262, 1394, 7460, 4102, 605, 4356, 7461, 1783, 2862, 7462, 2822, 819, 2101, 578, 2197, 2937, # 4166 + 7463, 1502, 436, 3254, 4103, 3255, 2823, 3902, 2905, 3425, 3426, 7464, 2712, 2315, 7465, 7466, # 4182 + 2332, 2067, 23, 4357, 193, 826, 3759, 2102, 699, 1630, 4104, 3075, 390, 1793, 1064, 3526, # 4198 + 7467, 1579, 3076, 3077, 1400, 7468, 4105, 1838, 1640, 2863, 7469, 4358, 4359, 137, 4106, 598, # 4214 + 3078, 1966, 780, 104, 974, 2938, 7470, 278, 899, 253, 402, 572, 504, 493, 1339, 7471, # 4230 + 3903, 1275, 4360, 2574, 2550, 7472, 3640, 3029, 3079, 2249, 565, 1334, 2713, 863, 41, 7473, # 4246 + 7474, 4361, 7475, 1657, 2333, 19, 463, 2750, 4107, 606, 7476, 2981, 3256, 1087, 2084, 1323, # 4262 + 2652, 2982, 7477, 1631, 1623, 1750, 4108, 2682, 7478, 2864, 791, 2714, 2653, 2334, 232, 2416, # 4278 + 7479, 2983, 1498, 7480, 2654, 2620, 755, 1366, 3641, 3257, 3126, 2025, 1609, 119, 1917, 3427, # 4294 + 862, 1026, 4109, 7481, 3904, 3760, 4362, 3905, 4363, 2260, 1951, 2470, 7482, 1125, 817, 4110, # 4310 + 4111, 3906, 1513, 1766, 2040, 1487, 4112, 3030, 3258, 2824, 3761, 3127, 7483, 7484, 1507, 7485, # 4326 + 2683, 733, 40, 1632, 1106, 2865, 345, 4113, 841, 2524, 230, 4364, 2984, 1846, 3259, 3428, # 4342 + 7486, 1263, 986, 3429, 7487, 735, 879, 254, 1137, 857, 622, 1300, 1180, 1388, 1562, 3907, # 4358 + 3908, 2939, 967, 2751, 2655, 1349, 592, 2133, 1692, 3324, 2985, 1994, 4114, 1679, 3909, 1901, # 4374 + 2185, 7488, 739, 3642, 2715, 1296, 1290, 7489, 4115, 2198, 2199, 1921, 1563, 2595, 2551, 1870, # 4390 + 2752, 2986, 7490, 435, 7491, 343, 1108, 596, 17, 1751, 4365, 2235, 3430, 3643, 7492, 4366, # 4406 + 294, 3527, 2940, 1693, 477, 979, 281, 2041, 3528, 643, 2042, 3644, 2621, 2782, 2261, 1031, # 4422 + 2335, 2134, 2298, 3529, 4367, 367, 1249, 2552, 7493, 3530, 7494, 4368, 1283, 3325, 2004, 240, # 4438 + 1762, 3326, 4369, 4370, 836, 1069, 3128, 474, 7495, 2148, 2525, 268, 3531, 7496, 3188, 1521, # 4454 + 1284, 7497, 1658, 1546, 4116, 7498, 3532, 3533, 7499, 4117, 3327, 2684, 1685, 4118, 961, 1673, # 4470 + 2622, 190, 2005, 2200, 3762, 4371, 4372, 7500, 570, 2497, 3645, 1490, 7501, 4373, 2623, 3260, # 4486 + 1956, 4374, 584, 1514, 396, 1045, 1944, 7502, 4375, 1967, 2444, 7503, 7504, 4376, 3910, 619, # 4502 + 7505, 3129, 3261, 215, 2006, 2783, 2553, 3189, 4377, 3190, 4378, 763, 4119, 3763, 4379, 7506, # 4518 + 7507, 1957, 1767, 2941, 3328, 3646, 1174, 452, 1477, 4380, 3329, 3130, 7508, 2825, 1253, 2382, # 4534 + 2186, 1091, 2285, 4120, 492, 7509, 638, 1169, 1824, 2135, 1752, 3911, 648, 926, 1021, 1324, # 4550 + 4381, 520, 4382, 997, 847, 1007, 892, 4383, 3764, 2262, 1871, 3647, 7510, 2400, 1784, 4384, # 4566 + 1952, 2942, 3080, 3191, 1728, 4121, 2043, 3648, 4385, 2007, 1701, 3131, 1551, 30, 2263, 4122, # 4582 + 7511, 2026, 4386, 3534, 7512, 501, 7513, 4123, 594, 3431, 2165, 1821, 3535, 3432, 3536, 3192, # 4598 + 829, 2826, 4124, 7514, 1680, 3132, 1225, 4125, 7515, 3262, 4387, 4126, 3133, 2336, 7516, 4388, # 4614 + 4127, 7517, 3912, 3913, 7518, 1847, 2383, 2596, 3330, 7519, 4389, 374, 3914, 652, 4128, 4129, # 4630 + 375, 1140, 798, 7520, 7521, 7522, 2361, 4390, 2264, 546, 1659, 138, 3031, 2445, 4391, 7523, # 4646 + 2250, 612, 1848, 910, 796, 3765, 1740, 1371, 825, 3766, 3767, 7524, 2906, 2554, 7525, 692, # 4662 + 444, 3032, 2624, 801, 4392, 4130, 7526, 1491, 244, 1053, 3033, 4131, 4132, 340, 7527, 3915, # 4678 + 1041, 2987, 293, 1168, 87, 1357, 7528, 1539, 959, 7529, 2236, 721, 694, 4133, 3768, 219, # 4694 + 1478, 644, 1417, 3331, 2656, 1413, 1401, 1335, 1389, 3916, 7530, 7531, 2988, 2362, 3134, 1825, # 4710 + 730, 1515, 184, 2827, 66, 4393, 7532, 1660, 2943, 246, 3332, 378, 1457, 226, 3433, 975, # 4726 + 3917, 2944, 1264, 3537, 674, 696, 7533, 163, 7534, 1141, 2417, 2166, 713, 3538, 3333, 4394, # 4742 + 3918, 7535, 7536, 1186, 15, 7537, 1079, 1070, 7538, 1522, 3193, 3539, 276, 1050, 2716, 758, # 4758 + 1126, 653, 2945, 3263, 7539, 2337, 889, 3540, 3919, 3081, 2989, 903, 1250, 4395, 3920, 3434, # 4774 + 3541, 1342, 1681, 1718, 766, 3264, 286, 89, 2946, 3649, 7540, 1713, 7541, 2597, 3334, 2990, # 4790 + 7542, 2947, 2215, 3194, 2866, 7543, 4396, 2498, 2526, 181, 387, 1075, 3921, 731, 2187, 3335, # 4806 + 7544, 3265, 310, 313, 3435, 2299, 770, 4134, 54, 3034, 189, 4397, 3082, 3769, 3922, 7545, # 4822 + 1230, 1617, 1849, 355, 3542, 4135, 4398, 3336, 111, 4136, 3650, 1350, 3135, 3436, 3035, 4137, # 4838 + 2149, 3266, 3543, 7546, 2784, 3923, 3924, 2991, 722, 2008, 7547, 1071, 247, 1207, 2338, 2471, # 4854 + 1378, 4399, 2009, 864, 1437, 1214, 4400, 373, 3770, 1142, 2216, 667, 4401, 442, 2753, 2555, # 4870 + 3771, 3925, 1968, 4138, 3267, 1839, 837, 170, 1107, 934, 1336, 1882, 7548, 7549, 2118, 4139, # 4886 + 2828, 743, 1569, 7550, 4402, 4140, 582, 2384, 1418, 3437, 7551, 1802, 7552, 357, 1395, 1729, # 4902 + 3651, 3268, 2418, 1564, 2237, 7553, 3083, 3772, 1633, 4403, 1114, 2085, 4141, 1532, 7554, 482, # 4918 + 2446, 4404, 7555, 7556, 1492, 833, 1466, 7557, 2717, 3544, 1641, 2829, 7558, 1526, 1272, 3652, # 4934 + 4142, 1686, 1794, 416, 2556, 1902, 1953, 1803, 7559, 3773, 2785, 3774, 1159, 2316, 7560, 2867, # 4950 + 4405, 1610, 1584, 3036, 2419, 2754, 443, 3269, 1163, 3136, 7561, 7562, 3926, 7563, 4143, 2499, # 4966 + 3037, 4406, 3927, 3137, 2103, 1647, 3545, 2010, 1872, 4144, 7564, 4145, 431, 3438, 7565, 250, # 4982 + 97, 81, 4146, 7566, 1648, 1850, 1558, 160, 848, 7567, 866, 740, 1694, 7568, 2201, 2830, # 4998 + 3195, 4147, 4407, 3653, 1687, 950, 2472, 426, 469, 3196, 3654, 3655, 3928, 7569, 7570, 1188, # 5014 + 424, 1995, 861, 3546, 4148, 3775, 2202, 2685, 168, 1235, 3547, 4149, 7571, 2086, 1674, 4408, # 5030 + 3337, 3270, 220, 2557, 1009, 7572, 3776, 670, 2992, 332, 1208, 717, 7573, 7574, 3548, 2447, # 5046 + 3929, 3338, 7575, 513, 7576, 1209, 2868, 3339, 3138, 4409, 1080, 7577, 7578, 7579, 7580, 2527, # 5062 + 3656, 3549, 815, 1587, 3930, 3931, 7581, 3550, 3439, 3777, 1254, 4410, 1328, 3038, 1390, 3932, # 5078 + 1741, 3933, 3778, 3934, 7582, 236, 3779, 2448, 3271, 7583, 7584, 3657, 3780, 1273, 3781, 4411, # 5094 + 7585, 308, 7586, 4412, 245, 4413, 1851, 2473, 1307, 2575, 430, 715, 2136, 2449, 7587, 270, # 5110 + 199, 2869, 3935, 7588, 3551, 2718, 1753, 761, 1754, 725, 1661, 1840, 4414, 3440, 3658, 7589, # 5126 + 7590, 587, 14, 3272, 227, 2598, 326, 480, 2265, 943, 2755, 3552, 291, 650, 1883, 7591, # 5142 + 1702, 1226, 102, 1547, 62, 3441, 904, 4415, 3442, 1164, 4150, 7592, 7593, 1224, 1548, 2756, # 5158 + 391, 498, 1493, 7594, 1386, 1419, 7595, 2055, 1177, 4416, 813, 880, 1081, 2363, 566, 1145, # 5174 + 4417, 2286, 1001, 1035, 2558, 2599, 2238, 394, 1286, 7596, 7597, 2068, 7598, 86, 1494, 1730, # 5190 + 3936, 491, 1588, 745, 897, 2948, 843, 3340, 3937, 2757, 2870, 3273, 1768, 998, 2217, 2069, # 5206 + 397, 1826, 1195, 1969, 3659, 2993, 3341, 284, 7599, 3782, 2500, 2137, 2119, 1903, 7600, 3938, # 5222 + 2150, 3939, 4151, 1036, 3443, 1904, 114, 2559, 4152, 209, 1527, 7601, 7602, 2949, 2831, 2625, # 5238 + 2385, 2719, 3139, 812, 2560, 7603, 3274, 7604, 1559, 737, 1884, 3660, 1210, 885, 28, 2686, # 5254 + 3553, 3783, 7605, 4153, 1004, 1779, 4418, 7606, 346, 1981, 2218, 2687, 4419, 3784, 1742, 797, # 5270 + 1642, 3940, 1933, 1072, 1384, 2151, 896, 3941, 3275, 3661, 3197, 2871, 3554, 7607, 2561, 1958, # 5286 + 4420, 2450, 1785, 7608, 7609, 7610, 3942, 4154, 1005, 1308, 3662, 4155, 2720, 4421, 4422, 1528, # 5302 + 2600, 161, 1178, 4156, 1982, 987, 4423, 1101, 4157, 631, 3943, 1157, 3198, 2420, 1343, 1241, # 5318 + 1016, 2239, 2562, 372, 877, 2339, 2501, 1160, 555, 1934, 911, 3944, 7611, 466, 1170, 169, # 5334 + 1051, 2907, 2688, 3663, 2474, 2994, 1182, 2011, 2563, 1251, 2626, 7612, 992, 2340, 3444, 1540, # 5350 + 2721, 1201, 2070, 2401, 1996, 2475, 7613, 4424, 528, 1922, 2188, 1503, 1873, 1570, 2364, 3342, # 5366 + 3276, 7614, 557, 1073, 7615, 1827, 3445, 2087, 2266, 3140, 3039, 3084, 767, 3085, 2786, 4425, # 5382 + 1006, 4158, 4426, 2341, 1267, 2176, 3664, 3199, 778, 3945, 3200, 2722, 1597, 2657, 7616, 4427, # 5398 + 7617, 3446, 7618, 7619, 7620, 3277, 2689, 1433, 3278, 131, 95, 1504, 3946, 723, 4159, 3141, # 5414 + 1841, 3555, 2758, 2189, 3947, 2027, 2104, 3665, 7621, 2995, 3948, 1218, 7622, 3343, 3201, 3949, # 5430 + 4160, 2576, 248, 1634, 3785, 912, 7623, 2832, 3666, 3040, 3786, 654, 53, 7624, 2996, 7625, # 5446 + 1688, 4428, 777, 3447, 1032, 3950, 1425, 7626, 191, 820, 2120, 2833, 971, 4429, 931, 3202, # 5462 + 135, 664, 783, 3787, 1997, 772, 2908, 1935, 3951, 3788, 4430, 2909, 3203, 282, 2723, 640, # 5478 + 1372, 3448, 1127, 922, 325, 3344, 7627, 7628, 711, 2044, 7629, 7630, 3952, 2219, 2787, 1936, # 5494 + 3953, 3345, 2220, 2251, 3789, 2300, 7631, 4431, 3790, 1258, 3279, 3954, 3204, 2138, 2950, 3955, # 5510 + 3956, 7632, 2221, 258, 3205, 4432, 101, 1227, 7633, 3280, 1755, 7634, 1391, 3281, 7635, 2910, # 5526 + 2056, 893, 7636, 7637, 7638, 1402, 4161, 2342, 7639, 7640, 3206, 3556, 7641, 7642, 878, 1325, # 5542 + 1780, 2788, 4433, 259, 1385, 2577, 744, 1183, 2267, 4434, 7643, 3957, 2502, 7644, 684, 1024, # 5558 + 4162, 7645, 472, 3557, 3449, 1165, 3282, 3958, 3959, 322, 2152, 881, 455, 1695, 1152, 1340, # 5574 + 660, 554, 2153, 4435, 1058, 4436, 4163, 830, 1065, 3346, 3960, 4437, 1923, 7646, 1703, 1918, # 5590 + 7647, 932, 2268, 122, 7648, 4438, 947, 677, 7649, 3791, 2627, 297, 1905, 1924, 2269, 4439, # 5606 + 2317, 3283, 7650, 7651, 4164, 7652, 4165, 84, 4166, 112, 989, 7653, 547, 1059, 3961, 701, # 5622 + 3558, 1019, 7654, 4167, 7655, 3450, 942, 639, 457, 2301, 2451, 993, 2951, 407, 851, 494, # 5638 + 4440, 3347, 927, 7656, 1237, 7657, 2421, 3348, 573, 4168, 680, 921, 2911, 1279, 1874, 285, # 5654 + 790, 1448, 1983, 719, 2167, 7658, 7659, 4441, 3962, 3963, 1649, 7660, 1541, 563, 7661, 1077, # 5670 + 7662, 3349, 3041, 3451, 511, 2997, 3964, 3965, 3667, 3966, 1268, 2564, 3350, 3207, 4442, 4443, # 5686 + 7663, 535, 1048, 1276, 1189, 2912, 2028, 3142, 1438, 1373, 2834, 2952, 1134, 2012, 7664, 4169, # 5702 + 1238, 2578, 3086, 1259, 7665, 700, 7666, 2953, 3143, 3668, 4170, 7667, 4171, 1146, 1875, 1906, # 5718 + 4444, 2601, 3967, 781, 2422, 132, 1589, 203, 147, 273, 2789, 2402, 898, 1786, 2154, 3968, # 5734 + 3969, 7668, 3792, 2790, 7669, 7670, 4445, 4446, 7671, 3208, 7672, 1635, 3793, 965, 7673, 1804, # 5750 + 2690, 1516, 3559, 1121, 1082, 1329, 3284, 3970, 1449, 3794, 65, 1128, 2835, 2913, 2759, 1590, # 5766 + 3795, 7674, 7675, 12, 2658, 45, 976, 2579, 3144, 4447, 517, 2528, 1013, 1037, 3209, 7676, # 5782 + 3796, 2836, 7677, 3797, 7678, 3452, 7679, 2602, 614, 1998, 2318, 3798, 3087, 2724, 2628, 7680, # 5798 + 2580, 4172, 599, 1269, 7681, 1810, 3669, 7682, 2691, 3088, 759, 1060, 489, 1805, 3351, 3285, # 5814 + 1358, 7683, 7684, 2386, 1387, 1215, 2629, 2252, 490, 7685, 7686, 4173, 1759, 2387, 2343, 7687, # 5830 + 4448, 3799, 1907, 3971, 2630, 1806, 3210, 4449, 3453, 3286, 2760, 2344, 874, 7688, 7689, 3454, # 5846 + 3670, 1858, 91, 2914, 3671, 3042, 3800, 4450, 7690, 3145, 3972, 2659, 7691, 3455, 1202, 1403, # 5862 + 3801, 2954, 2529, 1517, 2503, 4451, 3456, 2504, 7692, 4452, 7693, 2692, 1885, 1495, 1731, 3973, # 5878 + 2365, 4453, 7694, 2029, 7695, 7696, 3974, 2693, 1216, 237, 2581, 4174, 2319, 3975, 3802, 4454, # 5894 + 4455, 2694, 3560, 3457, 445, 4456, 7697, 7698, 7699, 7700, 2761, 61, 3976, 3672, 1822, 3977, # 5910 + 7701, 687, 2045, 935, 925, 405, 2660, 703, 1096, 1859, 2725, 4457, 3978, 1876, 1367, 2695, # 5926 + 3352, 918, 2105, 1781, 2476, 334, 3287, 1611, 1093, 4458, 564, 3146, 3458, 3673, 3353, 945, # 5942 + 2631, 2057, 4459, 7702, 1925, 872, 4175, 7703, 3459, 2696, 3089, 349, 4176, 3674, 3979, 4460, # 5958 + 3803, 4177, 3675, 2155, 3980, 4461, 4462, 4178, 4463, 2403, 2046, 782, 3981, 400, 251, 4179, # 5974 + 1624, 7704, 7705, 277, 3676, 299, 1265, 476, 1191, 3804, 2121, 4180, 4181, 1109, 205, 7706, # 5990 + 2582, 1000, 2156, 3561, 1860, 7707, 7708, 7709, 4464, 7710, 4465, 2565, 107, 2477, 2157, 3982, # 6006 + 3460, 3147, 7711, 1533, 541, 1301, 158, 753, 4182, 2872, 3562, 7712, 1696, 370, 1088, 4183, # 6022 + 4466, 3563, 579, 327, 440, 162, 2240, 269, 1937, 1374, 3461, 968, 3043, 56, 1396, 3090, # 6038 + 2106, 3288, 3354, 7713, 1926, 2158, 4467, 2998, 7714, 3564, 7715, 7716, 3677, 4468, 2478, 7717, # 6054 + 2791, 7718, 1650, 4469, 7719, 2603, 7720, 7721, 3983, 2661, 3355, 1149, 3356, 3984, 3805, 3985, # 6070 + 7722, 1076, 49, 7723, 951, 3211, 3289, 3290, 450, 2837, 920, 7724, 1811, 2792, 2366, 4184, # 6086 + 1908, 1138, 2367, 3806, 3462, 7725, 3212, 4470, 1909, 1147, 1518, 2423, 4471, 3807, 7726, 4472, # 6102 + 2388, 2604, 260, 1795, 3213, 7727, 7728, 3808, 3291, 708, 7729, 3565, 1704, 7730, 3566, 1351, # 6118 + 1618, 3357, 2999, 1886, 944, 4185, 3358, 4186, 3044, 3359, 4187, 7731, 3678, 422, 413, 1714, # 6134 + 3292, 500, 2058, 2345, 4188, 2479, 7732, 1344, 1910, 954, 7733, 1668, 7734, 7735, 3986, 2404, # 6150 + 4189, 3567, 3809, 4190, 7736, 2302, 1318, 2505, 3091, 133, 3092, 2873, 4473, 629, 31, 2838, # 6166 + 2697, 3810, 4474, 850, 949, 4475, 3987, 2955, 1732, 2088, 4191, 1496, 1852, 7737, 3988, 620, # 6182 + 3214, 981, 1242, 3679, 3360, 1619, 3680, 1643, 3293, 2139, 2452, 1970, 1719, 3463, 2168, 7738, # 6198 + 3215, 7739, 7740, 3361, 1828, 7741, 1277, 4476, 1565, 2047, 7742, 1636, 3568, 3093, 7743, 869, # 6214 + 2839, 655, 3811, 3812, 3094, 3989, 3000, 3813, 1310, 3569, 4477, 7744, 7745, 7746, 1733, 558, # 6230 + 4478, 3681, 335, 1549, 3045, 1756, 4192, 3682, 1945, 3464, 1829, 1291, 1192, 470, 2726, 2107, # 6246 + 2793, 913, 1054, 3990, 7747, 1027, 7748, 3046, 3991, 4479, 982, 2662, 3362, 3148, 3465, 3216, # 6262 + 3217, 1946, 2794, 7749, 571, 4480, 7750, 1830, 7751, 3570, 2583, 1523, 2424, 7752, 2089, 984, # 6278 + 4481, 3683, 1959, 7753, 3684, 852, 923, 2795, 3466, 3685, 969, 1519, 999, 2048, 2320, 1705, # 6294 + 7754, 3095, 615, 1662, 151, 597, 3992, 2405, 2321, 1049, 275, 4482, 3686, 4193, 568, 3687, # 6310 + 3571, 2480, 4194, 3688, 7755, 2425, 2270, 409, 3218, 7756, 1566, 2874, 3467, 1002, 769, 2840, # 6326 + 194, 2090, 3149, 3689, 2222, 3294, 4195, 628, 1505, 7757, 7758, 1763, 2177, 3001, 3993, 521, # 6342 + 1161, 2584, 1787, 2203, 2406, 4483, 3994, 1625, 4196, 4197, 412, 42, 3096, 464, 7759, 2632, # 6358 + 4484, 3363, 1760, 1571, 2875, 3468, 2530, 1219, 2204, 3814, 2633, 2140, 2368, 4485, 4486, 3295, # 6374 + 1651, 3364, 3572, 7760, 7761, 3573, 2481, 3469, 7762, 3690, 7763, 7764, 2271, 2091, 460, 7765, # 6390 + 4487, 7766, 3002, 962, 588, 3574, 289, 3219, 2634, 1116, 52, 7767, 3047, 1796, 7768, 7769, # 6406 + 7770, 1467, 7771, 1598, 1143, 3691, 4198, 1984, 1734, 1067, 4488, 1280, 3365, 465, 4489, 1572, # 6422 + 510, 7772, 1927, 2241, 1812, 1644, 3575, 7773, 4490, 3692, 7774, 7775, 2663, 1573, 1534, 7776, # 6438 + 7777, 4199, 536, 1807, 1761, 3470, 3815, 3150, 2635, 7778, 7779, 7780, 4491, 3471, 2915, 1911, # 6454 + 2796, 7781, 3296, 1122, 377, 3220, 7782, 360, 7783, 7784, 4200, 1529, 551, 7785, 2059, 3693, # 6470 + 1769, 2426, 7786, 2916, 4201, 3297, 3097, 2322, 2108, 2030, 4492, 1404, 136, 1468, 1479, 672, # 6486 + 1171, 3221, 2303, 271, 3151, 7787, 2762, 7788, 2049, 678, 2727, 865, 1947, 4493, 7789, 2013, # 6502 + 3995, 2956, 7790, 2728, 2223, 1397, 3048, 3694, 4494, 4495, 1735, 2917, 3366, 3576, 7791, 3816, # 6518 + 509, 2841, 2453, 2876, 3817, 7792, 7793, 3152, 3153, 4496, 4202, 2531, 4497, 2304, 1166, 1010, # 6534 + 552, 681, 1887, 7794, 7795, 2957, 2958, 3996, 1287, 1596, 1861, 3154, 358, 453, 736, 175, # 6550 + 478, 1117, 905, 1167, 1097, 7796, 1853, 1530, 7797, 1706, 7798, 2178, 3472, 2287, 3695, 3473, # 6566 + 3577, 4203, 2092, 4204, 7799, 3367, 1193, 2482, 4205, 1458, 2190, 2205, 1862, 1888, 1421, 3298, # 6582 + 2918, 3049, 2179, 3474, 595, 2122, 7800, 3997, 7801, 7802, 4206, 1707, 2636, 223, 3696, 1359, # 6598 + 751, 3098, 183, 3475, 7803, 2797, 3003, 419, 2369, 633, 704, 3818, 2389, 241, 7804, 7805, # 6614 + 7806, 838, 3004, 3697, 2272, 2763, 2454, 3819, 1938, 2050, 3998, 1309, 3099, 2242, 1181, 7807, # 6630 + 1136, 2206, 3820, 2370, 1446, 4207, 2305, 4498, 7808, 7809, 4208, 1055, 2605, 484, 3698, 7810, # 6646 + 3999, 625, 4209, 2273, 3368, 1499, 4210, 4000, 7811, 4001, 4211, 3222, 2274, 2275, 3476, 7812, # 6662 + 7813, 2764, 808, 2606, 3699, 3369, 4002, 4212, 3100, 2532, 526, 3370, 3821, 4213, 955, 7814, # 6678 + 1620, 4214, 2637, 2427, 7815, 1429, 3700, 1669, 1831, 994, 928, 7816, 3578, 1260, 7817, 7818, # 6694 + 7819, 1948, 2288, 741, 2919, 1626, 4215, 2729, 2455, 867, 1184, 362, 3371, 1392, 7820, 7821, # 6710 + 4003, 4216, 1770, 1736, 3223, 2920, 4499, 4500, 1928, 2698, 1459, 1158, 7822, 3050, 3372, 2877, # 6726 + 1292, 1929, 2506, 2842, 3701, 1985, 1187, 2071, 2014, 2607, 4217, 7823, 2566, 2507, 2169, 3702, # 6742 + 2483, 3299, 7824, 3703, 4501, 7825, 7826, 666, 1003, 3005, 1022, 3579, 4218, 7827, 4502, 1813, # 6758 + 2253, 574, 3822, 1603, 295, 1535, 705, 3823, 4219, 283, 858, 417, 7828, 7829, 3224, 4503, # 6774 + 4504, 3051, 1220, 1889, 1046, 2276, 2456, 4004, 1393, 1599, 689, 2567, 388, 4220, 7830, 2484, # 6790 + 802, 7831, 2798, 3824, 2060, 1405, 2254, 7832, 4505, 3825, 2109, 1052, 1345, 3225, 1585, 7833, # 6806 + 809, 7834, 7835, 7836, 575, 2730, 3477, 956, 1552, 1469, 1144, 2323, 7837, 2324, 1560, 2457, # 6822 + 3580, 3226, 4005, 616, 2207, 3155, 2180, 2289, 7838, 1832, 7839, 3478, 4506, 7840, 1319, 3704, # 6838 + 3705, 1211, 3581, 1023, 3227, 1293, 2799, 7841, 7842, 7843, 3826, 607, 2306, 3827, 762, 2878, # 6854 + 1439, 4221, 1360, 7844, 1485, 3052, 7845, 4507, 1038, 4222, 1450, 2061, 2638, 4223, 1379, 4508, # 6870 + 2585, 7846, 7847, 4224, 1352, 1414, 2325, 2921, 1172, 7848, 7849, 3828, 3829, 7850, 1797, 1451, # 6886 + 7851, 7852, 7853, 7854, 2922, 4006, 4007, 2485, 2346, 411, 4008, 4009, 3582, 3300, 3101, 4509, # 6902 + 1561, 2664, 1452, 4010, 1375, 7855, 7856, 47, 2959, 316, 7857, 1406, 1591, 2923, 3156, 7858, # 6918 + 1025, 2141, 3102, 3157, 354, 2731, 884, 2224, 4225, 2407, 508, 3706, 726, 3583, 996, 2428, # 6934 + 3584, 729, 7859, 392, 2191, 1453, 4011, 4510, 3707, 7860, 7861, 2458, 3585, 2608, 1675, 2800, # 6950 + 919, 2347, 2960, 2348, 1270, 4511, 4012, 73, 7862, 7863, 647, 7864, 3228, 2843, 2255, 1550, # 6966 + 1346, 3006, 7865, 1332, 883, 3479, 7866, 7867, 7868, 7869, 3301, 2765, 7870, 1212, 831, 1347, # 6982 + 4226, 4512, 2326, 3830, 1863, 3053, 720, 3831, 4513, 4514, 3832, 7871, 4227, 7872, 7873, 4515, # 6998 + 7874, 7875, 1798, 4516, 3708, 2609, 4517, 3586, 1645, 2371, 7876, 7877, 2924, 669, 2208, 2665, # 7014 + 2429, 7878, 2879, 7879, 7880, 1028, 3229, 7881, 4228, 2408, 7882, 2256, 1353, 7883, 7884, 4518, # 7030 + 3158, 518, 7885, 4013, 7886, 4229, 1960, 7887, 2142, 4230, 7888, 7889, 3007, 2349, 2350, 3833, # 7046 + 516, 1833, 1454, 4014, 2699, 4231, 4519, 2225, 2610, 1971, 1129, 3587, 7890, 2766, 7891, 2961, # 7062 + 1422, 577, 1470, 3008, 1524, 3373, 7892, 7893, 432, 4232, 3054, 3480, 7894, 2586, 1455, 2508, # 7078 + 2226, 1972, 1175, 7895, 1020, 2732, 4015, 3481, 4520, 7896, 2733, 7897, 1743, 1361, 3055, 3482, # 7094 + 2639, 4016, 4233, 4521, 2290, 895, 924, 4234, 2170, 331, 2243, 3056, 166, 1627, 3057, 1098, # 7110 + 7898, 1232, 2880, 2227, 3374, 4522, 657, 403, 1196, 2372, 542, 3709, 3375, 1600, 4235, 3483, # 7126 + 7899, 4523, 2767, 3230, 576, 530, 1362, 7900, 4524, 2533, 2666, 3710, 4017, 7901, 842, 3834, # 7142 + 7902, 2801, 2031, 1014, 4018, 213, 2700, 3376, 665, 621, 4236, 7903, 3711, 2925, 2430, 7904, # 7158 + 2431, 3302, 3588, 3377, 7905, 4237, 2534, 4238, 4525, 3589, 1682, 4239, 3484, 1380, 7906, 724, # 7174 + 2277, 600, 1670, 7907, 1337, 1233, 4526, 3103, 2244, 7908, 1621, 4527, 7909, 651, 4240, 7910, # 7190 + 1612, 4241, 2611, 7911, 2844, 7912, 2734, 2307, 3058, 7913, 716, 2459, 3059, 174, 1255, 2701, # 7206 + 4019, 3590, 548, 1320, 1398, 728, 4020, 1574, 7914, 1890, 1197, 3060, 4021, 7915, 3061, 3062, # 7222 + 3712, 3591, 3713, 747, 7916, 635, 4242, 4528, 7917, 7918, 7919, 4243, 7920, 7921, 4529, 7922, # 7238 + 3378, 4530, 2432, 451, 7923, 3714, 2535, 2072, 4244, 2735, 4245, 4022, 7924, 1764, 4531, 7925, # 7254 + 4246, 350, 7926, 2278, 2390, 2486, 7927, 4247, 4023, 2245, 1434, 4024, 488, 4532, 458, 4248, # 7270 + 4025, 3715, 771, 1330, 2391, 3835, 2568, 3159, 2159, 2409, 1553, 2667, 3160, 4249, 7928, 2487, # 7286 + 2881, 2612, 1720, 2702, 4250, 3379, 4533, 7929, 2536, 4251, 7930, 3231, 4252, 2768, 7931, 2015, # 7302 + 2736, 7932, 1155, 1017, 3716, 3836, 7933, 3303, 2308, 201, 1864, 4253, 1430, 7934, 4026, 7935, # 7318 + 7936, 7937, 7938, 7939, 4254, 1604, 7940, 414, 1865, 371, 2587, 4534, 4535, 3485, 2016, 3104, # 7334 + 4536, 1708, 960, 4255, 887, 389, 2171, 1536, 1663, 1721, 7941, 2228, 4027, 2351, 2926, 1580, # 7350 + 7942, 7943, 7944, 1744, 7945, 2537, 4537, 4538, 7946, 4539, 7947, 2073, 7948, 7949, 3592, 3380, # 7366 + 2882, 4256, 7950, 4257, 2640, 3381, 2802, 673, 2703, 2460, 709, 3486, 4028, 3593, 4258, 7951, # 7382 + 1148, 502, 634, 7952, 7953, 1204, 4540, 3594, 1575, 4541, 2613, 3717, 7954, 3718, 3105, 948, # 7398 + 3232, 121, 1745, 3837, 1110, 7955, 4259, 3063, 2509, 3009, 4029, 3719, 1151, 1771, 3838, 1488, # 7414 + 4030, 1986, 7956, 2433, 3487, 7957, 7958, 2093, 7959, 4260, 3839, 1213, 1407, 2803, 531, 2737, # 7430 + 2538, 3233, 1011, 1537, 7960, 2769, 4261, 3106, 1061, 7961, 3720, 3721, 1866, 2883, 7962, 2017, # 7446 + 120, 4262, 4263, 2062, 3595, 3234, 2309, 3840, 2668, 3382, 1954, 4542, 7963, 7964, 3488, 1047, # 7462 + 2704, 1266, 7965, 1368, 4543, 2845, 649, 3383, 3841, 2539, 2738, 1102, 2846, 2669, 7966, 7967, # 7478 + 1999, 7968, 1111, 3596, 2962, 7969, 2488, 3842, 3597, 2804, 1854, 3384, 3722, 7970, 7971, 3385, # 7494 + 2410, 2884, 3304, 3235, 3598, 7972, 2569, 7973, 3599, 2805, 4031, 1460, 856, 7974, 3600, 7975, # 7510 + 2885, 2963, 7976, 2886, 3843, 7977, 4264, 632, 2510, 875, 3844, 1697, 3845, 2291, 7978, 7979, # 7526 + 4544, 3010, 1239, 580, 4545, 4265, 7980, 914, 936, 2074, 1190, 4032, 1039, 2123, 7981, 7982, # 7542 + 7983, 3386, 1473, 7984, 1354, 4266, 3846, 7985, 2172, 3064, 4033, 915, 3305, 4267, 4268, 3306, # 7558 + 1605, 1834, 7986, 2739, 398, 3601, 4269, 3847, 4034, 328, 1912, 2847, 4035, 3848, 1331, 4270, # 7574 + 3011, 937, 4271, 7987, 3602, 4036, 4037, 3387, 2160, 4546, 3388, 524, 742, 538, 3065, 1012, # 7590 + 7988, 7989, 3849, 2461, 7990, 658, 1103, 225, 3850, 7991, 7992, 4547, 7993, 4548, 7994, 3236, # 7606 + 1243, 7995, 4038, 963, 2246, 4549, 7996, 2705, 3603, 3161, 7997, 7998, 2588, 2327, 7999, 4550, # 7622 + 8000, 8001, 8002, 3489, 3307, 957, 3389, 2540, 2032, 1930, 2927, 2462, 870, 2018, 3604, 1746, # 7638 + 2770, 2771, 2434, 2463, 8003, 3851, 8004, 3723, 3107, 3724, 3490, 3390, 3725, 8005, 1179, 3066, # 7654 + 8006, 3162, 2373, 4272, 3726, 2541, 3163, 3108, 2740, 4039, 8007, 3391, 1556, 2542, 2292, 977, # 7670 + 2887, 2033, 4040, 1205, 3392, 8008, 1765, 3393, 3164, 2124, 1271, 1689, 714, 4551, 3491, 8009, # 7686 + 2328, 3852, 533, 4273, 3605, 2181, 617, 8010, 2464, 3308, 3492, 2310, 8011, 8012, 3165, 8013, # 7702 + 8014, 3853, 1987, 618, 427, 2641, 3493, 3394, 8015, 8016, 1244, 1690, 8017, 2806, 4274, 4552, # 7718 + 8018, 3494, 8019, 8020, 2279, 1576, 473, 3606, 4275, 3395, 972, 8021, 3607, 8022, 3067, 8023, # 7734 + 8024, 4553, 4554, 8025, 3727, 4041, 4042, 8026, 153, 4555, 356, 8027, 1891, 2888, 4276, 2143, # 7750 + 408, 803, 2352, 8028, 3854, 8029, 4277, 1646, 2570, 2511, 4556, 4557, 3855, 8030, 3856, 4278, # 7766 + 8031, 2411, 3396, 752, 8032, 8033, 1961, 2964, 8034, 746, 3012, 2465, 8035, 4279, 3728, 698, # 7782 + 4558, 1892, 4280, 3608, 2543, 4559, 3609, 3857, 8036, 3166, 3397, 8037, 1823, 1302, 4043, 2706, # 7798 + 3858, 1973, 4281, 8038, 4282, 3167, 823, 1303, 1288, 1236, 2848, 3495, 4044, 3398, 774, 3859, # 7814 + 8039, 1581, 4560, 1304, 2849, 3860, 4561, 8040, 2435, 2161, 1083, 3237, 4283, 4045, 4284, 344, # 7830 + 1173, 288, 2311, 454, 1683, 8041, 8042, 1461, 4562, 4046, 2589, 8043, 8044, 4563, 985, 894, # 7846 + 8045, 3399, 3168, 8046, 1913, 2928, 3729, 1988, 8047, 2110, 1974, 8048, 4047, 8049, 2571, 1194, # 7862 + 425, 8050, 4564, 3169, 1245, 3730, 4285, 8051, 8052, 2850, 8053, 636, 4565, 1855, 3861, 760, # 7878 + 1799, 8054, 4286, 2209, 1508, 4566, 4048, 1893, 1684, 2293, 8055, 8056, 8057, 4287, 4288, 2210, # 7894 + 479, 8058, 8059, 832, 8060, 4049, 2489, 8061, 2965, 2490, 3731, 990, 3109, 627, 1814, 2642, # 7910 + 4289, 1582, 4290, 2125, 2111, 3496, 4567, 8062, 799, 4291, 3170, 8063, 4568, 2112, 1737, 3013, # 7926 + 1018, 543, 754, 4292, 3309, 1676, 4569, 4570, 4050, 8064, 1489, 8065, 3497, 8066, 2614, 2889, # 7942 + 4051, 8067, 8068, 2966, 8069, 8070, 8071, 8072, 3171, 4571, 4572, 2182, 1722, 8073, 3238, 3239, # 7958 + 1842, 3610, 1715, 481, 365, 1975, 1856, 8074, 8075, 1962, 2491, 4573, 8076, 2126, 3611, 3240, # 7974 + 433, 1894, 2063, 2075, 8077, 602, 2741, 8078, 8079, 8080, 8081, 8082, 3014, 1628, 3400, 8083, # 7990 + 3172, 4574, 4052, 2890, 4575, 2512, 8084, 2544, 2772, 8085, 8086, 8087, 3310, 4576, 2891, 8088, # 8006 + 4577, 8089, 2851, 4578, 4579, 1221, 2967, 4053, 2513, 8090, 8091, 8092, 1867, 1989, 8093, 8094, # 8022 + 8095, 1895, 8096, 8097, 4580, 1896, 4054, 318, 8098, 2094, 4055, 4293, 8099, 8100, 485, 8101, # 8038 + 938, 3862, 553, 2670, 116, 8102, 3863, 3612, 8103, 3498, 2671, 2773, 3401, 3311, 2807, 8104, # 8054 + 3613, 2929, 4056, 1747, 2930, 2968, 8105, 8106, 207, 8107, 8108, 2672, 4581, 2514, 8109, 3015, # 8070 + 890, 3614, 3864, 8110, 1877, 3732, 3402, 8111, 2183, 2353, 3403, 1652, 8112, 8113, 8114, 941, # 8086 + 2294, 208, 3499, 4057, 2019, 330, 4294, 3865, 2892, 2492, 3733, 4295, 8115, 8116, 8117, 8118, # 8102 ) - +# fmt: on diff --git a/lib/chardet/euctwprober.py b/lib/chardet/euctwprober.py index 7dbc136e..204857b7 100644 --- a/lib/chardet/euctwprober.py +++ b/lib/chardet/euctwprober.py @@ -25,22 +25,23 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .mbcharsetprober import MultiByteCharSetProber -from .codingstatemachine import CodingStateMachine from .chardistribution import EUCTWDistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .mbcharsetprober import MultiByteCharSetProber from .mbcssm import EUCTW_SM_MODEL + class EUCTWProber(MultiByteCharSetProber): - def __init__(self): - super(EUCTWProber, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) self.distribution_analyzer = EUCTWDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-TW" @property - def language(self): + def language(self) -> str: return "Taiwan" diff --git a/lib/chardet/gb2312freq.py b/lib/chardet/gb2312freq.py index a0167b38..3e0b5080 100644 --- a/lib/chardet/gb2312freq.py +++ b/lib/chardet/gb2312freq.py @@ -43,6 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9 GB2312_TABLE_SIZE = 3760 +# fmt: off GB2312_CHAR_TO_FREQ_ORDER = ( 1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205, 2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842, @@ -280,4 +281,4 @@ GB2312_CHAR_TO_FREQ_ORDER = ( 381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189, 852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512 ) - +# fmt: on diff --git a/lib/chardet/gb2312prober.py b/lib/chardet/gb2312prober.py index 7cae6b51..3731b5d7 100644 --- a/lib/chardet/gb2312prober.py +++ b/lib/chardet/gb2312prober.py @@ -25,22 +25,23 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .mbcharsetprober import MultiByteCharSetProber -from .codingstatemachine import CodingStateMachine from .chardistribution import GB2312DistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .mbcharsetprober import MultiByteCharSetProber from .mbcssm import GB2312_SM_MODEL + class GB2312Prober(MultiByteCharSetProber): - def __init__(self): - super(GB2312Prober, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) self.distribution_analyzer = GB2312DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "GB2312" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/lib/chardet/hebrewprober.py b/lib/chardet/hebrewprober.py index 10b81224..763c23ed 100644 --- a/lib/chardet/hebrewprober.py +++ b/lib/chardet/hebrewprober.py @@ -25,8 +25,11 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .enums import ProbingState +from .sbcharsetprober import SingleByteCharSetProber # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -125,18 +128,20 @@ from .enums import ProbingState # model probers scores. The answer is returned in the form of the name of the # charset identified, either "windows-1255" or "ISO-8859-8". + class HebrewProber(CharSetProber): + SPACE = 0x20 # windows-1255 / ISO-8859-8 code points of interest - FINAL_KAF = 0xea - NORMAL_KAF = 0xeb - FINAL_MEM = 0xed - NORMAL_MEM = 0xee - FINAL_NUN = 0xef - NORMAL_NUN = 0xf0 - FINAL_PE = 0xf3 - NORMAL_PE = 0xf4 - FINAL_TSADI = 0xf5 - NORMAL_TSADI = 0xf6 + FINAL_KAF = 0xEA + NORMAL_KAF = 0xEB + FINAL_MEM = 0xED + NORMAL_MEM = 0xEE + FINAL_NUN = 0xEF + NORMAL_NUN = 0xF0 + FINAL_PE = 0xF3 + NORMAL_PE = 0xF4 + FINAL_TSADI = 0xF5 + NORMAL_TSADI = 0xF6 # Minimum Visual vs Logical final letter score difference. # If the difference is below this, don't rely solely on the final letter score @@ -151,35 +156,44 @@ class HebrewProber(CharSetProber): VISUAL_HEBREW_NAME = "ISO-8859-8" LOGICAL_HEBREW_NAME = "windows-1255" - def __init__(self): - super(HebrewProber, self).__init__() - self._final_char_logical_score = None - self._final_char_visual_score = None - self._prev = None - self._before_prev = None - self._logical_prober = None - self._visual_prober = None + def __init__(self) -> None: + super().__init__() + self._final_char_logical_score = 0 + self._final_char_visual_score = 0 + self._prev = self.SPACE + self._before_prev = self.SPACE + self._logical_prober: Optional[SingleByteCharSetProber] = None + self._visual_prober: Optional[SingleByteCharSetProber] = None self.reset() - def reset(self): + def reset(self) -> None: self._final_char_logical_score = 0 self._final_char_visual_score = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate # a word delimiter at the beginning of the data - self._prev = ' ' - self._before_prev = ' ' + self._prev = self.SPACE + self._before_prev = self.SPACE # These probers are owned by the group prober. - def set_model_probers(self, logicalProber, visualProber): - self._logical_prober = logicalProber - self._visual_prober = visualProber + def set_model_probers( + self, + logical_prober: SingleByteCharSetProber, + visual_prober: SingleByteCharSetProber, + ) -> None: + self._logical_prober = logical_prober + self._visual_prober = visual_prober - def is_final(self, c): - return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN, - self.FINAL_PE, self.FINAL_TSADI] + def is_final(self, c: int) -> bool: + return c in [ + self.FINAL_KAF, + self.FINAL_MEM, + self.FINAL_NUN, + self.FINAL_PE, + self.FINAL_TSADI, + ] - def is_non_final(self, c): + def is_non_final(self, c: int) -> bool: # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters @@ -190,10 +204,9 @@ class HebrewProber(CharSetProber): # for example legally end with a Non-Final Pe or Kaf. However, the # benefit of these letters as Non-Final letters outweighs the damage # since these words are quite rare. - return c in [self.NORMAL_KAF, self.NORMAL_MEM, - self.NORMAL_NUN, self.NORMAL_PE] + return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE] - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew # or visual Hebrew. @@ -227,9 +240,9 @@ class HebrewProber(CharSetProber): byte_str = self.filter_high_byte_only(byte_str) for cur in byte_str: - if cur == ' ': + if cur == self.SPACE: # We stand on a space - a word just ended - if self._before_prev != ' ': + if self._before_prev != self.SPACE: # next-to-last char was not a space so self._prev is not a # 1 letter word if self.is_final(self._prev): @@ -241,8 +254,11 @@ class HebrewProber(CharSetProber): self._final_char_visual_score += 1 else: # Not standing on a space - if ((self._before_prev == ' ') and - (self.is_final(self._prev)) and (cur != ' ')): + if ( + (self._before_prev == self.SPACE) + and (self.is_final(self._prev)) + and (cur != self.SPACE) + ): # case (3) [-2:space][-1:final letter][cur:not space] self._final_char_visual_score += 1 self._before_prev = self._prev @@ -253,7 +269,10 @@ class HebrewProber(CharSetProber): return ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> str: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = self._final_char_logical_score - self._final_char_visual_score @@ -263,8 +282,9 @@ class HebrewProber(CharSetProber): return self.VISUAL_HEBREW_NAME # It's not dominant enough, try to rely on the model scores instead. - modelsub = (self._logical_prober.get_confidence() - - self._visual_prober.get_confidence()) + modelsub = ( + self._logical_prober.get_confidence() - self._visual_prober.get_confidence() + ) if modelsub > self.MIN_MODEL_DISTANCE: return self.LOGICAL_HEBREW_NAME if modelsub < -self.MIN_MODEL_DISTANCE: @@ -280,13 +300,17 @@ class HebrewProber(CharSetProber): return self.LOGICAL_HEBREW_NAME @property - def language(self): - return 'Hebrew' + def language(self) -> str: + return "Hebrew" @property - def state(self): + def state(self) -> ProbingState: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Remain active as long as any of the model probers are active. - if (self._logical_prober.state == ProbingState.NOT_ME) and \ - (self._visual_prober.state == ProbingState.NOT_ME): + if (self._logical_prober.state == ProbingState.NOT_ME) and ( + self._visual_prober.state == ProbingState.NOT_ME + ): return ProbingState.NOT_ME return ProbingState.DETECTING diff --git a/lib/chardet/jisfreq.py b/lib/chardet/jisfreq.py index 510b9400..e0470f96 100644 --- a/lib/chardet/jisfreq.py +++ b/lib/chardet/jisfreq.py @@ -46,6 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0 # Char to FreqOrder table , JIS_TABLE_SIZE = 4368 +# fmt: off JIS_CHAR_TO_FREQ_ORDER = ( 40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16 3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32 @@ -321,5 +322,4 @@ JIS_CHAR_TO_FREQ_ORDER = ( 1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352 2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512 ) - - +# fmt: on diff --git a/lib/chardet/johabfreq.py b/lib/chardet/johabfreq.py new file mode 100644 index 00000000..3fee2c32 --- /dev/null +++ b/lib/chardet/johabfreq.py @@ -0,0 +1,2382 @@ +######################## BEGIN LICENSE BLOCK ######################## +# The Original Code is Mozilla Communicator client code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 1998 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Mark Pilgrim - port to Python +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +# The frequency data itself is the same as euc-kr. +# This is just a mapping table to euc-kr. + +JOHAB_TO_EUCKR_ORDER_TABLE = { + 0x8861: 0, + 0x8862: 1, + 0x8865: 2, + 0x8868: 3, + 0x8869: 4, + 0x886A: 5, + 0x886B: 6, + 0x8871: 7, + 0x8873: 8, + 0x8874: 9, + 0x8875: 10, + 0x8876: 11, + 0x8877: 12, + 0x8878: 13, + 0x8879: 14, + 0x887B: 15, + 0x887C: 16, + 0x887D: 17, + 0x8881: 18, + 0x8882: 19, + 0x8885: 20, + 0x8889: 21, + 0x8891: 22, + 0x8893: 23, + 0x8895: 24, + 0x8896: 25, + 0x8897: 26, + 0x88A1: 27, + 0x88A2: 28, + 0x88A5: 29, + 0x88A9: 30, + 0x88B5: 31, + 0x88B7: 32, + 0x88C1: 33, + 0x88C5: 34, + 0x88C9: 35, + 0x88E1: 36, + 0x88E2: 37, + 0x88E5: 38, + 0x88E8: 39, + 0x88E9: 40, + 0x88EB: 41, + 0x88F1: 42, + 0x88F3: 43, + 0x88F5: 44, + 0x88F6: 45, + 0x88F7: 46, + 0x88F8: 47, + 0x88FB: 48, + 0x88FC: 49, + 0x88FD: 50, + 0x8941: 51, + 0x8945: 52, + 0x8949: 53, + 0x8951: 54, + 0x8953: 55, + 0x8955: 56, + 0x8956: 57, + 0x8957: 58, + 0x8961: 59, + 0x8962: 60, + 0x8963: 61, + 0x8965: 62, + 0x8968: 63, + 0x8969: 64, + 0x8971: 65, + 0x8973: 66, + 0x8975: 67, + 0x8976: 68, + 0x8977: 69, + 0x897B: 70, + 0x8981: 71, + 0x8985: 72, + 0x8989: 73, + 0x8993: 74, + 0x8995: 75, + 0x89A1: 76, + 0x89A2: 77, + 0x89A5: 78, + 0x89A8: 79, + 0x89A9: 80, + 0x89AB: 81, + 0x89AD: 82, + 0x89B0: 83, + 0x89B1: 84, + 0x89B3: 85, + 0x89B5: 86, + 0x89B7: 87, + 0x89B8: 88, + 0x89C1: 89, + 0x89C2: 90, + 0x89C5: 91, + 0x89C9: 92, + 0x89CB: 93, + 0x89D1: 94, + 0x89D3: 95, + 0x89D5: 96, + 0x89D7: 97, + 0x89E1: 98, + 0x89E5: 99, + 0x89E9: 100, + 0x89F3: 101, + 0x89F6: 102, + 0x89F7: 103, + 0x8A41: 104, + 0x8A42: 105, + 0x8A45: 106, + 0x8A49: 107, + 0x8A51: 108, + 0x8A53: 109, + 0x8A55: 110, + 0x8A57: 111, + 0x8A61: 112, + 0x8A65: 113, + 0x8A69: 114, + 0x8A73: 115, + 0x8A75: 116, + 0x8A81: 117, + 0x8A82: 118, + 0x8A85: 119, + 0x8A88: 120, + 0x8A89: 121, + 0x8A8A: 122, + 0x8A8B: 123, + 0x8A90: 124, + 0x8A91: 125, + 0x8A93: 126, + 0x8A95: 127, + 0x8A97: 128, + 0x8A98: 129, + 0x8AA1: 130, + 0x8AA2: 131, + 0x8AA5: 132, + 0x8AA9: 133, + 0x8AB6: 134, + 0x8AB7: 135, + 0x8AC1: 136, + 0x8AD5: 137, + 0x8AE1: 138, + 0x8AE2: 139, + 0x8AE5: 140, + 0x8AE9: 141, + 0x8AF1: 142, + 0x8AF3: 143, + 0x8AF5: 144, + 0x8B41: 145, + 0x8B45: 146, + 0x8B49: 147, + 0x8B61: 148, + 0x8B62: 149, + 0x8B65: 150, + 0x8B68: 151, + 0x8B69: 152, + 0x8B6A: 153, + 0x8B71: 154, + 0x8B73: 155, + 0x8B75: 156, + 0x8B77: 157, + 0x8B81: 158, + 0x8BA1: 159, + 0x8BA2: 160, + 0x8BA5: 161, + 0x8BA8: 162, + 0x8BA9: 163, + 0x8BAB: 164, + 0x8BB1: 165, + 0x8BB3: 166, + 0x8BB5: 167, + 0x8BB7: 168, + 0x8BB8: 169, + 0x8BBC: 170, + 0x8C61: 171, + 0x8C62: 172, + 0x8C63: 173, + 0x8C65: 174, + 0x8C69: 175, + 0x8C6B: 176, + 0x8C71: 177, + 0x8C73: 178, + 0x8C75: 179, + 0x8C76: 180, + 0x8C77: 181, + 0x8C7B: 182, + 0x8C81: 183, + 0x8C82: 184, + 0x8C85: 185, + 0x8C89: 186, + 0x8C91: 187, + 0x8C93: 188, + 0x8C95: 189, + 0x8C96: 190, + 0x8C97: 191, + 0x8CA1: 192, + 0x8CA2: 193, + 0x8CA9: 194, + 0x8CE1: 195, + 0x8CE2: 196, + 0x8CE3: 197, + 0x8CE5: 198, + 0x8CE9: 199, + 0x8CF1: 200, + 0x8CF3: 201, + 0x8CF5: 202, + 0x8CF6: 203, + 0x8CF7: 204, + 0x8D41: 205, + 0x8D42: 206, + 0x8D45: 207, + 0x8D51: 208, + 0x8D55: 209, + 0x8D57: 210, + 0x8D61: 211, + 0x8D65: 212, + 0x8D69: 213, + 0x8D75: 214, + 0x8D76: 215, + 0x8D7B: 216, + 0x8D81: 217, + 0x8DA1: 218, + 0x8DA2: 219, + 0x8DA5: 220, + 0x8DA7: 221, + 0x8DA9: 222, + 0x8DB1: 223, + 0x8DB3: 224, + 0x8DB5: 225, + 0x8DB7: 226, + 0x8DB8: 227, + 0x8DB9: 228, + 0x8DC1: 229, + 0x8DC2: 230, + 0x8DC9: 231, + 0x8DD6: 232, + 0x8DD7: 233, + 0x8DE1: 234, + 0x8DE2: 235, + 0x8DF7: 236, + 0x8E41: 237, + 0x8E45: 238, + 0x8E49: 239, + 0x8E51: 240, + 0x8E53: 241, + 0x8E57: 242, + 0x8E61: 243, + 0x8E81: 244, + 0x8E82: 245, + 0x8E85: 246, + 0x8E89: 247, + 0x8E90: 248, + 0x8E91: 249, + 0x8E93: 250, + 0x8E95: 251, + 0x8E97: 252, + 0x8E98: 253, + 0x8EA1: 254, + 0x8EA9: 255, + 0x8EB6: 256, + 0x8EB7: 257, + 0x8EC1: 258, + 0x8EC2: 259, + 0x8EC5: 260, + 0x8EC9: 261, + 0x8ED1: 262, + 0x8ED3: 263, + 0x8ED6: 264, + 0x8EE1: 265, + 0x8EE5: 266, + 0x8EE9: 267, + 0x8EF1: 268, + 0x8EF3: 269, + 0x8F41: 270, + 0x8F61: 271, + 0x8F62: 272, + 0x8F65: 273, + 0x8F67: 274, + 0x8F69: 275, + 0x8F6B: 276, + 0x8F70: 277, + 0x8F71: 278, + 0x8F73: 279, + 0x8F75: 280, + 0x8F77: 281, + 0x8F7B: 282, + 0x8FA1: 283, + 0x8FA2: 284, + 0x8FA5: 285, + 0x8FA9: 286, + 0x8FB1: 287, + 0x8FB3: 288, + 0x8FB5: 289, + 0x8FB7: 290, + 0x9061: 291, + 0x9062: 292, + 0x9063: 293, + 0x9065: 294, + 0x9068: 295, + 0x9069: 296, + 0x906A: 297, + 0x906B: 298, + 0x9071: 299, + 0x9073: 300, + 0x9075: 301, + 0x9076: 302, + 0x9077: 303, + 0x9078: 304, + 0x9079: 305, + 0x907B: 306, + 0x907D: 307, + 0x9081: 308, + 0x9082: 309, + 0x9085: 310, + 0x9089: 311, + 0x9091: 312, + 0x9093: 313, + 0x9095: 314, + 0x9096: 315, + 0x9097: 316, + 0x90A1: 317, + 0x90A2: 318, + 0x90A5: 319, + 0x90A9: 320, + 0x90B1: 321, + 0x90B7: 322, + 0x90E1: 323, + 0x90E2: 324, + 0x90E4: 325, + 0x90E5: 326, + 0x90E9: 327, + 0x90EB: 328, + 0x90EC: 329, + 0x90F1: 330, + 0x90F3: 331, + 0x90F5: 332, + 0x90F6: 333, + 0x90F7: 334, + 0x90FD: 335, + 0x9141: 336, + 0x9142: 337, + 0x9145: 338, + 0x9149: 339, + 0x9151: 340, + 0x9153: 341, + 0x9155: 342, + 0x9156: 343, + 0x9157: 344, + 0x9161: 345, + 0x9162: 346, + 0x9165: 347, + 0x9169: 348, + 0x9171: 349, + 0x9173: 350, + 0x9176: 351, + 0x9177: 352, + 0x917A: 353, + 0x9181: 354, + 0x9185: 355, + 0x91A1: 356, + 0x91A2: 357, + 0x91A5: 358, + 0x91A9: 359, + 0x91AB: 360, + 0x91B1: 361, + 0x91B3: 362, + 0x91B5: 363, + 0x91B7: 364, + 0x91BC: 365, + 0x91BD: 366, + 0x91C1: 367, + 0x91C5: 368, + 0x91C9: 369, + 0x91D6: 370, + 0x9241: 371, + 0x9245: 372, + 0x9249: 373, + 0x9251: 374, + 0x9253: 375, + 0x9255: 376, + 0x9261: 377, + 0x9262: 378, + 0x9265: 379, + 0x9269: 380, + 0x9273: 381, + 0x9275: 382, + 0x9277: 383, + 0x9281: 384, + 0x9282: 385, + 0x9285: 386, + 0x9288: 387, + 0x9289: 388, + 0x9291: 389, + 0x9293: 390, + 0x9295: 391, + 0x9297: 392, + 0x92A1: 393, + 0x92B6: 394, + 0x92C1: 395, + 0x92E1: 396, + 0x92E5: 397, + 0x92E9: 398, + 0x92F1: 399, + 0x92F3: 400, + 0x9341: 401, + 0x9342: 402, + 0x9349: 403, + 0x9351: 404, + 0x9353: 405, + 0x9357: 406, + 0x9361: 407, + 0x9362: 408, + 0x9365: 409, + 0x9369: 410, + 0x936A: 411, + 0x936B: 412, + 0x9371: 413, + 0x9373: 414, + 0x9375: 415, + 0x9377: 416, + 0x9378: 417, + 0x937C: 418, + 0x9381: 419, + 0x9385: 420, + 0x9389: 421, + 0x93A1: 422, + 0x93A2: 423, + 0x93A5: 424, + 0x93A9: 425, + 0x93AB: 426, + 0x93B1: 427, + 0x93B3: 428, + 0x93B5: 429, + 0x93B7: 430, + 0x93BC: 431, + 0x9461: 432, + 0x9462: 433, + 0x9463: 434, + 0x9465: 435, + 0x9468: 436, + 0x9469: 437, + 0x946A: 438, + 0x946B: 439, + 0x946C: 440, + 0x9470: 441, + 0x9471: 442, + 0x9473: 443, + 0x9475: 444, + 0x9476: 445, + 0x9477: 446, + 0x9478: 447, + 0x9479: 448, + 0x947D: 449, + 0x9481: 450, + 0x9482: 451, + 0x9485: 452, + 0x9489: 453, + 0x9491: 454, + 0x9493: 455, + 0x9495: 456, + 0x9496: 457, + 0x9497: 458, + 0x94A1: 459, + 0x94E1: 460, + 0x94E2: 461, + 0x94E3: 462, + 0x94E5: 463, + 0x94E8: 464, + 0x94E9: 465, + 0x94EB: 466, + 0x94EC: 467, + 0x94F1: 468, + 0x94F3: 469, + 0x94F5: 470, + 0x94F7: 471, + 0x94F9: 472, + 0x94FC: 473, + 0x9541: 474, + 0x9542: 475, + 0x9545: 476, + 0x9549: 477, + 0x9551: 478, + 0x9553: 479, + 0x9555: 480, + 0x9556: 481, + 0x9557: 482, + 0x9561: 483, + 0x9565: 484, + 0x9569: 485, + 0x9576: 486, + 0x9577: 487, + 0x9581: 488, + 0x9585: 489, + 0x95A1: 490, + 0x95A2: 491, + 0x95A5: 492, + 0x95A8: 493, + 0x95A9: 494, + 0x95AB: 495, + 0x95AD: 496, + 0x95B1: 497, + 0x95B3: 498, + 0x95B5: 499, + 0x95B7: 500, + 0x95B9: 501, + 0x95BB: 502, + 0x95C1: 503, + 0x95C5: 504, + 0x95C9: 505, + 0x95E1: 506, + 0x95F6: 507, + 0x9641: 508, + 0x9645: 509, + 0x9649: 510, + 0x9651: 511, + 0x9653: 512, + 0x9655: 513, + 0x9661: 514, + 0x9681: 515, + 0x9682: 516, + 0x9685: 517, + 0x9689: 518, + 0x9691: 519, + 0x9693: 520, + 0x9695: 521, + 0x9697: 522, + 0x96A1: 523, + 0x96B6: 524, + 0x96C1: 525, + 0x96D7: 526, + 0x96E1: 527, + 0x96E5: 528, + 0x96E9: 529, + 0x96F3: 530, + 0x96F5: 531, + 0x96F7: 532, + 0x9741: 533, + 0x9745: 534, + 0x9749: 535, + 0x9751: 536, + 0x9757: 537, + 0x9761: 538, + 0x9762: 539, + 0x9765: 540, + 0x9768: 541, + 0x9769: 542, + 0x976B: 543, + 0x9771: 544, + 0x9773: 545, + 0x9775: 546, + 0x9777: 547, + 0x9781: 548, + 0x97A1: 549, + 0x97A2: 550, + 0x97A5: 551, + 0x97A8: 552, + 0x97A9: 553, + 0x97B1: 554, + 0x97B3: 555, + 0x97B5: 556, + 0x97B6: 557, + 0x97B7: 558, + 0x97B8: 559, + 0x9861: 560, + 0x9862: 561, + 0x9865: 562, + 0x9869: 563, + 0x9871: 564, + 0x9873: 565, + 0x9875: 566, + 0x9876: 567, + 0x9877: 568, + 0x987D: 569, + 0x9881: 570, + 0x9882: 571, + 0x9885: 572, + 0x9889: 573, + 0x9891: 574, + 0x9893: 575, + 0x9895: 576, + 0x9896: 577, + 0x9897: 578, + 0x98E1: 579, + 0x98E2: 580, + 0x98E5: 581, + 0x98E9: 582, + 0x98EB: 583, + 0x98EC: 584, + 0x98F1: 585, + 0x98F3: 586, + 0x98F5: 587, + 0x98F6: 588, + 0x98F7: 589, + 0x98FD: 590, + 0x9941: 591, + 0x9942: 592, + 0x9945: 593, + 0x9949: 594, + 0x9951: 595, + 0x9953: 596, + 0x9955: 597, + 0x9956: 598, + 0x9957: 599, + 0x9961: 600, + 0x9976: 601, + 0x99A1: 602, + 0x99A2: 603, + 0x99A5: 604, + 0x99A9: 605, + 0x99B7: 606, + 0x99C1: 607, + 0x99C9: 608, + 0x99E1: 609, + 0x9A41: 610, + 0x9A45: 611, + 0x9A81: 612, + 0x9A82: 613, + 0x9A85: 614, + 0x9A89: 615, + 0x9A90: 616, + 0x9A91: 617, + 0x9A97: 618, + 0x9AC1: 619, + 0x9AE1: 620, + 0x9AE5: 621, + 0x9AE9: 622, + 0x9AF1: 623, + 0x9AF3: 624, + 0x9AF7: 625, + 0x9B61: 626, + 0x9B62: 627, + 0x9B65: 628, + 0x9B68: 629, + 0x9B69: 630, + 0x9B71: 631, + 0x9B73: 632, + 0x9B75: 633, + 0x9B81: 634, + 0x9B85: 635, + 0x9B89: 636, + 0x9B91: 637, + 0x9B93: 638, + 0x9BA1: 639, + 0x9BA5: 640, + 0x9BA9: 641, + 0x9BB1: 642, + 0x9BB3: 643, + 0x9BB5: 644, + 0x9BB7: 645, + 0x9C61: 646, + 0x9C62: 647, + 0x9C65: 648, + 0x9C69: 649, + 0x9C71: 650, + 0x9C73: 651, + 0x9C75: 652, + 0x9C76: 653, + 0x9C77: 654, + 0x9C78: 655, + 0x9C7C: 656, + 0x9C7D: 657, + 0x9C81: 658, + 0x9C82: 659, + 0x9C85: 660, + 0x9C89: 661, + 0x9C91: 662, + 0x9C93: 663, + 0x9C95: 664, + 0x9C96: 665, + 0x9C97: 666, + 0x9CA1: 667, + 0x9CA2: 668, + 0x9CA5: 669, + 0x9CB5: 670, + 0x9CB7: 671, + 0x9CE1: 672, + 0x9CE2: 673, + 0x9CE5: 674, + 0x9CE9: 675, + 0x9CF1: 676, + 0x9CF3: 677, + 0x9CF5: 678, + 0x9CF6: 679, + 0x9CF7: 680, + 0x9CFD: 681, + 0x9D41: 682, + 0x9D42: 683, + 0x9D45: 684, + 0x9D49: 685, + 0x9D51: 686, + 0x9D53: 687, + 0x9D55: 688, + 0x9D57: 689, + 0x9D61: 690, + 0x9D62: 691, + 0x9D65: 692, + 0x9D69: 693, + 0x9D71: 694, + 0x9D73: 695, + 0x9D75: 696, + 0x9D76: 697, + 0x9D77: 698, + 0x9D81: 699, + 0x9D85: 700, + 0x9D93: 701, + 0x9D95: 702, + 0x9DA1: 703, + 0x9DA2: 704, + 0x9DA5: 705, + 0x9DA9: 706, + 0x9DB1: 707, + 0x9DB3: 708, + 0x9DB5: 709, + 0x9DB7: 710, + 0x9DC1: 711, + 0x9DC5: 712, + 0x9DD7: 713, + 0x9DF6: 714, + 0x9E41: 715, + 0x9E45: 716, + 0x9E49: 717, + 0x9E51: 718, + 0x9E53: 719, + 0x9E55: 720, + 0x9E57: 721, + 0x9E61: 722, + 0x9E65: 723, + 0x9E69: 724, + 0x9E73: 725, + 0x9E75: 726, + 0x9E77: 727, + 0x9E81: 728, + 0x9E82: 729, + 0x9E85: 730, + 0x9E89: 731, + 0x9E91: 732, + 0x9E93: 733, + 0x9E95: 734, + 0x9E97: 735, + 0x9EA1: 736, + 0x9EB6: 737, + 0x9EC1: 738, + 0x9EE1: 739, + 0x9EE2: 740, + 0x9EE5: 741, + 0x9EE9: 742, + 0x9EF1: 743, + 0x9EF5: 744, + 0x9EF7: 745, + 0x9F41: 746, + 0x9F42: 747, + 0x9F45: 748, + 0x9F49: 749, + 0x9F51: 750, + 0x9F53: 751, + 0x9F55: 752, + 0x9F57: 753, + 0x9F61: 754, + 0x9F62: 755, + 0x9F65: 756, + 0x9F69: 757, + 0x9F71: 758, + 0x9F73: 759, + 0x9F75: 760, + 0x9F77: 761, + 0x9F78: 762, + 0x9F7B: 763, + 0x9F7C: 764, + 0x9FA1: 765, + 0x9FA2: 766, + 0x9FA5: 767, + 0x9FA9: 768, + 0x9FB1: 769, + 0x9FB3: 770, + 0x9FB5: 771, + 0x9FB7: 772, + 0xA061: 773, + 0xA062: 774, + 0xA065: 775, + 0xA067: 776, + 0xA068: 777, + 0xA069: 778, + 0xA06A: 779, + 0xA06B: 780, + 0xA071: 781, + 0xA073: 782, + 0xA075: 783, + 0xA077: 784, + 0xA078: 785, + 0xA07B: 786, + 0xA07D: 787, + 0xA081: 788, + 0xA082: 789, + 0xA085: 790, + 0xA089: 791, + 0xA091: 792, + 0xA093: 793, + 0xA095: 794, + 0xA096: 795, + 0xA097: 796, + 0xA098: 797, + 0xA0A1: 798, + 0xA0A2: 799, + 0xA0A9: 800, + 0xA0B7: 801, + 0xA0E1: 802, + 0xA0E2: 803, + 0xA0E5: 804, + 0xA0E9: 805, + 0xA0EB: 806, + 0xA0F1: 807, + 0xA0F3: 808, + 0xA0F5: 809, + 0xA0F7: 810, + 0xA0F8: 811, + 0xA0FD: 812, + 0xA141: 813, + 0xA142: 814, + 0xA145: 815, + 0xA149: 816, + 0xA151: 817, + 0xA153: 818, + 0xA155: 819, + 0xA156: 820, + 0xA157: 821, + 0xA161: 822, + 0xA162: 823, + 0xA165: 824, + 0xA169: 825, + 0xA175: 826, + 0xA176: 827, + 0xA177: 828, + 0xA179: 829, + 0xA181: 830, + 0xA1A1: 831, + 0xA1A2: 832, + 0xA1A4: 833, + 0xA1A5: 834, + 0xA1A9: 835, + 0xA1AB: 836, + 0xA1B1: 837, + 0xA1B3: 838, + 0xA1B5: 839, + 0xA1B7: 840, + 0xA1C1: 841, + 0xA1C5: 842, + 0xA1D6: 843, + 0xA1D7: 844, + 0xA241: 845, + 0xA245: 846, + 0xA249: 847, + 0xA253: 848, + 0xA255: 849, + 0xA257: 850, + 0xA261: 851, + 0xA265: 852, + 0xA269: 853, + 0xA273: 854, + 0xA275: 855, + 0xA281: 856, + 0xA282: 857, + 0xA283: 858, + 0xA285: 859, + 0xA288: 860, + 0xA289: 861, + 0xA28A: 862, + 0xA28B: 863, + 0xA291: 864, + 0xA293: 865, + 0xA295: 866, + 0xA297: 867, + 0xA29B: 868, + 0xA29D: 869, + 0xA2A1: 870, + 0xA2A5: 871, + 0xA2A9: 872, + 0xA2B3: 873, + 0xA2B5: 874, + 0xA2C1: 875, + 0xA2E1: 876, + 0xA2E5: 877, + 0xA2E9: 878, + 0xA341: 879, + 0xA345: 880, + 0xA349: 881, + 0xA351: 882, + 0xA355: 883, + 0xA361: 884, + 0xA365: 885, + 0xA369: 886, + 0xA371: 887, + 0xA375: 888, + 0xA3A1: 889, + 0xA3A2: 890, + 0xA3A5: 891, + 0xA3A8: 892, + 0xA3A9: 893, + 0xA3AB: 894, + 0xA3B1: 895, + 0xA3B3: 896, + 0xA3B5: 897, + 0xA3B6: 898, + 0xA3B7: 899, + 0xA3B9: 900, + 0xA3BB: 901, + 0xA461: 902, + 0xA462: 903, + 0xA463: 904, + 0xA464: 905, + 0xA465: 906, + 0xA468: 907, + 0xA469: 908, + 0xA46A: 909, + 0xA46B: 910, + 0xA46C: 911, + 0xA471: 912, + 0xA473: 913, + 0xA475: 914, + 0xA477: 915, + 0xA47B: 916, + 0xA481: 917, + 0xA482: 918, + 0xA485: 919, + 0xA489: 920, + 0xA491: 921, + 0xA493: 922, + 0xA495: 923, + 0xA496: 924, + 0xA497: 925, + 0xA49B: 926, + 0xA4A1: 927, + 0xA4A2: 928, + 0xA4A5: 929, + 0xA4B3: 930, + 0xA4E1: 931, + 0xA4E2: 932, + 0xA4E5: 933, + 0xA4E8: 934, + 0xA4E9: 935, + 0xA4EB: 936, + 0xA4F1: 937, + 0xA4F3: 938, + 0xA4F5: 939, + 0xA4F7: 940, + 0xA4F8: 941, + 0xA541: 942, + 0xA542: 943, + 0xA545: 944, + 0xA548: 945, + 0xA549: 946, + 0xA551: 947, + 0xA553: 948, + 0xA555: 949, + 0xA556: 950, + 0xA557: 951, + 0xA561: 952, + 0xA562: 953, + 0xA565: 954, + 0xA569: 955, + 0xA573: 956, + 0xA575: 957, + 0xA576: 958, + 0xA577: 959, + 0xA57B: 960, + 0xA581: 961, + 0xA585: 962, + 0xA5A1: 963, + 0xA5A2: 964, + 0xA5A3: 965, + 0xA5A5: 966, + 0xA5A9: 967, + 0xA5B1: 968, + 0xA5B3: 969, + 0xA5B5: 970, + 0xA5B7: 971, + 0xA5C1: 972, + 0xA5C5: 973, + 0xA5D6: 974, + 0xA5E1: 975, + 0xA5F6: 976, + 0xA641: 977, + 0xA642: 978, + 0xA645: 979, + 0xA649: 980, + 0xA651: 981, + 0xA653: 982, + 0xA661: 983, + 0xA665: 984, + 0xA681: 985, + 0xA682: 986, + 0xA685: 987, + 0xA688: 988, + 0xA689: 989, + 0xA68A: 990, + 0xA68B: 991, + 0xA691: 992, + 0xA693: 993, + 0xA695: 994, + 0xA697: 995, + 0xA69B: 996, + 0xA69C: 997, + 0xA6A1: 998, + 0xA6A9: 999, + 0xA6B6: 1000, + 0xA6C1: 1001, + 0xA6E1: 1002, + 0xA6E2: 1003, + 0xA6E5: 1004, + 0xA6E9: 1005, + 0xA6F7: 1006, + 0xA741: 1007, + 0xA745: 1008, + 0xA749: 1009, + 0xA751: 1010, + 0xA755: 1011, + 0xA757: 1012, + 0xA761: 1013, + 0xA762: 1014, + 0xA765: 1015, + 0xA769: 1016, + 0xA771: 1017, + 0xA773: 1018, + 0xA775: 1019, + 0xA7A1: 1020, + 0xA7A2: 1021, + 0xA7A5: 1022, + 0xA7A9: 1023, + 0xA7AB: 1024, + 0xA7B1: 1025, + 0xA7B3: 1026, + 0xA7B5: 1027, + 0xA7B7: 1028, + 0xA7B8: 1029, + 0xA7B9: 1030, + 0xA861: 1031, + 0xA862: 1032, + 0xA865: 1033, + 0xA869: 1034, + 0xA86B: 1035, + 0xA871: 1036, + 0xA873: 1037, + 0xA875: 1038, + 0xA876: 1039, + 0xA877: 1040, + 0xA87D: 1041, + 0xA881: 1042, + 0xA882: 1043, + 0xA885: 1044, + 0xA889: 1045, + 0xA891: 1046, + 0xA893: 1047, + 0xA895: 1048, + 0xA896: 1049, + 0xA897: 1050, + 0xA8A1: 1051, + 0xA8A2: 1052, + 0xA8B1: 1053, + 0xA8E1: 1054, + 0xA8E2: 1055, + 0xA8E5: 1056, + 0xA8E8: 1057, + 0xA8E9: 1058, + 0xA8F1: 1059, + 0xA8F5: 1060, + 0xA8F6: 1061, + 0xA8F7: 1062, + 0xA941: 1063, + 0xA957: 1064, + 0xA961: 1065, + 0xA962: 1066, + 0xA971: 1067, + 0xA973: 1068, + 0xA975: 1069, + 0xA976: 1070, + 0xA977: 1071, + 0xA9A1: 1072, + 0xA9A2: 1073, + 0xA9A5: 1074, + 0xA9A9: 1075, + 0xA9B1: 1076, + 0xA9B3: 1077, + 0xA9B7: 1078, + 0xAA41: 1079, + 0xAA61: 1080, + 0xAA77: 1081, + 0xAA81: 1082, + 0xAA82: 1083, + 0xAA85: 1084, + 0xAA89: 1085, + 0xAA91: 1086, + 0xAA95: 1087, + 0xAA97: 1088, + 0xAB41: 1089, + 0xAB57: 1090, + 0xAB61: 1091, + 0xAB65: 1092, + 0xAB69: 1093, + 0xAB71: 1094, + 0xAB73: 1095, + 0xABA1: 1096, + 0xABA2: 1097, + 0xABA5: 1098, + 0xABA9: 1099, + 0xABB1: 1100, + 0xABB3: 1101, + 0xABB5: 1102, + 0xABB7: 1103, + 0xAC61: 1104, + 0xAC62: 1105, + 0xAC64: 1106, + 0xAC65: 1107, + 0xAC68: 1108, + 0xAC69: 1109, + 0xAC6A: 1110, + 0xAC6B: 1111, + 0xAC71: 1112, + 0xAC73: 1113, + 0xAC75: 1114, + 0xAC76: 1115, + 0xAC77: 1116, + 0xAC7B: 1117, + 0xAC81: 1118, + 0xAC82: 1119, + 0xAC85: 1120, + 0xAC89: 1121, + 0xAC91: 1122, + 0xAC93: 1123, + 0xAC95: 1124, + 0xAC96: 1125, + 0xAC97: 1126, + 0xACA1: 1127, + 0xACA2: 1128, + 0xACA5: 1129, + 0xACA9: 1130, + 0xACB1: 1131, + 0xACB3: 1132, + 0xACB5: 1133, + 0xACB7: 1134, + 0xACC1: 1135, + 0xACC5: 1136, + 0xACC9: 1137, + 0xACD1: 1138, + 0xACD7: 1139, + 0xACE1: 1140, + 0xACE2: 1141, + 0xACE3: 1142, + 0xACE4: 1143, + 0xACE5: 1144, + 0xACE8: 1145, + 0xACE9: 1146, + 0xACEB: 1147, + 0xACEC: 1148, + 0xACF1: 1149, + 0xACF3: 1150, + 0xACF5: 1151, + 0xACF6: 1152, + 0xACF7: 1153, + 0xACFC: 1154, + 0xAD41: 1155, + 0xAD42: 1156, + 0xAD45: 1157, + 0xAD49: 1158, + 0xAD51: 1159, + 0xAD53: 1160, + 0xAD55: 1161, + 0xAD56: 1162, + 0xAD57: 1163, + 0xAD61: 1164, + 0xAD62: 1165, + 0xAD65: 1166, + 0xAD69: 1167, + 0xAD71: 1168, + 0xAD73: 1169, + 0xAD75: 1170, + 0xAD76: 1171, + 0xAD77: 1172, + 0xAD81: 1173, + 0xAD85: 1174, + 0xAD89: 1175, + 0xAD97: 1176, + 0xADA1: 1177, + 0xADA2: 1178, + 0xADA3: 1179, + 0xADA5: 1180, + 0xADA9: 1181, + 0xADAB: 1182, + 0xADB1: 1183, + 0xADB3: 1184, + 0xADB5: 1185, + 0xADB7: 1186, + 0xADBB: 1187, + 0xADC1: 1188, + 0xADC2: 1189, + 0xADC5: 1190, + 0xADC9: 1191, + 0xADD7: 1192, + 0xADE1: 1193, + 0xADE5: 1194, + 0xADE9: 1195, + 0xADF1: 1196, + 0xADF5: 1197, + 0xADF6: 1198, + 0xAE41: 1199, + 0xAE45: 1200, + 0xAE49: 1201, + 0xAE51: 1202, + 0xAE53: 1203, + 0xAE55: 1204, + 0xAE61: 1205, + 0xAE62: 1206, + 0xAE65: 1207, + 0xAE69: 1208, + 0xAE71: 1209, + 0xAE73: 1210, + 0xAE75: 1211, + 0xAE77: 1212, + 0xAE81: 1213, + 0xAE82: 1214, + 0xAE85: 1215, + 0xAE88: 1216, + 0xAE89: 1217, + 0xAE91: 1218, + 0xAE93: 1219, + 0xAE95: 1220, + 0xAE97: 1221, + 0xAE99: 1222, + 0xAE9B: 1223, + 0xAE9C: 1224, + 0xAEA1: 1225, + 0xAEB6: 1226, + 0xAEC1: 1227, + 0xAEC2: 1228, + 0xAEC5: 1229, + 0xAEC9: 1230, + 0xAED1: 1231, + 0xAED7: 1232, + 0xAEE1: 1233, + 0xAEE2: 1234, + 0xAEE5: 1235, + 0xAEE9: 1236, + 0xAEF1: 1237, + 0xAEF3: 1238, + 0xAEF5: 1239, + 0xAEF7: 1240, + 0xAF41: 1241, + 0xAF42: 1242, + 0xAF49: 1243, + 0xAF51: 1244, + 0xAF55: 1245, + 0xAF57: 1246, + 0xAF61: 1247, + 0xAF62: 1248, + 0xAF65: 1249, + 0xAF69: 1250, + 0xAF6A: 1251, + 0xAF71: 1252, + 0xAF73: 1253, + 0xAF75: 1254, + 0xAF77: 1255, + 0xAFA1: 1256, + 0xAFA2: 1257, + 0xAFA5: 1258, + 0xAFA8: 1259, + 0xAFA9: 1260, + 0xAFB0: 1261, + 0xAFB1: 1262, + 0xAFB3: 1263, + 0xAFB5: 1264, + 0xAFB7: 1265, + 0xAFBC: 1266, + 0xB061: 1267, + 0xB062: 1268, + 0xB064: 1269, + 0xB065: 1270, + 0xB069: 1271, + 0xB071: 1272, + 0xB073: 1273, + 0xB076: 1274, + 0xB077: 1275, + 0xB07D: 1276, + 0xB081: 1277, + 0xB082: 1278, + 0xB085: 1279, + 0xB089: 1280, + 0xB091: 1281, + 0xB093: 1282, + 0xB096: 1283, + 0xB097: 1284, + 0xB0B7: 1285, + 0xB0E1: 1286, + 0xB0E2: 1287, + 0xB0E5: 1288, + 0xB0E9: 1289, + 0xB0EB: 1290, + 0xB0F1: 1291, + 0xB0F3: 1292, + 0xB0F6: 1293, + 0xB0F7: 1294, + 0xB141: 1295, + 0xB145: 1296, + 0xB149: 1297, + 0xB185: 1298, + 0xB1A1: 1299, + 0xB1A2: 1300, + 0xB1A5: 1301, + 0xB1A8: 1302, + 0xB1A9: 1303, + 0xB1AB: 1304, + 0xB1B1: 1305, + 0xB1B3: 1306, + 0xB1B7: 1307, + 0xB1C1: 1308, + 0xB1C2: 1309, + 0xB1C5: 1310, + 0xB1D6: 1311, + 0xB1E1: 1312, + 0xB1F6: 1313, + 0xB241: 1314, + 0xB245: 1315, + 0xB249: 1316, + 0xB251: 1317, + 0xB253: 1318, + 0xB261: 1319, + 0xB281: 1320, + 0xB282: 1321, + 0xB285: 1322, + 0xB289: 1323, + 0xB291: 1324, + 0xB293: 1325, + 0xB297: 1326, + 0xB2A1: 1327, + 0xB2B6: 1328, + 0xB2C1: 1329, + 0xB2E1: 1330, + 0xB2E5: 1331, + 0xB357: 1332, + 0xB361: 1333, + 0xB362: 1334, + 0xB365: 1335, + 0xB369: 1336, + 0xB36B: 1337, + 0xB370: 1338, + 0xB371: 1339, + 0xB373: 1340, + 0xB381: 1341, + 0xB385: 1342, + 0xB389: 1343, + 0xB391: 1344, + 0xB3A1: 1345, + 0xB3A2: 1346, + 0xB3A5: 1347, + 0xB3A9: 1348, + 0xB3B1: 1349, + 0xB3B3: 1350, + 0xB3B5: 1351, + 0xB3B7: 1352, + 0xB461: 1353, + 0xB462: 1354, + 0xB465: 1355, + 0xB466: 1356, + 0xB467: 1357, + 0xB469: 1358, + 0xB46A: 1359, + 0xB46B: 1360, + 0xB470: 1361, + 0xB471: 1362, + 0xB473: 1363, + 0xB475: 1364, + 0xB476: 1365, + 0xB477: 1366, + 0xB47B: 1367, + 0xB47C: 1368, + 0xB481: 1369, + 0xB482: 1370, + 0xB485: 1371, + 0xB489: 1372, + 0xB491: 1373, + 0xB493: 1374, + 0xB495: 1375, + 0xB496: 1376, + 0xB497: 1377, + 0xB4A1: 1378, + 0xB4A2: 1379, + 0xB4A5: 1380, + 0xB4A9: 1381, + 0xB4AC: 1382, + 0xB4B1: 1383, + 0xB4B3: 1384, + 0xB4B5: 1385, + 0xB4B7: 1386, + 0xB4BB: 1387, + 0xB4BD: 1388, + 0xB4C1: 1389, + 0xB4C5: 1390, + 0xB4C9: 1391, + 0xB4D3: 1392, + 0xB4E1: 1393, + 0xB4E2: 1394, + 0xB4E5: 1395, + 0xB4E6: 1396, + 0xB4E8: 1397, + 0xB4E9: 1398, + 0xB4EA: 1399, + 0xB4EB: 1400, + 0xB4F1: 1401, + 0xB4F3: 1402, + 0xB4F4: 1403, + 0xB4F5: 1404, + 0xB4F6: 1405, + 0xB4F7: 1406, + 0xB4F8: 1407, + 0xB4FA: 1408, + 0xB4FC: 1409, + 0xB541: 1410, + 0xB542: 1411, + 0xB545: 1412, + 0xB549: 1413, + 0xB551: 1414, + 0xB553: 1415, + 0xB555: 1416, + 0xB557: 1417, + 0xB561: 1418, + 0xB562: 1419, + 0xB563: 1420, + 0xB565: 1421, + 0xB569: 1422, + 0xB56B: 1423, + 0xB56C: 1424, + 0xB571: 1425, + 0xB573: 1426, + 0xB574: 1427, + 0xB575: 1428, + 0xB576: 1429, + 0xB577: 1430, + 0xB57B: 1431, + 0xB57C: 1432, + 0xB57D: 1433, + 0xB581: 1434, + 0xB585: 1435, + 0xB589: 1436, + 0xB591: 1437, + 0xB593: 1438, + 0xB595: 1439, + 0xB596: 1440, + 0xB5A1: 1441, + 0xB5A2: 1442, + 0xB5A5: 1443, + 0xB5A9: 1444, + 0xB5AA: 1445, + 0xB5AB: 1446, + 0xB5AD: 1447, + 0xB5B0: 1448, + 0xB5B1: 1449, + 0xB5B3: 1450, + 0xB5B5: 1451, + 0xB5B7: 1452, + 0xB5B9: 1453, + 0xB5C1: 1454, + 0xB5C2: 1455, + 0xB5C5: 1456, + 0xB5C9: 1457, + 0xB5D1: 1458, + 0xB5D3: 1459, + 0xB5D5: 1460, + 0xB5D6: 1461, + 0xB5D7: 1462, + 0xB5E1: 1463, + 0xB5E2: 1464, + 0xB5E5: 1465, + 0xB5F1: 1466, + 0xB5F5: 1467, + 0xB5F7: 1468, + 0xB641: 1469, + 0xB642: 1470, + 0xB645: 1471, + 0xB649: 1472, + 0xB651: 1473, + 0xB653: 1474, + 0xB655: 1475, + 0xB657: 1476, + 0xB661: 1477, + 0xB662: 1478, + 0xB665: 1479, + 0xB669: 1480, + 0xB671: 1481, + 0xB673: 1482, + 0xB675: 1483, + 0xB677: 1484, + 0xB681: 1485, + 0xB682: 1486, + 0xB685: 1487, + 0xB689: 1488, + 0xB68A: 1489, + 0xB68B: 1490, + 0xB691: 1491, + 0xB693: 1492, + 0xB695: 1493, + 0xB697: 1494, + 0xB6A1: 1495, + 0xB6A2: 1496, + 0xB6A5: 1497, + 0xB6A9: 1498, + 0xB6B1: 1499, + 0xB6B3: 1500, + 0xB6B6: 1501, + 0xB6B7: 1502, + 0xB6C1: 1503, + 0xB6C2: 1504, + 0xB6C5: 1505, + 0xB6C9: 1506, + 0xB6D1: 1507, + 0xB6D3: 1508, + 0xB6D7: 1509, + 0xB6E1: 1510, + 0xB6E2: 1511, + 0xB6E5: 1512, + 0xB6E9: 1513, + 0xB6F1: 1514, + 0xB6F3: 1515, + 0xB6F5: 1516, + 0xB6F7: 1517, + 0xB741: 1518, + 0xB742: 1519, + 0xB745: 1520, + 0xB749: 1521, + 0xB751: 1522, + 0xB753: 1523, + 0xB755: 1524, + 0xB757: 1525, + 0xB759: 1526, + 0xB761: 1527, + 0xB762: 1528, + 0xB765: 1529, + 0xB769: 1530, + 0xB76F: 1531, + 0xB771: 1532, + 0xB773: 1533, + 0xB775: 1534, + 0xB777: 1535, + 0xB778: 1536, + 0xB779: 1537, + 0xB77A: 1538, + 0xB77B: 1539, + 0xB77C: 1540, + 0xB77D: 1541, + 0xB781: 1542, + 0xB785: 1543, + 0xB789: 1544, + 0xB791: 1545, + 0xB795: 1546, + 0xB7A1: 1547, + 0xB7A2: 1548, + 0xB7A5: 1549, + 0xB7A9: 1550, + 0xB7AA: 1551, + 0xB7AB: 1552, + 0xB7B0: 1553, + 0xB7B1: 1554, + 0xB7B3: 1555, + 0xB7B5: 1556, + 0xB7B6: 1557, + 0xB7B7: 1558, + 0xB7B8: 1559, + 0xB7BC: 1560, + 0xB861: 1561, + 0xB862: 1562, + 0xB865: 1563, + 0xB867: 1564, + 0xB868: 1565, + 0xB869: 1566, + 0xB86B: 1567, + 0xB871: 1568, + 0xB873: 1569, + 0xB875: 1570, + 0xB876: 1571, + 0xB877: 1572, + 0xB878: 1573, + 0xB881: 1574, + 0xB882: 1575, + 0xB885: 1576, + 0xB889: 1577, + 0xB891: 1578, + 0xB893: 1579, + 0xB895: 1580, + 0xB896: 1581, + 0xB897: 1582, + 0xB8A1: 1583, + 0xB8A2: 1584, + 0xB8A5: 1585, + 0xB8A7: 1586, + 0xB8A9: 1587, + 0xB8B1: 1588, + 0xB8B7: 1589, + 0xB8C1: 1590, + 0xB8C5: 1591, + 0xB8C9: 1592, + 0xB8E1: 1593, + 0xB8E2: 1594, + 0xB8E5: 1595, + 0xB8E9: 1596, + 0xB8EB: 1597, + 0xB8F1: 1598, + 0xB8F3: 1599, + 0xB8F5: 1600, + 0xB8F7: 1601, + 0xB8F8: 1602, + 0xB941: 1603, + 0xB942: 1604, + 0xB945: 1605, + 0xB949: 1606, + 0xB951: 1607, + 0xB953: 1608, + 0xB955: 1609, + 0xB957: 1610, + 0xB961: 1611, + 0xB965: 1612, + 0xB969: 1613, + 0xB971: 1614, + 0xB973: 1615, + 0xB976: 1616, + 0xB977: 1617, + 0xB981: 1618, + 0xB9A1: 1619, + 0xB9A2: 1620, + 0xB9A5: 1621, + 0xB9A9: 1622, + 0xB9AB: 1623, + 0xB9B1: 1624, + 0xB9B3: 1625, + 0xB9B5: 1626, + 0xB9B7: 1627, + 0xB9B8: 1628, + 0xB9B9: 1629, + 0xB9BD: 1630, + 0xB9C1: 1631, + 0xB9C2: 1632, + 0xB9C9: 1633, + 0xB9D3: 1634, + 0xB9D5: 1635, + 0xB9D7: 1636, + 0xB9E1: 1637, + 0xB9F6: 1638, + 0xB9F7: 1639, + 0xBA41: 1640, + 0xBA45: 1641, + 0xBA49: 1642, + 0xBA51: 1643, + 0xBA53: 1644, + 0xBA55: 1645, + 0xBA57: 1646, + 0xBA61: 1647, + 0xBA62: 1648, + 0xBA65: 1649, + 0xBA77: 1650, + 0xBA81: 1651, + 0xBA82: 1652, + 0xBA85: 1653, + 0xBA89: 1654, + 0xBA8A: 1655, + 0xBA8B: 1656, + 0xBA91: 1657, + 0xBA93: 1658, + 0xBA95: 1659, + 0xBA97: 1660, + 0xBAA1: 1661, + 0xBAB6: 1662, + 0xBAC1: 1663, + 0xBAE1: 1664, + 0xBAE2: 1665, + 0xBAE5: 1666, + 0xBAE9: 1667, + 0xBAF1: 1668, + 0xBAF3: 1669, + 0xBAF5: 1670, + 0xBB41: 1671, + 0xBB45: 1672, + 0xBB49: 1673, + 0xBB51: 1674, + 0xBB61: 1675, + 0xBB62: 1676, + 0xBB65: 1677, + 0xBB69: 1678, + 0xBB71: 1679, + 0xBB73: 1680, + 0xBB75: 1681, + 0xBB77: 1682, + 0xBBA1: 1683, + 0xBBA2: 1684, + 0xBBA5: 1685, + 0xBBA8: 1686, + 0xBBA9: 1687, + 0xBBAB: 1688, + 0xBBB1: 1689, + 0xBBB3: 1690, + 0xBBB5: 1691, + 0xBBB7: 1692, + 0xBBB8: 1693, + 0xBBBB: 1694, + 0xBBBC: 1695, + 0xBC61: 1696, + 0xBC62: 1697, + 0xBC65: 1698, + 0xBC67: 1699, + 0xBC69: 1700, + 0xBC6C: 1701, + 0xBC71: 1702, + 0xBC73: 1703, + 0xBC75: 1704, + 0xBC76: 1705, + 0xBC77: 1706, + 0xBC81: 1707, + 0xBC82: 1708, + 0xBC85: 1709, + 0xBC89: 1710, + 0xBC91: 1711, + 0xBC93: 1712, + 0xBC95: 1713, + 0xBC96: 1714, + 0xBC97: 1715, + 0xBCA1: 1716, + 0xBCA5: 1717, + 0xBCB7: 1718, + 0xBCE1: 1719, + 0xBCE2: 1720, + 0xBCE5: 1721, + 0xBCE9: 1722, + 0xBCF1: 1723, + 0xBCF3: 1724, + 0xBCF5: 1725, + 0xBCF6: 1726, + 0xBCF7: 1727, + 0xBD41: 1728, + 0xBD57: 1729, + 0xBD61: 1730, + 0xBD76: 1731, + 0xBDA1: 1732, + 0xBDA2: 1733, + 0xBDA5: 1734, + 0xBDA9: 1735, + 0xBDB1: 1736, + 0xBDB3: 1737, + 0xBDB5: 1738, + 0xBDB7: 1739, + 0xBDB9: 1740, + 0xBDC1: 1741, + 0xBDC2: 1742, + 0xBDC9: 1743, + 0xBDD6: 1744, + 0xBDE1: 1745, + 0xBDF6: 1746, + 0xBE41: 1747, + 0xBE45: 1748, + 0xBE49: 1749, + 0xBE51: 1750, + 0xBE53: 1751, + 0xBE77: 1752, + 0xBE81: 1753, + 0xBE82: 1754, + 0xBE85: 1755, + 0xBE89: 1756, + 0xBE91: 1757, + 0xBE93: 1758, + 0xBE97: 1759, + 0xBEA1: 1760, + 0xBEB6: 1761, + 0xBEB7: 1762, + 0xBEE1: 1763, + 0xBF41: 1764, + 0xBF61: 1765, + 0xBF71: 1766, + 0xBF75: 1767, + 0xBF77: 1768, + 0xBFA1: 1769, + 0xBFA2: 1770, + 0xBFA5: 1771, + 0xBFA9: 1772, + 0xBFB1: 1773, + 0xBFB3: 1774, + 0xBFB7: 1775, + 0xBFB8: 1776, + 0xBFBD: 1777, + 0xC061: 1778, + 0xC062: 1779, + 0xC065: 1780, + 0xC067: 1781, + 0xC069: 1782, + 0xC071: 1783, + 0xC073: 1784, + 0xC075: 1785, + 0xC076: 1786, + 0xC077: 1787, + 0xC078: 1788, + 0xC081: 1789, + 0xC082: 1790, + 0xC085: 1791, + 0xC089: 1792, + 0xC091: 1793, + 0xC093: 1794, + 0xC095: 1795, + 0xC096: 1796, + 0xC097: 1797, + 0xC0A1: 1798, + 0xC0A5: 1799, + 0xC0A7: 1800, + 0xC0A9: 1801, + 0xC0B1: 1802, + 0xC0B7: 1803, + 0xC0E1: 1804, + 0xC0E2: 1805, + 0xC0E5: 1806, + 0xC0E9: 1807, + 0xC0F1: 1808, + 0xC0F3: 1809, + 0xC0F5: 1810, + 0xC0F6: 1811, + 0xC0F7: 1812, + 0xC141: 1813, + 0xC142: 1814, + 0xC145: 1815, + 0xC149: 1816, + 0xC151: 1817, + 0xC153: 1818, + 0xC155: 1819, + 0xC157: 1820, + 0xC161: 1821, + 0xC165: 1822, + 0xC176: 1823, + 0xC181: 1824, + 0xC185: 1825, + 0xC197: 1826, + 0xC1A1: 1827, + 0xC1A2: 1828, + 0xC1A5: 1829, + 0xC1A9: 1830, + 0xC1B1: 1831, + 0xC1B3: 1832, + 0xC1B5: 1833, + 0xC1B7: 1834, + 0xC1C1: 1835, + 0xC1C5: 1836, + 0xC1C9: 1837, + 0xC1D7: 1838, + 0xC241: 1839, + 0xC245: 1840, + 0xC249: 1841, + 0xC251: 1842, + 0xC253: 1843, + 0xC255: 1844, + 0xC257: 1845, + 0xC261: 1846, + 0xC271: 1847, + 0xC281: 1848, + 0xC282: 1849, + 0xC285: 1850, + 0xC289: 1851, + 0xC291: 1852, + 0xC293: 1853, + 0xC295: 1854, + 0xC297: 1855, + 0xC2A1: 1856, + 0xC2B6: 1857, + 0xC2C1: 1858, + 0xC2C5: 1859, + 0xC2E1: 1860, + 0xC2E5: 1861, + 0xC2E9: 1862, + 0xC2F1: 1863, + 0xC2F3: 1864, + 0xC2F5: 1865, + 0xC2F7: 1866, + 0xC341: 1867, + 0xC345: 1868, + 0xC349: 1869, + 0xC351: 1870, + 0xC357: 1871, + 0xC361: 1872, + 0xC362: 1873, + 0xC365: 1874, + 0xC369: 1875, + 0xC371: 1876, + 0xC373: 1877, + 0xC375: 1878, + 0xC377: 1879, + 0xC3A1: 1880, + 0xC3A2: 1881, + 0xC3A5: 1882, + 0xC3A8: 1883, + 0xC3A9: 1884, + 0xC3AA: 1885, + 0xC3B1: 1886, + 0xC3B3: 1887, + 0xC3B5: 1888, + 0xC3B7: 1889, + 0xC461: 1890, + 0xC462: 1891, + 0xC465: 1892, + 0xC469: 1893, + 0xC471: 1894, + 0xC473: 1895, + 0xC475: 1896, + 0xC477: 1897, + 0xC481: 1898, + 0xC482: 1899, + 0xC485: 1900, + 0xC489: 1901, + 0xC491: 1902, + 0xC493: 1903, + 0xC495: 1904, + 0xC496: 1905, + 0xC497: 1906, + 0xC4A1: 1907, + 0xC4A2: 1908, + 0xC4B7: 1909, + 0xC4E1: 1910, + 0xC4E2: 1911, + 0xC4E5: 1912, + 0xC4E8: 1913, + 0xC4E9: 1914, + 0xC4F1: 1915, + 0xC4F3: 1916, + 0xC4F5: 1917, + 0xC4F6: 1918, + 0xC4F7: 1919, + 0xC541: 1920, + 0xC542: 1921, + 0xC545: 1922, + 0xC549: 1923, + 0xC551: 1924, + 0xC553: 1925, + 0xC555: 1926, + 0xC557: 1927, + 0xC561: 1928, + 0xC565: 1929, + 0xC569: 1930, + 0xC571: 1931, + 0xC573: 1932, + 0xC575: 1933, + 0xC576: 1934, + 0xC577: 1935, + 0xC581: 1936, + 0xC5A1: 1937, + 0xC5A2: 1938, + 0xC5A5: 1939, + 0xC5A9: 1940, + 0xC5B1: 1941, + 0xC5B3: 1942, + 0xC5B5: 1943, + 0xC5B7: 1944, + 0xC5C1: 1945, + 0xC5C2: 1946, + 0xC5C5: 1947, + 0xC5C9: 1948, + 0xC5D1: 1949, + 0xC5D7: 1950, + 0xC5E1: 1951, + 0xC5F7: 1952, + 0xC641: 1953, + 0xC649: 1954, + 0xC661: 1955, + 0xC681: 1956, + 0xC682: 1957, + 0xC685: 1958, + 0xC689: 1959, + 0xC691: 1960, + 0xC693: 1961, + 0xC695: 1962, + 0xC697: 1963, + 0xC6A1: 1964, + 0xC6A5: 1965, + 0xC6A9: 1966, + 0xC6B7: 1967, + 0xC6C1: 1968, + 0xC6D7: 1969, + 0xC6E1: 1970, + 0xC6E2: 1971, + 0xC6E5: 1972, + 0xC6E9: 1973, + 0xC6F1: 1974, + 0xC6F3: 1975, + 0xC6F5: 1976, + 0xC6F7: 1977, + 0xC741: 1978, + 0xC745: 1979, + 0xC749: 1980, + 0xC751: 1981, + 0xC761: 1982, + 0xC762: 1983, + 0xC765: 1984, + 0xC769: 1985, + 0xC771: 1986, + 0xC773: 1987, + 0xC777: 1988, + 0xC7A1: 1989, + 0xC7A2: 1990, + 0xC7A5: 1991, + 0xC7A9: 1992, + 0xC7B1: 1993, + 0xC7B3: 1994, + 0xC7B5: 1995, + 0xC7B7: 1996, + 0xC861: 1997, + 0xC862: 1998, + 0xC865: 1999, + 0xC869: 2000, + 0xC86A: 2001, + 0xC871: 2002, + 0xC873: 2003, + 0xC875: 2004, + 0xC876: 2005, + 0xC877: 2006, + 0xC881: 2007, + 0xC882: 2008, + 0xC885: 2009, + 0xC889: 2010, + 0xC891: 2011, + 0xC893: 2012, + 0xC895: 2013, + 0xC896: 2014, + 0xC897: 2015, + 0xC8A1: 2016, + 0xC8B7: 2017, + 0xC8E1: 2018, + 0xC8E2: 2019, + 0xC8E5: 2020, + 0xC8E9: 2021, + 0xC8EB: 2022, + 0xC8F1: 2023, + 0xC8F3: 2024, + 0xC8F5: 2025, + 0xC8F6: 2026, + 0xC8F7: 2027, + 0xC941: 2028, + 0xC942: 2029, + 0xC945: 2030, + 0xC949: 2031, + 0xC951: 2032, + 0xC953: 2033, + 0xC955: 2034, + 0xC957: 2035, + 0xC961: 2036, + 0xC965: 2037, + 0xC976: 2038, + 0xC981: 2039, + 0xC985: 2040, + 0xC9A1: 2041, + 0xC9A2: 2042, + 0xC9A5: 2043, + 0xC9A9: 2044, + 0xC9B1: 2045, + 0xC9B3: 2046, + 0xC9B5: 2047, + 0xC9B7: 2048, + 0xC9BC: 2049, + 0xC9C1: 2050, + 0xC9C5: 2051, + 0xC9E1: 2052, + 0xCA41: 2053, + 0xCA45: 2054, + 0xCA55: 2055, + 0xCA57: 2056, + 0xCA61: 2057, + 0xCA81: 2058, + 0xCA82: 2059, + 0xCA85: 2060, + 0xCA89: 2061, + 0xCA91: 2062, + 0xCA93: 2063, + 0xCA95: 2064, + 0xCA97: 2065, + 0xCAA1: 2066, + 0xCAB6: 2067, + 0xCAC1: 2068, + 0xCAE1: 2069, + 0xCAE2: 2070, + 0xCAE5: 2071, + 0xCAE9: 2072, + 0xCAF1: 2073, + 0xCAF3: 2074, + 0xCAF7: 2075, + 0xCB41: 2076, + 0xCB45: 2077, + 0xCB49: 2078, + 0xCB51: 2079, + 0xCB57: 2080, + 0xCB61: 2081, + 0xCB62: 2082, + 0xCB65: 2083, + 0xCB68: 2084, + 0xCB69: 2085, + 0xCB6B: 2086, + 0xCB71: 2087, + 0xCB73: 2088, + 0xCB75: 2089, + 0xCB81: 2090, + 0xCB85: 2091, + 0xCB89: 2092, + 0xCB91: 2093, + 0xCB93: 2094, + 0xCBA1: 2095, + 0xCBA2: 2096, + 0xCBA5: 2097, + 0xCBA9: 2098, + 0xCBB1: 2099, + 0xCBB3: 2100, + 0xCBB5: 2101, + 0xCBB7: 2102, + 0xCC61: 2103, + 0xCC62: 2104, + 0xCC63: 2105, + 0xCC65: 2106, + 0xCC69: 2107, + 0xCC6B: 2108, + 0xCC71: 2109, + 0xCC73: 2110, + 0xCC75: 2111, + 0xCC76: 2112, + 0xCC77: 2113, + 0xCC7B: 2114, + 0xCC81: 2115, + 0xCC82: 2116, + 0xCC85: 2117, + 0xCC89: 2118, + 0xCC91: 2119, + 0xCC93: 2120, + 0xCC95: 2121, + 0xCC96: 2122, + 0xCC97: 2123, + 0xCCA1: 2124, + 0xCCA2: 2125, + 0xCCE1: 2126, + 0xCCE2: 2127, + 0xCCE5: 2128, + 0xCCE9: 2129, + 0xCCF1: 2130, + 0xCCF3: 2131, + 0xCCF5: 2132, + 0xCCF6: 2133, + 0xCCF7: 2134, + 0xCD41: 2135, + 0xCD42: 2136, + 0xCD45: 2137, + 0xCD49: 2138, + 0xCD51: 2139, + 0xCD53: 2140, + 0xCD55: 2141, + 0xCD57: 2142, + 0xCD61: 2143, + 0xCD65: 2144, + 0xCD69: 2145, + 0xCD71: 2146, + 0xCD73: 2147, + 0xCD76: 2148, + 0xCD77: 2149, + 0xCD81: 2150, + 0xCD89: 2151, + 0xCD93: 2152, + 0xCD95: 2153, + 0xCDA1: 2154, + 0xCDA2: 2155, + 0xCDA5: 2156, + 0xCDA9: 2157, + 0xCDB1: 2158, + 0xCDB3: 2159, + 0xCDB5: 2160, + 0xCDB7: 2161, + 0xCDC1: 2162, + 0xCDD7: 2163, + 0xCE41: 2164, + 0xCE45: 2165, + 0xCE61: 2166, + 0xCE65: 2167, + 0xCE69: 2168, + 0xCE73: 2169, + 0xCE75: 2170, + 0xCE81: 2171, + 0xCE82: 2172, + 0xCE85: 2173, + 0xCE88: 2174, + 0xCE89: 2175, + 0xCE8B: 2176, + 0xCE91: 2177, + 0xCE93: 2178, + 0xCE95: 2179, + 0xCE97: 2180, + 0xCEA1: 2181, + 0xCEB7: 2182, + 0xCEE1: 2183, + 0xCEE5: 2184, + 0xCEE9: 2185, + 0xCEF1: 2186, + 0xCEF5: 2187, + 0xCF41: 2188, + 0xCF45: 2189, + 0xCF49: 2190, + 0xCF51: 2191, + 0xCF55: 2192, + 0xCF57: 2193, + 0xCF61: 2194, + 0xCF65: 2195, + 0xCF69: 2196, + 0xCF71: 2197, + 0xCF73: 2198, + 0xCF75: 2199, + 0xCFA1: 2200, + 0xCFA2: 2201, + 0xCFA5: 2202, + 0xCFA9: 2203, + 0xCFB1: 2204, + 0xCFB3: 2205, + 0xCFB5: 2206, + 0xCFB7: 2207, + 0xD061: 2208, + 0xD062: 2209, + 0xD065: 2210, + 0xD069: 2211, + 0xD06E: 2212, + 0xD071: 2213, + 0xD073: 2214, + 0xD075: 2215, + 0xD077: 2216, + 0xD081: 2217, + 0xD082: 2218, + 0xD085: 2219, + 0xD089: 2220, + 0xD091: 2221, + 0xD093: 2222, + 0xD095: 2223, + 0xD096: 2224, + 0xD097: 2225, + 0xD0A1: 2226, + 0xD0B7: 2227, + 0xD0E1: 2228, + 0xD0E2: 2229, + 0xD0E5: 2230, + 0xD0E9: 2231, + 0xD0EB: 2232, + 0xD0F1: 2233, + 0xD0F3: 2234, + 0xD0F5: 2235, + 0xD0F7: 2236, + 0xD141: 2237, + 0xD142: 2238, + 0xD145: 2239, + 0xD149: 2240, + 0xD151: 2241, + 0xD153: 2242, + 0xD155: 2243, + 0xD157: 2244, + 0xD161: 2245, + 0xD162: 2246, + 0xD165: 2247, + 0xD169: 2248, + 0xD171: 2249, + 0xD173: 2250, + 0xD175: 2251, + 0xD176: 2252, + 0xD177: 2253, + 0xD181: 2254, + 0xD185: 2255, + 0xD189: 2256, + 0xD193: 2257, + 0xD1A1: 2258, + 0xD1A2: 2259, + 0xD1A5: 2260, + 0xD1A9: 2261, + 0xD1AE: 2262, + 0xD1B1: 2263, + 0xD1B3: 2264, + 0xD1B5: 2265, + 0xD1B7: 2266, + 0xD1BB: 2267, + 0xD1C1: 2268, + 0xD1C2: 2269, + 0xD1C5: 2270, + 0xD1C9: 2271, + 0xD1D5: 2272, + 0xD1D7: 2273, + 0xD1E1: 2274, + 0xD1E2: 2275, + 0xD1E5: 2276, + 0xD1F5: 2277, + 0xD1F7: 2278, + 0xD241: 2279, + 0xD242: 2280, + 0xD245: 2281, + 0xD249: 2282, + 0xD253: 2283, + 0xD255: 2284, + 0xD257: 2285, + 0xD261: 2286, + 0xD265: 2287, + 0xD269: 2288, + 0xD273: 2289, + 0xD275: 2290, + 0xD281: 2291, + 0xD282: 2292, + 0xD285: 2293, + 0xD289: 2294, + 0xD28E: 2295, + 0xD291: 2296, + 0xD295: 2297, + 0xD297: 2298, + 0xD2A1: 2299, + 0xD2A5: 2300, + 0xD2A9: 2301, + 0xD2B1: 2302, + 0xD2B7: 2303, + 0xD2C1: 2304, + 0xD2C2: 2305, + 0xD2C5: 2306, + 0xD2C9: 2307, + 0xD2D7: 2308, + 0xD2E1: 2309, + 0xD2E2: 2310, + 0xD2E5: 2311, + 0xD2E9: 2312, + 0xD2F1: 2313, + 0xD2F3: 2314, + 0xD2F5: 2315, + 0xD2F7: 2316, + 0xD341: 2317, + 0xD342: 2318, + 0xD345: 2319, + 0xD349: 2320, + 0xD351: 2321, + 0xD355: 2322, + 0xD357: 2323, + 0xD361: 2324, + 0xD362: 2325, + 0xD365: 2326, + 0xD367: 2327, + 0xD368: 2328, + 0xD369: 2329, + 0xD36A: 2330, + 0xD371: 2331, + 0xD373: 2332, + 0xD375: 2333, + 0xD377: 2334, + 0xD37B: 2335, + 0xD381: 2336, + 0xD385: 2337, + 0xD389: 2338, + 0xD391: 2339, + 0xD393: 2340, + 0xD397: 2341, + 0xD3A1: 2342, + 0xD3A2: 2343, + 0xD3A5: 2344, + 0xD3A9: 2345, + 0xD3B1: 2346, + 0xD3B3: 2347, + 0xD3B5: 2348, + 0xD3B7: 2349, +} diff --git a/lib/chardet/johabprober.py b/lib/chardet/johabprober.py new file mode 100644 index 00000000..08b2999b --- /dev/null +++ b/lib/chardet/johabprober.py @@ -0,0 +1,47 @@ +######################## BEGIN LICENSE BLOCK ######################## +# The Original Code is mozilla.org code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 1998 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Mark Pilgrim - port to Python +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +from .chardistribution import JOHABDistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .mbcharsetprober import MultiByteCharSetProber +from .mbcssm import JOHAB_SM_MODEL + + +class JOHABProber(MultiByteCharSetProber): + def __init__(self) -> None: + super().__init__() + self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL) + self.distribution_analyzer = JOHABDistributionAnalysis() + self.reset() + + @property + def charset_name(self) -> str: + return "Johab" + + @property + def language(self) -> str: + return "Korean" diff --git a/lib/chardet/jpcntx.py b/lib/chardet/jpcntx.py index 624d5349..8c18dda8 100644 --- a/lib/chardet/jpcntx.py +++ b/lib/chardet/jpcntx.py @@ -25,110 +25,114 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Tuple, Union # This is hiragana 2-char sequence table, the number in each cell represents its frequency category -jp2CharContext = ( -(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1), -(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4), -(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2), -(0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4), -(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), -(0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4), -(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), -(0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3), -(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), -(0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4), -(1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4), -(0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3), -(0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3), -(0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3), -(0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4), -(0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3), -(2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4), -(0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3), -(0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5), -(0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3), -(2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5), -(0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4), -(1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4), -(0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3), -(0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3), -(0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3), -(0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5), -(0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4), -(0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5), -(0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3), -(0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4), -(0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4), -(0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4), -(0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1), -(0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0), -(1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3), -(0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0), -(0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3), -(0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3), -(0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5), -(0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4), -(2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5), -(0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3), -(0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3), -(0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3), -(0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3), -(0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4), -(0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4), -(0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2), -(0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3), -(0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3), -(0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3), -(0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3), -(0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4), -(0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3), -(0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4), -(0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3), -(0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3), -(0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4), -(0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4), -(0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3), -(2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4), -(0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4), -(0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3), -(0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4), -(0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4), -(1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4), -(0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3), -(0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2), -(0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2), -(0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3), -(0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3), -(0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5), -(0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3), -(0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4), -(1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4), -(0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4), -(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), -(0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3), -(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1), -(0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2), -(0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3), -(0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1), +# fmt: off +jp2_char_context = ( + (0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), + (2, 4, 0, 4, 0, 3, 0, 4, 0, 3, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 3, 3, 4, 2, 3, 3, 3, 2, 4, 1, 4, 3, 3, 1, 5, 4, 3, 4, 3, 4, 3, 5, 3, 0, 3, 5, 4, 2, 0, 3, 1, 0, 3, 3, 0, 3, 3, 0, 1, 1, 0, 4, 3, 0, 3, 3, 0, 4, 0, 2, 0, 3, 5, 5, 5, 5, 4, 0, 4, 1, 0, 3, 4), + (0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2), + (0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 4, 4, 3, 5, 3, 5, 1, 5, 3, 4, 3, 4, 4, 3, 4, 3, 3, 4, 3, 5, 4, 4, 3, 5, 5, 3, 5, 5, 5, 3, 5, 5, 3, 4, 5, 5, 3, 1, 3, 2, 0, 3, 4, 0, 4, 2, 0, 4, 2, 1, 5, 3, 2, 3, 5, 0, 4, 0, 2, 0, 5, 4, 4, 5, 4, 5, 0, 4, 0, 0, 4, 4), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + (0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 5, 4, 3, 3, 3, 3, 4, 3, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 4, 4, 4, 4, 5, 3, 4, 4, 3, 4, 5, 5, 4, 5, 5, 1, 4, 5, 4, 3, 0, 3, 3, 1, 3, 3, 0, 4, 4, 0, 3, 3, 1, 5, 3, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 0, 4, 1, 1, 3, 4), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + (0, 4, 0, 3, 0, 3, 0, 4, 0, 3, 4, 4, 3, 2, 2, 1, 2, 1, 3, 1, 3, 3, 3, 3, 3, 4, 3, 1, 3, 3, 5, 3, 3, 0, 4, 3, 0, 5, 4, 3, 3, 5, 4, 4, 3, 4, 4, 5, 0, 1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 0, 0, 5, 2, 2, 1, 4, 0, 3, 0, 1, 0, 4, 4, 3, 5, 4, 3, 0, 2, 1, 0, 4, 3), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + (0, 3, 0, 5, 0, 4, 0, 2, 1, 4, 4, 2, 4, 1, 4, 2, 4, 2, 4, 3, 3, 3, 4, 3, 3, 3, 3, 1, 4, 2, 3, 3, 3, 1, 4, 4, 1, 1, 1, 4, 3, 3, 2, 0, 2, 4, 3, 2, 0, 3, 3, 0, 3, 1, 1, 0, 0, 0, 3, 3, 0, 4, 2, 2, 3, 4, 0, 4, 0, 3, 0, 4, 4, 5, 3, 4, 4, 0, 3, 0, 0, 1, 4), + (1, 4, 0, 4, 0, 4, 0, 4, 0, 3, 5, 4, 4, 3, 4, 3, 5, 4, 3, 3, 4, 3, 5, 4, 4, 4, 4, 3, 4, 2, 4, 3, 3, 1, 5, 4, 3, 2, 4, 5, 4, 5, 5, 4, 4, 5, 4, 4, 0, 3, 2, 2, 3, 3, 0, 4, 3, 1, 3, 2, 1, 4, 3, 3, 4, 5, 0, 3, 0, 2, 0, 4, 5, 5, 4, 5, 4, 0, 4, 0, 0, 5, 4), + (0, 5, 0, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 3, 4, 0, 4, 4, 4, 3, 4, 3, 4, 3, 3, 1, 4, 2, 4, 3, 4, 0, 5, 4, 1, 4, 5, 4, 4, 5, 3, 2, 4, 3, 4, 3, 2, 4, 1, 3, 3, 3, 2, 3, 2, 0, 4, 3, 3, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 4, 3, 0, 4, 1, 0, 1, 3), + (0, 3, 1, 4, 0, 3, 0, 2, 0, 3, 4, 4, 3, 1, 4, 2, 3, 3, 4, 3, 4, 3, 4, 3, 4, 4, 3, 2, 3, 1, 5, 4, 4, 1, 4, 4, 3, 5, 4, 4, 3, 5, 5, 4, 3, 4, 4, 3, 1, 2, 3, 1, 2, 2, 0, 3, 2, 0, 3, 1, 0, 5, 3, 3, 3, 4, 3, 3, 3, 3, 4, 4, 4, 4, 5, 4, 2, 0, 3, 3, 2, 4, 3), + (0, 2, 0, 3, 0, 1, 0, 1, 0, 0, 3, 2, 0, 0, 2, 0, 1, 0, 2, 1, 3, 3, 3, 1, 2, 3, 1, 0, 1, 0, 4, 2, 1, 1, 3, 3, 0, 4, 3, 3, 1, 4, 3, 3, 0, 3, 3, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 4, 1, 0, 2, 3, 2, 2, 2, 1, 3, 3, 3, 4, 4, 3, 2, 0, 3, 1, 0, 3, 3), + (0, 4, 0, 4, 0, 3, 0, 3, 0, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 4, 3, 4, 3, 3, 2, 4, 3, 4, 5, 4, 1, 4, 5, 3, 5, 4, 5, 3, 5, 4, 0, 3, 5, 5, 3, 1, 3, 3, 2, 2, 3, 0, 3, 4, 1, 3, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 5, 3, 0, 4, 1, 0, 3, 4), + (0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 2, 2, 1, 0, 1, 0, 0, 0, 3, 0, 3, 0, 3, 0, 1, 3, 1, 0, 3, 1, 3, 3, 3, 1, 3, 3, 3, 0, 1, 3, 1, 3, 4, 0, 0, 3, 1, 1, 0, 3, 2, 0, 0, 0, 0, 1, 3, 0, 1, 0, 0, 3, 3, 2, 0, 3, 0, 0, 0, 0, 0, 3, 4, 3, 4, 3, 3, 0, 3, 0, 0, 2, 3), + (2, 3, 0, 3, 0, 2, 0, 1, 0, 3, 3, 4, 3, 1, 3, 1, 1, 1, 3, 1, 4, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 4, 3, 1, 4, 3, 2, 5, 5, 4, 4, 4, 4, 3, 3, 4, 4, 4, 0, 2, 1, 1, 3, 2, 0, 1, 2, 0, 0, 1, 0, 4, 1, 3, 3, 3, 0, 3, 0, 1, 0, 4, 4, 4, 5, 5, 3, 0, 2, 0, 0, 4, 4), + (0, 2, 0, 1, 0, 3, 1, 3, 0, 2, 3, 3, 3, 0, 3, 1, 0, 0, 3, 0, 3, 2, 3, 1, 3, 2, 1, 1, 0, 0, 4, 2, 1, 0, 2, 3, 1, 4, 3, 2, 0, 4, 4, 3, 1, 3, 1, 3, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 1, 1, 1, 2, 0, 3, 0, 0, 0, 3, 4, 2, 4, 3, 2, 0, 1, 0, 0, 3, 3), + (0, 1, 0, 4, 0, 5, 0, 4, 0, 2, 4, 4, 2, 3, 3, 2, 3, 3, 5, 3, 3, 3, 4, 3, 4, 2, 3, 0, 4, 3, 3, 3, 4, 1, 4, 3, 2, 1, 5, 5, 3, 4, 5, 1, 3, 5, 4, 2, 0, 3, 3, 0, 1, 3, 0, 4, 2, 0, 1, 3, 1, 4, 3, 3, 3, 3, 0, 3, 0, 1, 0, 3, 4, 4, 4, 5, 5, 0, 3, 0, 1, 4, 5), + (0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 3, 1, 3, 0, 4, 0, 1, 1, 3, 0, 3, 4, 3, 2, 3, 1, 0, 3, 3, 2, 3, 1, 3, 0, 2, 3, 0, 2, 1, 4, 1, 2, 2, 0, 0, 3, 3, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 3, 2, 1, 3, 3, 0, 2, 0, 2, 0, 0, 3, 3, 1, 2, 4, 0, 3, 0, 2, 2, 3), + (2, 4, 0, 5, 0, 4, 0, 4, 0, 2, 4, 4, 4, 3, 4, 3, 3, 3, 1, 2, 4, 3, 4, 3, 4, 4, 5, 0, 3, 3, 3, 3, 2, 0, 4, 3, 1, 4, 3, 4, 1, 4, 4, 3, 3, 4, 4, 3, 1, 2, 3, 0, 4, 2, 0, 4, 1, 0, 3, 3, 0, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 3, 5, 3, 4, 5, 2, 0, 3, 0, 0, 4, 5), + (0, 3, 0, 4, 0, 1, 0, 1, 0, 1, 3, 2, 2, 1, 3, 0, 3, 0, 2, 0, 2, 0, 3, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 4, 0, 3, 1, 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 3, 1, 0, 3, 0, 0, 0, 1, 4, 4, 4, 3, 0, 0, 4, 0, 0, 1, 4), + (1, 4, 1, 5, 0, 3, 0, 3, 0, 4, 5, 4, 4, 3, 5, 3, 3, 4, 4, 3, 4, 1, 3, 3, 3, 3, 2, 1, 4, 1, 5, 4, 3, 1, 4, 4, 3, 5, 4, 4, 3, 5, 4, 3, 3, 4, 4, 4, 0, 3, 3, 1, 2, 3, 0, 3, 1, 0, 3, 3, 0, 5, 4, 4, 4, 4, 4, 4, 3, 3, 5, 4, 4, 3, 3, 5, 4, 0, 3, 2, 0, 4, 4), + (0, 2, 0, 3, 0, 1, 0, 0, 0, 1, 3, 3, 3, 2, 4, 1, 3, 0, 3, 1, 3, 0, 2, 2, 1, 1, 0, 0, 2, 0, 4, 3, 1, 0, 4, 3, 0, 4, 4, 4, 1, 4, 3, 1, 1, 3, 3, 1, 0, 2, 0, 0, 1, 3, 0, 0, 0, 0, 2, 0, 0, 4, 3, 2, 4, 3, 5, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 0, 2, 1, 0, 3, 3), + (0, 2, 0, 4, 0, 3, 0, 2, 0, 2, 5, 5, 3, 4, 4, 4, 4, 1, 4, 3, 3, 0, 4, 3, 4, 3, 1, 3, 3, 2, 4, 3, 0, 3, 4, 3, 0, 3, 4, 4, 2, 4, 4, 0, 4, 5, 3, 3, 2, 2, 1, 1, 1, 2, 0, 1, 5, 0, 3, 3, 2, 4, 3, 3, 3, 4, 0, 3, 0, 2, 0, 4, 4, 3, 5, 5, 0, 0, 3, 0, 2, 3, 3), + (0, 3, 0, 4, 0, 3, 0, 1, 0, 3, 4, 3, 3, 1, 3, 3, 3, 0, 3, 1, 3, 0, 4, 3, 3, 1, 1, 0, 3, 0, 3, 3, 0, 0, 4, 4, 0, 1, 5, 4, 3, 3, 5, 0, 3, 3, 4, 3, 0, 2, 0, 1, 1, 1, 0, 1, 3, 0, 1, 2, 1, 3, 3, 2, 3, 3, 0, 3, 0, 1, 0, 1, 3, 3, 4, 4, 1, 0, 1, 2, 2, 1, 3), + (0, 1, 0, 4, 0, 4, 0, 3, 0, 1, 3, 3, 3, 2, 3, 1, 1, 0, 3, 0, 3, 3, 4, 3, 2, 4, 2, 0, 1, 0, 4, 3, 2, 0, 4, 3, 0, 5, 3, 3, 2, 4, 4, 4, 3, 3, 3, 4, 0, 1, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 2, 3, 3, 3, 0, 3, 0, 0, 0, 4, 4, 4, 5, 3, 2, 0, 3, 3, 0, 3, 5), + (0, 2, 0, 3, 0, 0, 0, 3, 0, 1, 3, 0, 2, 0, 0, 0, 1, 0, 3, 1, 1, 3, 3, 0, 0, 3, 0, 0, 3, 0, 2, 3, 1, 0, 3, 1, 0, 3, 3, 2, 0, 4, 2, 2, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 0, 0, 1, 3, 1, 2, 0, 0, 0, 1, 0, 0, 1, 4), + (0, 3, 0, 3, 0, 5, 0, 1, 0, 2, 4, 3, 1, 3, 3, 2, 1, 1, 5, 2, 1, 0, 5, 1, 2, 0, 0, 0, 3, 3, 2, 2, 3, 2, 4, 3, 0, 0, 3, 3, 1, 3, 3, 0, 2, 5, 3, 4, 0, 3, 3, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0, 2, 2, 3, 3, 3, 0, 2, 0, 1, 0, 3, 4, 4, 2, 5, 4, 0, 3, 0, 0, 3, 5), + (0, 3, 0, 3, 0, 3, 0, 1, 0, 3, 3, 3, 3, 0, 3, 0, 2, 0, 2, 1, 1, 0, 2, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 3, 2, 0, 0, 3, 3, 1, 2, 3, 1, 0, 3, 3, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 3, 1, 2, 3, 0, 3, 0, 1, 0, 3, 2, 1, 0, 4, 3, 0, 1, 1, 0, 3, 3), + (0, 4, 0, 5, 0, 3, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 4, 3, 5, 3, 3, 2, 5, 3, 4, 4, 4, 3, 4, 3, 4, 5, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, 3, 4, 5, 5, 4, 2, 3, 4, 2, 3, 4, 0, 3, 3, 1, 4, 3, 2, 4, 3, 3, 5, 5, 0, 3, 0, 3, 0, 5, 5, 5, 5, 4, 4, 0, 4, 0, 1, 4, 4), + (0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 5, 4, 4, 2, 3, 2, 5, 1, 3, 2, 5, 1, 4, 2, 3, 2, 3, 3, 4, 3, 3, 3, 3, 2, 5, 4, 1, 3, 3, 5, 3, 4, 4, 0, 4, 4, 3, 1, 1, 3, 1, 0, 2, 3, 0, 2, 3, 0, 3, 0, 0, 4, 3, 1, 3, 4, 0, 3, 0, 2, 0, 4, 4, 4, 3, 4, 5, 0, 4, 0, 0, 3, 4), + (0, 3, 0, 3, 0, 3, 1, 2, 0, 3, 4, 4, 3, 3, 3, 0, 2, 2, 4, 3, 3, 1, 3, 3, 3, 1, 1, 0, 3, 1, 4, 3, 2, 3, 4, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 3, 1, 3, 3, 1, 3, 3, 0, 4, 1, 0, 2, 2, 1, 4, 3, 2, 3, 3, 5, 4, 3, 3, 5, 4, 4, 3, 3, 0, 4, 0, 3, 2, 2, 4, 4), + (0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 2, 1, 3, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 1, 3, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 3, 4, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1), + (0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 4, 1, 4, 0, 3, 0, 4, 0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 1, 5, 1, 4, 0, 0, 3, 0, 5, 0, 5, 2, 0, 1, 0, 0, 0, 2, 1, 4, 0, 1, 3, 0, 0, 3, 0, 0, 3, 1, 1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), + (1, 4, 0, 5, 0, 3, 0, 2, 0, 3, 5, 4, 4, 3, 4, 3, 5, 3, 4, 3, 3, 0, 4, 3, 3, 3, 3, 3, 3, 2, 4, 4, 3, 1, 3, 4, 4, 5, 4, 4, 3, 4, 4, 1, 3, 5, 4, 3, 3, 3, 1, 2, 2, 3, 3, 1, 3, 1, 3, 3, 3, 5, 3, 3, 4, 5, 0, 3, 0, 3, 0, 3, 4, 3, 4, 4, 3, 0, 3, 0, 2, 4, 3), + (0, 1, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 4, 1, 4, 2, 4, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 3, 1, 1, 1, 0, 3, 0, 0, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 3, 2, 0, 2, 2, 0, 1, 0, 0, 0, 2, 3, 2, 3, 3, 0, 0, 0, 0, 2, 1, 0), + (0, 5, 1, 5, 0, 3, 0, 3, 0, 5, 4, 4, 5, 1, 5, 3, 3, 0, 4, 3, 4, 3, 5, 3, 4, 3, 3, 2, 4, 3, 4, 3, 3, 0, 3, 3, 1, 4, 4, 3, 4, 4, 4, 3, 4, 5, 5, 3, 2, 3, 1, 1, 3, 3, 1, 3, 1, 1, 3, 3, 2, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 5, 3, 3, 0, 3, 4, 0, 4, 3), + (0, 5, 0, 5, 0, 3, 0, 2, 0, 4, 4, 3, 5, 2, 4, 3, 3, 3, 4, 4, 4, 3, 5, 3, 5, 3, 3, 1, 4, 0, 4, 3, 3, 0, 3, 3, 0, 4, 4, 4, 4, 5, 4, 3, 3, 5, 5, 3, 2, 3, 1, 2, 3, 2, 0, 1, 0, 0, 3, 2, 2, 4, 4, 3, 1, 5, 0, 4, 0, 3, 0, 4, 3, 1, 3, 2, 1, 0, 3, 3, 0, 3, 3), + (0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 5, 5, 3, 4, 3, 3, 2, 5, 4, 4, 3, 5, 3, 5, 3, 4, 0, 4, 3, 4, 4, 3, 2, 4, 4, 3, 4, 5, 4, 4, 5, 5, 0, 3, 5, 5, 4, 1, 3, 3, 2, 3, 3, 1, 3, 1, 0, 4, 3, 1, 4, 4, 3, 4, 5, 0, 4, 0, 2, 0, 4, 3, 4, 4, 3, 3, 0, 4, 0, 0, 5, 5), + (0, 4, 0, 4, 0, 5, 0, 1, 1, 3, 3, 4, 4, 3, 4, 1, 3, 0, 5, 1, 3, 0, 3, 1, 3, 1, 1, 0, 3, 0, 3, 3, 4, 0, 4, 3, 0, 4, 4, 4, 3, 4, 4, 0, 3, 5, 4, 1, 0, 3, 0, 0, 2, 3, 0, 3, 1, 0, 3, 1, 0, 3, 2, 1, 3, 5, 0, 3, 0, 1, 0, 3, 2, 3, 3, 4, 4, 0, 2, 2, 0, 4, 4), + (2, 4, 0, 5, 0, 4, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 5, 3, 5, 3, 5, 2, 5, 3, 4, 3, 3, 4, 3, 4, 5, 3, 2, 1, 5, 4, 3, 2, 3, 4, 5, 3, 4, 1, 2, 5, 4, 3, 0, 3, 3, 0, 3, 2, 0, 2, 3, 0, 4, 1, 0, 3, 4, 3, 3, 5, 0, 3, 0, 1, 0, 4, 5, 5, 5, 4, 3, 0, 4, 2, 0, 3, 5), + (0, 5, 0, 4, 0, 4, 0, 2, 0, 5, 4, 3, 4, 3, 4, 3, 3, 3, 4, 3, 4, 2, 5, 3, 5, 3, 4, 1, 4, 3, 4, 4, 4, 0, 3, 5, 0, 4, 4, 4, 4, 5, 3, 1, 3, 4, 5, 3, 3, 3, 3, 3, 3, 3, 0, 2, 2, 0, 3, 3, 2, 4, 3, 3, 3, 5, 3, 4, 1, 3, 3, 5, 3, 2, 0, 0, 0, 0, 4, 3, 1, 3, 3), + (0, 1, 0, 3, 0, 3, 0, 1, 0, 1, 3, 3, 3, 2, 3, 3, 3, 0, 3, 0, 0, 0, 3, 1, 3, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 2, 0, 1, 2, 4, 1, 3, 3, 0, 0, 3, 3, 3, 0, 1, 0, 0, 2, 1, 0, 0, 3, 0, 3, 1, 0, 3, 0, 0, 1, 3, 0, 2, 0, 1, 0, 3, 3, 1, 3, 3, 0, 0, 1, 1, 0, 3, 3), + (0, 2, 0, 3, 0, 2, 1, 4, 0, 2, 2, 3, 1, 1, 3, 1, 1, 0, 2, 0, 3, 1, 2, 3, 1, 3, 0, 0, 1, 0, 4, 3, 2, 3, 3, 3, 1, 4, 2, 3, 3, 3, 3, 1, 0, 3, 1, 4, 0, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 0, 3, 1, 3, 2, 2, 0, 1, 0, 0, 0, 2, 3, 3, 3, 1, 0, 0, 0, 0, 0, 2, 3), + (0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 5, 5, 3, 3, 4, 3, 3, 1, 5, 4, 4, 2, 4, 4, 4, 3, 4, 2, 4, 3, 5, 5, 4, 3, 3, 4, 3, 3, 5, 5, 4, 5, 5, 1, 3, 4, 5, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 1, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 5, 3, 3, 1, 4, 3, 0, 4, 0, 1, 5, 3), + (0, 5, 0, 5, 0, 4, 0, 2, 0, 4, 4, 3, 4, 3, 3, 3, 3, 3, 5, 4, 4, 4, 4, 4, 4, 5, 3, 3, 5, 2, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 5, 5, 3, 3, 4, 3, 4, 3, 3, 4, 3, 3, 3, 3, 1, 2, 2, 1, 4, 3, 3, 5, 4, 4, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 4, 4, 1, 0, 4, 2, 0, 2, 4), + (0, 4, 0, 4, 0, 3, 0, 1, 0, 3, 5, 2, 3, 0, 3, 0, 2, 1, 4, 2, 3, 3, 4, 1, 4, 3, 3, 2, 4, 1, 3, 3, 3, 0, 3, 3, 0, 0, 3, 3, 3, 5, 3, 3, 3, 3, 3, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 3, 1, 2, 2, 3, 0, 3, 0, 2, 0, 4, 4, 3, 3, 4, 1, 0, 3, 0, 0, 2, 4), + (0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 2, 0, 0, 0, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 0, 2), + (0, 2, 1, 3, 0, 2, 0, 2, 0, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 4, 2, 2, 1, 2, 1, 4, 0, 4, 3, 1, 3, 3, 3, 2, 4, 3, 5, 4, 3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 2, 3, 0, 3, 3, 0, 3, 3, 4, 2, 3, 1, 4, 0, 1, 2, 0, 2, 3), + (0, 3, 0, 3, 0, 1, 0, 3, 0, 2, 3, 3, 3, 0, 3, 1, 2, 0, 3, 3, 2, 3, 3, 2, 3, 2, 3, 1, 3, 0, 4, 3, 2, 0, 3, 3, 1, 4, 3, 3, 2, 3, 4, 3, 1, 3, 3, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 4, 1, 1, 0, 3, 0, 3, 1, 0, 2, 3, 3, 3, 3, 3, 1, 0, 0, 2, 0, 3, 3), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3), + (0, 2, 0, 3, 1, 3, 0, 3, 0, 2, 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3, 1, 3, 0, 2, 3, 1, 1, 4, 3, 3, 2, 3, 3, 1, 2, 2, 4, 1, 3, 3, 0, 1, 4, 2, 3, 0, 1, 3, 0, 3, 0, 0, 1, 3, 0, 2, 0, 0, 3, 3, 2, 1, 3, 0, 3, 0, 2, 0, 3, 4, 4, 4, 3, 1, 0, 3, 0, 0, 3, 3), + (0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 3, 2, 2, 1, 3, 0, 1, 1, 3, 0, 3, 2, 3, 1, 2, 0, 2, 0, 1, 1, 3, 3, 3, 0, 3, 3, 1, 1, 2, 3, 2, 3, 3, 1, 2, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 2, 1, 2, 1, 3, 0, 3, 0, 0, 0, 3, 4, 4, 4, 3, 2, 0, 2, 0, 0, 2, 4), + (0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 3), + (0, 3, 0, 3, 0, 2, 0, 3, 0, 3, 3, 3, 2, 3, 2, 2, 2, 0, 3, 1, 3, 3, 3, 2, 3, 3, 0, 0, 3, 0, 3, 2, 2, 0, 2, 3, 1, 4, 3, 4, 3, 3, 2, 3, 1, 5, 4, 4, 0, 3, 1, 2, 1, 3, 0, 3, 1, 1, 2, 0, 2, 3, 1, 3, 1, 3, 0, 3, 0, 1, 0, 3, 3, 4, 4, 2, 1, 0, 2, 1, 0, 2, 4), + (0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 4, 2, 5, 1, 4, 0, 2, 0, 2, 1, 3, 1, 4, 0, 2, 1, 0, 0, 2, 1, 4, 1, 1, 0, 3, 3, 0, 5, 1, 3, 2, 3, 3, 1, 0, 3, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, 0, 3, 0, 2, 0, 1, 0, 3, 3, 3, 4, 3, 3, 0, 0, 0, 0, 2, 3), + (0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 3), + (0, 1, 0, 3, 0, 4, 0, 3, 0, 2, 4, 3, 1, 0, 3, 2, 2, 1, 3, 1, 2, 2, 3, 1, 1, 1, 2, 1, 3, 0, 1, 2, 0, 1, 3, 2, 1, 3, 0, 5, 5, 1, 0, 0, 1, 3, 2, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 1, 1, 1, 3, 2, 0, 2, 0, 1, 0, 2, 3, 3, 1, 2, 3, 0, 1, 0, 1, 0, 4), + (0, 0, 0, 1, 0, 3, 0, 3, 0, 2, 2, 1, 0, 0, 4, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 0, 1, 0, 3, 0, 3, 1, 3, 0, 3, 3, 0, 0, 1, 2, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 2, 0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 4), + (0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 2, 0, 2, 3, 0, 0, 2, 2, 3, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 2, 3), + (2, 4, 0, 5, 0, 5, 0, 4, 0, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 4, 5, 5, 5, 2, 3, 0, 5, 5, 4, 1, 5, 4, 3, 1, 5, 4, 3, 4, 4, 3, 3, 4, 3, 3, 0, 3, 2, 0, 2, 3, 0, 3, 0, 0, 3, 3, 0, 5, 3, 2, 3, 3, 0, 3, 0, 3, 0, 3, 4, 5, 4, 5, 3, 0, 4, 3, 0, 3, 4), + (0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 3, 4, 3, 2, 3, 2, 3, 0, 4, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 2, 4, 3, 3, 1, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 1, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 3, 1, 0, 5, 3, 2, 1, 3, 0, 3, 0, 1, 2, 4, 3, 2, 4, 3, 3, 0, 3, 2, 0, 4, 4), + (0, 3, 0, 3, 0, 1, 0, 0, 0, 1, 4, 3, 3, 2, 3, 1, 3, 1, 4, 2, 3, 2, 4, 2, 3, 4, 3, 0, 2, 2, 3, 3, 3, 0, 3, 3, 3, 0, 3, 4, 1, 3, 3, 0, 3, 4, 3, 3, 0, 1, 1, 0, 1, 0, 0, 0, 4, 0, 3, 0, 0, 3, 1, 2, 1, 3, 0, 4, 0, 1, 0, 4, 3, 3, 4, 3, 3, 0, 2, 0, 0, 3, 3), + (0, 3, 0, 4, 0, 1, 0, 3, 0, 3, 4, 3, 3, 0, 3, 3, 3, 1, 3, 1, 3, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 3, 3, 1, 3, 3, 2, 5, 4, 3, 3, 4, 5, 3, 2, 5, 3, 4, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 4, 2, 2, 1, 3, 0, 3, 0, 2, 0, 4, 4, 3, 5, 3, 2, 0, 1, 1, 0, 3, 4), + (0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 4, 3, 3, 2, 3, 3, 3, 1, 4, 3, 4, 1, 5, 3, 4, 3, 4, 0, 4, 2, 4, 3, 4, 1, 5, 4, 0, 4, 4, 4, 4, 5, 4, 1, 3, 5, 4, 2, 1, 4, 1, 1, 3, 2, 0, 3, 1, 0, 3, 2, 1, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 3, 3, 3, 0, 4, 2, 0, 3, 4), + (1, 4, 0, 4, 0, 3, 0, 1, 0, 3, 3, 3, 1, 1, 3, 3, 2, 2, 3, 3, 1, 0, 3, 2, 2, 1, 2, 0, 3, 1, 2, 1, 2, 0, 3, 2, 0, 2, 2, 3, 3, 4, 3, 0, 3, 3, 1, 2, 0, 1, 1, 3, 1, 2, 0, 0, 3, 0, 1, 1, 0, 3, 2, 2, 3, 3, 0, 3, 0, 0, 0, 2, 3, 3, 4, 3, 3, 0, 1, 0, 0, 1, 4), + (0, 4, 0, 4, 0, 4, 0, 0, 0, 3, 4, 4, 3, 1, 4, 2, 3, 2, 3, 3, 3, 1, 4, 3, 4, 0, 3, 0, 4, 2, 3, 3, 2, 2, 5, 4, 2, 1, 3, 4, 3, 4, 3, 1, 3, 3, 4, 2, 0, 2, 1, 0, 3, 3, 0, 0, 2, 0, 3, 1, 0, 4, 4, 3, 4, 3, 0, 4, 0, 1, 0, 2, 4, 4, 4, 4, 4, 0, 3, 2, 0, 3, 3), + (0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2), + (0, 2, 0, 3, 0, 4, 0, 4, 0, 1, 3, 3, 3, 0, 4, 0, 2, 1, 2, 1, 1, 1, 2, 0, 3, 1, 1, 0, 1, 0, 3, 1, 0, 0, 3, 3, 2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 3, 1, 0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 3, 0, 0, 0, 0, 1, 0, 0, 3, 3, 4, 3, 1, 0, 1, 0, 3, 0, 2), + (0, 0, 0, 3, 0, 5, 0, 0, 0, 0, 1, 0, 2, 0, 3, 1, 0, 1, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 4, 0, 0, 0, 2, 3, 0, 1, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 3), + (0, 2, 0, 5, 0, 5, 0, 1, 0, 2, 4, 3, 3, 2, 5, 1, 3, 2, 3, 3, 3, 0, 4, 1, 2, 0, 3, 0, 4, 0, 2, 2, 1, 1, 5, 3, 0, 0, 1, 4, 2, 3, 2, 0, 3, 3, 3, 2, 0, 2, 4, 1, 1, 2, 0, 1, 1, 0, 3, 1, 0, 1, 3, 1, 2, 3, 0, 2, 0, 0, 0, 1, 3, 5, 4, 4, 4, 0, 3, 0, 0, 1, 3), + (0, 4, 0, 5, 0, 4, 0, 4, 0, 4, 5, 4, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 3, 4, 5, 4, 2, 4, 2, 3, 4, 3, 1, 4, 4, 1, 3, 5, 4, 4, 5, 5, 4, 4, 5, 5, 5, 2, 3, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 4, 4, 4, 0, 3, 0, 4, 0, 3, 3, 4, 4, 5, 0, 0, 4, 3, 0, 4, 5), + (0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 4, 4, 4, 3, 3, 2, 4, 3, 4, 3, 4, 3, 5, 3, 4, 3, 2, 1, 4, 2, 4, 4, 3, 1, 3, 4, 2, 4, 5, 5, 3, 4, 5, 4, 1, 5, 4, 3, 0, 3, 2, 2, 3, 2, 1, 3, 1, 0, 3, 3, 3, 5, 3, 3, 3, 5, 4, 4, 2, 3, 3, 4, 3, 3, 3, 2, 1, 0, 3, 2, 1, 4, 3), + (0, 4, 0, 5, 0, 4, 0, 3, 0, 3, 5, 5, 3, 2, 4, 3, 4, 0, 5, 4, 4, 1, 4, 4, 4, 3, 3, 3, 4, 3, 5, 5, 2, 3, 3, 4, 1, 2, 5, 5, 3, 5, 5, 2, 3, 5, 5, 4, 0, 3, 2, 0, 3, 3, 1, 1, 5, 1, 4, 1, 0, 4, 3, 2, 3, 5, 0, 4, 0, 3, 0, 5, 4, 3, 4, 3, 0, 0, 4, 1, 0, 4, 4), + (1, 3, 0, 4, 0, 2, 0, 2, 0, 2, 5, 5, 3, 3, 3, 3, 3, 0, 4, 2, 3, 4, 4, 4, 3, 4, 0, 0, 3, 4, 5, 4, 3, 3, 3, 3, 2, 5, 5, 4, 5, 5, 5, 4, 3, 5, 5, 5, 1, 3, 1, 0, 1, 0, 0, 3, 2, 0, 4, 2, 0, 5, 2, 3, 2, 4, 1, 3, 0, 3, 0, 4, 5, 4, 5, 4, 3, 0, 4, 2, 0, 5, 4), + (0, 3, 0, 4, 0, 5, 0, 3, 0, 3, 4, 4, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 4, 3, 3, 2, 2, 0, 3, 3, 3, 3, 3, 1, 3, 3, 3, 0, 4, 4, 3, 4, 4, 1, 1, 4, 4, 2, 0, 3, 1, 0, 1, 1, 0, 4, 1, 0, 2, 3, 1, 3, 3, 1, 3, 4, 0, 3, 0, 1, 0, 3, 1, 3, 0, 0, 1, 0, 2, 0, 0, 4, 4), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + (0, 3, 0, 3, 0, 2, 0, 3, 0, 1, 5, 4, 3, 3, 3, 1, 4, 2, 1, 2, 3, 4, 4, 2, 4, 4, 5, 0, 3, 1, 4, 3, 4, 0, 4, 3, 3, 3, 2, 3, 2, 5, 3, 4, 3, 2, 2, 3, 0, 0, 3, 0, 2, 1, 0, 1, 2, 0, 0, 0, 0, 2, 1, 1, 3, 1, 0, 2, 0, 4, 0, 3, 4, 4, 4, 5, 2, 0, 2, 0, 0, 1, 3), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 4, 2, 1, 1, 0, 1, 0, 3, 2, 0, 0, 3, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 4, 0, 4, 2, 1, 0, 0, 0, 0, 0, 1), + (0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 0, 2, 1, 0, 0, 1, 2, 1, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2), + (0, 4, 0, 4, 0, 4, 0, 3, 0, 4, 4, 3, 4, 2, 4, 3, 2, 0, 4, 4, 4, 3, 5, 3, 5, 3, 3, 2, 4, 2, 4, 3, 4, 3, 1, 4, 0, 2, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 4, 1, 3, 4, 3, 2, 1, 2, 1, 3, 3, 3, 4, 4, 3, 3, 5, 0, 4, 0, 3, 0, 4, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3), + (0, 4, 0, 3, 0, 3, 0, 3, 0, 3, 5, 5, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 3, 5, 3, 3, 1, 3, 2, 4, 5, 5, 5, 5, 4, 3, 4, 5, 5, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 4, 3, 2, 2, 1, 2, 0, 3, 0, 0, 4, 1), ) +# fmt: on -class JapaneseContextAnalysis(object): + +class JapaneseContextAnalysis: NUM_OF_CATEGORY = 6 DONT_KNOW = -1 ENOUGH_REL_THRESHOLD = 100 MAX_REL_THRESHOLD = 1000 MINIMUM_DATA_THRESHOLD = 4 - def __init__(self): - self._total_rel = None - self._rel_sample = None - self._need_to_skip_char_num = None - self._last_char_order = None - self._done = None + def __init__(self) -> None: + self._total_rel = 0 + self._rel_sample: List[int] = [] + self._need_to_skip_char_num = 0 + self._last_char_order = -1 + self._done = False self.reset() - def reset(self): + def reset(self) -> None: self._total_rel = 0 # total sequence received # category counters, each integer counts sequence in its category self._rel_sample = [0] * self.NUM_OF_CATEGORY @@ -140,7 +144,7 @@ class JapaneseContextAnalysis(object): # been made self._done = False - def feed(self, byte_str, num_bytes): + def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None: if self._done: return @@ -153,7 +157,7 @@ class JapaneseContextAnalysis(object): # this character will simply our logic and improve performance. i = self._need_to_skip_char_num while i < num_bytes: - order, char_len = self.get_order(byte_str[i:i + 2]) + order, char_len = self.get_order(byte_str[i : i + 2]) i += char_len if i > num_bytes: self._need_to_skip_char_num = i - num_bytes @@ -164,32 +168,34 @@ class JapaneseContextAnalysis(object): if self._total_rel > self.MAX_REL_THRESHOLD: self._done = True break - self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1 + self._rel_sample[ + jp2_char_context[self._last_char_order][order] + ] += 1 self._last_char_order = order - def got_enough_data(self): + def got_enough_data(self) -> bool: return self._total_rel > self.ENOUGH_REL_THRESHOLD - def get_confidence(self): + def get_confidence(self) -> float: # This is just one way to calculate confidence. It works well for me. if self._total_rel > self.MINIMUM_DATA_THRESHOLD: return (self._total_rel - self._rel_sample[0]) / self._total_rel - else: - return self.DONT_KNOW + return self.DONT_KNOW - def get_order(self, byte_str): + def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]: return -1, 1 + class SJISContextAnalysis(JapaneseContextAnalysis): - def __init__(self): - super(SJISContextAnalysis, self).__init__() + def __init__(self) -> None: + super().__init__() self._charset_name = "SHIFT_JIS" @property - def charset_name(self): + def charset_name(self) -> str: return self._charset_name - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length @@ -209,8 +215,9 @@ class SJISContextAnalysis(JapaneseContextAnalysis): return -1, char_len + class EUCJPContextAnalysis(JapaneseContextAnalysis): - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length @@ -229,5 +236,3 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis): return second_char - 0xA1, char_len return -1, char_len - - diff --git a/lib/chardet/langbulgarianmodel.py b/lib/chardet/langbulgarianmodel.py index 6485306e..5617856e 100644 --- a/lib/chardet/langbulgarianmodel.py +++ b/lib/chardet/langbulgarianmodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -4115,536 +4111,539 @@ BULGARIAN_LANG_MODEL = { # Character Mapping Table(s): ISO_8859_5_BULGARIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 77, # 'A' - 66: 90, # 'B' - 67: 99, # 'C' - 68: 100, # 'D' - 69: 72, # 'E' - 70: 109, # 'F' - 71: 107, # 'G' - 72: 101, # 'H' - 73: 79, # 'I' - 74: 185, # 'J' - 75: 81, # 'K' - 76: 102, # 'L' - 77: 76, # 'M' - 78: 94, # 'N' - 79: 82, # 'O' - 80: 110, # 'P' - 81: 186, # 'Q' - 82: 108, # 'R' - 83: 91, # 'S' - 84: 74, # 'T' - 85: 119, # 'U' - 86: 84, # 'V' - 87: 96, # 'W' - 88: 111, # 'X' - 89: 187, # 'Y' - 90: 115, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 65, # 'a' - 98: 69, # 'b' - 99: 70, # 'c' - 100: 66, # 'd' - 101: 63, # 'e' - 102: 68, # 'f' - 103: 112, # 'g' - 104: 103, # 'h' - 105: 92, # 'i' - 106: 194, # 'j' - 107: 104, # 'k' - 108: 95, # 'l' - 109: 86, # 'm' - 110: 87, # 'n' - 111: 71, # 'o' - 112: 116, # 'p' - 113: 195, # 'q' - 114: 85, # 'r' - 115: 93, # 's' - 116: 97, # 't' - 117: 113, # 'u' - 118: 196, # 'v' - 119: 197, # 'w' - 120: 198, # 'x' - 121: 199, # 'y' - 122: 200, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 194, # '\x80' - 129: 195, # '\x81' - 130: 196, # '\x82' - 131: 197, # '\x83' - 132: 198, # '\x84' - 133: 199, # '\x85' - 134: 200, # '\x86' - 135: 201, # '\x87' - 136: 202, # '\x88' - 137: 203, # '\x89' - 138: 204, # '\x8a' - 139: 205, # '\x8b' - 140: 206, # '\x8c' - 141: 207, # '\x8d' - 142: 208, # '\x8e' - 143: 209, # '\x8f' - 144: 210, # '\x90' - 145: 211, # '\x91' - 146: 212, # '\x92' - 147: 213, # '\x93' - 148: 214, # '\x94' - 149: 215, # '\x95' - 150: 216, # '\x96' - 151: 217, # '\x97' - 152: 218, # '\x98' - 153: 219, # '\x99' - 154: 220, # '\x9a' - 155: 221, # '\x9b' - 156: 222, # '\x9c' - 157: 223, # '\x9d' - 158: 224, # '\x9e' - 159: 225, # '\x9f' - 160: 81, # '\xa0' - 161: 226, # 'Ё' - 162: 227, # 'Ђ' - 163: 228, # 'Ѓ' - 164: 229, # 'Є' - 165: 230, # 'Ѕ' - 166: 105, # 'І' - 167: 231, # 'Ї' - 168: 232, # 'Ј' - 169: 233, # 'Љ' - 170: 234, # 'Њ' - 171: 235, # 'Ћ' - 172: 236, # 'Ќ' - 173: 45, # '\xad' - 174: 237, # 'Ў' - 175: 238, # 'Џ' - 176: 31, # 'А' - 177: 32, # 'Б' - 178: 35, # 'В' - 179: 43, # 'Г' - 180: 37, # 'Д' - 181: 44, # 'Е' - 182: 55, # 'Ж' - 183: 47, # 'З' - 184: 40, # 'И' - 185: 59, # 'Й' - 186: 33, # 'К' - 187: 46, # 'Л' - 188: 38, # 'М' - 189: 36, # 'Н' - 190: 41, # 'О' - 191: 30, # 'П' - 192: 39, # 'Р' - 193: 28, # 'С' - 194: 34, # 'Т' - 195: 51, # 'У' - 196: 48, # 'Ф' - 197: 49, # 'Х' - 198: 53, # 'Ц' - 199: 50, # 'Ч' - 200: 54, # 'Ш' - 201: 57, # 'Щ' - 202: 61, # 'Ъ' - 203: 239, # 'Ы' - 204: 67, # 'Ь' - 205: 240, # 'Э' - 206: 60, # 'Ю' - 207: 56, # 'Я' - 208: 1, # 'а' - 209: 18, # 'б' - 210: 9, # 'в' - 211: 20, # 'г' - 212: 11, # 'д' - 213: 3, # 'е' - 214: 23, # 'ж' - 215: 15, # 'з' - 216: 2, # 'и' - 217: 26, # 'й' - 218: 12, # 'к' - 219: 10, # 'л' - 220: 14, # 'м' - 221: 6, # 'н' - 222: 4, # 'о' - 223: 13, # 'п' - 224: 7, # 'р' - 225: 8, # 'с' - 226: 5, # 'т' - 227: 19, # 'у' - 228: 29, # 'ф' - 229: 25, # 'х' - 230: 22, # 'ц' - 231: 21, # 'ч' - 232: 27, # 'ш' - 233: 24, # 'щ' - 234: 17, # 'ъ' - 235: 75, # 'ы' - 236: 52, # 'ь' - 237: 241, # 'э' - 238: 42, # 'ю' - 239: 16, # 'я' - 240: 62, # '№' - 241: 242, # 'ё' - 242: 243, # 'ђ' - 243: 244, # 'ѓ' - 244: 58, # 'є' - 245: 245, # 'ѕ' - 246: 98, # 'і' - 247: 246, # 'ї' - 248: 247, # 'ј' - 249: 248, # 'љ' - 250: 249, # 'њ' - 251: 250, # 'ћ' - 252: 251, # 'ќ' - 253: 91, # '§' - 254: 252, # 'ў' - 255: 253, # 'џ' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 77, # 'A' + 66: 90, # 'B' + 67: 99, # 'C' + 68: 100, # 'D' + 69: 72, # 'E' + 70: 109, # 'F' + 71: 107, # 'G' + 72: 101, # 'H' + 73: 79, # 'I' + 74: 185, # 'J' + 75: 81, # 'K' + 76: 102, # 'L' + 77: 76, # 'M' + 78: 94, # 'N' + 79: 82, # 'O' + 80: 110, # 'P' + 81: 186, # 'Q' + 82: 108, # 'R' + 83: 91, # 'S' + 84: 74, # 'T' + 85: 119, # 'U' + 86: 84, # 'V' + 87: 96, # 'W' + 88: 111, # 'X' + 89: 187, # 'Y' + 90: 115, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 65, # 'a' + 98: 69, # 'b' + 99: 70, # 'c' + 100: 66, # 'd' + 101: 63, # 'e' + 102: 68, # 'f' + 103: 112, # 'g' + 104: 103, # 'h' + 105: 92, # 'i' + 106: 194, # 'j' + 107: 104, # 'k' + 108: 95, # 'l' + 109: 86, # 'm' + 110: 87, # 'n' + 111: 71, # 'o' + 112: 116, # 'p' + 113: 195, # 'q' + 114: 85, # 'r' + 115: 93, # 's' + 116: 97, # 't' + 117: 113, # 'u' + 118: 196, # 'v' + 119: 197, # 'w' + 120: 198, # 'x' + 121: 199, # 'y' + 122: 200, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 194, # '\x80' + 129: 195, # '\x81' + 130: 196, # '\x82' + 131: 197, # '\x83' + 132: 198, # '\x84' + 133: 199, # '\x85' + 134: 200, # '\x86' + 135: 201, # '\x87' + 136: 202, # '\x88' + 137: 203, # '\x89' + 138: 204, # '\x8a' + 139: 205, # '\x8b' + 140: 206, # '\x8c' + 141: 207, # '\x8d' + 142: 208, # '\x8e' + 143: 209, # '\x8f' + 144: 210, # '\x90' + 145: 211, # '\x91' + 146: 212, # '\x92' + 147: 213, # '\x93' + 148: 214, # '\x94' + 149: 215, # '\x95' + 150: 216, # '\x96' + 151: 217, # '\x97' + 152: 218, # '\x98' + 153: 219, # '\x99' + 154: 220, # '\x9a' + 155: 221, # '\x9b' + 156: 222, # '\x9c' + 157: 223, # '\x9d' + 158: 224, # '\x9e' + 159: 225, # '\x9f' + 160: 81, # '\xa0' + 161: 226, # 'Ё' + 162: 227, # 'Ђ' + 163: 228, # 'Ѓ' + 164: 229, # 'Є' + 165: 230, # 'Ѕ' + 166: 105, # 'І' + 167: 231, # 'Ї' + 168: 232, # 'Ј' + 169: 233, # 'Љ' + 170: 234, # 'Њ' + 171: 235, # 'Ћ' + 172: 236, # 'Ќ' + 173: 45, # '\xad' + 174: 237, # 'Ў' + 175: 238, # 'Џ' + 176: 31, # 'А' + 177: 32, # 'Б' + 178: 35, # 'В' + 179: 43, # 'Г' + 180: 37, # 'Д' + 181: 44, # 'Е' + 182: 55, # 'Ж' + 183: 47, # 'З' + 184: 40, # 'И' + 185: 59, # 'Й' + 186: 33, # 'К' + 187: 46, # 'Л' + 188: 38, # 'М' + 189: 36, # 'Н' + 190: 41, # 'О' + 191: 30, # 'П' + 192: 39, # 'Р' + 193: 28, # 'С' + 194: 34, # 'Т' + 195: 51, # 'У' + 196: 48, # 'Ф' + 197: 49, # 'Х' + 198: 53, # 'Ц' + 199: 50, # 'Ч' + 200: 54, # 'Ш' + 201: 57, # 'Щ' + 202: 61, # 'Ъ' + 203: 239, # 'Ы' + 204: 67, # 'Ь' + 205: 240, # 'Э' + 206: 60, # 'Ю' + 207: 56, # 'Я' + 208: 1, # 'а' + 209: 18, # 'б' + 210: 9, # 'в' + 211: 20, # 'г' + 212: 11, # 'д' + 213: 3, # 'е' + 214: 23, # 'ж' + 215: 15, # 'з' + 216: 2, # 'и' + 217: 26, # 'й' + 218: 12, # 'к' + 219: 10, # 'л' + 220: 14, # 'м' + 221: 6, # 'н' + 222: 4, # 'о' + 223: 13, # 'п' + 224: 7, # 'р' + 225: 8, # 'с' + 226: 5, # 'т' + 227: 19, # 'у' + 228: 29, # 'ф' + 229: 25, # 'х' + 230: 22, # 'ц' + 231: 21, # 'ч' + 232: 27, # 'ш' + 233: 24, # 'щ' + 234: 17, # 'ъ' + 235: 75, # 'ы' + 236: 52, # 'ь' + 237: 241, # 'э' + 238: 42, # 'ю' + 239: 16, # 'я' + 240: 62, # '№' + 241: 242, # 'ё' + 242: 243, # 'ђ' + 243: 244, # 'ѓ' + 244: 58, # 'є' + 245: 245, # 'ѕ' + 246: 98, # 'і' + 247: 246, # 'ї' + 248: 247, # 'ј' + 249: 248, # 'љ' + 250: 249, # 'њ' + 251: 250, # 'ћ' + 252: 251, # 'ќ' + 253: 91, # '§' + 254: 252, # 'ў' + 255: 253, # 'џ' } -ISO_8859_5_BULGARIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5', - language='Bulgairan', - char_to_order_map=ISO_8859_5_BULGARIAN_CHAR_TO_ORDER, - language_model=BULGARIAN_LANG_MODEL, - typical_positive_ratio=0.969392, - keep_ascii_letters=False, - alphabet='АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя') +ISO_8859_5_BULGARIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-5", + language="Bulgarian", + char_to_order_map=ISO_8859_5_BULGARIAN_CHAR_TO_ORDER, + language_model=BULGARIAN_LANG_MODEL, + typical_positive_ratio=0.969392, + keep_ascii_letters=False, + alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", +) WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 77, # 'A' - 66: 90, # 'B' - 67: 99, # 'C' - 68: 100, # 'D' - 69: 72, # 'E' - 70: 109, # 'F' - 71: 107, # 'G' - 72: 101, # 'H' - 73: 79, # 'I' - 74: 185, # 'J' - 75: 81, # 'K' - 76: 102, # 'L' - 77: 76, # 'M' - 78: 94, # 'N' - 79: 82, # 'O' - 80: 110, # 'P' - 81: 186, # 'Q' - 82: 108, # 'R' - 83: 91, # 'S' - 84: 74, # 'T' - 85: 119, # 'U' - 86: 84, # 'V' - 87: 96, # 'W' - 88: 111, # 'X' - 89: 187, # 'Y' - 90: 115, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 65, # 'a' - 98: 69, # 'b' - 99: 70, # 'c' - 100: 66, # 'd' - 101: 63, # 'e' - 102: 68, # 'f' - 103: 112, # 'g' - 104: 103, # 'h' - 105: 92, # 'i' - 106: 194, # 'j' - 107: 104, # 'k' - 108: 95, # 'l' - 109: 86, # 'm' - 110: 87, # 'n' - 111: 71, # 'o' - 112: 116, # 'p' - 113: 195, # 'q' - 114: 85, # 'r' - 115: 93, # 's' - 116: 97, # 't' - 117: 113, # 'u' - 118: 196, # 'v' - 119: 197, # 'w' - 120: 198, # 'x' - 121: 199, # 'y' - 122: 200, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 206, # 'Ђ' - 129: 207, # 'Ѓ' - 130: 208, # '‚' - 131: 209, # 'ѓ' - 132: 210, # '„' - 133: 211, # '…' - 134: 212, # '†' - 135: 213, # '‡' - 136: 120, # '€' - 137: 214, # '‰' - 138: 215, # 'Љ' - 139: 216, # '‹' - 140: 217, # 'Њ' - 141: 218, # 'Ќ' - 142: 219, # 'Ћ' - 143: 220, # 'Џ' - 144: 221, # 'ђ' - 145: 78, # '‘' - 146: 64, # '’' - 147: 83, # '“' - 148: 121, # '”' - 149: 98, # '•' - 150: 117, # '–' - 151: 105, # '—' - 152: 222, # None - 153: 223, # '™' - 154: 224, # 'љ' - 155: 225, # '›' - 156: 226, # 'њ' - 157: 227, # 'ќ' - 158: 228, # 'ћ' - 159: 229, # 'џ' - 160: 88, # '\xa0' - 161: 230, # 'Ў' - 162: 231, # 'ў' - 163: 232, # 'Ј' - 164: 233, # '¤' - 165: 122, # 'Ґ' - 166: 89, # '¦' - 167: 106, # '§' - 168: 234, # 'Ё' - 169: 235, # '©' - 170: 236, # 'Є' - 171: 237, # '«' - 172: 238, # '¬' - 173: 45, # '\xad' - 174: 239, # '®' - 175: 240, # 'Ї' - 176: 73, # '°' - 177: 80, # '±' - 178: 118, # 'І' - 179: 114, # 'і' - 180: 241, # 'ґ' - 181: 242, # 'µ' - 182: 243, # '¶' - 183: 244, # '·' - 184: 245, # 'ё' - 185: 62, # '№' - 186: 58, # 'є' - 187: 246, # '»' - 188: 247, # 'ј' - 189: 248, # 'Ѕ' - 190: 249, # 'ѕ' - 191: 250, # 'ї' - 192: 31, # 'А' - 193: 32, # 'Б' - 194: 35, # 'В' - 195: 43, # 'Г' - 196: 37, # 'Д' - 197: 44, # 'Е' - 198: 55, # 'Ж' - 199: 47, # 'З' - 200: 40, # 'И' - 201: 59, # 'Й' - 202: 33, # 'К' - 203: 46, # 'Л' - 204: 38, # 'М' - 205: 36, # 'Н' - 206: 41, # 'О' - 207: 30, # 'П' - 208: 39, # 'Р' - 209: 28, # 'С' - 210: 34, # 'Т' - 211: 51, # 'У' - 212: 48, # 'Ф' - 213: 49, # 'Х' - 214: 53, # 'Ц' - 215: 50, # 'Ч' - 216: 54, # 'Ш' - 217: 57, # 'Щ' - 218: 61, # 'Ъ' - 219: 251, # 'Ы' - 220: 67, # 'Ь' - 221: 252, # 'Э' - 222: 60, # 'Ю' - 223: 56, # 'Я' - 224: 1, # 'а' - 225: 18, # 'б' - 226: 9, # 'в' - 227: 20, # 'г' - 228: 11, # 'д' - 229: 3, # 'е' - 230: 23, # 'ж' - 231: 15, # 'з' - 232: 2, # 'и' - 233: 26, # 'й' - 234: 12, # 'к' - 235: 10, # 'л' - 236: 14, # 'м' - 237: 6, # 'н' - 238: 4, # 'о' - 239: 13, # 'п' - 240: 7, # 'р' - 241: 8, # 'с' - 242: 5, # 'т' - 243: 19, # 'у' - 244: 29, # 'ф' - 245: 25, # 'х' - 246: 22, # 'ц' - 247: 21, # 'ч' - 248: 27, # 'ш' - 249: 24, # 'щ' - 250: 17, # 'ъ' - 251: 75, # 'ы' - 252: 52, # 'ь' - 253: 253, # 'э' - 254: 42, # 'ю' - 255: 16, # 'я' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 77, # 'A' + 66: 90, # 'B' + 67: 99, # 'C' + 68: 100, # 'D' + 69: 72, # 'E' + 70: 109, # 'F' + 71: 107, # 'G' + 72: 101, # 'H' + 73: 79, # 'I' + 74: 185, # 'J' + 75: 81, # 'K' + 76: 102, # 'L' + 77: 76, # 'M' + 78: 94, # 'N' + 79: 82, # 'O' + 80: 110, # 'P' + 81: 186, # 'Q' + 82: 108, # 'R' + 83: 91, # 'S' + 84: 74, # 'T' + 85: 119, # 'U' + 86: 84, # 'V' + 87: 96, # 'W' + 88: 111, # 'X' + 89: 187, # 'Y' + 90: 115, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 65, # 'a' + 98: 69, # 'b' + 99: 70, # 'c' + 100: 66, # 'd' + 101: 63, # 'e' + 102: 68, # 'f' + 103: 112, # 'g' + 104: 103, # 'h' + 105: 92, # 'i' + 106: 194, # 'j' + 107: 104, # 'k' + 108: 95, # 'l' + 109: 86, # 'm' + 110: 87, # 'n' + 111: 71, # 'o' + 112: 116, # 'p' + 113: 195, # 'q' + 114: 85, # 'r' + 115: 93, # 's' + 116: 97, # 't' + 117: 113, # 'u' + 118: 196, # 'v' + 119: 197, # 'w' + 120: 198, # 'x' + 121: 199, # 'y' + 122: 200, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 206, # 'Ђ' + 129: 207, # 'Ѓ' + 130: 208, # '‚' + 131: 209, # 'ѓ' + 132: 210, # '„' + 133: 211, # '…' + 134: 212, # '†' + 135: 213, # '‡' + 136: 120, # '€' + 137: 214, # '‰' + 138: 215, # 'Љ' + 139: 216, # '‹' + 140: 217, # 'Њ' + 141: 218, # 'Ќ' + 142: 219, # 'Ћ' + 143: 220, # 'Џ' + 144: 221, # 'ђ' + 145: 78, # '‘' + 146: 64, # '’' + 147: 83, # '“' + 148: 121, # '”' + 149: 98, # '•' + 150: 117, # '–' + 151: 105, # '—' + 152: 222, # None + 153: 223, # '™' + 154: 224, # 'љ' + 155: 225, # '›' + 156: 226, # 'њ' + 157: 227, # 'ќ' + 158: 228, # 'ћ' + 159: 229, # 'џ' + 160: 88, # '\xa0' + 161: 230, # 'Ў' + 162: 231, # 'ў' + 163: 232, # 'Ј' + 164: 233, # '¤' + 165: 122, # 'Ґ' + 166: 89, # '¦' + 167: 106, # '§' + 168: 234, # 'Ё' + 169: 235, # '©' + 170: 236, # 'Є' + 171: 237, # '«' + 172: 238, # '¬' + 173: 45, # '\xad' + 174: 239, # '®' + 175: 240, # 'Ї' + 176: 73, # '°' + 177: 80, # '±' + 178: 118, # 'І' + 179: 114, # 'і' + 180: 241, # 'ґ' + 181: 242, # 'µ' + 182: 243, # '¶' + 183: 244, # '·' + 184: 245, # 'ё' + 185: 62, # '№' + 186: 58, # 'є' + 187: 246, # '»' + 188: 247, # 'ј' + 189: 248, # 'Ѕ' + 190: 249, # 'ѕ' + 191: 250, # 'ї' + 192: 31, # 'А' + 193: 32, # 'Б' + 194: 35, # 'В' + 195: 43, # 'Г' + 196: 37, # 'Д' + 197: 44, # 'Е' + 198: 55, # 'Ж' + 199: 47, # 'З' + 200: 40, # 'И' + 201: 59, # 'Й' + 202: 33, # 'К' + 203: 46, # 'Л' + 204: 38, # 'М' + 205: 36, # 'Н' + 206: 41, # 'О' + 207: 30, # 'П' + 208: 39, # 'Р' + 209: 28, # 'С' + 210: 34, # 'Т' + 211: 51, # 'У' + 212: 48, # 'Ф' + 213: 49, # 'Х' + 214: 53, # 'Ц' + 215: 50, # 'Ч' + 216: 54, # 'Ш' + 217: 57, # 'Щ' + 218: 61, # 'Ъ' + 219: 251, # 'Ы' + 220: 67, # 'Ь' + 221: 252, # 'Э' + 222: 60, # 'Ю' + 223: 56, # 'Я' + 224: 1, # 'а' + 225: 18, # 'б' + 226: 9, # 'в' + 227: 20, # 'г' + 228: 11, # 'д' + 229: 3, # 'е' + 230: 23, # 'ж' + 231: 15, # 'з' + 232: 2, # 'и' + 233: 26, # 'й' + 234: 12, # 'к' + 235: 10, # 'л' + 236: 14, # 'м' + 237: 6, # 'н' + 238: 4, # 'о' + 239: 13, # 'п' + 240: 7, # 'р' + 241: 8, # 'с' + 242: 5, # 'т' + 243: 19, # 'у' + 244: 29, # 'ф' + 245: 25, # 'х' + 246: 22, # 'ц' + 247: 21, # 'ч' + 248: 27, # 'ш' + 249: 24, # 'щ' + 250: 17, # 'ъ' + 251: 75, # 'ы' + 252: 52, # 'ь' + 253: 253, # 'э' + 254: 42, # 'ю' + 255: 16, # 'я' } -WINDOWS_1251_BULGARIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1251', - language='Bulgarian', - char_to_order_map=WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER, - language_model=BULGARIAN_LANG_MODEL, - typical_positive_ratio=0.969392, - keep_ascii_letters=False, - alphabet='АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя') - +WINDOWS_1251_BULGARIAN_MODEL = SingleByteCharSetModel( + charset_name="windows-1251", + language="Bulgarian", + char_to_order_map=WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER, + language_model=BULGARIAN_LANG_MODEL, + typical_positive_ratio=0.969392, + keep_ascii_letters=False, + alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", +) diff --git a/lib/chardet/langgreekmodel.py b/lib/chardet/langgreekmodel.py index 2d4982ca..0de9b7f3 100644 --- a/lib/chardet/langgreekmodel.py +++ b/lib/chardet/langgreekmodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -3863,536 +3859,539 @@ GREEK_LANG_MODEL = { # Character Mapping Table(s): WINDOWS_1253_GREEK_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 82, # 'A' - 66: 100, # 'B' - 67: 104, # 'C' - 68: 94, # 'D' - 69: 98, # 'E' - 70: 101, # 'F' - 71: 116, # 'G' - 72: 102, # 'H' - 73: 111, # 'I' - 74: 187, # 'J' - 75: 117, # 'K' - 76: 92, # 'L' - 77: 88, # 'M' - 78: 113, # 'N' - 79: 85, # 'O' - 80: 79, # 'P' - 81: 118, # 'Q' - 82: 105, # 'R' - 83: 83, # 'S' - 84: 67, # 'T' - 85: 114, # 'U' - 86: 119, # 'V' - 87: 95, # 'W' - 88: 99, # 'X' - 89: 109, # 'Y' - 90: 188, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 72, # 'a' - 98: 70, # 'b' - 99: 80, # 'c' - 100: 81, # 'd' - 101: 60, # 'e' - 102: 96, # 'f' - 103: 93, # 'g' - 104: 89, # 'h' - 105: 68, # 'i' - 106: 120, # 'j' - 107: 97, # 'k' - 108: 77, # 'l' - 109: 86, # 'm' - 110: 69, # 'n' - 111: 55, # 'o' - 112: 78, # 'p' - 113: 115, # 'q' - 114: 65, # 'r' - 115: 66, # 's' - 116: 58, # 't' - 117: 76, # 'u' - 118: 106, # 'v' - 119: 103, # 'w' - 120: 87, # 'x' - 121: 107, # 'y' - 122: 112, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 255, # '€' - 129: 255, # None - 130: 255, # '‚' - 131: 255, # 'ƒ' - 132: 255, # '„' - 133: 255, # '…' - 134: 255, # '†' - 135: 255, # '‡' - 136: 255, # None - 137: 255, # '‰' - 138: 255, # None - 139: 255, # '‹' - 140: 255, # None - 141: 255, # None - 142: 255, # None - 143: 255, # None - 144: 255, # None - 145: 255, # '‘' - 146: 255, # '’' - 147: 255, # '“' - 148: 255, # '”' - 149: 255, # '•' - 150: 255, # '–' - 151: 255, # '—' - 152: 255, # None - 153: 255, # '™' - 154: 255, # None - 155: 255, # '›' - 156: 255, # None - 157: 255, # None - 158: 255, # None - 159: 255, # None - 160: 253, # '\xa0' - 161: 233, # '΅' - 162: 61, # 'Ά' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 253, # None - 171: 253, # '«' - 172: 253, # '¬' - 173: 74, # '\xad' - 174: 253, # '®' - 175: 253, # '―' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '²' - 179: 253, # '³' - 180: 247, # '΄' - 181: 253, # 'µ' - 182: 253, # '¶' - 183: 36, # '·' - 184: 46, # 'Έ' - 185: 71, # 'Ή' - 186: 73, # 'Ί' - 187: 253, # '»' - 188: 54, # 'Ό' - 189: 253, # '½' - 190: 108, # 'Ύ' - 191: 123, # 'Ώ' - 192: 110, # 'ΐ' - 193: 31, # 'Α' - 194: 51, # 'Β' - 195: 43, # 'Γ' - 196: 41, # 'Δ' - 197: 34, # 'Ε' - 198: 91, # 'Ζ' - 199: 40, # 'Η' - 200: 52, # 'Θ' - 201: 47, # 'Ι' - 202: 44, # 'Κ' - 203: 53, # 'Λ' - 204: 38, # 'Μ' - 205: 49, # 'Ν' - 206: 59, # 'Ξ' - 207: 39, # 'Ο' - 208: 35, # 'Π' - 209: 48, # 'Ρ' - 210: 250, # None - 211: 37, # 'Σ' - 212: 33, # 'Τ' - 213: 45, # 'Υ' - 214: 56, # 'Φ' - 215: 50, # 'Χ' - 216: 84, # 'Ψ' - 217: 57, # 'Ω' - 218: 120, # 'Ϊ' - 219: 121, # 'Ϋ' - 220: 17, # 'ά' - 221: 18, # 'έ' - 222: 22, # 'ή' - 223: 15, # 'ί' - 224: 124, # 'ΰ' - 225: 1, # 'α' - 226: 29, # 'β' - 227: 20, # 'γ' - 228: 21, # 'δ' - 229: 3, # 'ε' - 230: 32, # 'ζ' - 231: 13, # 'η' - 232: 25, # 'θ' - 233: 5, # 'ι' - 234: 11, # 'κ' - 235: 16, # 'λ' - 236: 10, # 'μ' - 237: 6, # 'ν' - 238: 30, # 'ξ' - 239: 4, # 'ο' - 240: 9, # 'π' - 241: 8, # 'ρ' - 242: 14, # 'ς' - 243: 7, # 'σ' - 244: 2, # 'τ' - 245: 12, # 'υ' - 246: 28, # 'φ' - 247: 23, # 'χ' - 248: 42, # 'ψ' - 249: 24, # 'ω' - 250: 64, # 'ϊ' - 251: 75, # 'ϋ' - 252: 19, # 'ό' - 253: 26, # 'ύ' - 254: 27, # 'ώ' - 255: 253, # None + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 82, # 'A' + 66: 100, # 'B' + 67: 104, # 'C' + 68: 94, # 'D' + 69: 98, # 'E' + 70: 101, # 'F' + 71: 116, # 'G' + 72: 102, # 'H' + 73: 111, # 'I' + 74: 187, # 'J' + 75: 117, # 'K' + 76: 92, # 'L' + 77: 88, # 'M' + 78: 113, # 'N' + 79: 85, # 'O' + 80: 79, # 'P' + 81: 118, # 'Q' + 82: 105, # 'R' + 83: 83, # 'S' + 84: 67, # 'T' + 85: 114, # 'U' + 86: 119, # 'V' + 87: 95, # 'W' + 88: 99, # 'X' + 89: 109, # 'Y' + 90: 188, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 72, # 'a' + 98: 70, # 'b' + 99: 80, # 'c' + 100: 81, # 'd' + 101: 60, # 'e' + 102: 96, # 'f' + 103: 93, # 'g' + 104: 89, # 'h' + 105: 68, # 'i' + 106: 120, # 'j' + 107: 97, # 'k' + 108: 77, # 'l' + 109: 86, # 'm' + 110: 69, # 'n' + 111: 55, # 'o' + 112: 78, # 'p' + 113: 115, # 'q' + 114: 65, # 'r' + 115: 66, # 's' + 116: 58, # 't' + 117: 76, # 'u' + 118: 106, # 'v' + 119: 103, # 'w' + 120: 87, # 'x' + 121: 107, # 'y' + 122: 112, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 255, # '€' + 129: 255, # None + 130: 255, # '‚' + 131: 255, # 'ƒ' + 132: 255, # '„' + 133: 255, # '…' + 134: 255, # '†' + 135: 255, # '‡' + 136: 255, # None + 137: 255, # '‰' + 138: 255, # None + 139: 255, # '‹' + 140: 255, # None + 141: 255, # None + 142: 255, # None + 143: 255, # None + 144: 255, # None + 145: 255, # '‘' + 146: 255, # '’' + 147: 255, # '“' + 148: 255, # '”' + 149: 255, # '•' + 150: 255, # '–' + 151: 255, # '—' + 152: 255, # None + 153: 255, # '™' + 154: 255, # None + 155: 255, # '›' + 156: 255, # None + 157: 255, # None + 158: 255, # None + 159: 255, # None + 160: 253, # '\xa0' + 161: 233, # '΅' + 162: 61, # 'Ά' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 253, # None + 171: 253, # '«' + 172: 253, # '¬' + 173: 74, # '\xad' + 174: 253, # '®' + 175: 253, # '―' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '²' + 179: 253, # '³' + 180: 247, # '΄' + 181: 253, # 'µ' + 182: 253, # '¶' + 183: 36, # '·' + 184: 46, # 'Έ' + 185: 71, # 'Ή' + 186: 73, # 'Ί' + 187: 253, # '»' + 188: 54, # 'Ό' + 189: 253, # '½' + 190: 108, # 'Ύ' + 191: 123, # 'Ώ' + 192: 110, # 'ΐ' + 193: 31, # 'Α' + 194: 51, # 'Β' + 195: 43, # 'Γ' + 196: 41, # 'Δ' + 197: 34, # 'Ε' + 198: 91, # 'Ζ' + 199: 40, # 'Η' + 200: 52, # 'Θ' + 201: 47, # 'Ι' + 202: 44, # 'Κ' + 203: 53, # 'Λ' + 204: 38, # 'Μ' + 205: 49, # 'Ν' + 206: 59, # 'Ξ' + 207: 39, # 'Ο' + 208: 35, # 'Π' + 209: 48, # 'Ρ' + 210: 250, # None + 211: 37, # 'Σ' + 212: 33, # 'Τ' + 213: 45, # 'Υ' + 214: 56, # 'Φ' + 215: 50, # 'Χ' + 216: 84, # 'Ψ' + 217: 57, # 'Ω' + 218: 120, # 'Ϊ' + 219: 121, # 'Ϋ' + 220: 17, # 'ά' + 221: 18, # 'έ' + 222: 22, # 'ή' + 223: 15, # 'ί' + 224: 124, # 'ΰ' + 225: 1, # 'α' + 226: 29, # 'β' + 227: 20, # 'γ' + 228: 21, # 'δ' + 229: 3, # 'ε' + 230: 32, # 'ζ' + 231: 13, # 'η' + 232: 25, # 'θ' + 233: 5, # 'ι' + 234: 11, # 'κ' + 235: 16, # 'λ' + 236: 10, # 'μ' + 237: 6, # 'ν' + 238: 30, # 'ξ' + 239: 4, # 'ο' + 240: 9, # 'π' + 241: 8, # 'ρ' + 242: 14, # 'ς' + 243: 7, # 'σ' + 244: 2, # 'τ' + 245: 12, # 'υ' + 246: 28, # 'φ' + 247: 23, # 'χ' + 248: 42, # 'ψ' + 249: 24, # 'ω' + 250: 64, # 'ϊ' + 251: 75, # 'ϋ' + 252: 19, # 'ό' + 253: 26, # 'ύ' + 254: 27, # 'ώ' + 255: 253, # None } -WINDOWS_1253_GREEK_MODEL = SingleByteCharSetModel(charset_name='windows-1253', - language='Greek', - char_to_order_map=WINDOWS_1253_GREEK_CHAR_TO_ORDER, - language_model=GREEK_LANG_MODEL, - typical_positive_ratio=0.982851, - keep_ascii_letters=False, - alphabet='ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ') +WINDOWS_1253_GREEK_MODEL = SingleByteCharSetModel( + charset_name="windows-1253", + language="Greek", + char_to_order_map=WINDOWS_1253_GREEK_CHAR_TO_ORDER, + language_model=GREEK_LANG_MODEL, + typical_positive_ratio=0.982851, + keep_ascii_letters=False, + alphabet="ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ", +) ISO_8859_7_GREEK_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 82, # 'A' - 66: 100, # 'B' - 67: 104, # 'C' - 68: 94, # 'D' - 69: 98, # 'E' - 70: 101, # 'F' - 71: 116, # 'G' - 72: 102, # 'H' - 73: 111, # 'I' - 74: 187, # 'J' - 75: 117, # 'K' - 76: 92, # 'L' - 77: 88, # 'M' - 78: 113, # 'N' - 79: 85, # 'O' - 80: 79, # 'P' - 81: 118, # 'Q' - 82: 105, # 'R' - 83: 83, # 'S' - 84: 67, # 'T' - 85: 114, # 'U' - 86: 119, # 'V' - 87: 95, # 'W' - 88: 99, # 'X' - 89: 109, # 'Y' - 90: 188, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 72, # 'a' - 98: 70, # 'b' - 99: 80, # 'c' - 100: 81, # 'd' - 101: 60, # 'e' - 102: 96, # 'f' - 103: 93, # 'g' - 104: 89, # 'h' - 105: 68, # 'i' - 106: 120, # 'j' - 107: 97, # 'k' - 108: 77, # 'l' - 109: 86, # 'm' - 110: 69, # 'n' - 111: 55, # 'o' - 112: 78, # 'p' - 113: 115, # 'q' - 114: 65, # 'r' - 115: 66, # 's' - 116: 58, # 't' - 117: 76, # 'u' - 118: 106, # 'v' - 119: 103, # 'w' - 120: 87, # 'x' - 121: 107, # 'y' - 122: 112, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 255, # '\x80' - 129: 255, # '\x81' - 130: 255, # '\x82' - 131: 255, # '\x83' - 132: 255, # '\x84' - 133: 255, # '\x85' - 134: 255, # '\x86' - 135: 255, # '\x87' - 136: 255, # '\x88' - 137: 255, # '\x89' - 138: 255, # '\x8a' - 139: 255, # '\x8b' - 140: 255, # '\x8c' - 141: 255, # '\x8d' - 142: 255, # '\x8e' - 143: 255, # '\x8f' - 144: 255, # '\x90' - 145: 255, # '\x91' - 146: 255, # '\x92' - 147: 255, # '\x93' - 148: 255, # '\x94' - 149: 255, # '\x95' - 150: 255, # '\x96' - 151: 255, # '\x97' - 152: 255, # '\x98' - 153: 255, # '\x99' - 154: 255, # '\x9a' - 155: 255, # '\x9b' - 156: 255, # '\x9c' - 157: 255, # '\x9d' - 158: 255, # '\x9e' - 159: 255, # '\x9f' - 160: 253, # '\xa0' - 161: 233, # '‘' - 162: 90, # '’' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '₯' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 253, # 'ͺ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 74, # '\xad' - 174: 253, # None - 175: 253, # '―' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '²' - 179: 253, # '³' - 180: 247, # '΄' - 181: 248, # '΅' - 182: 61, # 'Ά' - 183: 36, # '·' - 184: 46, # 'Έ' - 185: 71, # 'Ή' - 186: 73, # 'Ί' - 187: 253, # '»' - 188: 54, # 'Ό' - 189: 253, # '½' - 190: 108, # 'Ύ' - 191: 123, # 'Ώ' - 192: 110, # 'ΐ' - 193: 31, # 'Α' - 194: 51, # 'Β' - 195: 43, # 'Γ' - 196: 41, # 'Δ' - 197: 34, # 'Ε' - 198: 91, # 'Ζ' - 199: 40, # 'Η' - 200: 52, # 'Θ' - 201: 47, # 'Ι' - 202: 44, # 'Κ' - 203: 53, # 'Λ' - 204: 38, # 'Μ' - 205: 49, # 'Ν' - 206: 59, # 'Ξ' - 207: 39, # 'Ο' - 208: 35, # 'Π' - 209: 48, # 'Ρ' - 210: 250, # None - 211: 37, # 'Σ' - 212: 33, # 'Τ' - 213: 45, # 'Υ' - 214: 56, # 'Φ' - 215: 50, # 'Χ' - 216: 84, # 'Ψ' - 217: 57, # 'Ω' - 218: 120, # 'Ϊ' - 219: 121, # 'Ϋ' - 220: 17, # 'ά' - 221: 18, # 'έ' - 222: 22, # 'ή' - 223: 15, # 'ί' - 224: 124, # 'ΰ' - 225: 1, # 'α' - 226: 29, # 'β' - 227: 20, # 'γ' - 228: 21, # 'δ' - 229: 3, # 'ε' - 230: 32, # 'ζ' - 231: 13, # 'η' - 232: 25, # 'θ' - 233: 5, # 'ι' - 234: 11, # 'κ' - 235: 16, # 'λ' - 236: 10, # 'μ' - 237: 6, # 'ν' - 238: 30, # 'ξ' - 239: 4, # 'ο' - 240: 9, # 'π' - 241: 8, # 'ρ' - 242: 14, # 'ς' - 243: 7, # 'σ' - 244: 2, # 'τ' - 245: 12, # 'υ' - 246: 28, # 'φ' - 247: 23, # 'χ' - 248: 42, # 'ψ' - 249: 24, # 'ω' - 250: 64, # 'ϊ' - 251: 75, # 'ϋ' - 252: 19, # 'ό' - 253: 26, # 'ύ' - 254: 27, # 'ώ' - 255: 253, # None + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 82, # 'A' + 66: 100, # 'B' + 67: 104, # 'C' + 68: 94, # 'D' + 69: 98, # 'E' + 70: 101, # 'F' + 71: 116, # 'G' + 72: 102, # 'H' + 73: 111, # 'I' + 74: 187, # 'J' + 75: 117, # 'K' + 76: 92, # 'L' + 77: 88, # 'M' + 78: 113, # 'N' + 79: 85, # 'O' + 80: 79, # 'P' + 81: 118, # 'Q' + 82: 105, # 'R' + 83: 83, # 'S' + 84: 67, # 'T' + 85: 114, # 'U' + 86: 119, # 'V' + 87: 95, # 'W' + 88: 99, # 'X' + 89: 109, # 'Y' + 90: 188, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 72, # 'a' + 98: 70, # 'b' + 99: 80, # 'c' + 100: 81, # 'd' + 101: 60, # 'e' + 102: 96, # 'f' + 103: 93, # 'g' + 104: 89, # 'h' + 105: 68, # 'i' + 106: 120, # 'j' + 107: 97, # 'k' + 108: 77, # 'l' + 109: 86, # 'm' + 110: 69, # 'n' + 111: 55, # 'o' + 112: 78, # 'p' + 113: 115, # 'q' + 114: 65, # 'r' + 115: 66, # 's' + 116: 58, # 't' + 117: 76, # 'u' + 118: 106, # 'v' + 119: 103, # 'w' + 120: 87, # 'x' + 121: 107, # 'y' + 122: 112, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 255, # '\x80' + 129: 255, # '\x81' + 130: 255, # '\x82' + 131: 255, # '\x83' + 132: 255, # '\x84' + 133: 255, # '\x85' + 134: 255, # '\x86' + 135: 255, # '\x87' + 136: 255, # '\x88' + 137: 255, # '\x89' + 138: 255, # '\x8a' + 139: 255, # '\x8b' + 140: 255, # '\x8c' + 141: 255, # '\x8d' + 142: 255, # '\x8e' + 143: 255, # '\x8f' + 144: 255, # '\x90' + 145: 255, # '\x91' + 146: 255, # '\x92' + 147: 255, # '\x93' + 148: 255, # '\x94' + 149: 255, # '\x95' + 150: 255, # '\x96' + 151: 255, # '\x97' + 152: 255, # '\x98' + 153: 255, # '\x99' + 154: 255, # '\x9a' + 155: 255, # '\x9b' + 156: 255, # '\x9c' + 157: 255, # '\x9d' + 158: 255, # '\x9e' + 159: 255, # '\x9f' + 160: 253, # '\xa0' + 161: 233, # '‘' + 162: 90, # '’' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '₯' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 253, # 'ͺ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 74, # '\xad' + 174: 253, # None + 175: 253, # '―' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '²' + 179: 253, # '³' + 180: 247, # '΄' + 181: 248, # '΅' + 182: 61, # 'Ά' + 183: 36, # '·' + 184: 46, # 'Έ' + 185: 71, # 'Ή' + 186: 73, # 'Ί' + 187: 253, # '»' + 188: 54, # 'Ό' + 189: 253, # '½' + 190: 108, # 'Ύ' + 191: 123, # 'Ώ' + 192: 110, # 'ΐ' + 193: 31, # 'Α' + 194: 51, # 'Β' + 195: 43, # 'Γ' + 196: 41, # 'Δ' + 197: 34, # 'Ε' + 198: 91, # 'Ζ' + 199: 40, # 'Η' + 200: 52, # 'Θ' + 201: 47, # 'Ι' + 202: 44, # 'Κ' + 203: 53, # 'Λ' + 204: 38, # 'Μ' + 205: 49, # 'Ν' + 206: 59, # 'Ξ' + 207: 39, # 'Ο' + 208: 35, # 'Π' + 209: 48, # 'Ρ' + 210: 250, # None + 211: 37, # 'Σ' + 212: 33, # 'Τ' + 213: 45, # 'Υ' + 214: 56, # 'Φ' + 215: 50, # 'Χ' + 216: 84, # 'Ψ' + 217: 57, # 'Ω' + 218: 120, # 'Ϊ' + 219: 121, # 'Ϋ' + 220: 17, # 'ά' + 221: 18, # 'έ' + 222: 22, # 'ή' + 223: 15, # 'ί' + 224: 124, # 'ΰ' + 225: 1, # 'α' + 226: 29, # 'β' + 227: 20, # 'γ' + 228: 21, # 'δ' + 229: 3, # 'ε' + 230: 32, # 'ζ' + 231: 13, # 'η' + 232: 25, # 'θ' + 233: 5, # 'ι' + 234: 11, # 'κ' + 235: 16, # 'λ' + 236: 10, # 'μ' + 237: 6, # 'ν' + 238: 30, # 'ξ' + 239: 4, # 'ο' + 240: 9, # 'π' + 241: 8, # 'ρ' + 242: 14, # 'ς' + 243: 7, # 'σ' + 244: 2, # 'τ' + 245: 12, # 'υ' + 246: 28, # 'φ' + 247: 23, # 'χ' + 248: 42, # 'ψ' + 249: 24, # 'ω' + 250: 64, # 'ϊ' + 251: 75, # 'ϋ' + 252: 19, # 'ό' + 253: 26, # 'ύ' + 254: 27, # 'ώ' + 255: 253, # None } -ISO_8859_7_GREEK_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-7', - language='Greek', - char_to_order_map=ISO_8859_7_GREEK_CHAR_TO_ORDER, - language_model=GREEK_LANG_MODEL, - typical_positive_ratio=0.982851, - keep_ascii_letters=False, - alphabet='ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ') - +ISO_8859_7_GREEK_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-7", + language="Greek", + char_to_order_map=ISO_8859_7_GREEK_CHAR_TO_ORDER, + language_model=GREEK_LANG_MODEL, + typical_positive_ratio=0.982851, + keep_ascii_letters=False, + alphabet="ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ", +) diff --git a/lib/chardet/langhebrewmodel.py b/lib/chardet/langhebrewmodel.py index d509d6de..32b619cc 100644 --- a/lib/chardet/langhebrewmodel.py +++ b/lib/chardet/langhebrewmodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -4115,269 +4111,270 @@ HEBREW_LANG_MODEL = { # Character Mapping Table(s): WINDOWS_1255_HEBREW_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 69, # 'A' - 66: 91, # 'B' - 67: 79, # 'C' - 68: 80, # 'D' - 69: 92, # 'E' - 70: 89, # 'F' - 71: 97, # 'G' - 72: 90, # 'H' - 73: 68, # 'I' - 74: 111, # 'J' - 75: 112, # 'K' - 76: 82, # 'L' - 77: 73, # 'M' - 78: 95, # 'N' - 79: 85, # 'O' - 80: 78, # 'P' - 81: 121, # 'Q' - 82: 86, # 'R' - 83: 71, # 'S' - 84: 67, # 'T' - 85: 102, # 'U' - 86: 107, # 'V' - 87: 84, # 'W' - 88: 114, # 'X' - 89: 103, # 'Y' - 90: 115, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 50, # 'a' - 98: 74, # 'b' - 99: 60, # 'c' - 100: 61, # 'd' - 101: 42, # 'e' - 102: 76, # 'f' - 103: 70, # 'g' - 104: 64, # 'h' - 105: 53, # 'i' - 106: 105, # 'j' - 107: 93, # 'k' - 108: 56, # 'l' - 109: 65, # 'm' - 110: 54, # 'n' - 111: 49, # 'o' - 112: 66, # 'p' - 113: 110, # 'q' - 114: 51, # 'r' - 115: 43, # 's' - 116: 44, # 't' - 117: 63, # 'u' - 118: 81, # 'v' - 119: 77, # 'w' - 120: 98, # 'x' - 121: 75, # 'y' - 122: 108, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 124, # '€' - 129: 202, # None - 130: 203, # '‚' - 131: 204, # 'ƒ' - 132: 205, # '„' - 133: 40, # '…' - 134: 58, # '†' - 135: 206, # '‡' - 136: 207, # 'ˆ' - 137: 208, # '‰' - 138: 209, # None - 139: 210, # '‹' - 140: 211, # None - 141: 212, # None - 142: 213, # None - 143: 214, # None - 144: 215, # None - 145: 83, # '‘' - 146: 52, # '’' - 147: 47, # '“' - 148: 46, # '”' - 149: 72, # '•' - 150: 32, # '–' - 151: 94, # '—' - 152: 216, # '˜' - 153: 113, # '™' - 154: 217, # None - 155: 109, # '›' - 156: 218, # None - 157: 219, # None - 158: 220, # None - 159: 221, # None - 160: 34, # '\xa0' - 161: 116, # '¡' - 162: 222, # '¢' - 163: 118, # '£' - 164: 100, # '₪' - 165: 223, # '¥' - 166: 224, # '¦' - 167: 117, # '§' - 168: 119, # '¨' - 169: 104, # '©' - 170: 125, # '×' - 171: 225, # '«' - 172: 226, # '¬' - 173: 87, # '\xad' - 174: 99, # '®' - 175: 227, # '¯' - 176: 106, # '°' - 177: 122, # '±' - 178: 123, # '²' - 179: 228, # '³' - 180: 55, # '´' - 181: 229, # 'µ' - 182: 230, # '¶' - 183: 101, # '·' - 184: 231, # '¸' - 185: 232, # '¹' - 186: 120, # '÷' - 187: 233, # '»' - 188: 48, # '¼' - 189: 39, # '½' - 190: 57, # '¾' - 191: 234, # '¿' - 192: 30, # 'ְ' - 193: 59, # 'ֱ' - 194: 41, # 'ֲ' - 195: 88, # 'ֳ' - 196: 33, # 'ִ' - 197: 37, # 'ֵ' - 198: 36, # 'ֶ' - 199: 31, # 'ַ' - 200: 29, # 'ָ' - 201: 35, # 'ֹ' - 202: 235, # None - 203: 62, # 'ֻ' - 204: 28, # 'ּ' - 205: 236, # 'ֽ' - 206: 126, # '־' - 207: 237, # 'ֿ' - 208: 238, # '׀' - 209: 38, # 'ׁ' - 210: 45, # 'ׂ' - 211: 239, # '׃' - 212: 240, # 'װ' - 213: 241, # 'ױ' - 214: 242, # 'ײ' - 215: 243, # '׳' - 216: 127, # '״' - 217: 244, # None - 218: 245, # None - 219: 246, # None - 220: 247, # None - 221: 248, # None - 222: 249, # None - 223: 250, # None - 224: 9, # 'א' - 225: 8, # 'ב' - 226: 20, # 'ג' - 227: 16, # 'ד' - 228: 3, # 'ה' - 229: 2, # 'ו' - 230: 24, # 'ז' - 231: 14, # 'ח' - 232: 22, # 'ט' - 233: 1, # 'י' - 234: 25, # 'ך' - 235: 15, # 'כ' - 236: 4, # 'ל' - 237: 11, # 'ם' - 238: 6, # 'מ' - 239: 23, # 'ן' - 240: 12, # 'נ' - 241: 19, # 'ס' - 242: 13, # 'ע' - 243: 26, # 'ף' - 244: 18, # 'פ' - 245: 27, # 'ץ' - 246: 21, # 'צ' - 247: 17, # 'ק' - 248: 7, # 'ר' - 249: 10, # 'ש' - 250: 5, # 'ת' - 251: 251, # None - 252: 252, # None - 253: 128, # '\u200e' - 254: 96, # '\u200f' - 255: 253, # None + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 69, # 'A' + 66: 91, # 'B' + 67: 79, # 'C' + 68: 80, # 'D' + 69: 92, # 'E' + 70: 89, # 'F' + 71: 97, # 'G' + 72: 90, # 'H' + 73: 68, # 'I' + 74: 111, # 'J' + 75: 112, # 'K' + 76: 82, # 'L' + 77: 73, # 'M' + 78: 95, # 'N' + 79: 85, # 'O' + 80: 78, # 'P' + 81: 121, # 'Q' + 82: 86, # 'R' + 83: 71, # 'S' + 84: 67, # 'T' + 85: 102, # 'U' + 86: 107, # 'V' + 87: 84, # 'W' + 88: 114, # 'X' + 89: 103, # 'Y' + 90: 115, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 50, # 'a' + 98: 74, # 'b' + 99: 60, # 'c' + 100: 61, # 'd' + 101: 42, # 'e' + 102: 76, # 'f' + 103: 70, # 'g' + 104: 64, # 'h' + 105: 53, # 'i' + 106: 105, # 'j' + 107: 93, # 'k' + 108: 56, # 'l' + 109: 65, # 'm' + 110: 54, # 'n' + 111: 49, # 'o' + 112: 66, # 'p' + 113: 110, # 'q' + 114: 51, # 'r' + 115: 43, # 's' + 116: 44, # 't' + 117: 63, # 'u' + 118: 81, # 'v' + 119: 77, # 'w' + 120: 98, # 'x' + 121: 75, # 'y' + 122: 108, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 124, # '€' + 129: 202, # None + 130: 203, # '‚' + 131: 204, # 'ƒ' + 132: 205, # '„' + 133: 40, # '…' + 134: 58, # '†' + 135: 206, # '‡' + 136: 207, # 'ˆ' + 137: 208, # '‰' + 138: 209, # None + 139: 210, # '‹' + 140: 211, # None + 141: 212, # None + 142: 213, # None + 143: 214, # None + 144: 215, # None + 145: 83, # '‘' + 146: 52, # '’' + 147: 47, # '“' + 148: 46, # '”' + 149: 72, # '•' + 150: 32, # '–' + 151: 94, # '—' + 152: 216, # '˜' + 153: 113, # '™' + 154: 217, # None + 155: 109, # '›' + 156: 218, # None + 157: 219, # None + 158: 220, # None + 159: 221, # None + 160: 34, # '\xa0' + 161: 116, # '¡' + 162: 222, # '¢' + 163: 118, # '£' + 164: 100, # '₪' + 165: 223, # '¥' + 166: 224, # '¦' + 167: 117, # '§' + 168: 119, # '¨' + 169: 104, # '©' + 170: 125, # '×' + 171: 225, # '«' + 172: 226, # '¬' + 173: 87, # '\xad' + 174: 99, # '®' + 175: 227, # '¯' + 176: 106, # '°' + 177: 122, # '±' + 178: 123, # '²' + 179: 228, # '³' + 180: 55, # '´' + 181: 229, # 'µ' + 182: 230, # '¶' + 183: 101, # '·' + 184: 231, # '¸' + 185: 232, # '¹' + 186: 120, # '÷' + 187: 233, # '»' + 188: 48, # '¼' + 189: 39, # '½' + 190: 57, # '¾' + 191: 234, # '¿' + 192: 30, # 'ְ' + 193: 59, # 'ֱ' + 194: 41, # 'ֲ' + 195: 88, # 'ֳ' + 196: 33, # 'ִ' + 197: 37, # 'ֵ' + 198: 36, # 'ֶ' + 199: 31, # 'ַ' + 200: 29, # 'ָ' + 201: 35, # 'ֹ' + 202: 235, # None + 203: 62, # 'ֻ' + 204: 28, # 'ּ' + 205: 236, # 'ֽ' + 206: 126, # '־' + 207: 237, # 'ֿ' + 208: 238, # '׀' + 209: 38, # 'ׁ' + 210: 45, # 'ׂ' + 211: 239, # '׃' + 212: 240, # 'װ' + 213: 241, # 'ױ' + 214: 242, # 'ײ' + 215: 243, # '׳' + 216: 127, # '״' + 217: 244, # None + 218: 245, # None + 219: 246, # None + 220: 247, # None + 221: 248, # None + 222: 249, # None + 223: 250, # None + 224: 9, # 'א' + 225: 8, # 'ב' + 226: 20, # 'ג' + 227: 16, # 'ד' + 228: 3, # 'ה' + 229: 2, # 'ו' + 230: 24, # 'ז' + 231: 14, # 'ח' + 232: 22, # 'ט' + 233: 1, # 'י' + 234: 25, # 'ך' + 235: 15, # 'כ' + 236: 4, # 'ל' + 237: 11, # 'ם' + 238: 6, # 'מ' + 239: 23, # 'ן' + 240: 12, # 'נ' + 241: 19, # 'ס' + 242: 13, # 'ע' + 243: 26, # 'ף' + 244: 18, # 'פ' + 245: 27, # 'ץ' + 246: 21, # 'צ' + 247: 17, # 'ק' + 248: 7, # 'ר' + 249: 10, # 'ש' + 250: 5, # 'ת' + 251: 251, # None + 252: 252, # None + 253: 128, # '\u200e' + 254: 96, # '\u200f' + 255: 253, # None } -WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(charset_name='windows-1255', - language='Hebrew', - char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER, - language_model=HEBREW_LANG_MODEL, - typical_positive_ratio=0.984004, - keep_ascii_letters=False, - alphabet='אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ') - +WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel( + charset_name="windows-1255", + language="Hebrew", + char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER, + language_model=HEBREW_LANG_MODEL, + typical_positive_ratio=0.984004, + keep_ascii_letters=False, + alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", +) diff --git a/lib/chardet/langhungarianmodel.py b/lib/chardet/langhungarianmodel.py index aff0b349..23f9029f 100644 --- a/lib/chardet/langhungarianmodel.py +++ b/lib/chardet/langhungarianmodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -4115,536 +4111,539 @@ HUNGARIAN_LANG_MODEL = { # Character Mapping Table(s): WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 28, # 'A' - 66: 40, # 'B' - 67: 54, # 'C' - 68: 45, # 'D' - 69: 32, # 'E' - 70: 50, # 'F' - 71: 49, # 'G' - 72: 38, # 'H' - 73: 39, # 'I' - 74: 53, # 'J' - 75: 36, # 'K' - 76: 41, # 'L' - 77: 34, # 'M' - 78: 35, # 'N' - 79: 47, # 'O' - 80: 46, # 'P' - 81: 72, # 'Q' - 82: 43, # 'R' - 83: 33, # 'S' - 84: 37, # 'T' - 85: 57, # 'U' - 86: 48, # 'V' - 87: 64, # 'W' - 88: 68, # 'X' - 89: 55, # 'Y' - 90: 52, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 18, # 'b' - 99: 26, # 'c' - 100: 17, # 'd' - 101: 1, # 'e' - 102: 27, # 'f' - 103: 12, # 'g' - 104: 20, # 'h' - 105: 9, # 'i' - 106: 22, # 'j' - 107: 7, # 'k' - 108: 6, # 'l' - 109: 13, # 'm' - 110: 4, # 'n' - 111: 8, # 'o' - 112: 23, # 'p' - 113: 67, # 'q' - 114: 10, # 'r' - 115: 5, # 's' - 116: 3, # 't' - 117: 21, # 'u' - 118: 19, # 'v' - 119: 65, # 'w' - 120: 62, # 'x' - 121: 16, # 'y' - 122: 11, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 161, # '€' - 129: 162, # None - 130: 163, # '‚' - 131: 164, # None - 132: 165, # '„' - 133: 166, # '…' - 134: 167, # '†' - 135: 168, # '‡' - 136: 169, # None - 137: 170, # '‰' - 138: 171, # 'Š' - 139: 172, # '‹' - 140: 173, # 'Ś' - 141: 174, # 'Ť' - 142: 175, # 'Ž' - 143: 176, # 'Ź' - 144: 177, # None - 145: 178, # '‘' - 146: 179, # '’' - 147: 180, # '“' - 148: 78, # '”' - 149: 181, # '•' - 150: 69, # '–' - 151: 182, # '—' - 152: 183, # None - 153: 184, # '™' - 154: 185, # 'š' - 155: 186, # '›' - 156: 187, # 'ś' - 157: 188, # 'ť' - 158: 189, # 'ž' - 159: 190, # 'ź' - 160: 191, # '\xa0' - 161: 192, # 'ˇ' - 162: 193, # '˘' - 163: 194, # 'Ł' - 164: 195, # '¤' - 165: 196, # 'Ą' - 166: 197, # '¦' - 167: 76, # '§' - 168: 198, # '¨' - 169: 199, # '©' - 170: 200, # 'Ş' - 171: 201, # '«' - 172: 202, # '¬' - 173: 203, # '\xad' - 174: 204, # '®' - 175: 205, # 'Ż' - 176: 81, # '°' - 177: 206, # '±' - 178: 207, # '˛' - 179: 208, # 'ł' - 180: 209, # '´' - 181: 210, # 'µ' - 182: 211, # '¶' - 183: 212, # '·' - 184: 213, # '¸' - 185: 214, # 'ą' - 186: 215, # 'ş' - 187: 216, # '»' - 188: 217, # 'Ľ' - 189: 218, # '˝' - 190: 219, # 'ľ' - 191: 220, # 'ż' - 192: 221, # 'Ŕ' - 193: 51, # 'Á' - 194: 83, # 'Â' - 195: 222, # 'Ă' - 196: 80, # 'Ä' - 197: 223, # 'Ĺ' - 198: 224, # 'Ć' - 199: 225, # 'Ç' - 200: 226, # 'Č' - 201: 44, # 'É' - 202: 227, # 'Ę' - 203: 228, # 'Ë' - 204: 229, # 'Ě' - 205: 61, # 'Í' - 206: 230, # 'Î' - 207: 231, # 'Ď' - 208: 232, # 'Đ' - 209: 233, # 'Ń' - 210: 234, # 'Ň' - 211: 58, # 'Ó' - 212: 235, # 'Ô' - 213: 66, # 'Ő' - 214: 59, # 'Ö' - 215: 236, # '×' - 216: 237, # 'Ř' - 217: 238, # 'Ů' - 218: 60, # 'Ú' - 219: 70, # 'Ű' - 220: 63, # 'Ü' - 221: 239, # 'Ý' - 222: 240, # 'Ţ' - 223: 241, # 'ß' - 224: 84, # 'ŕ' - 225: 14, # 'á' - 226: 75, # 'â' - 227: 242, # 'ă' - 228: 71, # 'ä' - 229: 82, # 'ĺ' - 230: 243, # 'ć' - 231: 73, # 'ç' - 232: 244, # 'č' - 233: 15, # 'é' - 234: 85, # 'ę' - 235: 79, # 'ë' - 236: 86, # 'ě' - 237: 30, # 'í' - 238: 77, # 'î' - 239: 87, # 'ď' - 240: 245, # 'đ' - 241: 246, # 'ń' - 242: 247, # 'ň' - 243: 25, # 'ó' - 244: 74, # 'ô' - 245: 42, # 'ő' - 246: 24, # 'ö' - 247: 248, # '÷' - 248: 249, # 'ř' - 249: 250, # 'ů' - 250: 31, # 'ú' - 251: 56, # 'ű' - 252: 29, # 'ü' - 253: 251, # 'ý' - 254: 252, # 'ţ' - 255: 253, # '˙' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 28, # 'A' + 66: 40, # 'B' + 67: 54, # 'C' + 68: 45, # 'D' + 69: 32, # 'E' + 70: 50, # 'F' + 71: 49, # 'G' + 72: 38, # 'H' + 73: 39, # 'I' + 74: 53, # 'J' + 75: 36, # 'K' + 76: 41, # 'L' + 77: 34, # 'M' + 78: 35, # 'N' + 79: 47, # 'O' + 80: 46, # 'P' + 81: 72, # 'Q' + 82: 43, # 'R' + 83: 33, # 'S' + 84: 37, # 'T' + 85: 57, # 'U' + 86: 48, # 'V' + 87: 64, # 'W' + 88: 68, # 'X' + 89: 55, # 'Y' + 90: 52, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 18, # 'b' + 99: 26, # 'c' + 100: 17, # 'd' + 101: 1, # 'e' + 102: 27, # 'f' + 103: 12, # 'g' + 104: 20, # 'h' + 105: 9, # 'i' + 106: 22, # 'j' + 107: 7, # 'k' + 108: 6, # 'l' + 109: 13, # 'm' + 110: 4, # 'n' + 111: 8, # 'o' + 112: 23, # 'p' + 113: 67, # 'q' + 114: 10, # 'r' + 115: 5, # 's' + 116: 3, # 't' + 117: 21, # 'u' + 118: 19, # 'v' + 119: 65, # 'w' + 120: 62, # 'x' + 121: 16, # 'y' + 122: 11, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 161, # '€' + 129: 162, # None + 130: 163, # '‚' + 131: 164, # None + 132: 165, # '„' + 133: 166, # '…' + 134: 167, # '†' + 135: 168, # '‡' + 136: 169, # None + 137: 170, # '‰' + 138: 171, # 'Š' + 139: 172, # '‹' + 140: 173, # 'Ś' + 141: 174, # 'Ť' + 142: 175, # 'Ž' + 143: 176, # 'Ź' + 144: 177, # None + 145: 178, # '‘' + 146: 179, # '’' + 147: 180, # '“' + 148: 78, # '”' + 149: 181, # '•' + 150: 69, # '–' + 151: 182, # '—' + 152: 183, # None + 153: 184, # '™' + 154: 185, # 'š' + 155: 186, # '›' + 156: 187, # 'ś' + 157: 188, # 'ť' + 158: 189, # 'ž' + 159: 190, # 'ź' + 160: 191, # '\xa0' + 161: 192, # 'ˇ' + 162: 193, # '˘' + 163: 194, # 'Ł' + 164: 195, # '¤' + 165: 196, # 'Ą' + 166: 197, # '¦' + 167: 76, # '§' + 168: 198, # '¨' + 169: 199, # '©' + 170: 200, # 'Ş' + 171: 201, # '«' + 172: 202, # '¬' + 173: 203, # '\xad' + 174: 204, # '®' + 175: 205, # 'Ż' + 176: 81, # '°' + 177: 206, # '±' + 178: 207, # '˛' + 179: 208, # 'ł' + 180: 209, # '´' + 181: 210, # 'µ' + 182: 211, # '¶' + 183: 212, # '·' + 184: 213, # '¸' + 185: 214, # 'ą' + 186: 215, # 'ş' + 187: 216, # '»' + 188: 217, # 'Ľ' + 189: 218, # '˝' + 190: 219, # 'ľ' + 191: 220, # 'ż' + 192: 221, # 'Ŕ' + 193: 51, # 'Á' + 194: 83, # 'Â' + 195: 222, # 'Ă' + 196: 80, # 'Ä' + 197: 223, # 'Ĺ' + 198: 224, # 'Ć' + 199: 225, # 'Ç' + 200: 226, # 'Č' + 201: 44, # 'É' + 202: 227, # 'Ę' + 203: 228, # 'Ë' + 204: 229, # 'Ě' + 205: 61, # 'Í' + 206: 230, # 'Î' + 207: 231, # 'Ď' + 208: 232, # 'Đ' + 209: 233, # 'Ń' + 210: 234, # 'Ň' + 211: 58, # 'Ó' + 212: 235, # 'Ô' + 213: 66, # 'Ő' + 214: 59, # 'Ö' + 215: 236, # '×' + 216: 237, # 'Ř' + 217: 238, # 'Ů' + 218: 60, # 'Ú' + 219: 70, # 'Ű' + 220: 63, # 'Ü' + 221: 239, # 'Ý' + 222: 240, # 'Ţ' + 223: 241, # 'ß' + 224: 84, # 'ŕ' + 225: 14, # 'á' + 226: 75, # 'â' + 227: 242, # 'ă' + 228: 71, # 'ä' + 229: 82, # 'ĺ' + 230: 243, # 'ć' + 231: 73, # 'ç' + 232: 244, # 'č' + 233: 15, # 'é' + 234: 85, # 'ę' + 235: 79, # 'ë' + 236: 86, # 'ě' + 237: 30, # 'í' + 238: 77, # 'î' + 239: 87, # 'ď' + 240: 245, # 'đ' + 241: 246, # 'ń' + 242: 247, # 'ň' + 243: 25, # 'ó' + 244: 74, # 'ô' + 245: 42, # 'ő' + 246: 24, # 'ö' + 247: 248, # '÷' + 248: 249, # 'ř' + 249: 250, # 'ů' + 250: 31, # 'ú' + 251: 56, # 'ű' + 252: 29, # 'ü' + 253: 251, # 'ý' + 254: 252, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_HUNGARIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1250', - language='Hungarian', - char_to_order_map=WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER, - language_model=HUNGARIAN_LANG_MODEL, - typical_positive_ratio=0.947368, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű') +WINDOWS_1250_HUNGARIAN_MODEL = SingleByteCharSetModel( + charset_name="windows-1250", + language="Hungarian", + char_to_order_map=WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER, + language_model=HUNGARIAN_LANG_MODEL, + typical_positive_ratio=0.947368, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű", +) ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 28, # 'A' - 66: 40, # 'B' - 67: 54, # 'C' - 68: 45, # 'D' - 69: 32, # 'E' - 70: 50, # 'F' - 71: 49, # 'G' - 72: 38, # 'H' - 73: 39, # 'I' - 74: 53, # 'J' - 75: 36, # 'K' - 76: 41, # 'L' - 77: 34, # 'M' - 78: 35, # 'N' - 79: 47, # 'O' - 80: 46, # 'P' - 81: 71, # 'Q' - 82: 43, # 'R' - 83: 33, # 'S' - 84: 37, # 'T' - 85: 57, # 'U' - 86: 48, # 'V' - 87: 64, # 'W' - 88: 68, # 'X' - 89: 55, # 'Y' - 90: 52, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 18, # 'b' - 99: 26, # 'c' - 100: 17, # 'd' - 101: 1, # 'e' - 102: 27, # 'f' - 103: 12, # 'g' - 104: 20, # 'h' - 105: 9, # 'i' - 106: 22, # 'j' - 107: 7, # 'k' - 108: 6, # 'l' - 109: 13, # 'm' - 110: 4, # 'n' - 111: 8, # 'o' - 112: 23, # 'p' - 113: 67, # 'q' - 114: 10, # 'r' - 115: 5, # 's' - 116: 3, # 't' - 117: 21, # 'u' - 118: 19, # 'v' - 119: 65, # 'w' - 120: 62, # 'x' - 121: 16, # 'y' - 122: 11, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 159, # '\x80' - 129: 160, # '\x81' - 130: 161, # '\x82' - 131: 162, # '\x83' - 132: 163, # '\x84' - 133: 164, # '\x85' - 134: 165, # '\x86' - 135: 166, # '\x87' - 136: 167, # '\x88' - 137: 168, # '\x89' - 138: 169, # '\x8a' - 139: 170, # '\x8b' - 140: 171, # '\x8c' - 141: 172, # '\x8d' - 142: 173, # '\x8e' - 143: 174, # '\x8f' - 144: 175, # '\x90' - 145: 176, # '\x91' - 146: 177, # '\x92' - 147: 178, # '\x93' - 148: 179, # '\x94' - 149: 180, # '\x95' - 150: 181, # '\x96' - 151: 182, # '\x97' - 152: 183, # '\x98' - 153: 184, # '\x99' - 154: 185, # '\x9a' - 155: 186, # '\x9b' - 156: 187, # '\x9c' - 157: 188, # '\x9d' - 158: 189, # '\x9e' - 159: 190, # '\x9f' - 160: 191, # '\xa0' - 161: 192, # 'Ą' - 162: 193, # '˘' - 163: 194, # 'Ł' - 164: 195, # '¤' - 165: 196, # 'Ľ' - 166: 197, # 'Ś' - 167: 75, # '§' - 168: 198, # '¨' - 169: 199, # 'Š' - 170: 200, # 'Ş' - 171: 201, # 'Ť' - 172: 202, # 'Ź' - 173: 203, # '\xad' - 174: 204, # 'Ž' - 175: 205, # 'Ż' - 176: 79, # '°' - 177: 206, # 'ą' - 178: 207, # '˛' - 179: 208, # 'ł' - 180: 209, # '´' - 181: 210, # 'ľ' - 182: 211, # 'ś' - 183: 212, # 'ˇ' - 184: 213, # '¸' - 185: 214, # 'š' - 186: 215, # 'ş' - 187: 216, # 'ť' - 188: 217, # 'ź' - 189: 218, # '˝' - 190: 219, # 'ž' - 191: 220, # 'ż' - 192: 221, # 'Ŕ' - 193: 51, # 'Á' - 194: 81, # 'Â' - 195: 222, # 'Ă' - 196: 78, # 'Ä' - 197: 223, # 'Ĺ' - 198: 224, # 'Ć' - 199: 225, # 'Ç' - 200: 226, # 'Č' - 201: 44, # 'É' - 202: 227, # 'Ę' - 203: 228, # 'Ë' - 204: 229, # 'Ě' - 205: 61, # 'Í' - 206: 230, # 'Î' - 207: 231, # 'Ď' - 208: 232, # 'Đ' - 209: 233, # 'Ń' - 210: 234, # 'Ň' - 211: 58, # 'Ó' - 212: 235, # 'Ô' - 213: 66, # 'Ő' - 214: 59, # 'Ö' - 215: 236, # '×' - 216: 237, # 'Ř' - 217: 238, # 'Ů' - 218: 60, # 'Ú' - 219: 69, # 'Ű' - 220: 63, # 'Ü' - 221: 239, # 'Ý' - 222: 240, # 'Ţ' - 223: 241, # 'ß' - 224: 82, # 'ŕ' - 225: 14, # 'á' - 226: 74, # 'â' - 227: 242, # 'ă' - 228: 70, # 'ä' - 229: 80, # 'ĺ' - 230: 243, # 'ć' - 231: 72, # 'ç' - 232: 244, # 'č' - 233: 15, # 'é' - 234: 83, # 'ę' - 235: 77, # 'ë' - 236: 84, # 'ě' - 237: 30, # 'í' - 238: 76, # 'î' - 239: 85, # 'ď' - 240: 245, # 'đ' - 241: 246, # 'ń' - 242: 247, # 'ň' - 243: 25, # 'ó' - 244: 73, # 'ô' - 245: 42, # 'ő' - 246: 24, # 'ö' - 247: 248, # '÷' - 248: 249, # 'ř' - 249: 250, # 'ů' - 250: 31, # 'ú' - 251: 56, # 'ű' - 252: 29, # 'ü' - 253: 251, # 'ý' - 254: 252, # 'ţ' - 255: 253, # '˙' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 28, # 'A' + 66: 40, # 'B' + 67: 54, # 'C' + 68: 45, # 'D' + 69: 32, # 'E' + 70: 50, # 'F' + 71: 49, # 'G' + 72: 38, # 'H' + 73: 39, # 'I' + 74: 53, # 'J' + 75: 36, # 'K' + 76: 41, # 'L' + 77: 34, # 'M' + 78: 35, # 'N' + 79: 47, # 'O' + 80: 46, # 'P' + 81: 71, # 'Q' + 82: 43, # 'R' + 83: 33, # 'S' + 84: 37, # 'T' + 85: 57, # 'U' + 86: 48, # 'V' + 87: 64, # 'W' + 88: 68, # 'X' + 89: 55, # 'Y' + 90: 52, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 18, # 'b' + 99: 26, # 'c' + 100: 17, # 'd' + 101: 1, # 'e' + 102: 27, # 'f' + 103: 12, # 'g' + 104: 20, # 'h' + 105: 9, # 'i' + 106: 22, # 'j' + 107: 7, # 'k' + 108: 6, # 'l' + 109: 13, # 'm' + 110: 4, # 'n' + 111: 8, # 'o' + 112: 23, # 'p' + 113: 67, # 'q' + 114: 10, # 'r' + 115: 5, # 's' + 116: 3, # 't' + 117: 21, # 'u' + 118: 19, # 'v' + 119: 65, # 'w' + 120: 62, # 'x' + 121: 16, # 'y' + 122: 11, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 159, # '\x80' + 129: 160, # '\x81' + 130: 161, # '\x82' + 131: 162, # '\x83' + 132: 163, # '\x84' + 133: 164, # '\x85' + 134: 165, # '\x86' + 135: 166, # '\x87' + 136: 167, # '\x88' + 137: 168, # '\x89' + 138: 169, # '\x8a' + 139: 170, # '\x8b' + 140: 171, # '\x8c' + 141: 172, # '\x8d' + 142: 173, # '\x8e' + 143: 174, # '\x8f' + 144: 175, # '\x90' + 145: 176, # '\x91' + 146: 177, # '\x92' + 147: 178, # '\x93' + 148: 179, # '\x94' + 149: 180, # '\x95' + 150: 181, # '\x96' + 151: 182, # '\x97' + 152: 183, # '\x98' + 153: 184, # '\x99' + 154: 185, # '\x9a' + 155: 186, # '\x9b' + 156: 187, # '\x9c' + 157: 188, # '\x9d' + 158: 189, # '\x9e' + 159: 190, # '\x9f' + 160: 191, # '\xa0' + 161: 192, # 'Ą' + 162: 193, # '˘' + 163: 194, # 'Ł' + 164: 195, # '¤' + 165: 196, # 'Ľ' + 166: 197, # 'Ś' + 167: 75, # '§' + 168: 198, # '¨' + 169: 199, # 'Š' + 170: 200, # 'Ş' + 171: 201, # 'Ť' + 172: 202, # 'Ź' + 173: 203, # '\xad' + 174: 204, # 'Ž' + 175: 205, # 'Ż' + 176: 79, # '°' + 177: 206, # 'ą' + 178: 207, # '˛' + 179: 208, # 'ł' + 180: 209, # '´' + 181: 210, # 'ľ' + 182: 211, # 'ś' + 183: 212, # 'ˇ' + 184: 213, # '¸' + 185: 214, # 'š' + 186: 215, # 'ş' + 187: 216, # 'ť' + 188: 217, # 'ź' + 189: 218, # '˝' + 190: 219, # 'ž' + 191: 220, # 'ż' + 192: 221, # 'Ŕ' + 193: 51, # 'Á' + 194: 81, # 'Â' + 195: 222, # 'Ă' + 196: 78, # 'Ä' + 197: 223, # 'Ĺ' + 198: 224, # 'Ć' + 199: 225, # 'Ç' + 200: 226, # 'Č' + 201: 44, # 'É' + 202: 227, # 'Ę' + 203: 228, # 'Ë' + 204: 229, # 'Ě' + 205: 61, # 'Í' + 206: 230, # 'Î' + 207: 231, # 'Ď' + 208: 232, # 'Đ' + 209: 233, # 'Ń' + 210: 234, # 'Ň' + 211: 58, # 'Ó' + 212: 235, # 'Ô' + 213: 66, # 'Ő' + 214: 59, # 'Ö' + 215: 236, # '×' + 216: 237, # 'Ř' + 217: 238, # 'Ů' + 218: 60, # 'Ú' + 219: 69, # 'Ű' + 220: 63, # 'Ü' + 221: 239, # 'Ý' + 222: 240, # 'Ţ' + 223: 241, # 'ß' + 224: 82, # 'ŕ' + 225: 14, # 'á' + 226: 74, # 'â' + 227: 242, # 'ă' + 228: 70, # 'ä' + 229: 80, # 'ĺ' + 230: 243, # 'ć' + 231: 72, # 'ç' + 232: 244, # 'č' + 233: 15, # 'é' + 234: 83, # 'ę' + 235: 77, # 'ë' + 236: 84, # 'ě' + 237: 30, # 'í' + 238: 76, # 'î' + 239: 85, # 'ď' + 240: 245, # 'đ' + 241: 246, # 'ń' + 242: 247, # 'ň' + 243: 25, # 'ó' + 244: 73, # 'ô' + 245: 42, # 'ő' + 246: 24, # 'ö' + 247: 248, # '÷' + 248: 249, # 'ř' + 249: 250, # 'ů' + 250: 31, # 'ú' + 251: 56, # 'ű' + 252: 29, # 'ü' + 253: 251, # 'ý' + 254: 252, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_HUNGARIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Hungarian', - char_to_order_map=ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER, - language_model=HUNGARIAN_LANG_MODEL, - typical_positive_ratio=0.947368, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű') - +ISO_8859_2_HUNGARIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Hungarian", + char_to_order_map=ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER, + language_model=HUNGARIAN_LANG_MODEL, + typical_positive_ratio=0.947368, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű", +) diff --git a/lib/chardet/langrussianmodel.py b/lib/chardet/langrussianmodel.py index 1ef61a3f..9ea28a23 100644 --- a/lib/chardet/langrussianmodel.py +++ b/lib/chardet/langrussianmodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -4115,1604 +4111,1615 @@ RUSSIAN_LANG_MODEL = { # Character Mapping Table(s): IBM866_RUSSIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 142, # 'A' - 66: 143, # 'B' - 67: 144, # 'C' - 68: 145, # 'D' - 69: 146, # 'E' - 70: 147, # 'F' - 71: 148, # 'G' - 72: 149, # 'H' - 73: 150, # 'I' - 74: 151, # 'J' - 75: 152, # 'K' - 76: 74, # 'L' - 77: 153, # 'M' - 78: 75, # 'N' - 79: 154, # 'O' - 80: 155, # 'P' - 81: 156, # 'Q' - 82: 157, # 'R' - 83: 158, # 'S' - 84: 159, # 'T' - 85: 160, # 'U' - 86: 161, # 'V' - 87: 162, # 'W' - 88: 163, # 'X' - 89: 164, # 'Y' - 90: 165, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 71, # 'a' - 98: 172, # 'b' - 99: 66, # 'c' - 100: 173, # 'd' - 101: 65, # 'e' - 102: 174, # 'f' - 103: 76, # 'g' - 104: 175, # 'h' - 105: 64, # 'i' - 106: 176, # 'j' - 107: 177, # 'k' - 108: 77, # 'l' - 109: 72, # 'm' - 110: 178, # 'n' - 111: 69, # 'o' - 112: 67, # 'p' - 113: 179, # 'q' - 114: 78, # 'r' - 115: 73, # 's' - 116: 180, # 't' - 117: 181, # 'u' - 118: 79, # 'v' - 119: 182, # 'w' - 120: 183, # 'x' - 121: 184, # 'y' - 122: 185, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 37, # 'А' - 129: 44, # 'Б' - 130: 33, # 'В' - 131: 46, # 'Г' - 132: 41, # 'Д' - 133: 48, # 'Е' - 134: 56, # 'Ж' - 135: 51, # 'З' - 136: 42, # 'И' - 137: 60, # 'Й' - 138: 36, # 'К' - 139: 49, # 'Л' - 140: 38, # 'М' - 141: 31, # 'Н' - 142: 34, # 'О' - 143: 35, # 'П' - 144: 45, # 'Р' - 145: 32, # 'С' - 146: 40, # 'Т' - 147: 52, # 'У' - 148: 53, # 'Ф' - 149: 55, # 'Х' - 150: 58, # 'Ц' - 151: 50, # 'Ч' - 152: 57, # 'Ш' - 153: 63, # 'Щ' - 154: 70, # 'Ъ' - 155: 62, # 'Ы' - 156: 61, # 'Ь' - 157: 47, # 'Э' - 158: 59, # 'Ю' - 159: 43, # 'Я' - 160: 3, # 'а' - 161: 21, # 'б' - 162: 10, # 'в' - 163: 19, # 'г' - 164: 13, # 'д' - 165: 2, # 'е' - 166: 24, # 'ж' - 167: 20, # 'з' - 168: 4, # 'и' - 169: 23, # 'й' - 170: 11, # 'к' - 171: 8, # 'л' - 172: 12, # 'м' - 173: 5, # 'н' - 174: 1, # 'о' - 175: 15, # 'п' - 176: 191, # '░' - 177: 192, # '▒' - 178: 193, # '▓' - 179: 194, # '│' - 180: 195, # '┤' - 181: 196, # '╡' - 182: 197, # '╢' - 183: 198, # '╖' - 184: 199, # '╕' - 185: 200, # '╣' - 186: 201, # '║' - 187: 202, # '╗' - 188: 203, # '╝' - 189: 204, # '╜' - 190: 205, # '╛' - 191: 206, # '┐' - 192: 207, # '└' - 193: 208, # '┴' - 194: 209, # '┬' - 195: 210, # '├' - 196: 211, # '─' - 197: 212, # '┼' - 198: 213, # '╞' - 199: 214, # '╟' - 200: 215, # '╚' - 201: 216, # '╔' - 202: 217, # '╩' - 203: 218, # '╦' - 204: 219, # '╠' - 205: 220, # '═' - 206: 221, # '╬' - 207: 222, # '╧' - 208: 223, # '╨' - 209: 224, # '╤' - 210: 225, # '╥' - 211: 226, # '╙' - 212: 227, # '╘' - 213: 228, # '╒' - 214: 229, # '╓' - 215: 230, # '╫' - 216: 231, # '╪' - 217: 232, # '┘' - 218: 233, # '┌' - 219: 234, # '█' - 220: 235, # '▄' - 221: 236, # '▌' - 222: 237, # '▐' - 223: 238, # '▀' - 224: 9, # 'р' - 225: 7, # 'с' - 226: 6, # 'т' - 227: 14, # 'у' - 228: 39, # 'ф' - 229: 26, # 'х' - 230: 28, # 'ц' - 231: 22, # 'ч' - 232: 25, # 'ш' - 233: 29, # 'щ' - 234: 54, # 'ъ' - 235: 18, # 'ы' - 236: 17, # 'ь' - 237: 30, # 'э' - 238: 27, # 'ю' - 239: 16, # 'я' - 240: 239, # 'Ё' - 241: 68, # 'ё' - 242: 240, # 'Є' - 243: 241, # 'є' - 244: 242, # 'Ї' - 245: 243, # 'ї' - 246: 244, # 'Ў' - 247: 245, # 'ў' - 248: 246, # '°' - 249: 247, # '∙' - 250: 248, # '·' - 251: 249, # '√' - 252: 250, # '№' - 253: 251, # '¤' - 254: 252, # '■' - 255: 255, # '\xa0' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 142, # 'A' + 66: 143, # 'B' + 67: 144, # 'C' + 68: 145, # 'D' + 69: 146, # 'E' + 70: 147, # 'F' + 71: 148, # 'G' + 72: 149, # 'H' + 73: 150, # 'I' + 74: 151, # 'J' + 75: 152, # 'K' + 76: 74, # 'L' + 77: 153, # 'M' + 78: 75, # 'N' + 79: 154, # 'O' + 80: 155, # 'P' + 81: 156, # 'Q' + 82: 157, # 'R' + 83: 158, # 'S' + 84: 159, # 'T' + 85: 160, # 'U' + 86: 161, # 'V' + 87: 162, # 'W' + 88: 163, # 'X' + 89: 164, # 'Y' + 90: 165, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 71, # 'a' + 98: 172, # 'b' + 99: 66, # 'c' + 100: 173, # 'd' + 101: 65, # 'e' + 102: 174, # 'f' + 103: 76, # 'g' + 104: 175, # 'h' + 105: 64, # 'i' + 106: 176, # 'j' + 107: 177, # 'k' + 108: 77, # 'l' + 109: 72, # 'm' + 110: 178, # 'n' + 111: 69, # 'o' + 112: 67, # 'p' + 113: 179, # 'q' + 114: 78, # 'r' + 115: 73, # 's' + 116: 180, # 't' + 117: 181, # 'u' + 118: 79, # 'v' + 119: 182, # 'w' + 120: 183, # 'x' + 121: 184, # 'y' + 122: 185, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 37, # 'А' + 129: 44, # 'Б' + 130: 33, # 'В' + 131: 46, # 'Г' + 132: 41, # 'Д' + 133: 48, # 'Е' + 134: 56, # 'Ж' + 135: 51, # 'З' + 136: 42, # 'И' + 137: 60, # 'Й' + 138: 36, # 'К' + 139: 49, # 'Л' + 140: 38, # 'М' + 141: 31, # 'Н' + 142: 34, # 'О' + 143: 35, # 'П' + 144: 45, # 'Р' + 145: 32, # 'С' + 146: 40, # 'Т' + 147: 52, # 'У' + 148: 53, # 'Ф' + 149: 55, # 'Х' + 150: 58, # 'Ц' + 151: 50, # 'Ч' + 152: 57, # 'Ш' + 153: 63, # 'Щ' + 154: 70, # 'Ъ' + 155: 62, # 'Ы' + 156: 61, # 'Ь' + 157: 47, # 'Э' + 158: 59, # 'Ю' + 159: 43, # 'Я' + 160: 3, # 'а' + 161: 21, # 'б' + 162: 10, # 'в' + 163: 19, # 'г' + 164: 13, # 'д' + 165: 2, # 'е' + 166: 24, # 'ж' + 167: 20, # 'з' + 168: 4, # 'и' + 169: 23, # 'й' + 170: 11, # 'к' + 171: 8, # 'л' + 172: 12, # 'м' + 173: 5, # 'н' + 174: 1, # 'о' + 175: 15, # 'п' + 176: 191, # '░' + 177: 192, # '▒' + 178: 193, # '▓' + 179: 194, # '│' + 180: 195, # '┤' + 181: 196, # '╡' + 182: 197, # '╢' + 183: 198, # '╖' + 184: 199, # '╕' + 185: 200, # '╣' + 186: 201, # '║' + 187: 202, # '╗' + 188: 203, # '╝' + 189: 204, # '╜' + 190: 205, # '╛' + 191: 206, # '┐' + 192: 207, # '└' + 193: 208, # '┴' + 194: 209, # '┬' + 195: 210, # '├' + 196: 211, # '─' + 197: 212, # '┼' + 198: 213, # '╞' + 199: 214, # '╟' + 200: 215, # '╚' + 201: 216, # '╔' + 202: 217, # '╩' + 203: 218, # '╦' + 204: 219, # '╠' + 205: 220, # '═' + 206: 221, # '╬' + 207: 222, # '╧' + 208: 223, # '╨' + 209: 224, # '╤' + 210: 225, # '╥' + 211: 226, # '╙' + 212: 227, # '╘' + 213: 228, # '╒' + 214: 229, # '╓' + 215: 230, # '╫' + 216: 231, # '╪' + 217: 232, # '┘' + 218: 233, # '┌' + 219: 234, # '█' + 220: 235, # '▄' + 221: 236, # '▌' + 222: 237, # '▐' + 223: 238, # '▀' + 224: 9, # 'р' + 225: 7, # 'с' + 226: 6, # 'т' + 227: 14, # 'у' + 228: 39, # 'ф' + 229: 26, # 'х' + 230: 28, # 'ц' + 231: 22, # 'ч' + 232: 25, # 'ш' + 233: 29, # 'щ' + 234: 54, # 'ъ' + 235: 18, # 'ы' + 236: 17, # 'ь' + 237: 30, # 'э' + 238: 27, # 'ю' + 239: 16, # 'я' + 240: 239, # 'Ё' + 241: 68, # 'ё' + 242: 240, # 'Є' + 243: 241, # 'є' + 244: 242, # 'Ї' + 245: 243, # 'ї' + 246: 244, # 'Ў' + 247: 245, # 'ў' + 248: 246, # '°' + 249: 247, # '∙' + 250: 248, # '·' + 251: 249, # '√' + 252: 250, # '№' + 253: 251, # '¤' + 254: 252, # '■' + 255: 255, # '\xa0' } -IBM866_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM866', - language='Russian', - char_to_order_map=IBM866_RUSSIAN_CHAR_TO_ORDER, - language_model=RUSSIAN_LANG_MODEL, - typical_positive_ratio=0.976601, - keep_ascii_letters=False, - alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё') +IBM866_RUSSIAN_MODEL = SingleByteCharSetModel( + charset_name="IBM866", + language="Russian", + char_to_order_map=IBM866_RUSSIAN_CHAR_TO_ORDER, + language_model=RUSSIAN_LANG_MODEL, + typical_positive_ratio=0.976601, + keep_ascii_letters=False, + alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё", +) WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 142, # 'A' - 66: 143, # 'B' - 67: 144, # 'C' - 68: 145, # 'D' - 69: 146, # 'E' - 70: 147, # 'F' - 71: 148, # 'G' - 72: 149, # 'H' - 73: 150, # 'I' - 74: 151, # 'J' - 75: 152, # 'K' - 76: 74, # 'L' - 77: 153, # 'M' - 78: 75, # 'N' - 79: 154, # 'O' - 80: 155, # 'P' - 81: 156, # 'Q' - 82: 157, # 'R' - 83: 158, # 'S' - 84: 159, # 'T' - 85: 160, # 'U' - 86: 161, # 'V' - 87: 162, # 'W' - 88: 163, # 'X' - 89: 164, # 'Y' - 90: 165, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 71, # 'a' - 98: 172, # 'b' - 99: 66, # 'c' - 100: 173, # 'd' - 101: 65, # 'e' - 102: 174, # 'f' - 103: 76, # 'g' - 104: 175, # 'h' - 105: 64, # 'i' - 106: 176, # 'j' - 107: 177, # 'k' - 108: 77, # 'l' - 109: 72, # 'm' - 110: 178, # 'n' - 111: 69, # 'o' - 112: 67, # 'p' - 113: 179, # 'q' - 114: 78, # 'r' - 115: 73, # 's' - 116: 180, # 't' - 117: 181, # 'u' - 118: 79, # 'v' - 119: 182, # 'w' - 120: 183, # 'x' - 121: 184, # 'y' - 122: 185, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 191, # 'Ђ' - 129: 192, # 'Ѓ' - 130: 193, # '‚' - 131: 194, # 'ѓ' - 132: 195, # '„' - 133: 196, # '…' - 134: 197, # '†' - 135: 198, # '‡' - 136: 199, # '€' - 137: 200, # '‰' - 138: 201, # 'Љ' - 139: 202, # '‹' - 140: 203, # 'Њ' - 141: 204, # 'Ќ' - 142: 205, # 'Ћ' - 143: 206, # 'Џ' - 144: 207, # 'ђ' - 145: 208, # '‘' - 146: 209, # '’' - 147: 210, # '“' - 148: 211, # '”' - 149: 212, # '•' - 150: 213, # '–' - 151: 214, # '—' - 152: 215, # None - 153: 216, # '™' - 154: 217, # 'љ' - 155: 218, # '›' - 156: 219, # 'њ' - 157: 220, # 'ќ' - 158: 221, # 'ћ' - 159: 222, # 'џ' - 160: 223, # '\xa0' - 161: 224, # 'Ў' - 162: 225, # 'ў' - 163: 226, # 'Ј' - 164: 227, # '¤' - 165: 228, # 'Ґ' - 166: 229, # '¦' - 167: 230, # '§' - 168: 231, # 'Ё' - 169: 232, # '©' - 170: 233, # 'Є' - 171: 234, # '«' - 172: 235, # '¬' - 173: 236, # '\xad' - 174: 237, # '®' - 175: 238, # 'Ї' - 176: 239, # '°' - 177: 240, # '±' - 178: 241, # 'І' - 179: 242, # 'і' - 180: 243, # 'ґ' - 181: 244, # 'µ' - 182: 245, # '¶' - 183: 246, # '·' - 184: 68, # 'ё' - 185: 247, # '№' - 186: 248, # 'є' - 187: 249, # '»' - 188: 250, # 'ј' - 189: 251, # 'Ѕ' - 190: 252, # 'ѕ' - 191: 253, # 'ї' - 192: 37, # 'А' - 193: 44, # 'Б' - 194: 33, # 'В' - 195: 46, # 'Г' - 196: 41, # 'Д' - 197: 48, # 'Е' - 198: 56, # 'Ж' - 199: 51, # 'З' - 200: 42, # 'И' - 201: 60, # 'Й' - 202: 36, # 'К' - 203: 49, # 'Л' - 204: 38, # 'М' - 205: 31, # 'Н' - 206: 34, # 'О' - 207: 35, # 'П' - 208: 45, # 'Р' - 209: 32, # 'С' - 210: 40, # 'Т' - 211: 52, # 'У' - 212: 53, # 'Ф' - 213: 55, # 'Х' - 214: 58, # 'Ц' - 215: 50, # 'Ч' - 216: 57, # 'Ш' - 217: 63, # 'Щ' - 218: 70, # 'Ъ' - 219: 62, # 'Ы' - 220: 61, # 'Ь' - 221: 47, # 'Э' - 222: 59, # 'Ю' - 223: 43, # 'Я' - 224: 3, # 'а' - 225: 21, # 'б' - 226: 10, # 'в' - 227: 19, # 'г' - 228: 13, # 'д' - 229: 2, # 'е' - 230: 24, # 'ж' - 231: 20, # 'з' - 232: 4, # 'и' - 233: 23, # 'й' - 234: 11, # 'к' - 235: 8, # 'л' - 236: 12, # 'м' - 237: 5, # 'н' - 238: 1, # 'о' - 239: 15, # 'п' - 240: 9, # 'р' - 241: 7, # 'с' - 242: 6, # 'т' - 243: 14, # 'у' - 244: 39, # 'ф' - 245: 26, # 'х' - 246: 28, # 'ц' - 247: 22, # 'ч' - 248: 25, # 'ш' - 249: 29, # 'щ' - 250: 54, # 'ъ' - 251: 18, # 'ы' - 252: 17, # 'ь' - 253: 30, # 'э' - 254: 27, # 'ю' - 255: 16, # 'я' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 142, # 'A' + 66: 143, # 'B' + 67: 144, # 'C' + 68: 145, # 'D' + 69: 146, # 'E' + 70: 147, # 'F' + 71: 148, # 'G' + 72: 149, # 'H' + 73: 150, # 'I' + 74: 151, # 'J' + 75: 152, # 'K' + 76: 74, # 'L' + 77: 153, # 'M' + 78: 75, # 'N' + 79: 154, # 'O' + 80: 155, # 'P' + 81: 156, # 'Q' + 82: 157, # 'R' + 83: 158, # 'S' + 84: 159, # 'T' + 85: 160, # 'U' + 86: 161, # 'V' + 87: 162, # 'W' + 88: 163, # 'X' + 89: 164, # 'Y' + 90: 165, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 71, # 'a' + 98: 172, # 'b' + 99: 66, # 'c' + 100: 173, # 'd' + 101: 65, # 'e' + 102: 174, # 'f' + 103: 76, # 'g' + 104: 175, # 'h' + 105: 64, # 'i' + 106: 176, # 'j' + 107: 177, # 'k' + 108: 77, # 'l' + 109: 72, # 'm' + 110: 178, # 'n' + 111: 69, # 'o' + 112: 67, # 'p' + 113: 179, # 'q' + 114: 78, # 'r' + 115: 73, # 's' + 116: 180, # 't' + 117: 181, # 'u' + 118: 79, # 'v' + 119: 182, # 'w' + 120: 183, # 'x' + 121: 184, # 'y' + 122: 185, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 191, # 'Ђ' + 129: 192, # 'Ѓ' + 130: 193, # '‚' + 131: 194, # 'ѓ' + 132: 195, # '„' + 133: 196, # '…' + 134: 197, # '†' + 135: 198, # '‡' + 136: 199, # '€' + 137: 200, # '‰' + 138: 201, # 'Љ' + 139: 202, # '‹' + 140: 203, # 'Њ' + 141: 204, # 'Ќ' + 142: 205, # 'Ћ' + 143: 206, # 'Џ' + 144: 207, # 'ђ' + 145: 208, # '‘' + 146: 209, # '’' + 147: 210, # '“' + 148: 211, # '”' + 149: 212, # '•' + 150: 213, # '–' + 151: 214, # '—' + 152: 215, # None + 153: 216, # '™' + 154: 217, # 'љ' + 155: 218, # '›' + 156: 219, # 'њ' + 157: 220, # 'ќ' + 158: 221, # 'ћ' + 159: 222, # 'џ' + 160: 223, # '\xa0' + 161: 224, # 'Ў' + 162: 225, # 'ў' + 163: 226, # 'Ј' + 164: 227, # '¤' + 165: 228, # 'Ґ' + 166: 229, # '¦' + 167: 230, # '§' + 168: 231, # 'Ё' + 169: 232, # '©' + 170: 233, # 'Є' + 171: 234, # '«' + 172: 235, # '¬' + 173: 236, # '\xad' + 174: 237, # '®' + 175: 238, # 'Ї' + 176: 239, # '°' + 177: 240, # '±' + 178: 241, # 'І' + 179: 242, # 'і' + 180: 243, # 'ґ' + 181: 244, # 'µ' + 182: 245, # '¶' + 183: 246, # '·' + 184: 68, # 'ё' + 185: 247, # '№' + 186: 248, # 'є' + 187: 249, # '»' + 188: 250, # 'ј' + 189: 251, # 'Ѕ' + 190: 252, # 'ѕ' + 191: 253, # 'ї' + 192: 37, # 'А' + 193: 44, # 'Б' + 194: 33, # 'В' + 195: 46, # 'Г' + 196: 41, # 'Д' + 197: 48, # 'Е' + 198: 56, # 'Ж' + 199: 51, # 'З' + 200: 42, # 'И' + 201: 60, # 'Й' + 202: 36, # 'К' + 203: 49, # 'Л' + 204: 38, # 'М' + 205: 31, # 'Н' + 206: 34, # 'О' + 207: 35, # 'П' + 208: 45, # 'Р' + 209: 32, # 'С' + 210: 40, # 'Т' + 211: 52, # 'У' + 212: 53, # 'Ф' + 213: 55, # 'Х' + 214: 58, # 'Ц' + 215: 50, # 'Ч' + 216: 57, # 'Ш' + 217: 63, # 'Щ' + 218: 70, # 'Ъ' + 219: 62, # 'Ы' + 220: 61, # 'Ь' + 221: 47, # 'Э' + 222: 59, # 'Ю' + 223: 43, # 'Я' + 224: 3, # 'а' + 225: 21, # 'б' + 226: 10, # 'в' + 227: 19, # 'г' + 228: 13, # 'д' + 229: 2, # 'е' + 230: 24, # 'ж' + 231: 20, # 'з' + 232: 4, # 'и' + 233: 23, # 'й' + 234: 11, # 'к' + 235: 8, # 'л' + 236: 12, # 'м' + 237: 5, # 'н' + 238: 1, # 'о' + 239: 15, # 'п' + 240: 9, # 'р' + 241: 7, # 'с' + 242: 6, # 'т' + 243: 14, # 'у' + 244: 39, # 'ф' + 245: 26, # 'х' + 246: 28, # 'ц' + 247: 22, # 'ч' + 248: 25, # 'ш' + 249: 29, # 'щ' + 250: 54, # 'ъ' + 251: 18, # 'ы' + 252: 17, # 'ь' + 253: 30, # 'э' + 254: 27, # 'ю' + 255: 16, # 'я' } -WINDOWS_1251_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1251', - language='Russian', - char_to_order_map=WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER, - language_model=RUSSIAN_LANG_MODEL, - typical_positive_ratio=0.976601, - keep_ascii_letters=False, - alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё') +WINDOWS_1251_RUSSIAN_MODEL = SingleByteCharSetModel( + charset_name="windows-1251", + language="Russian", + char_to_order_map=WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER, + language_model=RUSSIAN_LANG_MODEL, + typical_positive_ratio=0.976601, + keep_ascii_letters=False, + alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё", +) IBM855_RUSSIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 142, # 'A' - 66: 143, # 'B' - 67: 144, # 'C' - 68: 145, # 'D' - 69: 146, # 'E' - 70: 147, # 'F' - 71: 148, # 'G' - 72: 149, # 'H' - 73: 150, # 'I' - 74: 151, # 'J' - 75: 152, # 'K' - 76: 74, # 'L' - 77: 153, # 'M' - 78: 75, # 'N' - 79: 154, # 'O' - 80: 155, # 'P' - 81: 156, # 'Q' - 82: 157, # 'R' - 83: 158, # 'S' - 84: 159, # 'T' - 85: 160, # 'U' - 86: 161, # 'V' - 87: 162, # 'W' - 88: 163, # 'X' - 89: 164, # 'Y' - 90: 165, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 71, # 'a' - 98: 172, # 'b' - 99: 66, # 'c' - 100: 173, # 'd' - 101: 65, # 'e' - 102: 174, # 'f' - 103: 76, # 'g' - 104: 175, # 'h' - 105: 64, # 'i' - 106: 176, # 'j' - 107: 177, # 'k' - 108: 77, # 'l' - 109: 72, # 'm' - 110: 178, # 'n' - 111: 69, # 'o' - 112: 67, # 'p' - 113: 179, # 'q' - 114: 78, # 'r' - 115: 73, # 's' - 116: 180, # 't' - 117: 181, # 'u' - 118: 79, # 'v' - 119: 182, # 'w' - 120: 183, # 'x' - 121: 184, # 'y' - 122: 185, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 191, # 'ђ' - 129: 192, # 'Ђ' - 130: 193, # 'ѓ' - 131: 194, # 'Ѓ' - 132: 68, # 'ё' - 133: 195, # 'Ё' - 134: 196, # 'є' - 135: 197, # 'Є' - 136: 198, # 'ѕ' - 137: 199, # 'Ѕ' - 138: 200, # 'і' - 139: 201, # 'І' - 140: 202, # 'ї' - 141: 203, # 'Ї' - 142: 204, # 'ј' - 143: 205, # 'Ј' - 144: 206, # 'љ' - 145: 207, # 'Љ' - 146: 208, # 'њ' - 147: 209, # 'Њ' - 148: 210, # 'ћ' - 149: 211, # 'Ћ' - 150: 212, # 'ќ' - 151: 213, # 'Ќ' - 152: 214, # 'ў' - 153: 215, # 'Ў' - 154: 216, # 'џ' - 155: 217, # 'Џ' - 156: 27, # 'ю' - 157: 59, # 'Ю' - 158: 54, # 'ъ' - 159: 70, # 'Ъ' - 160: 3, # 'а' - 161: 37, # 'А' - 162: 21, # 'б' - 163: 44, # 'Б' - 164: 28, # 'ц' - 165: 58, # 'Ц' - 166: 13, # 'д' - 167: 41, # 'Д' - 168: 2, # 'е' - 169: 48, # 'Е' - 170: 39, # 'ф' - 171: 53, # 'Ф' - 172: 19, # 'г' - 173: 46, # 'Г' - 174: 218, # '«' - 175: 219, # '»' - 176: 220, # '░' - 177: 221, # '▒' - 178: 222, # '▓' - 179: 223, # '│' - 180: 224, # '┤' - 181: 26, # 'х' - 182: 55, # 'Х' - 183: 4, # 'и' - 184: 42, # 'И' - 185: 225, # '╣' - 186: 226, # '║' - 187: 227, # '╗' - 188: 228, # '╝' - 189: 23, # 'й' - 190: 60, # 'Й' - 191: 229, # '┐' - 192: 230, # '└' - 193: 231, # '┴' - 194: 232, # '┬' - 195: 233, # '├' - 196: 234, # '─' - 197: 235, # '┼' - 198: 11, # 'к' - 199: 36, # 'К' - 200: 236, # '╚' - 201: 237, # '╔' - 202: 238, # '╩' - 203: 239, # '╦' - 204: 240, # '╠' - 205: 241, # '═' - 206: 242, # '╬' - 207: 243, # '¤' - 208: 8, # 'л' - 209: 49, # 'Л' - 210: 12, # 'м' - 211: 38, # 'М' - 212: 5, # 'н' - 213: 31, # 'Н' - 214: 1, # 'о' - 215: 34, # 'О' - 216: 15, # 'п' - 217: 244, # '┘' - 218: 245, # '┌' - 219: 246, # '█' - 220: 247, # '▄' - 221: 35, # 'П' - 222: 16, # 'я' - 223: 248, # '▀' - 224: 43, # 'Я' - 225: 9, # 'р' - 226: 45, # 'Р' - 227: 7, # 'с' - 228: 32, # 'С' - 229: 6, # 'т' - 230: 40, # 'Т' - 231: 14, # 'у' - 232: 52, # 'У' - 233: 24, # 'ж' - 234: 56, # 'Ж' - 235: 10, # 'в' - 236: 33, # 'В' - 237: 17, # 'ь' - 238: 61, # 'Ь' - 239: 249, # '№' - 240: 250, # '\xad' - 241: 18, # 'ы' - 242: 62, # 'Ы' - 243: 20, # 'з' - 244: 51, # 'З' - 245: 25, # 'ш' - 246: 57, # 'Ш' - 247: 30, # 'э' - 248: 47, # 'Э' - 249: 29, # 'щ' - 250: 63, # 'Щ' - 251: 22, # 'ч' - 252: 50, # 'Ч' - 253: 251, # '§' - 254: 252, # '■' - 255: 255, # '\xa0' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 142, # 'A' + 66: 143, # 'B' + 67: 144, # 'C' + 68: 145, # 'D' + 69: 146, # 'E' + 70: 147, # 'F' + 71: 148, # 'G' + 72: 149, # 'H' + 73: 150, # 'I' + 74: 151, # 'J' + 75: 152, # 'K' + 76: 74, # 'L' + 77: 153, # 'M' + 78: 75, # 'N' + 79: 154, # 'O' + 80: 155, # 'P' + 81: 156, # 'Q' + 82: 157, # 'R' + 83: 158, # 'S' + 84: 159, # 'T' + 85: 160, # 'U' + 86: 161, # 'V' + 87: 162, # 'W' + 88: 163, # 'X' + 89: 164, # 'Y' + 90: 165, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 71, # 'a' + 98: 172, # 'b' + 99: 66, # 'c' + 100: 173, # 'd' + 101: 65, # 'e' + 102: 174, # 'f' + 103: 76, # 'g' + 104: 175, # 'h' + 105: 64, # 'i' + 106: 176, # 'j' + 107: 177, # 'k' + 108: 77, # 'l' + 109: 72, # 'm' + 110: 178, # 'n' + 111: 69, # 'o' + 112: 67, # 'p' + 113: 179, # 'q' + 114: 78, # 'r' + 115: 73, # 's' + 116: 180, # 't' + 117: 181, # 'u' + 118: 79, # 'v' + 119: 182, # 'w' + 120: 183, # 'x' + 121: 184, # 'y' + 122: 185, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 191, # 'ђ' + 129: 192, # 'Ђ' + 130: 193, # 'ѓ' + 131: 194, # 'Ѓ' + 132: 68, # 'ё' + 133: 195, # 'Ё' + 134: 196, # 'є' + 135: 197, # 'Є' + 136: 198, # 'ѕ' + 137: 199, # 'Ѕ' + 138: 200, # 'і' + 139: 201, # 'І' + 140: 202, # 'ї' + 141: 203, # 'Ї' + 142: 204, # 'ј' + 143: 205, # 'Ј' + 144: 206, # 'љ' + 145: 207, # 'Љ' + 146: 208, # 'њ' + 147: 209, # 'Њ' + 148: 210, # 'ћ' + 149: 211, # 'Ћ' + 150: 212, # 'ќ' + 151: 213, # 'Ќ' + 152: 214, # 'ў' + 153: 215, # 'Ў' + 154: 216, # 'џ' + 155: 217, # 'Џ' + 156: 27, # 'ю' + 157: 59, # 'Ю' + 158: 54, # 'ъ' + 159: 70, # 'Ъ' + 160: 3, # 'а' + 161: 37, # 'А' + 162: 21, # 'б' + 163: 44, # 'Б' + 164: 28, # 'ц' + 165: 58, # 'Ц' + 166: 13, # 'д' + 167: 41, # 'Д' + 168: 2, # 'е' + 169: 48, # 'Е' + 170: 39, # 'ф' + 171: 53, # 'Ф' + 172: 19, # 'г' + 173: 46, # 'Г' + 174: 218, # '«' + 175: 219, # '»' + 176: 220, # '░' + 177: 221, # '▒' + 178: 222, # '▓' + 179: 223, # '│' + 180: 224, # '┤' + 181: 26, # 'х' + 182: 55, # 'Х' + 183: 4, # 'и' + 184: 42, # 'И' + 185: 225, # '╣' + 186: 226, # '║' + 187: 227, # '╗' + 188: 228, # '╝' + 189: 23, # 'й' + 190: 60, # 'Й' + 191: 229, # '┐' + 192: 230, # '└' + 193: 231, # '┴' + 194: 232, # '┬' + 195: 233, # '├' + 196: 234, # '─' + 197: 235, # '┼' + 198: 11, # 'к' + 199: 36, # 'К' + 200: 236, # '╚' + 201: 237, # '╔' + 202: 238, # '╩' + 203: 239, # '╦' + 204: 240, # '╠' + 205: 241, # '═' + 206: 242, # '╬' + 207: 243, # '¤' + 208: 8, # 'л' + 209: 49, # 'Л' + 210: 12, # 'м' + 211: 38, # 'М' + 212: 5, # 'н' + 213: 31, # 'Н' + 214: 1, # 'о' + 215: 34, # 'О' + 216: 15, # 'п' + 217: 244, # '┘' + 218: 245, # '┌' + 219: 246, # '█' + 220: 247, # '▄' + 221: 35, # 'П' + 222: 16, # 'я' + 223: 248, # '▀' + 224: 43, # 'Я' + 225: 9, # 'р' + 226: 45, # 'Р' + 227: 7, # 'с' + 228: 32, # 'С' + 229: 6, # 'т' + 230: 40, # 'Т' + 231: 14, # 'у' + 232: 52, # 'У' + 233: 24, # 'ж' + 234: 56, # 'Ж' + 235: 10, # 'в' + 236: 33, # 'В' + 237: 17, # 'ь' + 238: 61, # 'Ь' + 239: 249, # '№' + 240: 250, # '\xad' + 241: 18, # 'ы' + 242: 62, # 'Ы' + 243: 20, # 'з' + 244: 51, # 'З' + 245: 25, # 'ш' + 246: 57, # 'Ш' + 247: 30, # 'э' + 248: 47, # 'Э' + 249: 29, # 'щ' + 250: 63, # 'Щ' + 251: 22, # 'ч' + 252: 50, # 'Ч' + 253: 251, # '§' + 254: 252, # '■' + 255: 255, # '\xa0' } -IBM855_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM855', - language='Russian', - char_to_order_map=IBM855_RUSSIAN_CHAR_TO_ORDER, - language_model=RUSSIAN_LANG_MODEL, - typical_positive_ratio=0.976601, - keep_ascii_letters=False, - alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё') +IBM855_RUSSIAN_MODEL = SingleByteCharSetModel( + charset_name="IBM855", + language="Russian", + char_to_order_map=IBM855_RUSSIAN_CHAR_TO_ORDER, + language_model=RUSSIAN_LANG_MODEL, + typical_positive_ratio=0.976601, + keep_ascii_letters=False, + alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё", +) KOI8_R_RUSSIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 142, # 'A' - 66: 143, # 'B' - 67: 144, # 'C' - 68: 145, # 'D' - 69: 146, # 'E' - 70: 147, # 'F' - 71: 148, # 'G' - 72: 149, # 'H' - 73: 150, # 'I' - 74: 151, # 'J' - 75: 152, # 'K' - 76: 74, # 'L' - 77: 153, # 'M' - 78: 75, # 'N' - 79: 154, # 'O' - 80: 155, # 'P' - 81: 156, # 'Q' - 82: 157, # 'R' - 83: 158, # 'S' - 84: 159, # 'T' - 85: 160, # 'U' - 86: 161, # 'V' - 87: 162, # 'W' - 88: 163, # 'X' - 89: 164, # 'Y' - 90: 165, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 71, # 'a' - 98: 172, # 'b' - 99: 66, # 'c' - 100: 173, # 'd' - 101: 65, # 'e' - 102: 174, # 'f' - 103: 76, # 'g' - 104: 175, # 'h' - 105: 64, # 'i' - 106: 176, # 'j' - 107: 177, # 'k' - 108: 77, # 'l' - 109: 72, # 'm' - 110: 178, # 'n' - 111: 69, # 'o' - 112: 67, # 'p' - 113: 179, # 'q' - 114: 78, # 'r' - 115: 73, # 's' - 116: 180, # 't' - 117: 181, # 'u' - 118: 79, # 'v' - 119: 182, # 'w' - 120: 183, # 'x' - 121: 184, # 'y' - 122: 185, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 191, # '─' - 129: 192, # '│' - 130: 193, # '┌' - 131: 194, # '┐' - 132: 195, # '└' - 133: 196, # '┘' - 134: 197, # '├' - 135: 198, # '┤' - 136: 199, # '┬' - 137: 200, # '┴' - 138: 201, # '┼' - 139: 202, # '▀' - 140: 203, # '▄' - 141: 204, # '█' - 142: 205, # '▌' - 143: 206, # '▐' - 144: 207, # '░' - 145: 208, # '▒' - 146: 209, # '▓' - 147: 210, # '⌠' - 148: 211, # '■' - 149: 212, # '∙' - 150: 213, # '√' - 151: 214, # '≈' - 152: 215, # '≤' - 153: 216, # '≥' - 154: 217, # '\xa0' - 155: 218, # '⌡' - 156: 219, # '°' - 157: 220, # '²' - 158: 221, # '·' - 159: 222, # '÷' - 160: 223, # '═' - 161: 224, # '║' - 162: 225, # '╒' - 163: 68, # 'ё' - 164: 226, # '╓' - 165: 227, # '╔' - 166: 228, # '╕' - 167: 229, # '╖' - 168: 230, # '╗' - 169: 231, # '╘' - 170: 232, # '╙' - 171: 233, # '╚' - 172: 234, # '╛' - 173: 235, # '╜' - 174: 236, # '╝' - 175: 237, # '╞' - 176: 238, # '╟' - 177: 239, # '╠' - 178: 240, # '╡' - 179: 241, # 'Ё' - 180: 242, # '╢' - 181: 243, # '╣' - 182: 244, # '╤' - 183: 245, # '╥' - 184: 246, # '╦' - 185: 247, # '╧' - 186: 248, # '╨' - 187: 249, # '╩' - 188: 250, # '╪' - 189: 251, # '╫' - 190: 252, # '╬' - 191: 253, # '©' - 192: 27, # 'ю' - 193: 3, # 'а' - 194: 21, # 'б' - 195: 28, # 'ц' - 196: 13, # 'д' - 197: 2, # 'е' - 198: 39, # 'ф' - 199: 19, # 'г' - 200: 26, # 'х' - 201: 4, # 'и' - 202: 23, # 'й' - 203: 11, # 'к' - 204: 8, # 'л' - 205: 12, # 'м' - 206: 5, # 'н' - 207: 1, # 'о' - 208: 15, # 'п' - 209: 16, # 'я' - 210: 9, # 'р' - 211: 7, # 'с' - 212: 6, # 'т' - 213: 14, # 'у' - 214: 24, # 'ж' - 215: 10, # 'в' - 216: 17, # 'ь' - 217: 18, # 'ы' - 218: 20, # 'з' - 219: 25, # 'ш' - 220: 30, # 'э' - 221: 29, # 'щ' - 222: 22, # 'ч' - 223: 54, # 'ъ' - 224: 59, # 'Ю' - 225: 37, # 'А' - 226: 44, # 'Б' - 227: 58, # 'Ц' - 228: 41, # 'Д' - 229: 48, # 'Е' - 230: 53, # 'Ф' - 231: 46, # 'Г' - 232: 55, # 'Х' - 233: 42, # 'И' - 234: 60, # 'Й' - 235: 36, # 'К' - 236: 49, # 'Л' - 237: 38, # 'М' - 238: 31, # 'Н' - 239: 34, # 'О' - 240: 35, # 'П' - 241: 43, # 'Я' - 242: 45, # 'Р' - 243: 32, # 'С' - 244: 40, # 'Т' - 245: 52, # 'У' - 246: 56, # 'Ж' - 247: 33, # 'В' - 248: 61, # 'Ь' - 249: 62, # 'Ы' - 250: 51, # 'З' - 251: 57, # 'Ш' - 252: 47, # 'Э' - 253: 63, # 'Щ' - 254: 50, # 'Ч' - 255: 70, # 'Ъ' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 142, # 'A' + 66: 143, # 'B' + 67: 144, # 'C' + 68: 145, # 'D' + 69: 146, # 'E' + 70: 147, # 'F' + 71: 148, # 'G' + 72: 149, # 'H' + 73: 150, # 'I' + 74: 151, # 'J' + 75: 152, # 'K' + 76: 74, # 'L' + 77: 153, # 'M' + 78: 75, # 'N' + 79: 154, # 'O' + 80: 155, # 'P' + 81: 156, # 'Q' + 82: 157, # 'R' + 83: 158, # 'S' + 84: 159, # 'T' + 85: 160, # 'U' + 86: 161, # 'V' + 87: 162, # 'W' + 88: 163, # 'X' + 89: 164, # 'Y' + 90: 165, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 71, # 'a' + 98: 172, # 'b' + 99: 66, # 'c' + 100: 173, # 'd' + 101: 65, # 'e' + 102: 174, # 'f' + 103: 76, # 'g' + 104: 175, # 'h' + 105: 64, # 'i' + 106: 176, # 'j' + 107: 177, # 'k' + 108: 77, # 'l' + 109: 72, # 'm' + 110: 178, # 'n' + 111: 69, # 'o' + 112: 67, # 'p' + 113: 179, # 'q' + 114: 78, # 'r' + 115: 73, # 's' + 116: 180, # 't' + 117: 181, # 'u' + 118: 79, # 'v' + 119: 182, # 'w' + 120: 183, # 'x' + 121: 184, # 'y' + 122: 185, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 191, # '─' + 129: 192, # '│' + 130: 193, # '┌' + 131: 194, # '┐' + 132: 195, # '└' + 133: 196, # '┘' + 134: 197, # '├' + 135: 198, # '┤' + 136: 199, # '┬' + 137: 200, # '┴' + 138: 201, # '┼' + 139: 202, # '▀' + 140: 203, # '▄' + 141: 204, # '█' + 142: 205, # '▌' + 143: 206, # '▐' + 144: 207, # '░' + 145: 208, # '▒' + 146: 209, # '▓' + 147: 210, # '⌠' + 148: 211, # '■' + 149: 212, # '∙' + 150: 213, # '√' + 151: 214, # '≈' + 152: 215, # '≤' + 153: 216, # '≥' + 154: 217, # '\xa0' + 155: 218, # '⌡' + 156: 219, # '°' + 157: 220, # '²' + 158: 221, # '·' + 159: 222, # '÷' + 160: 223, # '═' + 161: 224, # '║' + 162: 225, # '╒' + 163: 68, # 'ё' + 164: 226, # '╓' + 165: 227, # '╔' + 166: 228, # '╕' + 167: 229, # '╖' + 168: 230, # '╗' + 169: 231, # '╘' + 170: 232, # '╙' + 171: 233, # '╚' + 172: 234, # '╛' + 173: 235, # '╜' + 174: 236, # '╝' + 175: 237, # '╞' + 176: 238, # '╟' + 177: 239, # '╠' + 178: 240, # '╡' + 179: 241, # 'Ё' + 180: 242, # '╢' + 181: 243, # '╣' + 182: 244, # '╤' + 183: 245, # '╥' + 184: 246, # '╦' + 185: 247, # '╧' + 186: 248, # '╨' + 187: 249, # '╩' + 188: 250, # '╪' + 189: 251, # '╫' + 190: 252, # '╬' + 191: 253, # '©' + 192: 27, # 'ю' + 193: 3, # 'а' + 194: 21, # 'б' + 195: 28, # 'ц' + 196: 13, # 'д' + 197: 2, # 'е' + 198: 39, # 'ф' + 199: 19, # 'г' + 200: 26, # 'х' + 201: 4, # 'и' + 202: 23, # 'й' + 203: 11, # 'к' + 204: 8, # 'л' + 205: 12, # 'м' + 206: 5, # 'н' + 207: 1, # 'о' + 208: 15, # 'п' + 209: 16, # 'я' + 210: 9, # 'р' + 211: 7, # 'с' + 212: 6, # 'т' + 213: 14, # 'у' + 214: 24, # 'ж' + 215: 10, # 'в' + 216: 17, # 'ь' + 217: 18, # 'ы' + 218: 20, # 'з' + 219: 25, # 'ш' + 220: 30, # 'э' + 221: 29, # 'щ' + 222: 22, # 'ч' + 223: 54, # 'ъ' + 224: 59, # 'Ю' + 225: 37, # 'А' + 226: 44, # 'Б' + 227: 58, # 'Ц' + 228: 41, # 'Д' + 229: 48, # 'Е' + 230: 53, # 'Ф' + 231: 46, # 'Г' + 232: 55, # 'Х' + 233: 42, # 'И' + 234: 60, # 'Й' + 235: 36, # 'К' + 236: 49, # 'Л' + 237: 38, # 'М' + 238: 31, # 'Н' + 239: 34, # 'О' + 240: 35, # 'П' + 241: 43, # 'Я' + 242: 45, # 'Р' + 243: 32, # 'С' + 244: 40, # 'Т' + 245: 52, # 'У' + 246: 56, # 'Ж' + 247: 33, # 'В' + 248: 61, # 'Ь' + 249: 62, # 'Ы' + 250: 51, # 'З' + 251: 57, # 'Ш' + 252: 47, # 'Э' + 253: 63, # 'Щ' + 254: 50, # 'Ч' + 255: 70, # 'Ъ' } -KOI8_R_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='KOI8-R', - language='Russian', - char_to_order_map=KOI8_R_RUSSIAN_CHAR_TO_ORDER, - language_model=RUSSIAN_LANG_MODEL, - typical_positive_ratio=0.976601, - keep_ascii_letters=False, - alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё') +KOI8_R_RUSSIAN_MODEL = SingleByteCharSetModel( + charset_name="KOI8-R", + language="Russian", + char_to_order_map=KOI8_R_RUSSIAN_CHAR_TO_ORDER, + language_model=RUSSIAN_LANG_MODEL, + typical_positive_ratio=0.976601, + keep_ascii_letters=False, + alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё", +) MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 142, # 'A' - 66: 143, # 'B' - 67: 144, # 'C' - 68: 145, # 'D' - 69: 146, # 'E' - 70: 147, # 'F' - 71: 148, # 'G' - 72: 149, # 'H' - 73: 150, # 'I' - 74: 151, # 'J' - 75: 152, # 'K' - 76: 74, # 'L' - 77: 153, # 'M' - 78: 75, # 'N' - 79: 154, # 'O' - 80: 155, # 'P' - 81: 156, # 'Q' - 82: 157, # 'R' - 83: 158, # 'S' - 84: 159, # 'T' - 85: 160, # 'U' - 86: 161, # 'V' - 87: 162, # 'W' - 88: 163, # 'X' - 89: 164, # 'Y' - 90: 165, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 71, # 'a' - 98: 172, # 'b' - 99: 66, # 'c' - 100: 173, # 'd' - 101: 65, # 'e' - 102: 174, # 'f' - 103: 76, # 'g' - 104: 175, # 'h' - 105: 64, # 'i' - 106: 176, # 'j' - 107: 177, # 'k' - 108: 77, # 'l' - 109: 72, # 'm' - 110: 178, # 'n' - 111: 69, # 'o' - 112: 67, # 'p' - 113: 179, # 'q' - 114: 78, # 'r' - 115: 73, # 's' - 116: 180, # 't' - 117: 181, # 'u' - 118: 79, # 'v' - 119: 182, # 'w' - 120: 183, # 'x' - 121: 184, # 'y' - 122: 185, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 37, # 'А' - 129: 44, # 'Б' - 130: 33, # 'В' - 131: 46, # 'Г' - 132: 41, # 'Д' - 133: 48, # 'Е' - 134: 56, # 'Ж' - 135: 51, # 'З' - 136: 42, # 'И' - 137: 60, # 'Й' - 138: 36, # 'К' - 139: 49, # 'Л' - 140: 38, # 'М' - 141: 31, # 'Н' - 142: 34, # 'О' - 143: 35, # 'П' - 144: 45, # 'Р' - 145: 32, # 'С' - 146: 40, # 'Т' - 147: 52, # 'У' - 148: 53, # 'Ф' - 149: 55, # 'Х' - 150: 58, # 'Ц' - 151: 50, # 'Ч' - 152: 57, # 'Ш' - 153: 63, # 'Щ' - 154: 70, # 'Ъ' - 155: 62, # 'Ы' - 156: 61, # 'Ь' - 157: 47, # 'Э' - 158: 59, # 'Ю' - 159: 43, # 'Я' - 160: 191, # '†' - 161: 192, # '°' - 162: 193, # 'Ґ' - 163: 194, # '£' - 164: 195, # '§' - 165: 196, # '•' - 166: 197, # '¶' - 167: 198, # 'І' - 168: 199, # '®' - 169: 200, # '©' - 170: 201, # '™' - 171: 202, # 'Ђ' - 172: 203, # 'ђ' - 173: 204, # '≠' - 174: 205, # 'Ѓ' - 175: 206, # 'ѓ' - 176: 207, # '∞' - 177: 208, # '±' - 178: 209, # '≤' - 179: 210, # '≥' - 180: 211, # 'і' - 181: 212, # 'µ' - 182: 213, # 'ґ' - 183: 214, # 'Ј' - 184: 215, # 'Є' - 185: 216, # 'є' - 186: 217, # 'Ї' - 187: 218, # 'ї' - 188: 219, # 'Љ' - 189: 220, # 'љ' - 190: 221, # 'Њ' - 191: 222, # 'њ' - 192: 223, # 'ј' - 193: 224, # 'Ѕ' - 194: 225, # '¬' - 195: 226, # '√' - 196: 227, # 'ƒ' - 197: 228, # '≈' - 198: 229, # '∆' - 199: 230, # '«' - 200: 231, # '»' - 201: 232, # '…' - 202: 233, # '\xa0' - 203: 234, # 'Ћ' - 204: 235, # 'ћ' - 205: 236, # 'Ќ' - 206: 237, # 'ќ' - 207: 238, # 'ѕ' - 208: 239, # '–' - 209: 240, # '—' - 210: 241, # '“' - 211: 242, # '”' - 212: 243, # '‘' - 213: 244, # '’' - 214: 245, # '÷' - 215: 246, # '„' - 216: 247, # 'Ў' - 217: 248, # 'ў' - 218: 249, # 'Џ' - 219: 250, # 'џ' - 220: 251, # '№' - 221: 252, # 'Ё' - 222: 68, # 'ё' - 223: 16, # 'я' - 224: 3, # 'а' - 225: 21, # 'б' - 226: 10, # 'в' - 227: 19, # 'г' - 228: 13, # 'д' - 229: 2, # 'е' - 230: 24, # 'ж' - 231: 20, # 'з' - 232: 4, # 'и' - 233: 23, # 'й' - 234: 11, # 'к' - 235: 8, # 'л' - 236: 12, # 'м' - 237: 5, # 'н' - 238: 1, # 'о' - 239: 15, # 'п' - 240: 9, # 'р' - 241: 7, # 'с' - 242: 6, # 'т' - 243: 14, # 'у' - 244: 39, # 'ф' - 245: 26, # 'х' - 246: 28, # 'ц' - 247: 22, # 'ч' - 248: 25, # 'ш' - 249: 29, # 'щ' - 250: 54, # 'ъ' - 251: 18, # 'ы' - 252: 17, # 'ь' - 253: 30, # 'э' - 254: 27, # 'ю' - 255: 255, # '€' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 142, # 'A' + 66: 143, # 'B' + 67: 144, # 'C' + 68: 145, # 'D' + 69: 146, # 'E' + 70: 147, # 'F' + 71: 148, # 'G' + 72: 149, # 'H' + 73: 150, # 'I' + 74: 151, # 'J' + 75: 152, # 'K' + 76: 74, # 'L' + 77: 153, # 'M' + 78: 75, # 'N' + 79: 154, # 'O' + 80: 155, # 'P' + 81: 156, # 'Q' + 82: 157, # 'R' + 83: 158, # 'S' + 84: 159, # 'T' + 85: 160, # 'U' + 86: 161, # 'V' + 87: 162, # 'W' + 88: 163, # 'X' + 89: 164, # 'Y' + 90: 165, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 71, # 'a' + 98: 172, # 'b' + 99: 66, # 'c' + 100: 173, # 'd' + 101: 65, # 'e' + 102: 174, # 'f' + 103: 76, # 'g' + 104: 175, # 'h' + 105: 64, # 'i' + 106: 176, # 'j' + 107: 177, # 'k' + 108: 77, # 'l' + 109: 72, # 'm' + 110: 178, # 'n' + 111: 69, # 'o' + 112: 67, # 'p' + 113: 179, # 'q' + 114: 78, # 'r' + 115: 73, # 's' + 116: 180, # 't' + 117: 181, # 'u' + 118: 79, # 'v' + 119: 182, # 'w' + 120: 183, # 'x' + 121: 184, # 'y' + 122: 185, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 37, # 'А' + 129: 44, # 'Б' + 130: 33, # 'В' + 131: 46, # 'Г' + 132: 41, # 'Д' + 133: 48, # 'Е' + 134: 56, # 'Ж' + 135: 51, # 'З' + 136: 42, # 'И' + 137: 60, # 'Й' + 138: 36, # 'К' + 139: 49, # 'Л' + 140: 38, # 'М' + 141: 31, # 'Н' + 142: 34, # 'О' + 143: 35, # 'П' + 144: 45, # 'Р' + 145: 32, # 'С' + 146: 40, # 'Т' + 147: 52, # 'У' + 148: 53, # 'Ф' + 149: 55, # 'Х' + 150: 58, # 'Ц' + 151: 50, # 'Ч' + 152: 57, # 'Ш' + 153: 63, # 'Щ' + 154: 70, # 'Ъ' + 155: 62, # 'Ы' + 156: 61, # 'Ь' + 157: 47, # 'Э' + 158: 59, # 'Ю' + 159: 43, # 'Я' + 160: 191, # '†' + 161: 192, # '°' + 162: 193, # 'Ґ' + 163: 194, # '£' + 164: 195, # '§' + 165: 196, # '•' + 166: 197, # '¶' + 167: 198, # 'І' + 168: 199, # '®' + 169: 200, # '©' + 170: 201, # '™' + 171: 202, # 'Ђ' + 172: 203, # 'ђ' + 173: 204, # '≠' + 174: 205, # 'Ѓ' + 175: 206, # 'ѓ' + 176: 207, # '∞' + 177: 208, # '±' + 178: 209, # '≤' + 179: 210, # '≥' + 180: 211, # 'і' + 181: 212, # 'µ' + 182: 213, # 'ґ' + 183: 214, # 'Ј' + 184: 215, # 'Є' + 185: 216, # 'є' + 186: 217, # 'Ї' + 187: 218, # 'ї' + 188: 219, # 'Љ' + 189: 220, # 'љ' + 190: 221, # 'Њ' + 191: 222, # 'њ' + 192: 223, # 'ј' + 193: 224, # 'Ѕ' + 194: 225, # '¬' + 195: 226, # '√' + 196: 227, # 'ƒ' + 197: 228, # '≈' + 198: 229, # '∆' + 199: 230, # '«' + 200: 231, # '»' + 201: 232, # '…' + 202: 233, # '\xa0' + 203: 234, # 'Ћ' + 204: 235, # 'ћ' + 205: 236, # 'Ќ' + 206: 237, # 'ќ' + 207: 238, # 'ѕ' + 208: 239, # '–' + 209: 240, # '—' + 210: 241, # '“' + 211: 242, # '”' + 212: 243, # '‘' + 213: 244, # '’' + 214: 245, # '÷' + 215: 246, # '„' + 216: 247, # 'Ў' + 217: 248, # 'ў' + 218: 249, # 'Џ' + 219: 250, # 'џ' + 220: 251, # '№' + 221: 252, # 'Ё' + 222: 68, # 'ё' + 223: 16, # 'я' + 224: 3, # 'а' + 225: 21, # 'б' + 226: 10, # 'в' + 227: 19, # 'г' + 228: 13, # 'д' + 229: 2, # 'е' + 230: 24, # 'ж' + 231: 20, # 'з' + 232: 4, # 'и' + 233: 23, # 'й' + 234: 11, # 'к' + 235: 8, # 'л' + 236: 12, # 'м' + 237: 5, # 'н' + 238: 1, # 'о' + 239: 15, # 'п' + 240: 9, # 'р' + 241: 7, # 'с' + 242: 6, # 'т' + 243: 14, # 'у' + 244: 39, # 'ф' + 245: 26, # 'х' + 246: 28, # 'ц' + 247: 22, # 'ч' + 248: 25, # 'ш' + 249: 29, # 'щ' + 250: 54, # 'ъ' + 251: 18, # 'ы' + 252: 17, # 'ь' + 253: 30, # 'э' + 254: 27, # 'ю' + 255: 255, # '€' } -MACCYRILLIC_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='MacCyrillic', - language='Russian', - char_to_order_map=MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER, - language_model=RUSSIAN_LANG_MODEL, - typical_positive_ratio=0.976601, - keep_ascii_letters=False, - alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё') +MACCYRILLIC_RUSSIAN_MODEL = SingleByteCharSetModel( + charset_name="MacCyrillic", + language="Russian", + char_to_order_map=MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER, + language_model=RUSSIAN_LANG_MODEL, + typical_positive_ratio=0.976601, + keep_ascii_letters=False, + alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё", +) ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 142, # 'A' - 66: 143, # 'B' - 67: 144, # 'C' - 68: 145, # 'D' - 69: 146, # 'E' - 70: 147, # 'F' - 71: 148, # 'G' - 72: 149, # 'H' - 73: 150, # 'I' - 74: 151, # 'J' - 75: 152, # 'K' - 76: 74, # 'L' - 77: 153, # 'M' - 78: 75, # 'N' - 79: 154, # 'O' - 80: 155, # 'P' - 81: 156, # 'Q' - 82: 157, # 'R' - 83: 158, # 'S' - 84: 159, # 'T' - 85: 160, # 'U' - 86: 161, # 'V' - 87: 162, # 'W' - 88: 163, # 'X' - 89: 164, # 'Y' - 90: 165, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 71, # 'a' - 98: 172, # 'b' - 99: 66, # 'c' - 100: 173, # 'd' - 101: 65, # 'e' - 102: 174, # 'f' - 103: 76, # 'g' - 104: 175, # 'h' - 105: 64, # 'i' - 106: 176, # 'j' - 107: 177, # 'k' - 108: 77, # 'l' - 109: 72, # 'm' - 110: 178, # 'n' - 111: 69, # 'o' - 112: 67, # 'p' - 113: 179, # 'q' - 114: 78, # 'r' - 115: 73, # 's' - 116: 180, # 't' - 117: 181, # 'u' - 118: 79, # 'v' - 119: 182, # 'w' - 120: 183, # 'x' - 121: 184, # 'y' - 122: 185, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 191, # '\x80' - 129: 192, # '\x81' - 130: 193, # '\x82' - 131: 194, # '\x83' - 132: 195, # '\x84' - 133: 196, # '\x85' - 134: 197, # '\x86' - 135: 198, # '\x87' - 136: 199, # '\x88' - 137: 200, # '\x89' - 138: 201, # '\x8a' - 139: 202, # '\x8b' - 140: 203, # '\x8c' - 141: 204, # '\x8d' - 142: 205, # '\x8e' - 143: 206, # '\x8f' - 144: 207, # '\x90' - 145: 208, # '\x91' - 146: 209, # '\x92' - 147: 210, # '\x93' - 148: 211, # '\x94' - 149: 212, # '\x95' - 150: 213, # '\x96' - 151: 214, # '\x97' - 152: 215, # '\x98' - 153: 216, # '\x99' - 154: 217, # '\x9a' - 155: 218, # '\x9b' - 156: 219, # '\x9c' - 157: 220, # '\x9d' - 158: 221, # '\x9e' - 159: 222, # '\x9f' - 160: 223, # '\xa0' - 161: 224, # 'Ё' - 162: 225, # 'Ђ' - 163: 226, # 'Ѓ' - 164: 227, # 'Є' - 165: 228, # 'Ѕ' - 166: 229, # 'І' - 167: 230, # 'Ї' - 168: 231, # 'Ј' - 169: 232, # 'Љ' - 170: 233, # 'Њ' - 171: 234, # 'Ћ' - 172: 235, # 'Ќ' - 173: 236, # '\xad' - 174: 237, # 'Ў' - 175: 238, # 'Џ' - 176: 37, # 'А' - 177: 44, # 'Б' - 178: 33, # 'В' - 179: 46, # 'Г' - 180: 41, # 'Д' - 181: 48, # 'Е' - 182: 56, # 'Ж' - 183: 51, # 'З' - 184: 42, # 'И' - 185: 60, # 'Й' - 186: 36, # 'К' - 187: 49, # 'Л' - 188: 38, # 'М' - 189: 31, # 'Н' - 190: 34, # 'О' - 191: 35, # 'П' - 192: 45, # 'Р' - 193: 32, # 'С' - 194: 40, # 'Т' - 195: 52, # 'У' - 196: 53, # 'Ф' - 197: 55, # 'Х' - 198: 58, # 'Ц' - 199: 50, # 'Ч' - 200: 57, # 'Ш' - 201: 63, # 'Щ' - 202: 70, # 'Ъ' - 203: 62, # 'Ы' - 204: 61, # 'Ь' - 205: 47, # 'Э' - 206: 59, # 'Ю' - 207: 43, # 'Я' - 208: 3, # 'а' - 209: 21, # 'б' - 210: 10, # 'в' - 211: 19, # 'г' - 212: 13, # 'д' - 213: 2, # 'е' - 214: 24, # 'ж' - 215: 20, # 'з' - 216: 4, # 'и' - 217: 23, # 'й' - 218: 11, # 'к' - 219: 8, # 'л' - 220: 12, # 'м' - 221: 5, # 'н' - 222: 1, # 'о' - 223: 15, # 'п' - 224: 9, # 'р' - 225: 7, # 'с' - 226: 6, # 'т' - 227: 14, # 'у' - 228: 39, # 'ф' - 229: 26, # 'х' - 230: 28, # 'ц' - 231: 22, # 'ч' - 232: 25, # 'ш' - 233: 29, # 'щ' - 234: 54, # 'ъ' - 235: 18, # 'ы' - 236: 17, # 'ь' - 237: 30, # 'э' - 238: 27, # 'ю' - 239: 16, # 'я' - 240: 239, # '№' - 241: 68, # 'ё' - 242: 240, # 'ђ' - 243: 241, # 'ѓ' - 244: 242, # 'є' - 245: 243, # 'ѕ' - 246: 244, # 'і' - 247: 245, # 'ї' - 248: 246, # 'ј' - 249: 247, # 'љ' - 250: 248, # 'њ' - 251: 249, # 'ћ' - 252: 250, # 'ќ' - 253: 251, # '§' - 254: 252, # 'ў' - 255: 255, # 'џ' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 142, # 'A' + 66: 143, # 'B' + 67: 144, # 'C' + 68: 145, # 'D' + 69: 146, # 'E' + 70: 147, # 'F' + 71: 148, # 'G' + 72: 149, # 'H' + 73: 150, # 'I' + 74: 151, # 'J' + 75: 152, # 'K' + 76: 74, # 'L' + 77: 153, # 'M' + 78: 75, # 'N' + 79: 154, # 'O' + 80: 155, # 'P' + 81: 156, # 'Q' + 82: 157, # 'R' + 83: 158, # 'S' + 84: 159, # 'T' + 85: 160, # 'U' + 86: 161, # 'V' + 87: 162, # 'W' + 88: 163, # 'X' + 89: 164, # 'Y' + 90: 165, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 71, # 'a' + 98: 172, # 'b' + 99: 66, # 'c' + 100: 173, # 'd' + 101: 65, # 'e' + 102: 174, # 'f' + 103: 76, # 'g' + 104: 175, # 'h' + 105: 64, # 'i' + 106: 176, # 'j' + 107: 177, # 'k' + 108: 77, # 'l' + 109: 72, # 'm' + 110: 178, # 'n' + 111: 69, # 'o' + 112: 67, # 'p' + 113: 179, # 'q' + 114: 78, # 'r' + 115: 73, # 's' + 116: 180, # 't' + 117: 181, # 'u' + 118: 79, # 'v' + 119: 182, # 'w' + 120: 183, # 'x' + 121: 184, # 'y' + 122: 185, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 191, # '\x80' + 129: 192, # '\x81' + 130: 193, # '\x82' + 131: 194, # '\x83' + 132: 195, # '\x84' + 133: 196, # '\x85' + 134: 197, # '\x86' + 135: 198, # '\x87' + 136: 199, # '\x88' + 137: 200, # '\x89' + 138: 201, # '\x8a' + 139: 202, # '\x8b' + 140: 203, # '\x8c' + 141: 204, # '\x8d' + 142: 205, # '\x8e' + 143: 206, # '\x8f' + 144: 207, # '\x90' + 145: 208, # '\x91' + 146: 209, # '\x92' + 147: 210, # '\x93' + 148: 211, # '\x94' + 149: 212, # '\x95' + 150: 213, # '\x96' + 151: 214, # '\x97' + 152: 215, # '\x98' + 153: 216, # '\x99' + 154: 217, # '\x9a' + 155: 218, # '\x9b' + 156: 219, # '\x9c' + 157: 220, # '\x9d' + 158: 221, # '\x9e' + 159: 222, # '\x9f' + 160: 223, # '\xa0' + 161: 224, # 'Ё' + 162: 225, # 'Ђ' + 163: 226, # 'Ѓ' + 164: 227, # 'Є' + 165: 228, # 'Ѕ' + 166: 229, # 'І' + 167: 230, # 'Ї' + 168: 231, # 'Ј' + 169: 232, # 'Љ' + 170: 233, # 'Њ' + 171: 234, # 'Ћ' + 172: 235, # 'Ќ' + 173: 236, # '\xad' + 174: 237, # 'Ў' + 175: 238, # 'Џ' + 176: 37, # 'А' + 177: 44, # 'Б' + 178: 33, # 'В' + 179: 46, # 'Г' + 180: 41, # 'Д' + 181: 48, # 'Е' + 182: 56, # 'Ж' + 183: 51, # 'З' + 184: 42, # 'И' + 185: 60, # 'Й' + 186: 36, # 'К' + 187: 49, # 'Л' + 188: 38, # 'М' + 189: 31, # 'Н' + 190: 34, # 'О' + 191: 35, # 'П' + 192: 45, # 'Р' + 193: 32, # 'С' + 194: 40, # 'Т' + 195: 52, # 'У' + 196: 53, # 'Ф' + 197: 55, # 'Х' + 198: 58, # 'Ц' + 199: 50, # 'Ч' + 200: 57, # 'Ш' + 201: 63, # 'Щ' + 202: 70, # 'Ъ' + 203: 62, # 'Ы' + 204: 61, # 'Ь' + 205: 47, # 'Э' + 206: 59, # 'Ю' + 207: 43, # 'Я' + 208: 3, # 'а' + 209: 21, # 'б' + 210: 10, # 'в' + 211: 19, # 'г' + 212: 13, # 'д' + 213: 2, # 'е' + 214: 24, # 'ж' + 215: 20, # 'з' + 216: 4, # 'и' + 217: 23, # 'й' + 218: 11, # 'к' + 219: 8, # 'л' + 220: 12, # 'м' + 221: 5, # 'н' + 222: 1, # 'о' + 223: 15, # 'п' + 224: 9, # 'р' + 225: 7, # 'с' + 226: 6, # 'т' + 227: 14, # 'у' + 228: 39, # 'ф' + 229: 26, # 'х' + 230: 28, # 'ц' + 231: 22, # 'ч' + 232: 25, # 'ш' + 233: 29, # 'щ' + 234: 54, # 'ъ' + 235: 18, # 'ы' + 236: 17, # 'ь' + 237: 30, # 'э' + 238: 27, # 'ю' + 239: 16, # 'я' + 240: 239, # '№' + 241: 68, # 'ё' + 242: 240, # 'ђ' + 243: 241, # 'ѓ' + 244: 242, # 'є' + 245: 243, # 'ѕ' + 246: 244, # 'і' + 247: 245, # 'ї' + 248: 246, # 'ј' + 249: 247, # 'љ' + 250: 248, # 'њ' + 251: 249, # 'ћ' + 252: 250, # 'ќ' + 253: 251, # '§' + 254: 252, # 'ў' + 255: 255, # 'џ' } -ISO_8859_5_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5', - language='Russian', - char_to_order_map=ISO_8859_5_RUSSIAN_CHAR_TO_ORDER, - language_model=RUSSIAN_LANG_MODEL, - typical_positive_ratio=0.976601, - keep_ascii_letters=False, - alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё') - +ISO_8859_5_RUSSIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-5", + language="Russian", + char_to_order_map=ISO_8859_5_RUSSIAN_CHAR_TO_ORDER, + language_model=RUSSIAN_LANG_MODEL, + typical_positive_ratio=0.976601, + keep_ascii_letters=False, + alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё", +) diff --git a/lib/chardet/langthaimodel.py b/lib/chardet/langthaimodel.py index d8b1095c..d7aedc7c 100644 --- a/lib/chardet/langthaimodel.py +++ b/lib/chardet/langthaimodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -4115,269 +4111,270 @@ THAI_LANG_MODEL = { # Character Mapping Table(s): TIS_620_THAI_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 254, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 254, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 253, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 182, # 'A' - 66: 106, # 'B' - 67: 107, # 'C' - 68: 100, # 'D' - 69: 183, # 'E' - 70: 184, # 'F' - 71: 185, # 'G' - 72: 101, # 'H' - 73: 94, # 'I' - 74: 186, # 'J' - 75: 187, # 'K' - 76: 108, # 'L' - 77: 109, # 'M' - 78: 110, # 'N' - 79: 111, # 'O' - 80: 188, # 'P' - 81: 189, # 'Q' - 82: 190, # 'R' - 83: 89, # 'S' - 84: 95, # 'T' - 85: 112, # 'U' - 86: 113, # 'V' - 87: 191, # 'W' - 88: 192, # 'X' - 89: 193, # 'Y' - 90: 194, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 64, # 'a' - 98: 72, # 'b' - 99: 73, # 'c' - 100: 114, # 'd' - 101: 74, # 'e' - 102: 115, # 'f' - 103: 116, # 'g' - 104: 102, # 'h' - 105: 81, # 'i' - 106: 201, # 'j' - 107: 117, # 'k' - 108: 90, # 'l' - 109: 103, # 'm' - 110: 78, # 'n' - 111: 82, # 'o' - 112: 96, # 'p' - 113: 202, # 'q' - 114: 91, # 'r' - 115: 79, # 's' - 116: 84, # 't' - 117: 104, # 'u' - 118: 105, # 'v' - 119: 97, # 'w' - 120: 98, # 'x' - 121: 92, # 'y' - 122: 203, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 253, # '\x7f' - 128: 209, # '\x80' - 129: 210, # '\x81' - 130: 211, # '\x82' - 131: 212, # '\x83' - 132: 213, # '\x84' - 133: 88, # '\x85' - 134: 214, # '\x86' - 135: 215, # '\x87' - 136: 216, # '\x88' - 137: 217, # '\x89' - 138: 218, # '\x8a' - 139: 219, # '\x8b' - 140: 220, # '\x8c' - 141: 118, # '\x8d' - 142: 221, # '\x8e' - 143: 222, # '\x8f' - 144: 223, # '\x90' - 145: 224, # '\x91' - 146: 99, # '\x92' - 147: 85, # '\x93' - 148: 83, # '\x94' - 149: 225, # '\x95' - 150: 226, # '\x96' - 151: 227, # '\x97' - 152: 228, # '\x98' - 153: 229, # '\x99' - 154: 230, # '\x9a' - 155: 231, # '\x9b' - 156: 232, # '\x9c' - 157: 233, # '\x9d' - 158: 234, # '\x9e' - 159: 235, # '\x9f' - 160: 236, # None - 161: 5, # 'ก' - 162: 30, # 'ข' - 163: 237, # 'ฃ' - 164: 24, # 'ค' - 165: 238, # 'ฅ' - 166: 75, # 'ฆ' - 167: 8, # 'ง' - 168: 26, # 'จ' - 169: 52, # 'ฉ' - 170: 34, # 'ช' - 171: 51, # 'ซ' - 172: 119, # 'ฌ' - 173: 47, # 'ญ' - 174: 58, # 'ฎ' - 175: 57, # 'ฏ' - 176: 49, # 'ฐ' - 177: 53, # 'ฑ' - 178: 55, # 'ฒ' - 179: 43, # 'ณ' - 180: 20, # 'ด' - 181: 19, # 'ต' - 182: 44, # 'ถ' - 183: 14, # 'ท' - 184: 48, # 'ธ' - 185: 3, # 'น' - 186: 17, # 'บ' - 187: 25, # 'ป' - 188: 39, # 'ผ' - 189: 62, # 'ฝ' - 190: 31, # 'พ' - 191: 54, # 'ฟ' - 192: 45, # 'ภ' - 193: 9, # 'ม' - 194: 16, # 'ย' - 195: 2, # 'ร' - 196: 61, # 'ฤ' - 197: 15, # 'ล' - 198: 239, # 'ฦ' - 199: 12, # 'ว' - 200: 42, # 'ศ' - 201: 46, # 'ษ' - 202: 18, # 'ส' - 203: 21, # 'ห' - 204: 76, # 'ฬ' - 205: 4, # 'อ' - 206: 66, # 'ฮ' - 207: 63, # 'ฯ' - 208: 22, # 'ะ' - 209: 10, # 'ั' - 210: 1, # 'า' - 211: 36, # 'ำ' - 212: 23, # 'ิ' - 213: 13, # 'ี' - 214: 40, # 'ึ' - 215: 27, # 'ื' - 216: 32, # 'ุ' - 217: 35, # 'ู' - 218: 86, # 'ฺ' - 219: 240, # None - 220: 241, # None - 221: 242, # None - 222: 243, # None - 223: 244, # '฿' - 224: 11, # 'เ' - 225: 28, # 'แ' - 226: 41, # 'โ' - 227: 29, # 'ใ' - 228: 33, # 'ไ' - 229: 245, # 'ๅ' - 230: 50, # 'ๆ' - 231: 37, # '็' - 232: 6, # '่' - 233: 7, # '้' - 234: 67, # '๊' - 235: 77, # '๋' - 236: 38, # '์' - 237: 93, # 'ํ' - 238: 246, # '๎' - 239: 247, # '๏' - 240: 68, # '๐' - 241: 56, # '๑' - 242: 59, # '๒' - 243: 65, # '๓' - 244: 69, # '๔' - 245: 60, # '๕' - 246: 70, # '๖' - 247: 80, # '๗' - 248: 71, # '๘' - 249: 87, # '๙' - 250: 248, # '๚' - 251: 249, # '๛' - 252: 250, # None - 253: 251, # None - 254: 252, # None - 255: 253, # None + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 254, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 254, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 253, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 182, # 'A' + 66: 106, # 'B' + 67: 107, # 'C' + 68: 100, # 'D' + 69: 183, # 'E' + 70: 184, # 'F' + 71: 185, # 'G' + 72: 101, # 'H' + 73: 94, # 'I' + 74: 186, # 'J' + 75: 187, # 'K' + 76: 108, # 'L' + 77: 109, # 'M' + 78: 110, # 'N' + 79: 111, # 'O' + 80: 188, # 'P' + 81: 189, # 'Q' + 82: 190, # 'R' + 83: 89, # 'S' + 84: 95, # 'T' + 85: 112, # 'U' + 86: 113, # 'V' + 87: 191, # 'W' + 88: 192, # 'X' + 89: 193, # 'Y' + 90: 194, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 64, # 'a' + 98: 72, # 'b' + 99: 73, # 'c' + 100: 114, # 'd' + 101: 74, # 'e' + 102: 115, # 'f' + 103: 116, # 'g' + 104: 102, # 'h' + 105: 81, # 'i' + 106: 201, # 'j' + 107: 117, # 'k' + 108: 90, # 'l' + 109: 103, # 'm' + 110: 78, # 'n' + 111: 82, # 'o' + 112: 96, # 'p' + 113: 202, # 'q' + 114: 91, # 'r' + 115: 79, # 's' + 116: 84, # 't' + 117: 104, # 'u' + 118: 105, # 'v' + 119: 97, # 'w' + 120: 98, # 'x' + 121: 92, # 'y' + 122: 203, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 253, # '\x7f' + 128: 209, # '\x80' + 129: 210, # '\x81' + 130: 211, # '\x82' + 131: 212, # '\x83' + 132: 213, # '\x84' + 133: 88, # '\x85' + 134: 214, # '\x86' + 135: 215, # '\x87' + 136: 216, # '\x88' + 137: 217, # '\x89' + 138: 218, # '\x8a' + 139: 219, # '\x8b' + 140: 220, # '\x8c' + 141: 118, # '\x8d' + 142: 221, # '\x8e' + 143: 222, # '\x8f' + 144: 223, # '\x90' + 145: 224, # '\x91' + 146: 99, # '\x92' + 147: 85, # '\x93' + 148: 83, # '\x94' + 149: 225, # '\x95' + 150: 226, # '\x96' + 151: 227, # '\x97' + 152: 228, # '\x98' + 153: 229, # '\x99' + 154: 230, # '\x9a' + 155: 231, # '\x9b' + 156: 232, # '\x9c' + 157: 233, # '\x9d' + 158: 234, # '\x9e' + 159: 235, # '\x9f' + 160: 236, # None + 161: 5, # 'ก' + 162: 30, # 'ข' + 163: 237, # 'ฃ' + 164: 24, # 'ค' + 165: 238, # 'ฅ' + 166: 75, # 'ฆ' + 167: 8, # 'ง' + 168: 26, # 'จ' + 169: 52, # 'ฉ' + 170: 34, # 'ช' + 171: 51, # 'ซ' + 172: 119, # 'ฌ' + 173: 47, # 'ญ' + 174: 58, # 'ฎ' + 175: 57, # 'ฏ' + 176: 49, # 'ฐ' + 177: 53, # 'ฑ' + 178: 55, # 'ฒ' + 179: 43, # 'ณ' + 180: 20, # 'ด' + 181: 19, # 'ต' + 182: 44, # 'ถ' + 183: 14, # 'ท' + 184: 48, # 'ธ' + 185: 3, # 'น' + 186: 17, # 'บ' + 187: 25, # 'ป' + 188: 39, # 'ผ' + 189: 62, # 'ฝ' + 190: 31, # 'พ' + 191: 54, # 'ฟ' + 192: 45, # 'ภ' + 193: 9, # 'ม' + 194: 16, # 'ย' + 195: 2, # 'ร' + 196: 61, # 'ฤ' + 197: 15, # 'ล' + 198: 239, # 'ฦ' + 199: 12, # 'ว' + 200: 42, # 'ศ' + 201: 46, # 'ษ' + 202: 18, # 'ส' + 203: 21, # 'ห' + 204: 76, # 'ฬ' + 205: 4, # 'อ' + 206: 66, # 'ฮ' + 207: 63, # 'ฯ' + 208: 22, # 'ะ' + 209: 10, # 'ั' + 210: 1, # 'า' + 211: 36, # 'ำ' + 212: 23, # 'ิ' + 213: 13, # 'ี' + 214: 40, # 'ึ' + 215: 27, # 'ื' + 216: 32, # 'ุ' + 217: 35, # 'ู' + 218: 86, # 'ฺ' + 219: 240, # None + 220: 241, # None + 221: 242, # None + 222: 243, # None + 223: 244, # '฿' + 224: 11, # 'เ' + 225: 28, # 'แ' + 226: 41, # 'โ' + 227: 29, # 'ใ' + 228: 33, # 'ไ' + 229: 245, # 'ๅ' + 230: 50, # 'ๆ' + 231: 37, # '็' + 232: 6, # '่' + 233: 7, # '้' + 234: 67, # '๊' + 235: 77, # '๋' + 236: 38, # '์' + 237: 93, # 'ํ' + 238: 246, # '๎' + 239: 247, # '๏' + 240: 68, # '๐' + 241: 56, # '๑' + 242: 59, # '๒' + 243: 65, # '๓' + 244: 69, # '๔' + 245: 60, # '๕' + 246: 70, # '๖' + 247: 80, # '๗' + 248: 71, # '๘' + 249: 87, # '๙' + 250: 248, # '๚' + 251: 249, # '๛' + 252: 250, # None + 253: 251, # None + 254: 252, # None + 255: 253, # None } -TIS_620_THAI_MODEL = SingleByteCharSetModel(charset_name='TIS-620', - language='Thai', - char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER, - language_model=THAI_LANG_MODEL, - typical_positive_ratio=0.926386, - keep_ascii_letters=False, - alphabet='กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛') - +TIS_620_THAI_MODEL = SingleByteCharSetModel( + charset_name="TIS-620", + language="Thai", + char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER, + language_model=THAI_LANG_MODEL, + typical_positive_ratio=0.926386, + keep_ascii_letters=False, + alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", +) diff --git a/lib/chardet/langturkishmodel.py b/lib/chardet/langturkishmodel.py index cb4672b1..6c6c09ba 100644 --- a/lib/chardet/langturkishmodel.py +++ b/lib/chardet/langturkishmodel.py @@ -1,9 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -4115,269 +4111,270 @@ TURKISH_LANG_MODEL = { # Character Mapping Table(s): ISO_8859_9_TURKISH_CHAR_TO_ORDER = { - 0: 255, # '\x00' - 1: 255, # '\x01' - 2: 255, # '\x02' - 3: 255, # '\x03' - 4: 255, # '\x04' - 5: 255, # '\x05' - 6: 255, # '\x06' - 7: 255, # '\x07' - 8: 255, # '\x08' - 9: 255, # '\t' - 10: 255, # '\n' - 11: 255, # '\x0b' - 12: 255, # '\x0c' - 13: 255, # '\r' - 14: 255, # '\x0e' - 15: 255, # '\x0f' - 16: 255, # '\x10' - 17: 255, # '\x11' - 18: 255, # '\x12' - 19: 255, # '\x13' - 20: 255, # '\x14' - 21: 255, # '\x15' - 22: 255, # '\x16' - 23: 255, # '\x17' - 24: 255, # '\x18' - 25: 255, # '\x19' - 26: 255, # '\x1a' - 27: 255, # '\x1b' - 28: 255, # '\x1c' - 29: 255, # '\x1d' - 30: 255, # '\x1e' - 31: 255, # '\x1f' - 32: 255, # ' ' - 33: 255, # '!' - 34: 255, # '"' - 35: 255, # '#' - 36: 255, # '$' - 37: 255, # '%' - 38: 255, # '&' - 39: 255, # "'" - 40: 255, # '(' - 41: 255, # ')' - 42: 255, # '*' - 43: 255, # '+' - 44: 255, # ',' - 45: 255, # '-' - 46: 255, # '.' - 47: 255, # '/' - 48: 255, # '0' - 49: 255, # '1' - 50: 255, # '2' - 51: 255, # '3' - 52: 255, # '4' - 53: 255, # '5' - 54: 255, # '6' - 55: 255, # '7' - 56: 255, # '8' - 57: 255, # '9' - 58: 255, # ':' - 59: 255, # ';' - 60: 255, # '<' - 61: 255, # '=' - 62: 255, # '>' - 63: 255, # '?' - 64: 255, # '@' - 65: 23, # 'A' - 66: 37, # 'B' - 67: 47, # 'C' - 68: 39, # 'D' - 69: 29, # 'E' - 70: 52, # 'F' - 71: 36, # 'G' - 72: 45, # 'H' - 73: 53, # 'I' - 74: 60, # 'J' - 75: 16, # 'K' - 76: 49, # 'L' - 77: 20, # 'M' - 78: 46, # 'N' - 79: 42, # 'O' - 80: 48, # 'P' - 81: 69, # 'Q' - 82: 44, # 'R' - 83: 35, # 'S' - 84: 31, # 'T' - 85: 51, # 'U' - 86: 38, # 'V' - 87: 62, # 'W' - 88: 65, # 'X' - 89: 43, # 'Y' - 90: 56, # 'Z' - 91: 255, # '[' - 92: 255, # '\\' - 93: 255, # ']' - 94: 255, # '^' - 95: 255, # '_' - 96: 255, # '`' - 97: 1, # 'a' - 98: 21, # 'b' - 99: 28, # 'c' - 100: 12, # 'd' - 101: 2, # 'e' - 102: 18, # 'f' - 103: 27, # 'g' - 104: 25, # 'h' - 105: 3, # 'i' - 106: 24, # 'j' - 107: 10, # 'k' - 108: 5, # 'l' - 109: 13, # 'm' - 110: 4, # 'n' - 111: 15, # 'o' - 112: 26, # 'p' - 113: 64, # 'q' - 114: 7, # 'r' - 115: 8, # 's' - 116: 9, # 't' - 117: 14, # 'u' - 118: 32, # 'v' - 119: 57, # 'w' - 120: 58, # 'x' - 121: 11, # 'y' - 122: 22, # 'z' - 123: 255, # '{' - 124: 255, # '|' - 125: 255, # '}' - 126: 255, # '~' - 127: 255, # '\x7f' - 128: 180, # '\x80' - 129: 179, # '\x81' - 130: 178, # '\x82' - 131: 177, # '\x83' - 132: 176, # '\x84' - 133: 175, # '\x85' - 134: 174, # '\x86' - 135: 173, # '\x87' - 136: 172, # '\x88' - 137: 171, # '\x89' - 138: 170, # '\x8a' - 139: 169, # '\x8b' - 140: 168, # '\x8c' - 141: 167, # '\x8d' - 142: 166, # '\x8e' - 143: 165, # '\x8f' - 144: 164, # '\x90' - 145: 163, # '\x91' - 146: 162, # '\x92' - 147: 161, # '\x93' - 148: 160, # '\x94' - 149: 159, # '\x95' - 150: 101, # '\x96' - 151: 158, # '\x97' - 152: 157, # '\x98' - 153: 156, # '\x99' - 154: 155, # '\x9a' - 155: 154, # '\x9b' - 156: 153, # '\x9c' - 157: 152, # '\x9d' - 158: 151, # '\x9e' - 159: 106, # '\x9f' - 160: 150, # '\xa0' - 161: 149, # '¡' - 162: 148, # '¢' - 163: 147, # '£' - 164: 146, # '¤' - 165: 145, # '¥' - 166: 144, # '¦' - 167: 100, # '§' - 168: 143, # '¨' - 169: 142, # '©' - 170: 141, # 'ª' - 171: 140, # '«' - 172: 139, # '¬' - 173: 138, # '\xad' - 174: 137, # '®' - 175: 136, # '¯' - 176: 94, # '°' - 177: 80, # '±' - 178: 93, # '²' - 179: 135, # '³' - 180: 105, # '´' - 181: 134, # 'µ' - 182: 133, # '¶' - 183: 63, # '·' - 184: 132, # '¸' - 185: 131, # '¹' - 186: 130, # 'º' - 187: 129, # '»' - 188: 128, # '¼' - 189: 127, # '½' - 190: 126, # '¾' - 191: 125, # '¿' - 192: 124, # 'À' - 193: 104, # 'Á' - 194: 73, # 'Â' - 195: 99, # 'Ã' - 196: 79, # 'Ä' - 197: 85, # 'Å' - 198: 123, # 'Æ' - 199: 54, # 'Ç' - 200: 122, # 'È' - 201: 98, # 'É' - 202: 92, # 'Ê' - 203: 121, # 'Ë' - 204: 120, # 'Ì' - 205: 91, # 'Í' - 206: 103, # 'Î' - 207: 119, # 'Ï' - 208: 68, # 'Ğ' - 209: 118, # 'Ñ' - 210: 117, # 'Ò' - 211: 97, # 'Ó' - 212: 116, # 'Ô' - 213: 115, # 'Õ' - 214: 50, # 'Ö' - 215: 90, # '×' - 216: 114, # 'Ø' - 217: 113, # 'Ù' - 218: 112, # 'Ú' - 219: 111, # 'Û' - 220: 55, # 'Ü' - 221: 41, # 'İ' - 222: 40, # 'Ş' - 223: 86, # 'ß' - 224: 89, # 'à' - 225: 70, # 'á' - 226: 59, # 'â' - 227: 78, # 'ã' - 228: 71, # 'ä' - 229: 82, # 'å' - 230: 88, # 'æ' - 231: 33, # 'ç' - 232: 77, # 'è' - 233: 66, # 'é' - 234: 84, # 'ê' - 235: 83, # 'ë' - 236: 110, # 'ì' - 237: 75, # 'í' - 238: 61, # 'î' - 239: 96, # 'ï' - 240: 30, # 'ğ' - 241: 67, # 'ñ' - 242: 109, # 'ò' - 243: 74, # 'ó' - 244: 87, # 'ô' - 245: 102, # 'õ' - 246: 34, # 'ö' - 247: 95, # '÷' - 248: 81, # 'ø' - 249: 108, # 'ù' - 250: 76, # 'ú' - 251: 72, # 'û' - 252: 17, # 'ü' - 253: 6, # 'ı' - 254: 19, # 'ş' - 255: 107, # 'ÿ' + 0: 255, # '\x00' + 1: 255, # '\x01' + 2: 255, # '\x02' + 3: 255, # '\x03' + 4: 255, # '\x04' + 5: 255, # '\x05' + 6: 255, # '\x06' + 7: 255, # '\x07' + 8: 255, # '\x08' + 9: 255, # '\t' + 10: 255, # '\n' + 11: 255, # '\x0b' + 12: 255, # '\x0c' + 13: 255, # '\r' + 14: 255, # '\x0e' + 15: 255, # '\x0f' + 16: 255, # '\x10' + 17: 255, # '\x11' + 18: 255, # '\x12' + 19: 255, # '\x13' + 20: 255, # '\x14' + 21: 255, # '\x15' + 22: 255, # '\x16' + 23: 255, # '\x17' + 24: 255, # '\x18' + 25: 255, # '\x19' + 26: 255, # '\x1a' + 27: 255, # '\x1b' + 28: 255, # '\x1c' + 29: 255, # '\x1d' + 30: 255, # '\x1e' + 31: 255, # '\x1f' + 32: 255, # ' ' + 33: 255, # '!' + 34: 255, # '"' + 35: 255, # '#' + 36: 255, # '$' + 37: 255, # '%' + 38: 255, # '&' + 39: 255, # "'" + 40: 255, # '(' + 41: 255, # ')' + 42: 255, # '*' + 43: 255, # '+' + 44: 255, # ',' + 45: 255, # '-' + 46: 255, # '.' + 47: 255, # '/' + 48: 255, # '0' + 49: 255, # '1' + 50: 255, # '2' + 51: 255, # '3' + 52: 255, # '4' + 53: 255, # '5' + 54: 255, # '6' + 55: 255, # '7' + 56: 255, # '8' + 57: 255, # '9' + 58: 255, # ':' + 59: 255, # ';' + 60: 255, # '<' + 61: 255, # '=' + 62: 255, # '>' + 63: 255, # '?' + 64: 255, # '@' + 65: 23, # 'A' + 66: 37, # 'B' + 67: 47, # 'C' + 68: 39, # 'D' + 69: 29, # 'E' + 70: 52, # 'F' + 71: 36, # 'G' + 72: 45, # 'H' + 73: 53, # 'I' + 74: 60, # 'J' + 75: 16, # 'K' + 76: 49, # 'L' + 77: 20, # 'M' + 78: 46, # 'N' + 79: 42, # 'O' + 80: 48, # 'P' + 81: 69, # 'Q' + 82: 44, # 'R' + 83: 35, # 'S' + 84: 31, # 'T' + 85: 51, # 'U' + 86: 38, # 'V' + 87: 62, # 'W' + 88: 65, # 'X' + 89: 43, # 'Y' + 90: 56, # 'Z' + 91: 255, # '[' + 92: 255, # '\\' + 93: 255, # ']' + 94: 255, # '^' + 95: 255, # '_' + 96: 255, # '`' + 97: 1, # 'a' + 98: 21, # 'b' + 99: 28, # 'c' + 100: 12, # 'd' + 101: 2, # 'e' + 102: 18, # 'f' + 103: 27, # 'g' + 104: 25, # 'h' + 105: 3, # 'i' + 106: 24, # 'j' + 107: 10, # 'k' + 108: 5, # 'l' + 109: 13, # 'm' + 110: 4, # 'n' + 111: 15, # 'o' + 112: 26, # 'p' + 113: 64, # 'q' + 114: 7, # 'r' + 115: 8, # 's' + 116: 9, # 't' + 117: 14, # 'u' + 118: 32, # 'v' + 119: 57, # 'w' + 120: 58, # 'x' + 121: 11, # 'y' + 122: 22, # 'z' + 123: 255, # '{' + 124: 255, # '|' + 125: 255, # '}' + 126: 255, # '~' + 127: 255, # '\x7f' + 128: 180, # '\x80' + 129: 179, # '\x81' + 130: 178, # '\x82' + 131: 177, # '\x83' + 132: 176, # '\x84' + 133: 175, # '\x85' + 134: 174, # '\x86' + 135: 173, # '\x87' + 136: 172, # '\x88' + 137: 171, # '\x89' + 138: 170, # '\x8a' + 139: 169, # '\x8b' + 140: 168, # '\x8c' + 141: 167, # '\x8d' + 142: 166, # '\x8e' + 143: 165, # '\x8f' + 144: 164, # '\x90' + 145: 163, # '\x91' + 146: 162, # '\x92' + 147: 161, # '\x93' + 148: 160, # '\x94' + 149: 159, # '\x95' + 150: 101, # '\x96' + 151: 158, # '\x97' + 152: 157, # '\x98' + 153: 156, # '\x99' + 154: 155, # '\x9a' + 155: 154, # '\x9b' + 156: 153, # '\x9c' + 157: 152, # '\x9d' + 158: 151, # '\x9e' + 159: 106, # '\x9f' + 160: 150, # '\xa0' + 161: 149, # '¡' + 162: 148, # '¢' + 163: 147, # '£' + 164: 146, # '¤' + 165: 145, # '¥' + 166: 144, # '¦' + 167: 100, # '§' + 168: 143, # '¨' + 169: 142, # '©' + 170: 141, # 'ª' + 171: 140, # '«' + 172: 139, # '¬' + 173: 138, # '\xad' + 174: 137, # '®' + 175: 136, # '¯' + 176: 94, # '°' + 177: 80, # '±' + 178: 93, # '²' + 179: 135, # '³' + 180: 105, # '´' + 181: 134, # 'µ' + 182: 133, # '¶' + 183: 63, # '·' + 184: 132, # '¸' + 185: 131, # '¹' + 186: 130, # 'º' + 187: 129, # '»' + 188: 128, # '¼' + 189: 127, # '½' + 190: 126, # '¾' + 191: 125, # '¿' + 192: 124, # 'À' + 193: 104, # 'Á' + 194: 73, # 'Â' + 195: 99, # 'Ã' + 196: 79, # 'Ä' + 197: 85, # 'Å' + 198: 123, # 'Æ' + 199: 54, # 'Ç' + 200: 122, # 'È' + 201: 98, # 'É' + 202: 92, # 'Ê' + 203: 121, # 'Ë' + 204: 120, # 'Ì' + 205: 91, # 'Í' + 206: 103, # 'Î' + 207: 119, # 'Ï' + 208: 68, # 'Ğ' + 209: 118, # 'Ñ' + 210: 117, # 'Ò' + 211: 97, # 'Ó' + 212: 116, # 'Ô' + 213: 115, # 'Õ' + 214: 50, # 'Ö' + 215: 90, # '×' + 216: 114, # 'Ø' + 217: 113, # 'Ù' + 218: 112, # 'Ú' + 219: 111, # 'Û' + 220: 55, # 'Ü' + 221: 41, # 'İ' + 222: 40, # 'Ş' + 223: 86, # 'ß' + 224: 89, # 'à' + 225: 70, # 'á' + 226: 59, # 'â' + 227: 78, # 'ã' + 228: 71, # 'ä' + 229: 82, # 'å' + 230: 88, # 'æ' + 231: 33, # 'ç' + 232: 77, # 'è' + 233: 66, # 'é' + 234: 84, # 'ê' + 235: 83, # 'ë' + 236: 110, # 'ì' + 237: 75, # 'í' + 238: 61, # 'î' + 239: 96, # 'ï' + 240: 30, # 'ğ' + 241: 67, # 'ñ' + 242: 109, # 'ò' + 243: 74, # 'ó' + 244: 87, # 'ô' + 245: 102, # 'õ' + 246: 34, # 'ö' + 247: 95, # '÷' + 248: 81, # 'ø' + 249: 108, # 'ù' + 250: 76, # 'ú' + 251: 72, # 'û' + 252: 17, # 'ü' + 253: 6, # 'ı' + 254: 19, # 'ş' + 255: 107, # 'ÿ' } -ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-9', - language='Turkish', - char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER, - language_model=TURKISH_LANG_MODEL, - typical_positive_ratio=0.97029, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş') - +ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-9", + language="Turkish", + char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER, + language_model=TURKISH_LANG_MODEL, + typical_positive_ratio=0.97029, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş", +) diff --git a/lib/chardet/latin1prober.py b/lib/chardet/latin1prober.py index 7c37520b..65f13792 100644 --- a/lib/chardet/latin1prober.py +++ b/lib/chardet/latin1prober.py @@ -26,6 +26,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -41,6 +43,7 @@ ASV = 6 # accent small vowel ASO = 7 # accent small other CLASS_NUM = 8 # total classes +# fmt: off Latin1_CharToClass = ( OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F @@ -91,34 +94,34 @@ Latin1ClassModel = ( 0, 3, 1, 3, 1, 1, 1, 3, # ASV 0, 3, 1, 3, 1, 1, 3, 3, # ASO ) +# fmt: on class Latin1Prober(CharSetProber): - def __init__(self): - super(Latin1Prober, self).__init__() - self._last_char_class = None - self._freq_counter = None + def __init__(self) -> None: + super().__init__() + self._last_char_class = OTH + self._freq_counter: List[int] = [] self.reset() - def reset(self): + def reset(self) -> None: self._last_char_class = OTH self._freq_counter = [0] * FREQ_CAT_NUM - CharSetProber.reset(self) + super().reset() @property - def charset_name(self): + def charset_name(self) -> str: return "ISO-8859-1" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): - byte_str = self.filter_with_english_letters(byte_str) + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + byte_str = self.remove_xml_tags(byte_str) for c in byte_str: char_class = Latin1_CharToClass[c] - freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) - + char_class] + freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class] if freq == 0: self._state = ProbingState.NOT_ME break @@ -127,19 +130,18 @@ class Latin1Prober(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: if self.state == ProbingState.NOT_ME: return 0.01 total = sum(self._freq_counter) - if total < 0.01: - confidence = 0.0 - else: - confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0) - / total) - if confidence < 0.0: - confidence = 0.0 + confidence = ( + 0.0 + if total < 0.01 + else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total + ) + confidence = max(confidence, 0.0) # lower the confidence of latin1 so that other more accurate # detector can take priority. - confidence = confidence * 0.73 + confidence *= 0.73 return confidence diff --git a/lib/chardet/macromanprober.py b/lib/chardet/macromanprober.py new file mode 100644 index 00000000..bfcf1126 --- /dev/null +++ b/lib/chardet/macromanprober.py @@ -0,0 +1,162 @@ +######################## BEGIN LICENSE BLOCK ######################## +# This code was modified from latin1prober.py by Rob Speer . +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Rob Speer - adapt to MacRoman encoding +# Mark Pilgrim - port to Python +# Shy Shalom - original C code +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +from typing import List, Union + +from .charsetprober import CharSetProber +from .enums import ProbingState + +FREQ_CAT_NUM = 4 + +UDF = 0 # undefined +OTH = 1 # other +ASC = 2 # ascii capital letter +ASS = 3 # ascii small letter +ACV = 4 # accent capital vowel +ACO = 5 # accent capital other +ASV = 6 # accent small vowel +ASO = 7 # accent small other +ODD = 8 # character that is unlikely to appear +CLASS_NUM = 9 # total classes + +# The change from Latin1 is that we explicitly look for extended characters +# that are infrequently-occurring symbols, and consider them to always be +# improbable. This should let MacRoman get out of the way of more likely +# encodings in most situations. + +# fmt: off +MacRoman_CharToClass = ( + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F + OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 + ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F + OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 + ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F + ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87 + ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F + ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97 + ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7 + OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 + OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF + OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7 + OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7 + ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF + OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7 + ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF + ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7 + ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF +) + +# 0 : illegal +# 1 : very unlikely +# 2 : normal +# 3 : very likely +MacRomanClassModel = ( +# UDF OTH ASC ASS ACV ACO ASV ASO ODD + 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF + 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC + 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS + 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO + 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV + 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO + 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD +) +# fmt: on + + +class MacRomanProber(CharSetProber): + def __init__(self) -> None: + super().__init__() + self._last_char_class = OTH + self._freq_counter: List[int] = [] + self.reset() + + def reset(self) -> None: + self._last_char_class = OTH + self._freq_counter = [0] * FREQ_CAT_NUM + + # express the prior that MacRoman is a somewhat rare encoding; + # this can be done by starting out in a slightly improbable state + # that must be overcome + self._freq_counter[2] = 10 + + super().reset() + + @property + def charset_name(self) -> str: + return "MacRoman" + + @property + def language(self) -> str: + return "" + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + byte_str = self.remove_xml_tags(byte_str) + for c in byte_str: + char_class = MacRoman_CharToClass[c] + freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class] + if freq == 0: + self._state = ProbingState.NOT_ME + break + self._freq_counter[freq] += 1 + self._last_char_class = char_class + + return self.state + + def get_confidence(self) -> float: + if self.state == ProbingState.NOT_ME: + return 0.01 + + total = sum(self._freq_counter) + confidence = ( + 0.0 + if total < 0.01 + else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total + ) + confidence = max(confidence, 0.0) + # lower the confidence of MacRoman so that other more accurate + # detector can take priority. + confidence *= 0.73 + return confidence diff --git a/lib/chardet/mbcharsetprober.py b/lib/chardet/mbcharsetprober.py index 46091543..df8d3569 100644 --- a/lib/chardet/mbcharsetprober.py +++ b/lib/chardet/mbcharsetprober.py @@ -27,8 +27,12 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + +from .chardistribution import CharDistributionAnalysis from .charsetprober import CharSetProber -from .enums import ProbingState, MachineState +from .codingstatemachine import CodingStateMachine +from .enums import LanguageFilter, MachineState, ProbingState class MultiByteCharSetProber(CharSetProber): @@ -36,56 +40,56 @@ class MultiByteCharSetProber(CharSetProber): MultiByteCharSetProber """ - def __init__(self, lang_filter=None): - super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter) - self.distribution_analyzer = None - self.coding_sm = None - self._last_char = [0, 0] + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + super().__init__(lang_filter=lang_filter) + self.distribution_analyzer: Optional[CharDistributionAnalysis] = None + self.coding_sm: Optional[CodingStateMachine] = None + self._last_char = bytearray(b"\0\0") - def reset(self): - super(MultiByteCharSetProber, self).reset() + def reset(self) -> None: + super().reset() if self.coding_sm: self.coding_sm.reset() if self.distribution_analyzer: self.distribution_analyzer.reset() - self._last_char = [0, 0] + self._last_char = bytearray(b"\0\0") - @property - def charset_name(self): - raise NotImplementedError + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None - @property - def language(self): - raise NotImplementedError - - def feed(self, byte_str): - for i in range(len(byte_str)): - coding_state = self.coding_sm.next_state(byte_str[i]) + for i, byte in enumerate(byte_str): + coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: - self.logger.debug('%s %s prober hit error at byte %s', - self.charset_name, self.language, i) + self.logger.debug( + "%s %s prober hit error at byte %s", + self.charset_name, + self.language, + i, + ) self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.ITS_ME: + if coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.START: + if coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() if i == 0: - self._last_char[1] = byte_str[0] + self._last_char[1] = byte self.distribution_analyzer.feed(self._last_char, char_len) else: - self.distribution_analyzer.feed(byte_str[i - 1:i + 1], - char_len) + self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len) self._last_char[0] = byte_str[-1] if self.state == ProbingState.DETECTING: - if (self.distribution_analyzer.got_enough_data() and - (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + if self.distribution_analyzer.got_enough_data() and ( + self.get_confidence() > self.SHORTCUT_THRESHOLD + ): self._state = ProbingState.FOUND_IT return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None return self.distribution_analyzer.get_confidence() diff --git a/lib/chardet/mbcsgroupprober.py b/lib/chardet/mbcsgroupprober.py index 4b049293..b648dc21 100644 --- a/lib/chardet/mbcsgroupprober.py +++ b/lib/chardet/mbcsgroupprober.py @@ -27,20 +27,22 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .charsetgroupprober import CharSetGroupProber -from .utf8prober import UTF8Prober -from .sjisprober import SJISProber -from .eucjpprober import EUCJPProber -from .gb2312prober import GB2312Prober -from .euckrprober import EUCKRProber -from .cp949prober import CP949Prober from .big5prober import Big5Prober +from .charsetgroupprober import CharSetGroupProber +from .cp949prober import CP949Prober +from .enums import LanguageFilter +from .eucjpprober import EUCJPProber +from .euckrprober import EUCKRProber from .euctwprober import EUCTWProber +from .gb2312prober import GB2312Prober +from .johabprober import JOHABProber +from .sjisprober import SJISProber +from .utf8prober import UTF8Prober class MBCSGroupProber(CharSetGroupProber): - def __init__(self, lang_filter=None): - super(MBCSGroupProber, self).__init__(lang_filter=lang_filter) + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + super().__init__(lang_filter=lang_filter) self.probers = [ UTF8Prober(), SJISProber(), @@ -49,6 +51,7 @@ class MBCSGroupProber(CharSetGroupProber): EUCKRProber(), CP949Prober(), Big5Prober(), - EUCTWProber() + EUCTWProber(), + JOHABProber(), ] self.reset() diff --git a/lib/chardet/mbcssm.py b/lib/chardet/mbcssm.py index d68f6f6c..bec529fb 100644 --- a/lib/chardet/mbcssm.py +++ b/lib/chardet/mbcssm.py @@ -25,43 +25,45 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # BIG5 +# fmt: off BIG5_CLS = ( - 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,1, # 78 - 7f - 4,4,4,4,4,4,4,4, # 80 - 87 - 4,4,4,4,4,4,4,4, # 88 - 8f - 4,4,4,4,4,4,4,4, # 90 - 97 - 4,4,4,4,4,4,4,4, # 98 - 9f - 4,3,3,3,3,3,3,3, # a0 - a7 - 3,3,3,3,3,3,3,3, # a8 - af - 3,3,3,3,3,3,3,3, # b0 - b7 - 3,3,3,3,3,3,3,3, # b8 - bf - 3,3,3,3,3,3,3,3, # c0 - c7 - 3,3,3,3,3,3,3,3, # c8 - cf - 3,3,3,3,3,3,3,3, # d0 - d7 - 3,3,3,3,3,3,3,3, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,3,3,3, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,3,3,0 # f8 - ff + 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value + 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f + 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 + 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f + 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 + 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f + 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 + 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f + 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47 + 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f + 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57 + 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f + 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67 + 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f + 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77 + 2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f + 4, 4, 4, 4, 4, 4, 4, 4, # 80 - 87 + 4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f + 4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97 + 4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f + 4, 3, 3, 3, 3, 3, 3, 3, # a0 - a7 + 3, 3, 3, 3, 3, 3, 3, 3, # a8 - af + 3, 3, 3, 3, 3, 3, 3, 3, # b0 - b7 + 3, 3, 3, 3, 3, 3, 3, 3, # b8 - bf + 3, 3, 3, 3, 3, 3, 3, 3, # c0 - c7 + 3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf + 3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7 + 3, 3, 3, 3, 3, 3, 3, 3, # d8 - df + 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7 + 3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef + 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7 + 3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff ) BIG5_ST = ( @@ -69,34 +71,37 @@ BIG5_ST = ( MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17 ) +# fmt: on BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) -BIG5_SM_MODEL = {'class_table': BIG5_CLS, - 'class_factor': 5, - 'state_table': BIG5_ST, - 'char_len_table': BIG5_CHAR_LEN_TABLE, - 'name': 'Big5'} +BIG5_SM_MODEL: CodingStateMachineDict = { + "class_table": BIG5_CLS, + "class_factor": 5, + "state_table": BIG5_ST, + "char_len_table": BIG5_CHAR_LEN_TABLE, + "name": "Big5", +} # CP949 - +# fmt: off CP949_CLS = ( - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f - 1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f - 1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f - 4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f - 1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f - 5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f - 0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f - 6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f - 6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af - 7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf - 7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 2f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 3f + 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 4f + 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 50 - 5f + 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 60 - 6f + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 70 - 7f + 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 80 - 8f + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 9f + 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, # a0 - af + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0 - bf + 7, 7, 7, 7, 7, 7, 9, 2, 2, 3, 2, 2, 2, 2, 2, 2, # c0 - cf + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0 - df + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # e0 - ef + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, # f0 - ff ) CP949_ST = ( @@ -109,50 +114,53 @@ CP949_ST = ( MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6 ) +# fmt: on CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) -CP949_SM_MODEL = {'class_table': CP949_CLS, - 'class_factor': 10, - 'state_table': CP949_ST, - 'char_len_table': CP949_CHAR_LEN_TABLE, - 'name': 'CP949'} +CP949_SM_MODEL: CodingStateMachineDict = { + "class_table": CP949_CLS, + "class_factor": 10, + "state_table": CP949_ST, + "char_len_table": CP949_CHAR_LEN_TABLE, + "name": "CP949", +} # EUC-JP - +# fmt: off EUCJP_CLS = ( - 4,4,4,4,4,4,4,4, # 00 - 07 - 4,4,4,4,4,4,5,5, # 08 - 0f - 4,4,4,4,4,4,4,4, # 10 - 17 - 4,4,4,5,4,4,4,4, # 18 - 1f - 4,4,4,4,4,4,4,4, # 20 - 27 - 4,4,4,4,4,4,4,4, # 28 - 2f - 4,4,4,4,4,4,4,4, # 30 - 37 - 4,4,4,4,4,4,4,4, # 38 - 3f - 4,4,4,4,4,4,4,4, # 40 - 47 - 4,4,4,4,4,4,4,4, # 48 - 4f - 4,4,4,4,4,4,4,4, # 50 - 57 - 4,4,4,4,4,4,4,4, # 58 - 5f - 4,4,4,4,4,4,4,4, # 60 - 67 - 4,4,4,4,4,4,4,4, # 68 - 6f - 4,4,4,4,4,4,4,4, # 70 - 77 - 4,4,4,4,4,4,4,4, # 78 - 7f - 5,5,5,5,5,5,5,5, # 80 - 87 - 5,5,5,5,5,5,1,3, # 88 - 8f - 5,5,5,5,5,5,5,5, # 90 - 97 - 5,5,5,5,5,5,5,5, # 98 - 9f - 5,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,2,2,2, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,2,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,0,5 # f8 - ff + 4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07 + 4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f + 4, 4, 4, 4, 4, 4, 4, 4, # 10 - 17 + 4, 4, 4, 5, 4, 4, 4, 4, # 18 - 1f + 4, 4, 4, 4, 4, 4, 4, 4, # 20 - 27 + 4, 4, 4, 4, 4, 4, 4, 4, # 28 - 2f + 4, 4, 4, 4, 4, 4, 4, 4, # 30 - 37 + 4, 4, 4, 4, 4, 4, 4, 4, # 38 - 3f + 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 47 + 4, 4, 4, 4, 4, 4, 4, 4, # 48 - 4f + 4, 4, 4, 4, 4, 4, 4, 4, # 50 - 57 + 4, 4, 4, 4, 4, 4, 4, 4, # 58 - 5f + 4, 4, 4, 4, 4, 4, 4, 4, # 60 - 67 + 4, 4, 4, 4, 4, 4, 4, 4, # 68 - 6f + 4, 4, 4, 4, 4, 4, 4, 4, # 70 - 77 + 4, 4, 4, 4, 4, 4, 4, 4, # 78 - 7f + 5, 5, 5, 5, 5, 5, 5, 5, # 80 - 87 + 5, 5, 5, 5, 5, 5, 1, 3, # 88 - 8f + 5, 5, 5, 5, 5, 5, 5, 5, # 90 - 97 + 5, 5, 5, 5, 5, 5, 5, 5, # 98 - 9f + 5, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 + 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af + 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 + 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf + 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 + 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf + 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 + 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df + 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7 + 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef + 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7 + 0, 0, 0, 0, 0, 0, 0, 5 # f8 - ff ) EUCJP_ST = ( @@ -162,100 +170,163 @@ EUCJP_ST = ( MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27 ) +# fmt: on EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) -EUCJP_SM_MODEL = {'class_table': EUCJP_CLS, - 'class_factor': 6, - 'state_table': EUCJP_ST, - 'char_len_table': EUCJP_CHAR_LEN_TABLE, - 'name': 'EUC-JP'} +EUCJP_SM_MODEL: CodingStateMachineDict = { + "class_table": EUCJP_CLS, + "class_factor": 6, + "state_table": EUCJP_ST, + "char_len_table": EUCJP_CHAR_LEN_TABLE, + "name": "EUC-JP", +} # EUC-KR - +# fmt: off EUCKR_CLS = ( - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 1,1,1,1,1,1,1,1, # 40 - 47 - 1,1,1,1,1,1,1,1, # 48 - 4f - 1,1,1,1,1,1,1,1, # 50 - 57 - 1,1,1,1,1,1,1,1, # 58 - 5f - 1,1,1,1,1,1,1,1, # 60 - 67 - 1,1,1,1,1,1,1,1, # 68 - 6f - 1,1,1,1,1,1,1,1, # 70 - 77 - 1,1,1,1,1,1,1,1, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,3,3,3, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,3,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 2,2,2,2,2,2,2,2, # e0 - e7 - 2,2,2,2,2,2,2,2, # e8 - ef - 2,2,2,2,2,2,2,2, # f0 - f7 - 2,2,2,2,2,2,2,0 # f8 - ff + 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 + 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f + 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 + 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f + 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 + 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f + 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 + 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f + 1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47 + 1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f + 1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57 + 1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f + 1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67 + 1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f + 1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77 + 1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f + 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 + 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f + 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 + 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f + 0, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 + 2, 2, 2, 2, 2, 3, 3, 3, # a8 - af + 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 + 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf + 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 + 2, 3, 2, 2, 2, 2, 2, 2, # c8 - cf + 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 + 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df + 2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7 + 2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef + 2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7 + 2, 2, 2, 2, 2, 2, 2, 0 # f8 - ff ) EUCKR_ST = ( MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f ) +# fmt: on EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) -EUCKR_SM_MODEL = {'class_table': EUCKR_CLS, - 'class_factor': 4, - 'state_table': EUCKR_ST, - 'char_len_table': EUCKR_CHAR_LEN_TABLE, - 'name': 'EUC-KR'} +EUCKR_SM_MODEL: CodingStateMachineDict = { + "class_table": EUCKR_CLS, + "class_factor": 4, + "state_table": EUCKR_ST, + "char_len_table": EUCKR_CHAR_LEN_TABLE, + "name": "EUC-KR", +} + +# JOHAB +# fmt: off +JOHAB_CLS = ( + 4,4,4,4,4,4,4,4, # 00 - 07 + 4,4,4,4,4,4,0,0, # 08 - 0f + 4,4,4,4,4,4,4,4, # 10 - 17 + 4,4,4,0,4,4,4,4, # 18 - 1f + 4,4,4,4,4,4,4,4, # 20 - 27 + 4,4,4,4,4,4,4,4, # 28 - 2f + 4,3,3,3,3,3,3,3, # 30 - 37 + 3,3,3,3,3,3,3,3, # 38 - 3f + 3,1,1,1,1,1,1,1, # 40 - 47 + 1,1,1,1,1,1,1,1, # 48 - 4f + 1,1,1,1,1,1,1,1, # 50 - 57 + 1,1,1,1,1,1,1,1, # 58 - 5f + 1,1,1,1,1,1,1,1, # 60 - 67 + 1,1,1,1,1,1,1,1, # 68 - 6f + 1,1,1,1,1,1,1,1, # 70 - 77 + 1,1,1,1,1,1,1,2, # 78 - 7f + 6,6,6,6,8,8,8,8, # 80 - 87 + 8,8,8,8,8,8,8,8, # 88 - 8f + 8,7,7,7,7,7,7,7, # 90 - 97 + 7,7,7,7,7,7,7,7, # 98 - 9f + 7,7,7,7,7,7,7,7, # a0 - a7 + 7,7,7,7,7,7,7,7, # a8 - af + 7,7,7,7,7,7,7,7, # b0 - b7 + 7,7,7,7,7,7,7,7, # b8 - bf + 7,7,7,7,7,7,7,7, # c0 - c7 + 7,7,7,7,7,7,7,7, # c8 - cf + 7,7,7,7,5,5,5,5, # d0 - d7 + 5,9,9,9,9,9,9,5, # d8 - df + 9,9,9,9,9,9,9,9, # e0 - e7 + 9,9,9,9,9,9,9,9, # e8 - ef + 9,9,9,9,9,9,9,9, # f0 - f7 + 9,9,5,5,5,5,5,0 # f8 - ff +) + +JOHAB_ST = ( +# cls = 0 1 2 3 4 5 6 7 8 9 + MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START + MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME + MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR + MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3 + MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4 +) +# fmt: on + +JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2) + +JOHAB_SM_MODEL: CodingStateMachineDict = { + "class_table": JOHAB_CLS, + "class_factor": 10, + "state_table": JOHAB_ST, + "char_len_table": JOHAB_CHAR_LEN_TABLE, + "name": "Johab", +} # EUC-TW - +# fmt: off EUCTW_CLS = ( - 2,2,2,2,2,2,2,2, # 00 - 07 - 2,2,2,2,2,2,0,0, # 08 - 0f - 2,2,2,2,2,2,2,2, # 10 - 17 - 2,2,2,0,2,2,2,2, # 18 - 1f - 2,2,2,2,2,2,2,2, # 20 - 27 - 2,2,2,2,2,2,2,2, # 28 - 2f - 2,2,2,2,2,2,2,2, # 30 - 37 - 2,2,2,2,2,2,2,2, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,2, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,6,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,3,4,4,4,4,4,4, # a0 - a7 - 5,5,1,1,1,1,1,1, # a8 - af - 1,1,1,1,1,1,1,1, # b0 - b7 - 1,1,1,1,1,1,1,1, # b8 - bf - 1,1,3,1,3,3,3,3, # c0 - c7 - 3,3,3,3,3,3,3,3, # c8 - cf - 3,3,3,3,3,3,3,3, # d0 - d7 - 3,3,3,3,3,3,3,3, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,3,3,3, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,3,3,0 # f8 - ff + 2, 2, 2, 2, 2, 2, 2, 2, # 00 - 07 + 2, 2, 2, 2, 2, 2, 0, 0, # 08 - 0f + 2, 2, 2, 2, 2, 2, 2, 2, # 10 - 17 + 2, 2, 2, 0, 2, 2, 2, 2, # 18 - 1f + 2, 2, 2, 2, 2, 2, 2, 2, # 20 - 27 + 2, 2, 2, 2, 2, 2, 2, 2, # 28 - 2f + 2, 2, 2, 2, 2, 2, 2, 2, # 30 - 37 + 2, 2, 2, 2, 2, 2, 2, 2, # 38 - 3f + 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47 + 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f + 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57 + 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f + 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67 + 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f + 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77 + 2, 2, 2, 2, 2, 2, 2, 2, # 78 - 7f + 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 + 0, 0, 0, 0, 0, 0, 6, 0, # 88 - 8f + 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 + 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f + 0, 3, 4, 4, 4, 4, 4, 4, # a0 - a7 + 5, 5, 1, 1, 1, 1, 1, 1, # a8 - af + 1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7 + 1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf + 1, 1, 3, 1, 3, 3, 3, 3, # c0 - c7 + 3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf + 3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7 + 3, 3, 3, 3, 3, 3, 3, 3, # d8 - df + 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7 + 3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef + 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7 + 3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff ) EUCTW_ST = ( @@ -266,50 +337,53 @@ EUCTW_ST = ( 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27 MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f ) +# fmt: on EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) -EUCTW_SM_MODEL = {'class_table': EUCTW_CLS, - 'class_factor': 7, - 'state_table': EUCTW_ST, - 'char_len_table': EUCTW_CHAR_LEN_TABLE, - 'name': 'x-euc-tw'} +EUCTW_SM_MODEL: CodingStateMachineDict = { + "class_table": EUCTW_CLS, + "class_factor": 7, + "state_table": EUCTW_ST, + "char_len_table": EUCTW_CHAR_LEN_TABLE, + "name": "x-euc-tw", +} # GB2312 - +# fmt: off GB2312_CLS = ( - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 3,3,3,3,3,3,3,3, # 30 - 37 - 3,3,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,4, # 78 - 7f - 5,6,6,6,6,6,6,6, # 80 - 87 - 6,6,6,6,6,6,6,6, # 88 - 8f - 6,6,6,6,6,6,6,6, # 90 - 97 - 6,6,6,6,6,6,6,6, # 98 - 9f - 6,6,6,6,6,6,6,6, # a0 - a7 - 6,6,6,6,6,6,6,6, # a8 - af - 6,6,6,6,6,6,6,6, # b0 - b7 - 6,6,6,6,6,6,6,6, # b8 - bf - 6,6,6,6,6,6,6,6, # c0 - c7 - 6,6,6,6,6,6,6,6, # c8 - cf - 6,6,6,6,6,6,6,6, # d0 - d7 - 6,6,6,6,6,6,6,6, # d8 - df - 6,6,6,6,6,6,6,6, # e0 - e7 - 6,6,6,6,6,6,6,6, # e8 - ef - 6,6,6,6,6,6,6,6, # f0 - f7 - 6,6,6,6,6,6,6,0 # f8 - ff + 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 + 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f + 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 + 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f + 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 + 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f + 3, 3, 3, 3, 3, 3, 3, 3, # 30 - 37 + 3, 3, 1, 1, 1, 1, 1, 1, # 38 - 3f + 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47 + 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f + 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57 + 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f + 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67 + 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f + 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77 + 2, 2, 2, 2, 2, 2, 2, 4, # 78 - 7f + 5, 6, 6, 6, 6, 6, 6, 6, # 80 - 87 + 6, 6, 6, 6, 6, 6, 6, 6, # 88 - 8f + 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 97 + 6, 6, 6, 6, 6, 6, 6, 6, # 98 - 9f + 6, 6, 6, 6, 6, 6, 6, 6, # a0 - a7 + 6, 6, 6, 6, 6, 6, 6, 6, # a8 - af + 6, 6, 6, 6, 6, 6, 6, 6, # b0 - b7 + 6, 6, 6, 6, 6, 6, 6, 6, # b8 - bf + 6, 6, 6, 6, 6, 6, 6, 6, # c0 - c7 + 6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf + 6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7 + 6, 6, 6, 6, 6, 6, 6, 6, # d8 - df + 6, 6, 6, 6, 6, 6, 6, 6, # e0 - e7 + 6, 6, 6, 6, 6, 6, 6, 6, # e8 - ef + 6, 6, 6, 6, 6, 6, 6, 6, # f0 - f7 + 6, 6, 6, 6, 6, 6, 6, 0 # f8 - ff ) GB2312_ST = ( @@ -320,6 +394,7 @@ GB2312_ST = ( MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27 MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f ) +# fmt: on # To be accurate, the length of class 6 can be either 2 or 4. # But it is not necessary to discriminate between the two since @@ -328,100 +403,105 @@ GB2312_ST = ( # 2 here. GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2) -GB2312_SM_MODEL = {'class_table': GB2312_CLS, - 'class_factor': 7, - 'state_table': GB2312_ST, - 'char_len_table': GB2312_CHAR_LEN_TABLE, - 'name': 'GB2312'} +GB2312_SM_MODEL: CodingStateMachineDict = { + "class_table": GB2312_CLS, + "class_factor": 7, + "state_table": GB2312_ST, + "char_len_table": GB2312_CHAR_LEN_TABLE, + "name": "GB2312", +} # Shift_JIS - +# fmt: off SJIS_CLS = ( - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,1, # 78 - 7f - 3,3,3,3,3,2,2,3, # 80 - 87 - 3,3,3,3,3,3,3,3, # 88 - 8f - 3,3,3,3,3,3,3,3, # 90 - 97 - 3,3,3,3,3,3,3,3, # 98 - 9f + 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 + 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f + 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 + 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f + 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 + 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f + 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 + 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f + 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47 + 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f + 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57 + 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f + 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67 + 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f + 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77 + 2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f + 3, 3, 3, 3, 3, 2, 2, 3, # 80 - 87 + 3, 3, 3, 3, 3, 3, 3, 3, # 88 - 8f + 3, 3, 3, 3, 3, 3, 3, 3, # 90 - 97 + 3, 3, 3, 3, 3, 3, 3, 3, # 98 - 9f #0xa0 is illegal in sjis encoding, but some pages does #contain such byte. We need to be more error forgiven. - 2,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,2,2,2, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,2,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,4,4,4, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,0,0,0) # f8 - ff - + 2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 + 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af + 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 + 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf + 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 + 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf + 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 + 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df + 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7 + 3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef + 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7 + 3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff +) SJIS_ST = ( MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17 ) +# fmt: on SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) -SJIS_SM_MODEL = {'class_table': SJIS_CLS, - 'class_factor': 6, - 'state_table': SJIS_ST, - 'char_len_table': SJIS_CHAR_LEN_TABLE, - 'name': 'Shift_JIS'} +SJIS_SM_MODEL: CodingStateMachineDict = { + "class_table": SJIS_CLS, + "class_factor": 6, + "state_table": SJIS_ST, + "char_len_table": SJIS_CHAR_LEN_TABLE, + "name": "Shift_JIS", +} # UCS2-BE - +# fmt: off UCS2BE_CLS = ( - 0,0,0,0,0,0,0,0, # 00 - 07 - 0,0,1,0,0,2,0,0, # 08 - 0f - 0,0,0,0,0,0,0,0, # 10 - 17 - 0,0,0,3,0,0,0,0, # 18 - 1f - 0,0,0,0,0,0,0,0, # 20 - 27 - 0,3,3,3,3,3,0,0, # 28 - 2f - 0,0,0,0,0,0,0,0, # 30 - 37 - 0,0,0,0,0,0,0,0, # 38 - 3f - 0,0,0,0,0,0,0,0, # 40 - 47 - 0,0,0,0,0,0,0,0, # 48 - 4f - 0,0,0,0,0,0,0,0, # 50 - 57 - 0,0,0,0,0,0,0,0, # 58 - 5f - 0,0,0,0,0,0,0,0, # 60 - 67 - 0,0,0,0,0,0,0,0, # 68 - 6f - 0,0,0,0,0,0,0,0, # 70 - 77 - 0,0,0,0,0,0,0,0, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,0,0,0,0,0,0,0, # a0 - a7 - 0,0,0,0,0,0,0,0, # a8 - af - 0,0,0,0,0,0,0,0, # b0 - b7 - 0,0,0,0,0,0,0,0, # b8 - bf - 0,0,0,0,0,0,0,0, # c0 - c7 - 0,0,0,0,0,0,0,0, # c8 - cf - 0,0,0,0,0,0,0,0, # d0 - d7 - 0,0,0,0,0,0,0,0, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,4,5 # f8 - ff + 0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 + 0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f + 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 + 0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f + 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27 + 0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f + 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 + 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f + 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47 + 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f + 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 + 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f + 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 + 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f + 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 + 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f + 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 + 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f + 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 + 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f + 0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7 + 0, 0, 0, 0, 0, 0, 0, 0, # a8 - af + 0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7 + 0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf + 0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7 + 0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf + 0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7 + 0, 0, 0, 0, 0, 0, 0, 0, # d8 - df + 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7 + 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef + 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7 + 0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff ) UCS2BE_ST = ( @@ -433,50 +513,53 @@ UCS2BE_ST = ( 5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37 ) +# fmt: on UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) -UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS, - 'class_factor': 6, - 'state_table': UCS2BE_ST, - 'char_len_table': UCS2BE_CHAR_LEN_TABLE, - 'name': 'UTF-16BE'} +UCS2BE_SM_MODEL: CodingStateMachineDict = { + "class_table": UCS2BE_CLS, + "class_factor": 6, + "state_table": UCS2BE_ST, + "char_len_table": UCS2BE_CHAR_LEN_TABLE, + "name": "UTF-16BE", +} # UCS2-LE - +# fmt: off UCS2LE_CLS = ( - 0,0,0,0,0,0,0,0, # 00 - 07 - 0,0,1,0,0,2,0,0, # 08 - 0f - 0,0,0,0,0,0,0,0, # 10 - 17 - 0,0,0,3,0,0,0,0, # 18 - 1f - 0,0,0,0,0,0,0,0, # 20 - 27 - 0,3,3,3,3,3,0,0, # 28 - 2f - 0,0,0,0,0,0,0,0, # 30 - 37 - 0,0,0,0,0,0,0,0, # 38 - 3f - 0,0,0,0,0,0,0,0, # 40 - 47 - 0,0,0,0,0,0,0,0, # 48 - 4f - 0,0,0,0,0,0,0,0, # 50 - 57 - 0,0,0,0,0,0,0,0, # 58 - 5f - 0,0,0,0,0,0,0,0, # 60 - 67 - 0,0,0,0,0,0,0,0, # 68 - 6f - 0,0,0,0,0,0,0,0, # 70 - 77 - 0,0,0,0,0,0,0,0, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,0,0,0,0,0,0,0, # a0 - a7 - 0,0,0,0,0,0,0,0, # a8 - af - 0,0,0,0,0,0,0,0, # b0 - b7 - 0,0,0,0,0,0,0,0, # b8 - bf - 0,0,0,0,0,0,0,0, # c0 - c7 - 0,0,0,0,0,0,0,0, # c8 - cf - 0,0,0,0,0,0,0,0, # d0 - d7 - 0,0,0,0,0,0,0,0, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,4,5 # f8 - ff + 0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 + 0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f + 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 + 0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f + 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27 + 0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f + 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 + 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f + 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47 + 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f + 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 + 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f + 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 + 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f + 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 + 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f + 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 + 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f + 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 + 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f + 0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7 + 0, 0, 0, 0, 0, 0, 0, 0, # a8 - af + 0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7 + 0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf + 0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7 + 0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf + 0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7 + 0, 0, 0, 0, 0, 0, 0, 0, # d8 - df + 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7 + 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef + 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7 + 0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff ) UCS2LE_ST = ( @@ -488,50 +571,53 @@ UCS2LE_ST = ( 5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37 ) +# fmt: on UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) -UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS, - 'class_factor': 6, - 'state_table': UCS2LE_ST, - 'char_len_table': UCS2LE_CHAR_LEN_TABLE, - 'name': 'UTF-16LE'} +UCS2LE_SM_MODEL: CodingStateMachineDict = { + "class_table": UCS2LE_CLS, + "class_factor": 6, + "state_table": UCS2LE_ST, + "char_len_table": UCS2LE_CHAR_LEN_TABLE, + "name": "UTF-16LE", +} # UTF-8 - +# fmt: off UTF8_CLS = ( - 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 1,1,1,1,1,1,1,1, # 40 - 47 - 1,1,1,1,1,1,1,1, # 48 - 4f - 1,1,1,1,1,1,1,1, # 50 - 57 - 1,1,1,1,1,1,1,1, # 58 - 5f - 1,1,1,1,1,1,1,1, # 60 - 67 - 1,1,1,1,1,1,1,1, # 68 - 6f - 1,1,1,1,1,1,1,1, # 70 - 77 - 1,1,1,1,1,1,1,1, # 78 - 7f - 2,2,2,2,3,3,3,3, # 80 - 87 - 4,4,4,4,4,4,4,4, # 88 - 8f - 4,4,4,4,4,4,4,4, # 90 - 97 - 4,4,4,4,4,4,4,4, # 98 - 9f - 5,5,5,5,5,5,5,5, # a0 - a7 - 5,5,5,5,5,5,5,5, # a8 - af - 5,5,5,5,5,5,5,5, # b0 - b7 - 5,5,5,5,5,5,5,5, # b8 - bf - 0,0,6,6,6,6,6,6, # c0 - c7 - 6,6,6,6,6,6,6,6, # c8 - cf - 6,6,6,6,6,6,6,6, # d0 - d7 - 6,6,6,6,6,6,6,6, # d8 - df - 7,8,8,8,8,8,8,8, # e0 - e7 - 8,8,8,8,8,9,8,8, # e8 - ef - 10,11,11,11,11,11,11,11, # f0 - f7 - 12,13,13,13,14,15,0,0 # f8 - ff + 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as a legal value + 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f + 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 + 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f + 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 + 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f + 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 + 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f + 1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47 + 1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f + 1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57 + 1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f + 1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67 + 1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f + 1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77 + 1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f + 2, 2, 2, 2, 3, 3, 3, 3, # 80 - 87 + 4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f + 4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97 + 4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f + 5, 5, 5, 5, 5, 5, 5, 5, # a0 - a7 + 5, 5, 5, 5, 5, 5, 5, 5, # a8 - af + 5, 5, 5, 5, 5, 5, 5, 5, # b0 - b7 + 5, 5, 5, 5, 5, 5, 5, 5, # b8 - bf + 0, 0, 6, 6, 6, 6, 6, 6, # c0 - c7 + 6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf + 6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7 + 6, 6, 6, 6, 6, 6, 6, 6, # d8 - df + 7, 8, 8, 8, 8, 8, 8, 8, # e0 - e7 + 8, 8, 8, 8, 8, 9, 8, 8, # e8 - ef + 10, 11, 11, 11, 11, 11, 11, 11, # f0 - f7 + 12, 13, 13, 13, 14, 15, 0, 0 # f8 - ff ) UTF8_ST = ( @@ -562,11 +648,14 @@ UTF8_ST = ( MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf ) +# fmt: on UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) -UTF8_SM_MODEL = {'class_table': UTF8_CLS, - 'class_factor': 16, - 'state_table': UTF8_ST, - 'char_len_table': UTF8_CHAR_LEN_TABLE, - 'name': 'UTF-8'} +UTF8_SM_MODEL: CodingStateMachineDict = { + "class_table": UTF8_CLS, + "class_factor": 16, + "state_table": UTF8_ST, + "char_len_table": UTF8_CHAR_LEN_TABLE, + "name": "UTF-8", +} diff --git a/lib/chardet/metadata/languages.py b/lib/chardet/metadata/languages.py index bf63a847..57e27438 100644 --- a/lib/chardet/metadata/languages.py +++ b/lib/chardet/metadata/languages.py @@ -1,19 +1,17 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- """ Metadata about languages used by our model training code for our SingleByteCharSetProbers. Could be used for other things in the future. This code is based on the language metadata from the uchardet project. """ -from __future__ import absolute_import, print_function from string import ascii_letters +from typing import List, Optional + +# TODO: Add Ukrainian (KOI8-U) -# TODO: Add Ukranian (KOI8-U) - -class Language(object): +class Language: """Metadata about a language useful for training models :ivar name: The human name for the language, in English. @@ -33,9 +31,17 @@ class Language(object): Wikipedia for training data. :type wiki_start_pages: list of str """ - def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None, - alphabet=None, wiki_start_pages=None): - super(Language, self).__init__() + + def __init__( + self, + name: Optional[str] = None, + iso_code: Optional[str] = None, + use_ascii: bool = True, + charsets: Optional[List[str]] = None, + alphabet: Optional[str] = None, + wiki_start_pages: Optional[List[str]] = None, + ) -> None: + super().__init__() self.name = name self.iso_code = iso_code self.use_ascii = use_ascii @@ -46,265 +52,301 @@ class Language(object): else: alphabet = ascii_letters elif not alphabet: - raise ValueError('Must supply alphabet if use_ascii is False') - self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None + raise ValueError("Must supply alphabet if use_ascii is False") + self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None self.wiki_start_pages = wiki_start_pages - def __repr__(self): - return '{}({})'.format(self.__class__.__name__, - ', '.join('{}={!r}'.format(k, v) - for k, v in self.__dict__.items() - if not k.startswith('_'))) + def __repr__(self) -> str: + param_str = ", ".join( + f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") + ) + return f"{self.__class__.__name__}({param_str})" -LANGUAGES = {'Arabic': Language(name='Arabic', - iso_code='ar', - use_ascii=False, - # We only support encodings that use isolated - # forms, because the current recommendation is - # that the rendering system handles presentation - # forms. This means we purposefully skip IBM864. - charsets=['ISO-8859-6', 'WINDOWS-1256', - 'CP720', 'CP864'], - alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ', - wiki_start_pages=[u'الصفحة_الرئيسية']), - 'Belarusian': Language(name='Belarusian', - iso_code='be', - use_ascii=False, - charsets=['ISO-8859-5', 'WINDOWS-1251', - 'IBM866', 'MacCyrillic'], - alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ' - u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'), - wiki_start_pages=[u'Галоўная_старонка']), - 'Bulgarian': Language(name='Bulgarian', - iso_code='bg', - use_ascii=False, - charsets=['ISO-8859-5', 'WINDOWS-1251', - 'IBM855'], - alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ' - u'абвгдежзийклмнопрстуфхцчшщъьюя'), - wiki_start_pages=[u'Начална_страница']), - 'Czech': Language(name='Czech', - iso_code='cz', - use_ascii=True, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ', - wiki_start_pages=[u'Hlavní_strana']), - 'Danish': Language(name='Danish', - iso_code='da', - use_ascii=True, - charsets=['ISO-8859-1', 'ISO-8859-15', - 'WINDOWS-1252'], - alphabet=u'æøåÆØÅ', - wiki_start_pages=[u'Forside']), - 'German': Language(name='German', - iso_code='de', - use_ascii=True, - charsets=['ISO-8859-1', 'WINDOWS-1252'], - alphabet=u'äöüßÄÖÜ', - wiki_start_pages=[u'Wikipedia:Hauptseite']), - 'Greek': Language(name='Greek', - iso_code='el', - use_ascii=False, - charsets=['ISO-8859-7', 'WINDOWS-1253'], - alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' - u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'), - wiki_start_pages=[u'Πύλη:Κύρια']), - 'English': Language(name='English', - iso_code='en', - use_ascii=True, - charsets=['ISO-8859-1', 'WINDOWS-1252'], - wiki_start_pages=[u'Main_Page']), - 'Esperanto': Language(name='Esperanto', - iso_code='eo', - # Q, W, X, and Y not used at all - use_ascii=False, - charsets=['ISO-8859-3'], - alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' - u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'), - wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']), - 'Spanish': Language(name='Spanish', - iso_code='es', - use_ascii=True, - charsets=['ISO-8859-1', 'ISO-8859-15', - 'WINDOWS-1252'], - alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ', - wiki_start_pages=[u'Wikipedia:Portada']), - 'Estonian': Language(name='Estonian', - iso_code='et', - use_ascii=False, - charsets=['ISO-8859-4', 'ISO-8859-13', - 'WINDOWS-1257'], - # C, F, Š, Q, W, X, Y, Z, Ž are only for - # loanwords - alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ' - u'abdeghijklmnoprstuvõäöü'), - wiki_start_pages=[u'Esileht']), - 'Finnish': Language(name='Finnish', - iso_code='fi', - use_ascii=True, - charsets=['ISO-8859-1', 'ISO-8859-15', - 'WINDOWS-1252'], - alphabet=u'ÅÄÖŠŽåäöšž', - wiki_start_pages=[u'Wikipedia:Etusivu']), - 'French': Language(name='French', - iso_code='fr', - use_ascii=True, - charsets=['ISO-8859-1', 'ISO-8859-15', - 'WINDOWS-1252'], - alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ', - wiki_start_pages=[u'Wikipédia:Accueil_principal', - u'Bœuf (animal)']), - 'Hebrew': Language(name='Hebrew', - iso_code='he', - use_ascii=False, - charsets=['ISO-8859-8', 'WINDOWS-1255'], - alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ', - wiki_start_pages=[u'עמוד_ראשי']), - 'Croatian': Language(name='Croatian', - iso_code='hr', - # Q, W, X, Y are only used for foreign words. - use_ascii=False, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'abcčćdđefghijklmnoprsštuvzž' - u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'), - wiki_start_pages=[u'Glavna_stranica']), - 'Hungarian': Language(name='Hungarian', - iso_code='hu', - # Q, W, X, Y are only used for foreign words. - use_ascii=False, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű' - u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'), - wiki_start_pages=[u'Kezdőlap']), - 'Italian': Language(name='Italian', - iso_code='it', - use_ascii=True, - charsets=['ISO-8859-1', 'ISO-8859-15', - 'WINDOWS-1252'], - alphabet=u'ÀÈÉÌÒÓÙàèéìòóù', - wiki_start_pages=[u'Pagina_principale']), - 'Lithuanian': Language(name='Lithuanian', - iso_code='lt', - use_ascii=False, - charsets=['ISO-8859-13', 'WINDOWS-1257', - 'ISO-8859-4'], - # Q, W, and X not used at all - alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ' - u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'), - wiki_start_pages=[u'Pagrindinis_puslapis']), - 'Latvian': Language(name='Latvian', - iso_code='lv', - use_ascii=False, - charsets=['ISO-8859-13', 'WINDOWS-1257', - 'ISO-8859-4'], - # Q, W, X, Y are only for loanwords - alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ' - u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'), - wiki_start_pages=[u'Sākumlapa']), - 'Macedonian': Language(name='Macedonian', - iso_code='mk', - use_ascii=False, - charsets=['ISO-8859-5', 'WINDOWS-1251', - 'MacCyrillic', 'IBM855'], - alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ' - u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'), - wiki_start_pages=[u'Главна_страница']), - 'Dutch': Language(name='Dutch', - iso_code='nl', - use_ascii=True, - charsets=['ISO-8859-1', 'WINDOWS-1252'], - wiki_start_pages=[u'Hoofdpagina']), - 'Polish': Language(name='Polish', - iso_code='pl', - # Q and X are only used for foreign words. - use_ascii=False, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ' - u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'), - wiki_start_pages=[u'Wikipedia:Strona_główna']), - 'Portuguese': Language(name='Portuguese', - iso_code='pt', - use_ascii=True, - charsets=['ISO-8859-1', 'ISO-8859-15', - 'WINDOWS-1252'], - alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú', - wiki_start_pages=[u'Wikipédia:Página_principal']), - 'Romanian': Language(name='Romanian', - iso_code='ro', - use_ascii=True, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=u'ăâîșțĂÂÎȘȚ', - wiki_start_pages=[u'Pagina_principală']), - 'Russian': Language(name='Russian', - iso_code='ru', - use_ascii=False, - charsets=['ISO-8859-5', 'WINDOWS-1251', - 'KOI8-R', 'MacCyrillic', 'IBM866', - 'IBM855'], - alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' - u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'), - wiki_start_pages=[u'Заглавная_страница']), - 'Slovak': Language(name='Slovak', - iso_code='sk', - use_ascii=True, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ', - wiki_start_pages=[u'Hlavná_stránka']), - 'Slovene': Language(name='Slovene', - iso_code='sl', - # Q, W, X, Y are only used for foreign words. - use_ascii=False, - charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'abcčdefghijklmnoprsštuvzž' - u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'), - wiki_start_pages=[u'Glavna_stran']), - # Serbian can be written in both Latin and Cyrillic, but there's no - # simple way to get the Latin alphabet pages from Wikipedia through - # the API, so for now we just support Cyrillic. - 'Serbian': Language(name='Serbian', - iso_code='sr', - alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ' - u'абвгдђежзијклљмнњопрстћуфхцчџш'), - charsets=['ISO-8859-5', 'WINDOWS-1251', - 'MacCyrillic', 'IBM855'], - wiki_start_pages=[u'Главна_страна']), - 'Thai': Language(name='Thai', - iso_code='th', - use_ascii=False, - charsets=['ISO-8859-11', 'TIS-620', 'CP874'], - alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛', - wiki_start_pages=[u'หน้าหลัก']), - 'Turkish': Language(name='Turkish', - iso_code='tr', - # Q, W, and X are not used by Turkish - use_ascii=False, - charsets=['ISO-8859-3', 'ISO-8859-9', - 'WINDOWS-1254'], - alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû' - u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'), - wiki_start_pages=[u'Ana_Sayfa']), - 'Vietnamese': Language(name='Vietnamese', - iso_code='vi', - use_ascii=False, - # Windows-1258 is the only common 8-bit - # Vietnamese encoding supported by Python. - # From Wikipedia: - # For systems that lack support for Unicode, - # dozens of 8-bit Vietnamese code pages are - # available.[1] The most common are VISCII - # (TCVN 5712:1993), VPS, and Windows-1258.[3] - # Where ASCII is required, such as when - # ensuring readability in plain text e-mail, - # Vietnamese letters are often encoded - # according to Vietnamese Quoted-Readable - # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] - # though usage of either variable-width - # scheme has declined dramatically following - # the adoption of Unicode on the World Wide - # Web. - charsets=['WINDOWS-1258'], - alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy' - u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'), - wiki_start_pages=[u'Chữ_Quốc_ngữ']), - } +LANGUAGES = { + "Arabic": Language( + name="Arabic", + iso_code="ar", + use_ascii=False, + # We only support encodings that use isolated + # forms, because the current recommendation is + # that the rendering system handles presentation + # forms. This means we purposefully skip IBM864. + charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"], + alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", + wiki_start_pages=["الصفحة_الرئيسية"], + ), + "Belarusian": Language( + name="Belarusian", + iso_code="be", + use_ascii=False, + charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"], + alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ", + wiki_start_pages=["Галоўная_старонка"], + ), + "Bulgarian": Language( + name="Bulgarian", + iso_code="bg", + use_ascii=False, + charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"], + alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", + wiki_start_pages=["Начална_страница"], + ), + "Czech": Language( + name="Czech", + iso_code="cz", + use_ascii=True, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ", + wiki_start_pages=["Hlavní_strana"], + ), + "Danish": Language( + name="Danish", + iso_code="da", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="æøåÆØÅ", + wiki_start_pages=["Forside"], + ), + "German": Language( + name="German", + iso_code="de", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="äöüßẞÄÖÜ", + wiki_start_pages=["Wikipedia:Hauptseite"], + ), + "Greek": Language( + name="Greek", + iso_code="el", + use_ascii=False, + charsets=["ISO-8859-7", "WINDOWS-1253"], + alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ", + wiki_start_pages=["Πύλη:Κύρια"], + ), + "English": Language( + name="English", + iso_code="en", + use_ascii=True, + charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"], + wiki_start_pages=["Main_Page"], + ), + "Esperanto": Language( + name="Esperanto", + iso_code="eo", + # Q, W, X, and Y not used at all + use_ascii=False, + charsets=["ISO-8859-3"], + alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ", + wiki_start_pages=["Vikipedio:Ĉefpaĝo"], + ), + "Spanish": Language( + name="Spanish", + iso_code="es", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="ñáéíóúüÑÁÉÍÓÚÜ", + wiki_start_pages=["Wikipedia:Portada"], + ), + "Estonian": Language( + name="Estonian", + iso_code="et", + use_ascii=False, + charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"], + # C, F, Š, Q, W, X, Y, Z, Ž are only for + # loanwords + alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü", + wiki_start_pages=["Esileht"], + ), + "Finnish": Language( + name="Finnish", + iso_code="fi", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="ÅÄÖŠŽåäöšž", + wiki_start_pages=["Wikipedia:Etusivu"], + ), + "French": Language( + name="French", + iso_code="fr", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", + wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], + ), + "Hebrew": Language( + name="Hebrew", + iso_code="he", + use_ascii=False, + charsets=["ISO-8859-8", "WINDOWS-1255"], + alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", + wiki_start_pages=["עמוד_ראשי"], + ), + "Croatian": Language( + name="Croatian", + iso_code="hr", + # Q, W, X, Y are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ", + wiki_start_pages=["Glavna_stranica"], + ), + "Hungarian": Language( + name="Hungarian", + iso_code="hu", + # Q, W, X, Y are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ", + wiki_start_pages=["Kezdőlap"], + ), + "Italian": Language( + name="Italian", + iso_code="it", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="ÀÈÉÌÒÓÙàèéìòóù", + wiki_start_pages=["Pagina_principale"], + ), + "Lithuanian": Language( + name="Lithuanian", + iso_code="lt", + use_ascii=False, + charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], + # Q, W, and X not used at all + alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž", + wiki_start_pages=["Pagrindinis_puslapis"], + ), + "Latvian": Language( + name="Latvian", + iso_code="lv", + use_ascii=False, + charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], + # Q, W, X, Y are only for loanwords + alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž", + wiki_start_pages=["Sākumlapa"], + ), + "Macedonian": Language( + name="Macedonian", + iso_code="mk", + use_ascii=False, + charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], + alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш", + wiki_start_pages=["Главна_страница"], + ), + "Dutch": Language( + name="Dutch", + iso_code="nl", + use_ascii=True, + charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"], + wiki_start_pages=["Hoofdpagina"], + ), + "Polish": Language( + name="Polish", + iso_code="pl", + # Q and X are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż", + wiki_start_pages=["Wikipedia:Strona_główna"], + ), + "Portuguese": Language( + name="Portuguese", + iso_code="pt", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", + wiki_start_pages=["Wikipédia:Página_principal"], + ), + "Romanian": Language( + name="Romanian", + iso_code="ro", + use_ascii=True, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="ăâîșțĂÂÎȘȚ", + wiki_start_pages=["Pagina_principală"], + ), + "Russian": Language( + name="Russian", + iso_code="ru", + use_ascii=False, + charsets=[ + "ISO-8859-5", + "WINDOWS-1251", + "KOI8-R", + "MacCyrillic", + "IBM866", + "IBM855", + ], + alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", + wiki_start_pages=["Заглавная_страница"], + ), + "Slovak": Language( + name="Slovak", + iso_code="sk", + use_ascii=True, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ", + wiki_start_pages=["Hlavná_stránka"], + ), + "Slovene": Language( + name="Slovene", + iso_code="sl", + # Q, W, X, Y are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ", + wiki_start_pages=["Glavna_stran"], + ), + # Serbian can be written in both Latin and Cyrillic, but there's no + # simple way to get the Latin alphabet pages from Wikipedia through + # the API, so for now we just support Cyrillic. + "Serbian": Language( + name="Serbian", + iso_code="sr", + alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш", + charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], + wiki_start_pages=["Главна_страна"], + ), + "Thai": Language( + name="Thai", + iso_code="th", + use_ascii=False, + charsets=["ISO-8859-11", "TIS-620", "CP874"], + alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", + wiki_start_pages=["หน้าหลัก"], + ), + "Turkish": Language( + name="Turkish", + iso_code="tr", + # Q, W, and X are not used by Turkish + use_ascii=False, + charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"], + alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ", + wiki_start_pages=["Ana_Sayfa"], + ), + "Vietnamese": Language( + name="Vietnamese", + iso_code="vi", + use_ascii=False, + # Windows-1258 is the only common 8-bit + # Vietnamese encoding supported by Python. + # From Wikipedia: + # For systems that lack support for Unicode, + # dozens of 8-bit Vietnamese code pages are + # available.[1] The most common are VISCII + # (TCVN 5712:1993), VPS, and Windows-1258.[3] + # Where ASCII is required, such as when + # ensuring readability in plain text e-mail, + # Vietnamese letters are often encoded + # according to Vietnamese Quoted-Readable + # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] + # though usage of either variable-width + # scheme has declined dramatically following + # the adoption of Unicode on the World Wide + # Web. + charsets=["WINDOWS-1258"], + alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY", + wiki_start_pages=["Chữ_Quốc_ngữ"], + ), +} diff --git a/lib/chardet/resultdict.py b/lib/chardet/resultdict.py new file mode 100644 index 00000000..f028ba0b --- /dev/null +++ b/lib/chardet/resultdict.py @@ -0,0 +1,16 @@ +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class ResultDict(TypedDict): + encoding: Optional[str] + confidence: float + language: Optional[str] + +else: + ResultDict = dict diff --git a/lib/chardet/sbcharsetprober.py b/lib/chardet/sbcharsetprober.py index 186c0da1..864e0ee9 100644 --- a/lib/chardet/sbcharsetprober.py +++ b/lib/chardet/sbcharsetprober.py @@ -26,70 +26,77 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from collections import namedtuple +from typing import Dict, List, NamedTuple, Optional, Union from .charsetprober import CharSetProber from .enums import CharacterCategory, ProbingState, SequenceLikelihood -SingleByteCharSetModel = namedtuple('SingleByteCharSetModel', - ['charset_name', - 'language', - 'char_to_order_map', - 'language_model', - 'typical_positive_ratio', - 'keep_ascii_letters', - 'alphabet']) +class SingleByteCharSetModel(NamedTuple): + charset_name: str + language: str + char_to_order_map: Dict[int, int] + language_model: Dict[int, Dict[int, int]] + typical_positive_ratio: float + keep_ascii_letters: bool + alphabet: str class SingleByteCharSetProber(CharSetProber): SAMPLE_SIZE = 64 - SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 + SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 POSITIVE_SHORTCUT_THRESHOLD = 0.95 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 - def __init__(self, model, reversed=False, name_prober=None): - super(SingleByteCharSetProber, self).__init__() + def __init__( + self, + model: SingleByteCharSetModel, + is_reversed: bool = False, + name_prober: Optional[CharSetProber] = None, + ) -> None: + super().__init__() self._model = model # TRUE if we need to reverse every pair in the model lookup - self._reversed = reversed + self._reversed = is_reversed # Optional auxiliary prober for name decision self._name_prober = name_prober - self._last_order = None - self._seq_counters = None - self._total_seqs = None - self._total_char = None - self._freq_char = None + self._last_order = 255 + self._seq_counters: List[int] = [] + self._total_seqs = 0 + self._total_char = 0 + self._control_char = 0 + self._freq_char = 0 self.reset() - def reset(self): - super(SingleByteCharSetProber, self).reset() + def reset(self) -> None: + super().reset() # char order of last character self._last_order = 255 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() self._total_seqs = 0 self._total_char = 0 + self._control_char = 0 # characters that fall in our sampling range self._freq_char = 0 @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if self._name_prober: return self._name_prober.charset_name - else: - return self._model.charset_name + return self._model.charset_name @property - def language(self): + def language(self) -> Optional[str]: if self._name_prober: return self._name_prober.language - else: - return self._model.language + return self._model.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # TODO: Make filter_international_words keep things in self.alphabet if not self._model.keep_ascii_letters: byte_str = self.filter_international_words(byte_str) + else: + byte_str = self.remove_xml_tags(byte_str) if not byte_str: return self.state char_to_order_map = self._model.char_to_order_map @@ -103,9 +110,6 @@ class SingleByteCharSetProber(CharSetProber): # _total_char purposes. if order < CharacterCategory.CONTROL: self._total_char += 1 - # TODO: Follow uchardet's lead and discount confidence for frequent - # control characters. - # See https://github.com/BYVoid/uchardet/commit/55b4f23971db61 if order < self.SAMPLE_SIZE: self._freq_char += 1 if self._last_order < self.SAMPLE_SIZE: @@ -122,23 +126,36 @@ class SingleByteCharSetProber(CharSetProber): if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: confidence = self.get_confidence() if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: - self.logger.debug('%s confidence = %s, we have a winner', - charset_name, confidence) + self.logger.debug( + "%s confidence = %s, we have a winner", charset_name, confidence + ) self._state = ProbingState.FOUND_IT elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: - self.logger.debug('%s confidence = %s, below negative ' - 'shortcut threshhold %s', charset_name, - confidence, - self.NEGATIVE_SHORTCUT_THRESHOLD) + self.logger.debug( + "%s confidence = %s, below negative shortcut threshold %s", + charset_name, + confidence, + self.NEGATIVE_SHORTCUT_THRESHOLD, + ) self._state = ProbingState.NOT_ME return self.state - def get_confidence(self): + def get_confidence(self) -> float: r = 0.01 if self._total_seqs > 0: - r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / - self._total_seqs / self._model.typical_positive_ratio) + r = ( + ( + self._seq_counters[SequenceLikelihood.POSITIVE] + + 0.25 * self._seq_counters[SequenceLikelihood.LIKELY] + ) + / self._total_seqs + / self._model.typical_positive_ratio + ) + # The more control characters (proportionnaly to the size + # of the text), the less confident we become in the current + # charset. + r = r * (self._total_char - self._control_char) / self._total_char r = r * self._freq_char / self._total_char if r >= 1.0: r = 0.99 diff --git a/lib/chardet/sbcsgroupprober.py b/lib/chardet/sbcsgroupprober.py index c6c91d6b..b131978e 100644 --- a/lib/chardet/sbcsgroupprober.py +++ b/lib/chardet/sbcsgroupprober.py @@ -28,33 +28,38 @@ from .charsetgroupprober import CharSetGroupProber from .hebrewprober import HebrewProber -from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL, - WINDOWS_1251_BULGARIAN_MODEL) +from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL + # from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL, # WINDOWS_1250_HUNGARIAN_MODEL) -from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL, - ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL, - MACCYRILLIC_RUSSIAN_MODEL, - WINDOWS_1251_RUSSIAN_MODEL) +from .langrussianmodel import ( + IBM855_RUSSIAN_MODEL, + IBM866_RUSSIAN_MODEL, + ISO_8859_5_RUSSIAN_MODEL, + KOI8_R_RUSSIAN_MODEL, + MACCYRILLIC_RUSSIAN_MODEL, + WINDOWS_1251_RUSSIAN_MODEL, +) from .langthaimodel import TIS_620_THAI_MODEL from .langturkishmodel import ISO_8859_9_TURKISH_MODEL from .sbcharsetprober import SingleByteCharSetProber class SBCSGroupProber(CharSetGroupProber): - def __init__(self): - super(SBCSGroupProber, self).__init__() + def __init__(self) -> None: + super().__init__() hebrew_prober = HebrewProber() - logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL, - False, hebrew_prober) + logical_hebrew_prober = SingleByteCharSetProber( + WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober + ) # TODO: See if using ISO-8859-8 Hebrew model works better here, since # it's actually the visual one - visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL, - True, hebrew_prober) - hebrew_prober.set_model_probers(logical_hebrew_prober, - visual_hebrew_prober) + visual_hebrew_prober = SingleByteCharSetProber( + WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober + ) + hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober) # TODO: ORDER MATTERS HERE. I changed the order vs what was in master # and several tests failed that did not before. Some thought # should be put into the ordering, and we should consider making diff --git a/lib/chardet/sjisprober.py b/lib/chardet/sjisprober.py index 683add02..2fc2864a 100644 --- a/lib/chardet/sjisprober.py +++ b/lib/chardet/sjisprober.py @@ -25,68 +25,81 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .mbcharsetprober import MultiByteCharSetProber -from .codingstatemachine import CodingStateMachine +from typing import Union + from .chardistribution import SJISDistributionAnalysis +from .codingstatemachine import CodingStateMachine +from .enums import MachineState, ProbingState from .jpcntx import SJISContextAnalysis +from .mbcharsetprober import MultiByteCharSetProber from .mbcssm import SJIS_SM_MODEL -from .enums import ProbingState, MachineState class SJISProber(MultiByteCharSetProber): - def __init__(self): - super(SJISProber, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) self.distribution_analyzer = SJISDistributionAnalysis() self.context_analyzer = SJISContextAnalysis() self.reset() - def reset(self): - super(SJISProber, self).reset() + def reset(self) -> None: + super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return self.context_analyzer.charset_name @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): - for i in range(len(byte_str)): - coding_state = self.coding_sm.next_state(byte_str[i]) + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + + for i, byte in enumerate(byte_str): + coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: - self.logger.debug('%s %s prober hit error at byte %s', - self.charset_name, self.language, i) + self.logger.debug( + "%s %s prober hit error at byte %s", + self.charset_name, + self.language, + i, + ) self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.ITS_ME: + if coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.START: + if coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() if i == 0: - self._last_char[1] = byte_str[0] - self.context_analyzer.feed(self._last_char[2 - char_len:], - char_len) + self._last_char[1] = byte + self.context_analyzer.feed( + self._last_char[2 - char_len :], char_len + ) self.distribution_analyzer.feed(self._last_char, char_len) else: - self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3 - - char_len], char_len) - self.distribution_analyzer.feed(byte_str[i - 1:i + 1], - char_len) + self.context_analyzer.feed( + byte_str[i + 1 - char_len : i + 3 - char_len], char_len + ) + self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len) self._last_char[0] = byte_str[-1] if self.state == ProbingState.DETECTING: - if (self.context_analyzer.got_enough_data() and - (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + if self.context_analyzer.got_enough_data() and ( + self.get_confidence() > self.SHORTCUT_THRESHOLD + ): self._state = ProbingState.FOUND_IT return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/lib/chardet/universaldetector.py b/lib/chardet/universaldetector.py index 3f44e6d0..5f5b69dc 100644 --- a/lib/chardet/universaldetector.py +++ b/lib/chardet/universaldetector.py @@ -39,16 +39,21 @@ class a user of ``chardet`` should use. import codecs import logging import re +from typing import List, Optional, Union from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState, LanguageFilter, ProbingState from .escprober import EscCharSetProber from .latin1prober import Latin1Prober +from .macromanprober import MacRomanProber from .mbcsgroupprober import MBCSGroupProber +from .resultdict import ResultDict from .sbcsgroupprober import SBCSGroupProber +from .utf1632prober import UTF1632Prober -class UniversalDetector(object): +class UniversalDetector: """ The ``UniversalDetector`` class underlies the ``chardet.detect`` function and coordinates all of the different charset probers. @@ -66,49 +71,87 @@ class UniversalDetector(object): """ MINIMUM_THRESHOLD = 0.20 - HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') - ESC_DETECTOR = re.compile(b'(\033|~{)') - WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') - ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', - 'iso-8859-2': 'Windows-1250', - 'iso-8859-5': 'Windows-1251', - 'iso-8859-6': 'Windows-1256', - 'iso-8859-7': 'Windows-1253', - 'iso-8859-8': 'Windows-1255', - 'iso-8859-9': 'Windows-1254', - 'iso-8859-13': 'Windows-1257'} + HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]") + ESC_DETECTOR = re.compile(b"(\033|~{)") + WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]") + ISO_WIN_MAP = { + "iso-8859-1": "Windows-1252", + "iso-8859-2": "Windows-1250", + "iso-8859-5": "Windows-1251", + "iso-8859-6": "Windows-1256", + "iso-8859-7": "Windows-1253", + "iso-8859-8": "Windows-1255", + "iso-8859-9": "Windows-1254", + "iso-8859-13": "Windows-1257", + } + # Based on https://encoding.spec.whatwg.org/#names-and-labels + # but altered to match Python names for encodings and remove mappings + # that break tests. + LEGACY_MAP = { + "ascii": "Windows-1252", + "iso-8859-1": "Windows-1252", + "tis-620": "ISO-8859-11", + "iso-8859-9": "Windows-1254", + "gb2312": "GB18030", + "euc-kr": "CP949", + "utf-16le": "UTF-16", + } - def __init__(self, lang_filter=LanguageFilter.ALL): - self._esc_charset_prober = None - self._charset_probers = [] - self.result = None - self.done = None - self._got_data = None - self._input_state = None - self._last_char = None + def __init__( + self, + lang_filter: LanguageFilter = LanguageFilter.ALL, + should_rename_legacy: bool = False, + ) -> None: + self._esc_charset_prober: Optional[EscCharSetProber] = None + self._utf1632_prober: Optional[UTF1632Prober] = None + self._charset_probers: List[CharSetProber] = [] + self.result: ResultDict = { + "encoding": None, + "confidence": 0.0, + "language": None, + } + self.done = False + self._got_data = False + self._input_state = InputState.PURE_ASCII + self._last_char = b"" self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - self._has_win_bytes = None + self._has_win_bytes = False + self.should_rename_legacy = should_rename_legacy self.reset() - def reset(self): + @property + def input_state(self) -> int: + return self._input_state + + @property + def has_win_bytes(self) -> bool: + return self._has_win_bytes + + @property + def charset_probers(self) -> List[CharSetProber]: + return self._charset_probers + + def reset(self) -> None: """ Reset the UniversalDetector and all of its probers back to their initial states. This is called by ``__init__``, so you only need to call this directly in between analyses of different documents. """ - self.result = {'encoding': None, 'confidence': 0.0, 'language': None} + self.result = {"encoding": None, "confidence": 0.0, "language": None} self.done = False self._got_data = False self._has_win_bytes = False self._input_state = InputState.PURE_ASCII - self._last_char = b'' + self._last_char = b"" if self._esc_charset_prober: self._esc_charset_prober.reset() + if self._utf1632_prober: + self._utf1632_prober.reset() for prober in self._charset_probers: prober.reset() - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> None: """ Takes a chunk of a document and feeds it through all of the relevant charset probers. @@ -125,7 +168,7 @@ class UniversalDetector(object): if self.done: return - if not len(byte_str): + if not byte_str: return if not isinstance(byte_str, bytearray): @@ -136,35 +179,38 @@ class UniversalDetector(object): # If the data starts with BOM, we know it is UTF if byte_str.startswith(codecs.BOM_UTF8): # EF BB BF UTF-8 with BOM - self.result = {'encoding': "UTF-8-SIG", - 'confidence': 1.0, - 'language': ''} - elif byte_str.startswith((codecs.BOM_UTF32_LE, - codecs.BOM_UTF32_BE)): + self.result = { + "encoding": "UTF-8-SIG", + "confidence": 1.0, + "language": "", + } + elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): # FF FE 00 00 UTF-32, little-endian BOM # 00 00 FE FF UTF-32, big-endian BOM - self.result = {'encoding': "UTF-32", - 'confidence': 1.0, - 'language': ''} - elif byte_str.startswith(b'\xFE\xFF\x00\x00'): + self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""} + elif byte_str.startswith(b"\xFE\xFF\x00\x00"): # FE FF 00 00 UCS-4, unusual octet order BOM (3412) - self.result = {'encoding': "X-ISO-10646-UCS-4-3412", - 'confidence': 1.0, - 'language': ''} - elif byte_str.startswith(b'\x00\x00\xFF\xFE'): + self.result = { + # TODO: This encoding is not supported by Python. Should remove? + "encoding": "X-ISO-10646-UCS-4-3412", + "confidence": 1.0, + "language": "", + } + elif byte_str.startswith(b"\x00\x00\xFF\xFE"): # 00 00 FF FE UCS-4, unusual octet order BOM (2143) - self.result = {'encoding': "X-ISO-10646-UCS-4-2143", - 'confidence': 1.0, - 'language': ''} + self.result = { + # TODO: This encoding is not supported by Python. Should remove? + "encoding": "X-ISO-10646-UCS-4-2143", + "confidence": 1.0, + "language": "", + } elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): # FF FE UTF-16, little endian BOM # FE FF UTF-16, big endian BOM - self.result = {'encoding': "UTF-16", - 'confidence': 1.0, - 'language': ''} + self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""} self._got_data = True - if self.result['encoding'] is not None: + if self.result["encoding"] is not None: self.done = True return @@ -173,12 +219,29 @@ class UniversalDetector(object): if self._input_state == InputState.PURE_ASCII: if self.HIGH_BYTE_DETECTOR.search(byte_str): self._input_state = InputState.HIGH_BYTE - elif self._input_state == InputState.PURE_ASCII and \ - self.ESC_DETECTOR.search(self._last_char + byte_str): + elif ( + self._input_state == InputState.PURE_ASCII + and self.ESC_DETECTOR.search(self._last_char + byte_str) + ): self._input_state = InputState.ESC_ASCII self._last_char = byte_str[-1:] + # next we will look to see if it is appears to be either a UTF-16 or + # UTF-32 encoding + if not self._utf1632_prober: + self._utf1632_prober = UTF1632Prober() + + if self._utf1632_prober.state == ProbingState.DETECTING: + if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT: + self.result = { + "encoding": self._utf1632_prober.charset_name, + "confidence": self._utf1632_prober.get_confidence(), + "language": "", + } + self.done = True + return + # If we've seen escape sequences, use the EscCharSetProber, which # uses a simple state machine to check for known escape sequences in # HZ and ISO-2022 encodings, since those are the only encodings that @@ -187,12 +250,11 @@ class UniversalDetector(object): if not self._esc_charset_prober: self._esc_charset_prober = EscCharSetProber(self.lang_filter) if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: - self.result = {'encoding': - self._esc_charset_prober.charset_name, - 'confidence': - self._esc_charset_prober.get_confidence(), - 'language': - self._esc_charset_prober.language} + self.result = { + "encoding": self._esc_charset_prober.charset_name, + "confidence": self._esc_charset_prober.get_confidence(), + "language": self._esc_charset_prober.language, + } self.done = True # If we've seen high bytes (i.e., those with values greater than 127), # we need to do more complicated checks using all our multi-byte and @@ -207,17 +269,20 @@ class UniversalDetector(object): if self.lang_filter & LanguageFilter.NON_CJK: self._charset_probers.append(SBCSGroupProber()) self._charset_probers.append(Latin1Prober()) + self._charset_probers.append(MacRomanProber()) for prober in self._charset_probers: if prober.feed(byte_str) == ProbingState.FOUND_IT: - self.result = {'encoding': prober.charset_name, - 'confidence': prober.get_confidence(), - 'language': prober.language} + self.result = { + "encoding": prober.charset_name, + "confidence": prober.get_confidence(), + "language": prober.language, + } self.done = True break if self.WIN_BYTE_DETECTOR.search(byte_str): self._has_win_bytes = True - def close(self): + def close(self) -> ResultDict: """ Stop analyzing the current document and come up with a final prediction. @@ -231,13 +296,11 @@ class UniversalDetector(object): self.done = True if not self._got_data: - self.logger.debug('no data received!') + self.logger.debug("no data received!") # Default to ASCII if it is all we've seen so far elif self._input_state == InputState.PURE_ASCII: - self.result = {'encoding': 'ascii', - 'confidence': 1.0, - 'language': ''} + self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""} # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD elif self._input_state == InputState.HIGH_BYTE: @@ -253,34 +316,47 @@ class UniversalDetector(object): max_prober = prober if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): charset_name = max_prober.charset_name - lower_charset_name = max_prober.charset_name.lower() + assert charset_name is not None + lower_charset_name = charset_name.lower() confidence = max_prober.get_confidence() # Use Windows encoding name instead of ISO-8859 if we saw any # extra Windows-specific bytes - if lower_charset_name.startswith('iso-8859'): + if lower_charset_name.startswith("iso-8859"): if self._has_win_bytes: - charset_name = self.ISO_WIN_MAP.get(lower_charset_name, - charset_name) - self.result = {'encoding': charset_name, - 'confidence': confidence, - 'language': max_prober.language} + charset_name = self.ISO_WIN_MAP.get( + lower_charset_name, charset_name + ) + # Rename legacy encodings with superset encodings if asked + if self.should_rename_legacy: + charset_name = self.LEGACY_MAP.get( + (charset_name or "").lower(), charset_name + ) + self.result = { + "encoding": charset_name, + "confidence": confidence, + "language": max_prober.language, + } # Log all prober confidences if none met MINIMUM_THRESHOLD if self.logger.getEffectiveLevel() <= logging.DEBUG: - if self.result['encoding'] is None: - self.logger.debug('no probers hit minimum threshold') + if self.result["encoding"] is None: + self.logger.debug("no probers hit minimum threshold") for group_prober in self._charset_probers: if not group_prober: continue if isinstance(group_prober, CharSetGroupProber): for prober in group_prober.probers: - self.logger.debug('%s %s confidence = %s', - prober.charset_name, - prober.language, - prober.get_confidence()) + self.logger.debug( + "%s %s confidence = %s", + prober.charset_name, + prober.language, + prober.get_confidence(), + ) else: - self.logger.debug('%s %s confidence = %s', - prober.charset_name, - prober.language, - prober.get_confidence()) + self.logger.debug( + "%s %s confidence = %s", + group_prober.charset_name, + group_prober.language, + group_prober.get_confidence(), + ) return self.result diff --git a/lib/chardet/utf1632prober.py b/lib/chardet/utf1632prober.py new file mode 100644 index 00000000..0a624479 --- /dev/null +++ b/lib/chardet/utf1632prober.py @@ -0,0 +1,225 @@ +######################## BEGIN LICENSE BLOCK ######################## +# +# Contributor(s): +# Jason Zavaglia +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### +from typing import List, Union + +from .charsetprober import CharSetProber +from .enums import ProbingState + + +class UTF1632Prober(CharSetProber): + """ + This class simply looks for occurrences of zero bytes, and infers + whether the file is UTF16 or UTF32 (low-endian or big-endian) + For instance, files looking like ( \0 \0 \0 [nonzero] )+ + have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+ + may be guessed to be UTF16BE, and inversely for little-endian varieties. + """ + + # how many logical characters to scan before feeling confident of prediction + MIN_CHARS_FOR_DETECTION = 20 + # a fixed constant ratio of expected zeros or non-zeros in modulo-position. + EXPECTED_RATIO = 0.94 + + def __init__(self) -> None: + super().__init__() + self.position = 0 + self.zeros_at_mod = [0] * 4 + self.nonzeros_at_mod = [0] * 4 + self._state = ProbingState.DETECTING + self.quad = [0, 0, 0, 0] + self.invalid_utf16be = False + self.invalid_utf16le = False + self.invalid_utf32be = False + self.invalid_utf32le = False + self.first_half_surrogate_pair_detected_16be = False + self.first_half_surrogate_pair_detected_16le = False + self.reset() + + def reset(self) -> None: + super().reset() + self.position = 0 + self.zeros_at_mod = [0] * 4 + self.nonzeros_at_mod = [0] * 4 + self._state = ProbingState.DETECTING + self.invalid_utf16be = False + self.invalid_utf16le = False + self.invalid_utf32be = False + self.invalid_utf32le = False + self.first_half_surrogate_pair_detected_16be = False + self.first_half_surrogate_pair_detected_16le = False + self.quad = [0, 0, 0, 0] + + @property + def charset_name(self) -> str: + if self.is_likely_utf32be(): + return "utf-32be" + if self.is_likely_utf32le(): + return "utf-32le" + if self.is_likely_utf16be(): + return "utf-16be" + if self.is_likely_utf16le(): + return "utf-16le" + # default to something valid + return "utf-16" + + @property + def language(self) -> str: + return "" + + def approx_32bit_chars(self) -> float: + return max(1.0, self.position / 4.0) + + def approx_16bit_chars(self) -> float: + return max(1.0, self.position / 2.0) + + def is_likely_utf32be(self) -> bool: + approx_chars = self.approx_32bit_chars() + return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( + self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO + and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO + and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO + and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO + and not self.invalid_utf32be + ) + + def is_likely_utf32le(self) -> bool: + approx_chars = self.approx_32bit_chars() + return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( + self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO + and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO + and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO + and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO + and not self.invalid_utf32le + ) + + def is_likely_utf16be(self) -> bool: + approx_chars = self.approx_16bit_chars() + return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( + (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars + > self.EXPECTED_RATIO + and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars + > self.EXPECTED_RATIO + and not self.invalid_utf16be + ) + + def is_likely_utf16le(self) -> bool: + approx_chars = self.approx_16bit_chars() + return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( + (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars + > self.EXPECTED_RATIO + and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars + > self.EXPECTED_RATIO + and not self.invalid_utf16le + ) + + def validate_utf32_characters(self, quad: List[int]) -> None: + """ + Validate if the quad of bytes is valid UTF-32. + + UTF-32 is valid in the range 0x00000000 - 0x0010FFFF + excluding 0x0000D800 - 0x0000DFFF + + https://en.wikipedia.org/wiki/UTF-32 + """ + if ( + quad[0] != 0 + or quad[1] > 0x10 + or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF) + ): + self.invalid_utf32be = True + if ( + quad[3] != 0 + or quad[2] > 0x10 + or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF) + ): + self.invalid_utf32le = True + + def validate_utf16_characters(self, pair: List[int]) -> None: + """ + Validate if the pair of bytes is valid UTF-16. + + UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF + with an exception for surrogate pairs, which must be in the range + 0xD800-0xDBFF followed by 0xDC00-0xDFFF + + https://en.wikipedia.org/wiki/UTF-16 + """ + if not self.first_half_surrogate_pair_detected_16be: + if 0xD8 <= pair[0] <= 0xDB: + self.first_half_surrogate_pair_detected_16be = True + elif 0xDC <= pair[0] <= 0xDF: + self.invalid_utf16be = True + else: + if 0xDC <= pair[0] <= 0xDF: + self.first_half_surrogate_pair_detected_16be = False + else: + self.invalid_utf16be = True + + if not self.first_half_surrogate_pair_detected_16le: + if 0xD8 <= pair[1] <= 0xDB: + self.first_half_surrogate_pair_detected_16le = True + elif 0xDC <= pair[1] <= 0xDF: + self.invalid_utf16le = True + else: + if 0xDC <= pair[1] <= 0xDF: + self.first_half_surrogate_pair_detected_16le = False + else: + self.invalid_utf16le = True + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + for c in byte_str: + mod4 = self.position % 4 + self.quad[mod4] = c + if mod4 == 3: + self.validate_utf32_characters(self.quad) + self.validate_utf16_characters(self.quad[0:2]) + self.validate_utf16_characters(self.quad[2:4]) + if c == 0: + self.zeros_at_mod[mod4] += 1 + else: + self.nonzeros_at_mod[mod4] += 1 + self.position += 1 + return self.state + + @property + def state(self) -> ProbingState: + if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: + # terminal, decided states + return self._state + if self.get_confidence() > 0.80: + self._state = ProbingState.FOUND_IT + elif self.position > 4 * 1024: + # if we get to 4kb into the file, and we can't conclude it's UTF, + # let's give up + self._state = ProbingState.NOT_ME + return self._state + + def get_confidence(self) -> float: + return ( + 0.85 + if ( + self.is_likely_utf16le() + or self.is_likely_utf16be() + or self.is_likely_utf32le() + or self.is_likely_utf32be() + ) + else 0.00 + ) diff --git a/lib/chardet/utf8prober.py b/lib/chardet/utf8prober.py index 45732679..cf299f59 100644 --- a/lib/chardet/utf8prober.py +++ b/lib/chardet/utf8prober.py @@ -25,45 +25,46 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .charsetprober import CharSetProber -from .enums import ProbingState, MachineState -from .codingstatemachine import CodingStateMachine -from .mbcssm import UTF8_SM_MODEL +from typing import Union +from .charsetprober import CharSetProber +from .codingstatemachine import CodingStateMachine +from .enums import MachineState, ProbingState +from .mbcssm import UTF8_SM_MODEL class UTF8Prober(CharSetProber): ONE_CHAR_PROB = 0.5 - def __init__(self): - super(UTF8Prober, self).__init__() + def __init__(self) -> None: + super().__init__() self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) - self._num_mb_chars = None + self._num_mb_chars = 0 self.reset() - def reset(self): - super(UTF8Prober, self).reset() + def reset(self) -> None: + super().reset() self.coding_sm.reset() self._num_mb_chars = 0 @property - def charset_name(self): + def charset_name(self) -> str: return "utf-8" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: coding_state = self.coding_sm.next_state(c) if coding_state == MachineState.ERROR: self._state = ProbingState.NOT_ME break - elif coding_state == MachineState.ITS_ME: + if coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break - elif coding_state == MachineState.START: + if coding_state == MachineState.START: if self.coding_sm.get_current_charlen() >= 2: self._num_mb_chars += 1 @@ -73,10 +74,9 @@ class UTF8Prober(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: unlike = 0.99 if self._num_mb_chars < 6: - unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars + unlike *= self.ONE_CHAR_PROB**self._num_mb_chars return 1.0 - unlike - else: - return unlike + return unlike diff --git a/lib/chardet/version.py b/lib/chardet/version.py index 73283cdf..6db882ed 100644 --- a/lib/chardet/version.py +++ b/lib/chardet/version.py @@ -1,9 +1,9 @@ """ This module exists only to simplify retrieving the version number of chardet -from within setup.py and from chardet subpackages. +from within setuptools and from chardet subpackages. :author: Dan Blanchard (dan.blanchard@gmail.com) """ -__version__ = "4.0.0" -VERSION = __version__.split('.') +__version__ = "5.1.0" +VERSION = __version__.split(".")