mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-19 09:13:37 +00:00
Merge branch 'feature/UpdateChardet' into dev
This commit is contained in:
commit
eacfd57a85
49 changed files with 9067 additions and 5845 deletions
|
@ -6,6 +6,7 @@
|
||||||
* Remove lockfile no longer used by cachecontrol
|
* Remove lockfile no longer used by cachecontrol
|
||||||
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
|
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
|
||||||
* Update certifi 2022.09.24 to 2022.12.07
|
* Update certifi 2022.09.24 to 2022.12.07
|
||||||
|
* Update chardet packages 4.0.0 (b3d867a) to 5.1.0 (8087f00)
|
||||||
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
|
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
|
||||||
* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
|
* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
|
||||||
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
|
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
|
||||||
|
|
|
@ -15,68 +15,101 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
from .universaldetector import UniversalDetector
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
from .enums import InputState
|
from .enums import InputState
|
||||||
from .version import __version__, VERSION
|
from .resultdict import ResultDict
|
||||||
|
from .universaldetector import UniversalDetector
|
||||||
|
from .version import VERSION, __version__
|
||||||
|
|
||||||
|
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
|
def detect(
|
||||||
|
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
|
||||||
|
) -> ResultDict:
|
||||||
def detect(byte_str):
|
|
||||||
"""
|
"""
|
||||||
Detect the encoding of the given byte string.
|
Detect the encoding of the given byte string.
|
||||||
|
|
||||||
:param byte_str: The byte sequence to examine.
|
:param byte_str: The byte sequence to examine.
|
||||||
:type byte_str: ``bytes`` or ``bytearray``
|
:type byte_str: ``bytes`` or ``bytearray``
|
||||||
|
:param should_rename_legacy: Should we rename legacy encodings
|
||||||
|
to their more modern equivalents?
|
||||||
|
:type should_rename_legacy: ``bool``
|
||||||
"""
|
"""
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
raise TypeError(
|
||||||
'{0}'.format(type(byte_str)))
|
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||||
else:
|
)
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
return detector.close()
|
return detector.close()
|
||||||
|
|
||||||
|
|
||||||
def detect_all(byte_str):
|
def detect_all(
|
||||||
|
byte_str: Union[bytes, bytearray],
|
||||||
|
ignore_threshold: bool = False,
|
||||||
|
should_rename_legacy: bool = False,
|
||||||
|
) -> List[ResultDict]:
|
||||||
"""
|
"""
|
||||||
Detect all the possible encodings of the given byte string.
|
Detect all the possible encodings of the given byte string.
|
||||||
|
|
||||||
:param byte_str: The byte sequence to examine.
|
:param byte_str: The byte sequence to examine.
|
||||||
:type byte_str: ``bytes`` or ``bytearray``
|
:type byte_str: ``bytes`` or ``bytearray``
|
||||||
|
:param ignore_threshold: Include encodings that are below
|
||||||
|
``UniversalDetector.MINIMUM_THRESHOLD``
|
||||||
|
in results.
|
||||||
|
:type ignore_threshold: ``bool``
|
||||||
|
:param should_rename_legacy: Should we rename legacy encodings
|
||||||
|
to their more modern equivalents?
|
||||||
|
:type should_rename_legacy: ``bool``
|
||||||
"""
|
"""
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
raise TypeError(
|
||||||
'{0}'.format(type(byte_str)))
|
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||||
else:
|
)
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
|
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
detector.close()
|
detector.close()
|
||||||
|
|
||||||
if detector._input_state == InputState.HIGH_BYTE:
|
if detector.input_state == InputState.HIGH_BYTE:
|
||||||
results = []
|
results: List[ResultDict] = []
|
||||||
for prober in detector._charset_probers:
|
probers: List[CharSetProber] = []
|
||||||
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
for prober in detector.charset_probers:
|
||||||
charset_name = prober.charset_name
|
if isinstance(prober, CharSetGroupProber):
|
||||||
lower_charset_name = prober.charset_name.lower()
|
probers.extend(p for p in prober.probers)
|
||||||
|
else:
|
||||||
|
probers.append(prober)
|
||||||
|
for prober in probers:
|
||||||
|
if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
||||||
|
charset_name = prober.charset_name or ""
|
||||||
|
lower_charset_name = charset_name.lower()
|
||||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
# extra Windows-specific bytes
|
# extra Windows-specific bytes
|
||||||
if lower_charset_name.startswith('iso-8859'):
|
if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
|
||||||
if detector._has_win_bytes:
|
charset_name = detector.ISO_WIN_MAP.get(
|
||||||
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
|
lower_charset_name, charset_name
|
||||||
charset_name)
|
)
|
||||||
results.append({
|
# Rename legacy encodings with superset encodings if asked
|
||||||
'encoding': charset_name,
|
if should_rename_legacy:
|
||||||
'confidence': prober.get_confidence()
|
charset_name = detector.LEGACY_MAP.get(
|
||||||
})
|
charset_name.lower(), charset_name
|
||||||
|
)
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"encoding": charset_name,
|
||||||
|
"confidence": prober.get_confidence(),
|
||||||
|
"language": prober.language,
|
||||||
|
}
|
||||||
|
)
|
||||||
if len(results) > 0:
|
if len(results) > 0:
|
||||||
return sorted(results, key=lambda result: -result['confidence'])
|
return sorted(results, key=lambda result: -result["confidence"])
|
||||||
|
|
||||||
return [detector.result]
|
return [detector.result]
|
||||||
|
|
|
@ -42,9 +42,9 @@
|
||||||
|
|
||||||
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
|
|
||||||
#Char to FreqOrder table
|
# Char to FreqOrder table
|
||||||
BIG5_TABLE_SIZE = 5376
|
BIG5_TABLE_SIZE = 5376
|
||||||
|
# fmt: off
|
||||||
BIG5_CHAR_TO_FREQ_ORDER = (
|
BIG5_CHAR_TO_FREQ_ORDER = (
|
||||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||||
|
@ -383,4 +383,4 @@ BIG5_CHAR_TO_FREQ_ORDER = (
|
||||||
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
|
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
|
||||||
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
|
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,23 +25,23 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import Big5DistributionAnalysis
|
from .chardistribution import Big5DistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import BIG5_SM_MODEL
|
from .mbcssm import BIG5_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class Big5Prober(MultiByteCharSetProber):
|
class Big5Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(Big5Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
||||||
self.distribution_analyzer = Big5DistributionAnalysis()
|
self.distribution_analyzer = Big5DistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "Big5"
|
return "Big5"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Chinese"
|
return "Chinese"
|
||||||
|
|
|
@ -25,40 +25,58 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
|
from typing import Tuple, Union
|
||||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
|
|
||||||
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
|
from .big5freq import (
|
||||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
|
BIG5_CHAR_TO_FREQ_ORDER,
|
||||||
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
|
BIG5_TABLE_SIZE,
|
||||||
GB2312_TYPICAL_DISTRIBUTION_RATIO)
|
BIG5_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
|
)
|
||||||
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
from .euckrfreq import (
|
||||||
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
|
EUCKR_CHAR_TO_FREQ_ORDER,
|
||||||
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
EUCKR_TABLE_SIZE,
|
||||||
|
EUCKR_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .euctwfreq import (
|
||||||
|
EUCTW_CHAR_TO_FREQ_ORDER,
|
||||||
|
EUCTW_TABLE_SIZE,
|
||||||
|
EUCTW_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .gb2312freq import (
|
||||||
|
GB2312_CHAR_TO_FREQ_ORDER,
|
||||||
|
GB2312_TABLE_SIZE,
|
||||||
|
GB2312_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .jisfreq import (
|
||||||
|
JIS_CHAR_TO_FREQ_ORDER,
|
||||||
|
JIS_TABLE_SIZE,
|
||||||
|
JIS_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
|
||||||
|
|
||||||
|
|
||||||
class CharDistributionAnalysis(object):
|
class CharDistributionAnalysis:
|
||||||
ENOUGH_DATA_THRESHOLD = 1024
|
ENOUGH_DATA_THRESHOLD = 1024
|
||||||
SURE_YES = 0.99
|
SURE_YES = 0.99
|
||||||
SURE_NO = 0.01
|
SURE_NO = 0.01
|
||||||
MINIMUM_DATA_THRESHOLD = 3
|
MINIMUM_DATA_THRESHOLD = 3
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
# Mapping table to get frequency order from char order (get from
|
# Mapping table to get frequency order from char order (get from
|
||||||
# GetOrder())
|
# GetOrder())
|
||||||
self._char_to_freq_order = None
|
self._char_to_freq_order: Tuple[int, ...] = tuple()
|
||||||
self._table_size = None # Size of above table
|
self._table_size = 0 # Size of above table
|
||||||
# This is a constant value which varies from language to language,
|
# This is a constant value which varies from language to language,
|
||||||
# used in calculating confidence. See
|
# used in calculating confidence. See
|
||||||
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||||
# for further detail.
|
# for further detail.
|
||||||
self.typical_distribution_ratio = None
|
self.typical_distribution_ratio = 0.0
|
||||||
self._done = None
|
self._done = False
|
||||||
self._total_chars = None
|
self._total_chars = 0
|
||||||
self._freq_chars = None
|
self._freq_chars = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
"""reset analyser, clear any state"""
|
"""reset analyser, clear any state"""
|
||||||
# If this flag is set to True, detection is done and conclusion has
|
# If this flag is set to True, detection is done and conclusion has
|
||||||
# been made
|
# been made
|
||||||
|
@ -67,7 +85,7 @@ class CharDistributionAnalysis(object):
|
||||||
# The number of characters whose frequency order is less than 512
|
# The number of characters whose frequency order is less than 512
|
||||||
self._freq_chars = 0
|
self._freq_chars = 0
|
||||||
|
|
||||||
def feed(self, char, char_len):
|
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
|
||||||
"""feed a character with known length"""
|
"""feed a character with known length"""
|
||||||
if char_len == 2:
|
if char_len == 2:
|
||||||
# we only care about 2-bytes character in our distribution analysis
|
# we only care about 2-bytes character in our distribution analysis
|
||||||
|
@ -81,7 +99,7 @@ class CharDistributionAnalysis(object):
|
||||||
if 512 > self._char_to_freq_order[order]:
|
if 512 > self._char_to_freq_order[order]:
|
||||||
self._freq_chars += 1
|
self._freq_chars += 1
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
"""return confidence based on existing data"""
|
"""return confidence based on existing data"""
|
||||||
# if we didn't receive any character in our consideration range,
|
# if we didn't receive any character in our consideration range,
|
||||||
# return negative answer
|
# return negative answer
|
||||||
|
@ -89,20 +107,21 @@ class CharDistributionAnalysis(object):
|
||||||
return self.SURE_NO
|
return self.SURE_NO
|
||||||
|
|
||||||
if self._total_chars != self._freq_chars:
|
if self._total_chars != self._freq_chars:
|
||||||
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
|
r = self._freq_chars / (
|
||||||
* self.typical_distribution_ratio))
|
(self._total_chars - self._freq_chars) * self.typical_distribution_ratio
|
||||||
|
)
|
||||||
if r < self.SURE_YES:
|
if r < self.SURE_YES:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
# normalize confidence (we don't want to be 100% sure)
|
# normalize confidence (we don't want to be 100% sure)
|
||||||
return self.SURE_YES
|
return self.SURE_YES
|
||||||
|
|
||||||
def got_enough_data(self):
|
def got_enough_data(self) -> bool:
|
||||||
# It is not necessary to receive all data to draw conclusion.
|
# It is not necessary to receive all data to draw conclusion.
|
||||||
# For charset detection, certain amount of data is enough
|
# For charset detection, certain amount of data is enough
|
||||||
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, _: Union[bytes, bytearray]) -> int:
|
||||||
# We do not handle characters based on the original encoding string,
|
# We do not handle characters based on the original encoding string,
|
||||||
# but convert this encoding string to a number, here called order.
|
# but convert this encoding string to a number, here called order.
|
||||||
# This allows multiple encodings of a language to share one frequency
|
# This allows multiple encodings of a language to share one frequency
|
||||||
|
@ -111,13 +130,13 @@ class CharDistributionAnalysis(object):
|
||||||
|
|
||||||
|
|
||||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(EUCTWDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCTW_TABLE_SIZE
|
self._table_size = EUCTW_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for euc-TW encoding, we are interested
|
# for euc-TW encoding, we are interested
|
||||||
# first byte range: 0xc4 -- 0xfe
|
# first byte range: 0xc4 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
@ -125,18 +144,17 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
first_char = byte_str[0]
|
first_char = byte_str[0]
|
||||||
if first_char >= 0xC4:
|
if first_char >= 0xC4:
|
||||||
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
|
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(EUCKRDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCKR_TABLE_SIZE
|
self._table_size = EUCKR_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for euc-KR encoding, we are interested
|
# for euc-KR encoding, we are interested
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
@ -144,18 +162,32 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
first_char = byte_str[0]
|
first_char = byte_str[0]
|
||||||
if first_char >= 0xB0:
|
if first_char >= 0xB0:
|
||||||
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
|
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
|
||||||
else:
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||||
|
self._table_size = EUCKR_TABLE_SIZE
|
||||||
|
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
|
first_char = byte_str[0]
|
||||||
|
if 0x88 <= first_char < 0xD4:
|
||||||
|
code = first_char * 256 + byte_str[1]
|
||||||
|
return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(GB2312DistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = GB2312_TABLE_SIZE
|
self._table_size = GB2312_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for GB2312 encoding, we are interested
|
# for GB2312 encoding, we are interested
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
@ -163,18 +195,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
first_char, second_char = byte_str[0], byte_str[1]
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||||
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(Big5DistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = BIG5_TABLE_SIZE
|
self._table_size = BIG5_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for big5 encoding, we are interested
|
# for big5 encoding, we are interested
|
||||||
# first byte range: 0xa4 -- 0xfe
|
# first byte range: 0xa4 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||||
|
@ -183,28 +214,26 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
if first_char >= 0xA4:
|
if first_char >= 0xA4:
|
||||||
if second_char >= 0xA1:
|
if second_char >= 0xA1:
|
||||||
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||||
else:
|
|
||||||
return 157 * (first_char - 0xA4) + second_char - 0x40
|
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(SJISDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = JIS_TABLE_SIZE
|
self._table_size = JIS_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for sjis encoding, we are interested
|
# for sjis encoding, we are interested
|
||||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char, second_char = byte_str[0], byte_str[1]
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if (first_char >= 0x81) and (first_char <= 0x9F):
|
if 0x81 <= first_char <= 0x9F:
|
||||||
order = 188 * (first_char - 0x81)
|
order = 188 * (first_char - 0x81)
|
||||||
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
elif 0xE0 <= first_char <= 0xEF:
|
||||||
order = 188 * (first_char - 0xE0 + 31)
|
order = 188 * (first_char - 0xE0 + 31)
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
@ -215,19 +244,18 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(EUCJPDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = JIS_TABLE_SIZE
|
self._table_size = JIS_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for euc-JP encoding, we are interested
|
# for euc-JP encoding, we are interested
|
||||||
# first byte range: 0xa0 -- 0xfe
|
# first byte range: 0xa0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
char = byte_str[0]
|
char = byte_str[0]
|
||||||
if char >= 0xA0:
|
if char >= 0xA0:
|
||||||
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
|
return 94 * (char - 0xA1) + byte_str[1] - 0xA1
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
|
@ -25,29 +25,30 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .enums import ProbingState
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
|
from .enums import LanguageFilter, ProbingState
|
||||||
|
|
||||||
|
|
||||||
class CharSetGroupProber(CharSetProber):
|
class CharSetGroupProber(CharSetProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self._active_num = 0
|
self._active_num = 0
|
||||||
self.probers = []
|
self.probers: List[CharSetProber] = []
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober: Optional[CharSetProber] = None
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(CharSetGroupProber, self).reset()
|
super().reset()
|
||||||
self._active_num = 0
|
self._active_num = 0
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if prober:
|
|
||||||
prober.reset()
|
prober.reset()
|
||||||
prober.active = True
|
prober.active = True
|
||||||
self._active_num += 1
|
self._active_num += 1
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
self.get_confidence()
|
self.get_confidence()
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
|
@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
|
||||||
return self._best_guess_prober.charset_name
|
return self._best_guess_prober.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> Optional[str]:
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
self.get_confidence()
|
self.get_confidence()
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
return None
|
return None
|
||||||
return self._best_guess_prober.language
|
return self._best_guess_prober.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if not prober:
|
|
||||||
continue
|
|
||||||
if not prober.active:
|
if not prober.active:
|
||||||
continue
|
continue
|
||||||
state = prober.feed(byte_str)
|
state = prober.feed(byte_str)
|
||||||
|
@ -73,8 +72,9 @@ class CharSetGroupProber(CharSetProber):
|
||||||
continue
|
continue
|
||||||
if state == ProbingState.FOUND_IT:
|
if state == ProbingState.FOUND_IT:
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
|
self._state = ProbingState.FOUND_IT
|
||||||
return self.state
|
return self.state
|
||||||
elif state == ProbingState.NOT_ME:
|
if state == ProbingState.NOT_ME:
|
||||||
prober.active = False
|
prober.active = False
|
||||||
self._active_num -= 1
|
self._active_num -= 1
|
||||||
if self._active_num <= 0:
|
if self._active_num <= 0:
|
||||||
|
@ -82,22 +82,22 @@ class CharSetGroupProber(CharSetProber):
|
||||||
return self.state
|
return self.state
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
state = self.state
|
state = self.state
|
||||||
if state == ProbingState.FOUND_IT:
|
if state == ProbingState.FOUND_IT:
|
||||||
return 0.99
|
return 0.99
|
||||||
elif state == ProbingState.NOT_ME:
|
if state == ProbingState.NOT_ME:
|
||||||
return 0.01
|
return 0.01
|
||||||
best_conf = 0.0
|
best_conf = 0.0
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if not prober:
|
|
||||||
continue
|
|
||||||
if not prober.active:
|
if not prober.active:
|
||||||
self.logger.debug('%s not active', prober.charset_name)
|
self.logger.debug("%s not active", prober.charset_name)
|
||||||
continue
|
continue
|
||||||
conf = prober.get_confidence()
|
conf = prober.get_confidence()
|
||||||
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
|
self.logger.debug(
|
||||||
|
"%s %s confidence = %s", prober.charset_name, prober.language, conf
|
||||||
|
)
|
||||||
if best_conf < conf:
|
if best_conf < conf:
|
||||||
best_conf = conf
|
best_conf = conf
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
|
|
|
@ -28,54 +28,62 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from .enums import ProbingState
|
from .enums import LanguageFilter, ProbingState
|
||||||
|
|
||||||
|
INTERNATIONAL_WORDS_PATTERN = re.compile(
|
||||||
|
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CharSetProber(object):
|
class CharSetProber:
|
||||||
|
|
||||||
SHORTCUT_THRESHOLD = 0.95
|
SHORTCUT_THRESHOLD = 0.95
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
self._state = None
|
self._state = ProbingState.DETECTING
|
||||||
|
self.active = True
|
||||||
self.lang_filter = lang_filter
|
self.lang_filter = lang_filter
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._state = ProbingState.DETECTING
|
self._state = ProbingState.DETECTING
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def feed(self, buf):
|
@property
|
||||||
pass
|
def language(self) -> Optional[str]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self) -> ProbingState:
|
||||||
return self._state
|
return self._state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_high_byte_only(buf):
|
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
|
||||||
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
|
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_international_words(buf):
|
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
|
||||||
"""
|
"""
|
||||||
We define three types of bytes:
|
We define three types of bytes:
|
||||||
alphabet: english alphabets [a-zA-Z]
|
alphabet: english alphabets [a-zA-Z]
|
||||||
international: international characters [\x80-\xFF]
|
international: international characters [\x80-\xFF]
|
||||||
marker: everything else [^a-zA-Z\x80-\xFF]
|
marker: everything else [^a-zA-Z\x80-\xFF]
|
||||||
|
|
||||||
The input buffer can be thought to contain a series of words delimited
|
The input buffer can be thought to contain a series of words delimited
|
||||||
by markers. This function works to filter all words that contain at
|
by markers. This function works to filter all words that contain at
|
||||||
least one international character. All contiguous sequences of markers
|
least one international character. All contiguous sequences of markers
|
||||||
are replaced by a single space ascii character.
|
are replaced by a single space ascii character.
|
||||||
|
|
||||||
This filter applies to all scripts which do not use English characters.
|
This filter applies to all scripts which do not use English characters.
|
||||||
"""
|
"""
|
||||||
filtered = bytearray()
|
filtered = bytearray()
|
||||||
|
@ -83,8 +91,7 @@ class CharSetProber(object):
|
||||||
# This regex expression filters out only words that have at-least one
|
# This regex expression filters out only words that have at-least one
|
||||||
# international character. The word may include one marker character at
|
# international character. The word may include one marker character at
|
||||||
# the end.
|
# the end.
|
||||||
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
|
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
|
||||||
buf)
|
|
||||||
|
|
||||||
for word in words:
|
for word in words:
|
||||||
filtered.extend(word[:-1])
|
filtered.extend(word[:-1])
|
||||||
|
@ -94,20 +101,17 @@ class CharSetProber(object):
|
||||||
# similarly across all languages and may thus have similar
|
# similarly across all languages and may thus have similar
|
||||||
# frequencies).
|
# frequencies).
|
||||||
last_char = word[-1:]
|
last_char = word[-1:]
|
||||||
if not last_char.isalpha() and last_char < b'\x80':
|
if not last_char.isalpha() and last_char < b"\x80":
|
||||||
last_char = b' '
|
last_char = b" "
|
||||||
filtered.extend(last_char)
|
filtered.extend(last_char)
|
||||||
|
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_with_english_letters(buf):
|
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
|
||||||
"""
|
"""
|
||||||
Returns a copy of ``buf`` that retains only the sequences of English
|
Returns a copy of ``buf`` that retains only the sequences of English
|
||||||
alphabet and high byte characters that are not between <> characters.
|
alphabet and high byte characters that are not between <> characters.
|
||||||
Also retains English alphabet and high byte characters immediately
|
|
||||||
before occurrences of >.
|
|
||||||
|
|
||||||
This filter can be applied to all scripts which contain both English
|
This filter can be applied to all scripts which contain both English
|
||||||
characters and extended ASCII characters, but is currently only used by
|
characters and extended ASCII characters, but is currently only used by
|
||||||
``Latin1Prober``.
|
``Latin1Prober``.
|
||||||
|
@ -115,26 +119,24 @@ class CharSetProber(object):
|
||||||
filtered = bytearray()
|
filtered = bytearray()
|
||||||
in_tag = False
|
in_tag = False
|
||||||
prev = 0
|
prev = 0
|
||||||
|
buf = memoryview(buf).cast("c")
|
||||||
|
|
||||||
for curr in range(len(buf)):
|
for curr, buf_char in enumerate(buf):
|
||||||
# Slice here to get bytes instead of an int with Python 3
|
# Check if we're coming out of or entering an XML tag
|
||||||
buf_char = buf[curr:curr + 1]
|
|
||||||
# Check if we're coming out of or entering an HTML tag
|
# https://github.com/python/typeshed/issues/8182
|
||||||
if buf_char == b'>':
|
if buf_char == b">": # type: ignore[comparison-overlap]
|
||||||
|
prev = curr + 1
|
||||||
in_tag = False
|
in_tag = False
|
||||||
elif buf_char == b'<':
|
# https://github.com/python/typeshed/issues/8182
|
||||||
in_tag = True
|
elif buf_char == b"<": # type: ignore[comparison-overlap]
|
||||||
|
|
||||||
# If current character is not extended-ASCII and not alphabetic...
|
|
||||||
if buf_char < b'\x80' and not buf_char.isalpha():
|
|
||||||
# ...and we're not in a tag
|
|
||||||
if curr > prev and not in_tag:
|
if curr > prev and not in_tag:
|
||||||
# Keep everything after last non-extended-ASCII,
|
# Keep everything after last non-extended-ASCII,
|
||||||
# non-alphabetic character
|
# non-alphabetic character
|
||||||
filtered.extend(buf[prev:curr])
|
filtered.extend(buf[prev:curr])
|
||||||
# Output a space to delimit stretch we kept
|
# Output a space to delimit stretch we kept
|
||||||
filtered.extend(b' ')
|
filtered.extend(b" ")
|
||||||
prev = curr + 1
|
in_tag = True
|
||||||
|
|
||||||
# If we're not in a tag...
|
# If we're not in a tag...
|
||||||
if not in_tag:
|
if not in_tag:
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
"""
|
"""
|
||||||
Script which takes one or more file paths and reports on their detected
|
Script which takes one or more file paths and reports on their detected
|
||||||
encodings
|
encodings
|
||||||
|
@ -13,17 +12,21 @@ If no paths are provided, it takes its input from stdin.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, print_function, unicode_literals
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
from chardet import __version__
|
from .. import __version__
|
||||||
from chardet.compat import PY2
|
from ..universaldetector import UniversalDetector
|
||||||
from chardet.universaldetector import UniversalDetector
|
|
||||||
|
|
||||||
|
|
||||||
def description_of(lines, name='stdin'):
|
def description_of(
|
||||||
|
lines: Iterable[bytes],
|
||||||
|
name: str = "stdin",
|
||||||
|
minimal: bool = False,
|
||||||
|
should_rename_legacy: bool = False,
|
||||||
|
) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Return a string describing the probable encoding of a file or
|
Return a string describing the probable encoding of a file or
|
||||||
list of strings.
|
list of strings.
|
||||||
|
@ -32,8 +35,11 @@ def description_of(lines, name='stdin'):
|
||||||
:type lines: Iterable of bytes
|
:type lines: Iterable of bytes
|
||||||
:param name: Name of file or collection of lines
|
:param name: Name of file or collection of lines
|
||||||
:type name: str
|
:type name: str
|
||||||
|
:param should_rename_legacy: Should we rename legacy encodings to
|
||||||
|
their more modern equivalents?
|
||||||
|
:type should_rename_legacy: ``bool``
|
||||||
"""
|
"""
|
||||||
u = UniversalDetector()
|
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = bytearray(line)
|
line = bytearray(line)
|
||||||
u.feed(line)
|
u.feed(line)
|
||||||
|
@ -42,16 +48,14 @@ def description_of(lines, name='stdin'):
|
||||||
break
|
break
|
||||||
u.close()
|
u.close()
|
||||||
result = u.result
|
result = u.result
|
||||||
if PY2:
|
if minimal:
|
||||||
name = name.decode(sys.getfilesystemencoding(), 'ignore')
|
return result["encoding"]
|
||||||
if result['encoding']:
|
if result["encoding"]:
|
||||||
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
|
||||||
result['confidence'])
|
return f"{name}: no result"
|
||||||
else:
|
|
||||||
return '{0}: no result'.format(name)
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv: Optional[List[str]] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Handles command line arguments and gets things started.
|
Handles command line arguments and gets things started.
|
||||||
|
|
||||||
|
@ -61,25 +65,48 @@ def main(argv=None):
|
||||||
"""
|
"""
|
||||||
# Get command line arguments
|
# Get command line arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Takes one or more file paths and reports their detected \
|
description=(
|
||||||
encodings")
|
"Takes one or more file paths and reports their detected encodings"
|
||||||
parser.add_argument('input',
|
)
|
||||||
help='File whose encoding we would like to determine. \
|
)
|
||||||
(default: stdin)',
|
parser.add_argument(
|
||||||
type=argparse.FileType('rb'), nargs='*',
|
"input",
|
||||||
default=[sys.stdin if PY2 else sys.stdin.buffer])
|
help="File whose encoding we would like to determine. (default: stdin)",
|
||||||
parser.add_argument('--version', action='version',
|
type=argparse.FileType("rb"),
|
||||||
version='%(prog)s {0}'.format(__version__))
|
nargs="*",
|
||||||
|
default=[sys.stdin.buffer],
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--minimal",
|
||||||
|
help="Print only the encoding to standard output",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--legacy",
|
||||||
|
help="Rename legacy encodings to more modern ones.",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version", action="version", version=f"%(prog)s {__version__}"
|
||||||
|
)
|
||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
for f in args.input:
|
for f in args.input:
|
||||||
if f.isatty():
|
if f.isatty():
|
||||||
print("You are running chardetect interactively. Press " +
|
print(
|
||||||
"CTRL-D twice at the start of a blank line to signal the " +
|
"You are running chardetect interactively. Press "
|
||||||
"end of your input. If you want help, run chardetect " +
|
"CTRL-D twice at the start of a blank line to signal the "
|
||||||
"--help\n", file=sys.stderr)
|
"end of your input. If you want help, run chardetect "
|
||||||
print(description_of(f, f.name))
|
"--help\n",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
description_of(
|
||||||
|
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -27,10 +27,11 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from .codingstatemachinedict import CodingStateMachineDict
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
|
|
||||||
class CodingStateMachine(object):
|
class CodingStateMachine:
|
||||||
"""
|
"""
|
||||||
A state machine to verify a byte sequence for a particular encoding. For
|
A state machine to verify a byte sequence for a particular encoding. For
|
||||||
each byte the detector receives, it will feed that byte to every active
|
each byte the detector receives, it will feed that byte to every active
|
||||||
|
@ -52,37 +53,38 @@ class CodingStateMachine(object):
|
||||||
negative answer for this encoding. Detector will exclude this
|
negative answer for this encoding. Detector will exclude this
|
||||||
encoding from consideration from here on.
|
encoding from consideration from here on.
|
||||||
"""
|
"""
|
||||||
def __init__(self, sm):
|
|
||||||
|
def __init__(self, sm: CodingStateMachineDict) -> None:
|
||||||
self._model = sm
|
self._model = sm
|
||||||
self._curr_byte_pos = 0
|
self._curr_byte_pos = 0
|
||||||
self._curr_char_len = 0
|
self._curr_char_len = 0
|
||||||
self._curr_state = None
|
self._curr_state = MachineState.START
|
||||||
|
self.active = True
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._curr_state = MachineState.START
|
self._curr_state = MachineState.START
|
||||||
|
|
||||||
def next_state(self, c):
|
def next_state(self, c: int) -> int:
|
||||||
# for each byte we get its class
|
# for each byte we get its class
|
||||||
# if it is first byte, we also get byte length
|
# if it is first byte, we also get byte length
|
||||||
byte_class = self._model['class_table'][c]
|
byte_class = self._model["class_table"][c]
|
||||||
if self._curr_state == MachineState.START:
|
if self._curr_state == MachineState.START:
|
||||||
self._curr_byte_pos = 0
|
self._curr_byte_pos = 0
|
||||||
self._curr_char_len = self._model['char_len_table'][byte_class]
|
self._curr_char_len = self._model["char_len_table"][byte_class]
|
||||||
# from byte's class and state_table, we get its next state
|
# from byte's class and state_table, we get its next state
|
||||||
curr_state = (self._curr_state * self._model['class_factor']
|
curr_state = self._curr_state * self._model["class_factor"] + byte_class
|
||||||
+ byte_class)
|
self._curr_state = self._model["state_table"][curr_state]
|
||||||
self._curr_state = self._model['state_table'][curr_state]
|
|
||||||
self._curr_byte_pos += 1
|
self._curr_byte_pos += 1
|
||||||
return self._curr_state
|
return self._curr_state
|
||||||
|
|
||||||
def get_current_charlen(self):
|
def get_current_charlen(self) -> int:
|
||||||
return self._curr_char_len
|
return self._curr_char_len
|
||||||
|
|
||||||
def get_coding_state_machine(self):
|
def get_coding_state_machine(self) -> str:
|
||||||
return self._model['name']
|
return self._model["name"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return self._model['language']
|
return self._model["language"]
|
||||||
|
|
19
lib/chardet/codingstatemachinedict.py
Normal file
19
lib/chardet/codingstatemachinedict.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from typing import TYPE_CHECKING, Tuple
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# TypedDict was introduced in Python 3.8.
|
||||||
|
#
|
||||||
|
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
|
||||||
|
# for Python 3.7.
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
|
class CodingStateMachineDict(TypedDict, total=False):
|
||||||
|
class_table: Tuple[int, ...]
|
||||||
|
class_factor: int
|
||||||
|
state_table: Tuple[int, ...]
|
||||||
|
char_len_table: Tuple[int, ...]
|
||||||
|
name: str
|
||||||
|
language: str # Optional key
|
||||||
|
|
||||||
|
else:
|
||||||
|
CodingStateMachineDict = dict
|
|
@ -1,36 +0,0 @@
|
||||||
######################## BEGIN LICENSE BLOCK ########################
|
|
||||||
# Contributor(s):
|
|
||||||
# Dan Blanchard
|
|
||||||
# Ian Cordasco
|
|
||||||
#
|
|
||||||
# This library is free software; you can redistribute it and/or
|
|
||||||
# modify it under the terms of the GNU Lesser General Public
|
|
||||||
# License as published by the Free Software Foundation; either
|
|
||||||
# version 2.1 of the License, or (at your option) any later version.
|
|
||||||
#
|
|
||||||
# This library is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
# Lesser General Public License for more details.
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU Lesser General Public
|
|
||||||
# License along with this library; if not, write to the Free Software
|
|
||||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
||||||
# 02110-1301 USA
|
|
||||||
######################### END LICENSE BLOCK #########################
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info < (3, 0):
|
|
||||||
PY2 = True
|
|
||||||
PY3 = False
|
|
||||||
string_types = (str, unicode)
|
|
||||||
text_type = unicode
|
|
||||||
iteritems = dict.iteritems
|
|
||||||
else:
|
|
||||||
PY2 = False
|
|
||||||
PY3 = True
|
|
||||||
string_types = (bytes, str)
|
|
||||||
text_type = str
|
|
||||||
iteritems = dict.items
|
|
|
@ -32,8 +32,8 @@ from .mbcssm import CP949_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class CP949Prober(MultiByteCharSetProber):
|
class CP949Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(CP949Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
||||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||||
# not different.
|
# not different.
|
||||||
|
@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "CP949"
|
return "CP949"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Korean"
|
return "Korean"
|
||||||
|
|
|
@ -4,21 +4,26 @@ All of the Enums that are used throughout the chardet package.
|
||||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from enum import Enum, Flag
|
||||||
|
|
||||||
class InputState(object):
|
|
||||||
|
class InputState:
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a universal detector can be in.
|
This enum represents the different states a universal detector can be in.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PURE_ASCII = 0
|
PURE_ASCII = 0
|
||||||
ESC_ASCII = 1
|
ESC_ASCII = 1
|
||||||
HIGH_BYTE = 2
|
HIGH_BYTE = 2
|
||||||
|
|
||||||
|
|
||||||
class LanguageFilter(object):
|
class LanguageFilter(Flag):
|
||||||
"""
|
"""
|
||||||
This enum represents the different language filters we can apply to a
|
This enum represents the different language filters we can apply to a
|
||||||
``UniversalDetector``.
|
``UniversalDetector``.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
NONE = 0x00
|
||||||
CHINESE_SIMPLIFIED = 0x01
|
CHINESE_SIMPLIFIED = 0x01
|
||||||
CHINESE_TRADITIONAL = 0x02
|
CHINESE_TRADITIONAL = 0x02
|
||||||
JAPANESE = 0x04
|
JAPANESE = 0x04
|
||||||
|
@ -29,46 +34,50 @@ class LanguageFilter(object):
|
||||||
CJK = CHINESE | JAPANESE | KOREAN
|
CJK = CHINESE | JAPANESE | KOREAN
|
||||||
|
|
||||||
|
|
||||||
class ProbingState(object):
|
class ProbingState(Enum):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a prober can be in.
|
This enum represents the different states a prober can be in.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DETECTING = 0
|
DETECTING = 0
|
||||||
FOUND_IT = 1
|
FOUND_IT = 1
|
||||||
NOT_ME = 2
|
NOT_ME = 2
|
||||||
|
|
||||||
|
|
||||||
class MachineState(object):
|
class MachineState:
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a state machine can be in.
|
This enum represents the different states a state machine can be in.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
START = 0
|
START = 0
|
||||||
ERROR = 1
|
ERROR = 1
|
||||||
ITS_ME = 2
|
ITS_ME = 2
|
||||||
|
|
||||||
|
|
||||||
class SequenceLikelihood(object):
|
class SequenceLikelihood:
|
||||||
"""
|
"""
|
||||||
This enum represents the likelihood of a character following the previous one.
|
This enum represents the likelihood of a character following the previous one.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NEGATIVE = 0
|
NEGATIVE = 0
|
||||||
UNLIKELY = 1
|
UNLIKELY = 1
|
||||||
LIKELY = 2
|
LIKELY = 2
|
||||||
POSITIVE = 3
|
POSITIVE = 3
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_num_categories(cls):
|
def get_num_categories(cls) -> int:
|
||||||
""":returns: The number of likelihood categories in the enum."""
|
""":returns: The number of likelihood categories in the enum."""
|
||||||
return 4
|
return 4
|
||||||
|
|
||||||
|
|
||||||
class CharacterCategory(object):
|
class CharacterCategory:
|
||||||
"""
|
"""
|
||||||
This enum represents the different categories language models for
|
This enum represents the different categories language models for
|
||||||
``SingleByteCharsetProber`` put characters into.
|
``SingleByteCharsetProber`` put characters into.
|
||||||
|
|
||||||
Anything less than CONTROL is considered a letter.
|
Anything less than CONTROL is considered a letter.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
UNDEFINED = 255
|
UNDEFINED = 255
|
||||||
LINE_BREAK = 254
|
LINE_BREAK = 254
|
||||||
SYMBOL = 253
|
SYMBOL = 253
|
||||||
|
|
|
@ -25,11 +25,17 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .enums import LanguageFilter, ProbingState, MachineState
|
from .enums import LanguageFilter, MachineState, ProbingState
|
||||||
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
|
from .escsm import (
|
||||||
ISO2022KR_SM_MODEL)
|
HZ_SM_MODEL,
|
||||||
|
ISO2022CN_SM_MODEL,
|
||||||
|
ISO2022JP_SM_MODEL,
|
||||||
|
ISO2022KR_SM_MODEL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class EscCharSetProber(CharSetProber):
|
class EscCharSetProber(CharSetProber):
|
||||||
|
@ -39,8 +45,8 @@ class EscCharSetProber(CharSetProber):
|
||||||
identify these encodings.
|
identify these encodings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.coding_sm = []
|
self.coding_sm = []
|
||||||
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||||
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
||||||
|
@ -49,17 +55,15 @@ class EscCharSetProber(CharSetProber):
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
||||||
if self.lang_filter & LanguageFilter.KOREAN:
|
if self.lang_filter & LanguageFilter.KOREAN:
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
||||||
self.active_sm_count = None
|
self.active_sm_count = 0
|
||||||
self._detected_charset = None
|
self._detected_charset: Optional[str] = None
|
||||||
self._detected_language = None
|
self._detected_language: Optional[str] = None
|
||||||
self._state = None
|
self._state = ProbingState.DETECTING
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(EscCharSetProber, self).reset()
|
super().reset()
|
||||||
for coding_sm in self.coding_sm:
|
for coding_sm in self.coding_sm:
|
||||||
if not coding_sm:
|
|
||||||
continue
|
|
||||||
coding_sm.active = True
|
coding_sm.active = True
|
||||||
coding_sm.reset()
|
coding_sm.reset()
|
||||||
self.active_sm_count = len(self.coding_sm)
|
self.active_sm_count = len(self.coding_sm)
|
||||||
|
@ -67,23 +71,20 @@ class EscCharSetProber(CharSetProber):
|
||||||
self._detected_language = None
|
self._detected_language = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
return self._detected_charset
|
return self._detected_charset
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> Optional[str]:
|
||||||
return self._detected_language
|
return self._detected_language
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
if self._detected_charset:
|
return 0.99 if self._detected_charset else 0.00
|
||||||
return 0.99
|
|
||||||
else:
|
|
||||||
return 0.00
|
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
for coding_sm in self.coding_sm:
|
for coding_sm in self.coding_sm:
|
||||||
if not coding_sm or not coding_sm.active:
|
if not coding_sm.active:
|
||||||
continue
|
continue
|
||||||
coding_state = coding_sm.next_state(c)
|
coding_state = coding_sm.next_state(c)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
|
|
|
@ -25,222 +25,237 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from .codingstatemachinedict import CodingStateMachineDict
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
HZ_CLS = (
|
HZ_CLS = (
|
||||||
1,0,0,0,0,0,0,0, # 00 - 07
|
1, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||||
0,0,0,0,0,0,0,0, # 20 - 27
|
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||||
0,0,0,0,0,0,0,0, # 28 - 2f
|
0, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||||
0,0,0,0,0,0,0,0, # 30 - 37
|
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||||
0,0,0,0,0,0,0,0, # 40 - 47
|
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
|
||||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||||
0,0,0,0,0,0,0,0, # 50 - 57
|
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||||
0,0,0,0,0,0,0,0, # 60 - 67
|
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||||
0,0,0,0,0,0,0,0, # 70 - 77
|
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||||
0,0,0,4,0,5,2,0, # 78 - 7f
|
0, 0, 0, 4, 0, 5, 2, 0, # 78 - 7f
|
||||||
1,1,1,1,1,1,1,1, # 80 - 87
|
1, 1, 1, 1, 1, 1, 1, 1, # 80 - 87
|
||||||
1,1,1,1,1,1,1,1, # 88 - 8f
|
1, 1, 1, 1, 1, 1, 1, 1, # 88 - 8f
|
||||||
1,1,1,1,1,1,1,1, # 90 - 97
|
1, 1, 1, 1, 1, 1, 1, 1, # 90 - 97
|
||||||
1,1,1,1,1,1,1,1, # 98 - 9f
|
1, 1, 1, 1, 1, 1, 1, 1, # 98 - 9f
|
||||||
1,1,1,1,1,1,1,1, # a0 - a7
|
1, 1, 1, 1, 1, 1, 1, 1, # a0 - a7
|
||||||
1,1,1,1,1,1,1,1, # a8 - af
|
1, 1, 1, 1, 1, 1, 1, 1, # a8 - af
|
||||||
1,1,1,1,1,1,1,1, # b0 - b7
|
1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7
|
||||||
1,1,1,1,1,1,1,1, # b8 - bf
|
1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf
|
||||||
1,1,1,1,1,1,1,1, # c0 - c7
|
1, 1, 1, 1, 1, 1, 1, 1, # c0 - c7
|
||||||
1,1,1,1,1,1,1,1, # c8 - cf
|
1, 1, 1, 1, 1, 1, 1, 1, # c8 - cf
|
||||||
1,1,1,1,1,1,1,1, # d0 - d7
|
1, 1, 1, 1, 1, 1, 1, 1, # d0 - d7
|
||||||
1,1,1,1,1,1,1,1, # d8 - df
|
1, 1, 1, 1, 1, 1, 1, 1, # d8 - df
|
||||||
1,1,1,1,1,1,1,1, # e0 - e7
|
1, 1, 1, 1, 1, 1, 1, 1, # e0 - e7
|
||||||
1,1,1,1,1,1,1,1, # e8 - ef
|
1, 1, 1, 1, 1, 1, 1, 1, # e8 - ef
|
||||||
1,1,1,1,1,1,1,1, # f0 - f7
|
1, 1, 1, 1, 1, 1, 1, 1, # f0 - f7
|
||||||
1,1,1,1,1,1,1,1, # f8 - ff
|
1, 1, 1, 1, 1, 1, 1, 1, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
HZ_ST = (
|
HZ_ST = (
|
||||||
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
MachineState.START, MachineState.ERROR, 3, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
|
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, 4, MachineState.ERROR, # 10-17
|
||||||
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
|
5, MachineState.ERROR, 6, MachineState.ERROR, 5, 5, 4, MachineState.ERROR, # 18-1f
|
||||||
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
|
4, MachineState.ERROR, 4, 4, 4, MachineState.ERROR, 4, MachineState.ERROR, # 20-27
|
||||||
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
|
4, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 28-2f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
HZ_SM_MODEL = {'class_table': HZ_CLS,
|
HZ_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 6,
|
"class_table": HZ_CLS,
|
||||||
'state_table': HZ_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': HZ_CHAR_LEN_TABLE,
|
"state_table": HZ_ST,
|
||||||
'name': "HZ-GB-2312",
|
"char_len_table": HZ_CHAR_LEN_TABLE,
|
||||||
'language': 'Chinese'}
|
"name": "HZ-GB-2312",
|
||||||
|
"language": "Chinese",
|
||||||
|
}
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ISO2022CN_CLS = (
|
ISO2022CN_CLS = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||||
0,0,0,0,0,0,0,0, # 20 - 27
|
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||||
0,3,0,0,0,0,0,0, # 28 - 2f
|
0, 3, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||||
0,0,0,0,0,0,0,0, # 30 - 37
|
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||||
0,0,0,4,0,0,0,0, # 40 - 47
|
0, 0, 0, 4, 0, 0, 0, 0, # 40 - 47
|
||||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||||
0,0,0,0,0,0,0,0, # 50 - 57
|
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||||
0,0,0,0,0,0,0,0, # 60 - 67
|
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||||
0,0,0,0,0,0,0,0, # 70 - 77
|
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||||
2,2,2,2,2,2,2,2, # 80 - 87
|
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
|
||||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
|
||||||
2,2,2,2,2,2,2,2, # 90 - 97
|
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
|
||||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
|
||||||
2,2,2,2,2,2,2,2, # a0 - a7
|
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||||
2,2,2,2,2,2,2,2, # a8 - af
|
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||||
2,2,2,2,2,2,2,2, # b0 - b7
|
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||||
2,2,2,2,2,2,2,2, # b8 - bf
|
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||||
2,2,2,2,2,2,2,2, # c0 - c7
|
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||||
2,2,2,2,2,2,2,2, # c8 - cf
|
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||||
2,2,2,2,2,2,2,2, # d0 - d7
|
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||||
2,2,2,2,2,2,2,2, # e0 - e7
|
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||||
2,2,2,2,2,2,2,2, # e8 - ef
|
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||||
2,2,2,2,2,2,2,2, # f0 - f7
|
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||||
2,2,2,2,2,2,2,2, # f8 - ff
|
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022CN_ST = (
|
ISO2022CN_ST = (
|
||||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07
|
||||||
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
|
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, # 18-1f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 20-27
|
||||||
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
|
5, 6, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 28-2f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 30-37
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, # 38-3f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
|
ISO2022CN_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 9,
|
"class_table": ISO2022CN_CLS,
|
||||||
'state_table': ISO2022CN_ST,
|
"class_factor": 9,
|
||||||
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
|
"state_table": ISO2022CN_ST,
|
||||||
'name': "ISO-2022-CN",
|
"char_len_table": ISO2022CN_CHAR_LEN_TABLE,
|
||||||
'language': 'Chinese'}
|
"name": "ISO-2022-CN",
|
||||||
|
"language": "Chinese",
|
||||||
|
}
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ISO2022JP_CLS = (
|
ISO2022JP_CLS = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 2, 2, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||||
0,0,0,0,7,0,0,0, # 20 - 27
|
0, 0, 0, 0, 7, 0, 0, 0, # 20 - 27
|
||||||
3,0,0,0,0,0,0,0, # 28 - 2f
|
3, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||||
0,0,0,0,0,0,0,0, # 30 - 37
|
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||||
6,0,4,0,8,0,0,0, # 40 - 47
|
6, 0, 4, 0, 8, 0, 0, 0, # 40 - 47
|
||||||
0,9,5,0,0,0,0,0, # 48 - 4f
|
0, 9, 5, 0, 0, 0, 0, 0, # 48 - 4f
|
||||||
0,0,0,0,0,0,0,0, # 50 - 57
|
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||||
0,0,0,0,0,0,0,0, # 60 - 67
|
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||||
0,0,0,0,0,0,0,0, # 70 - 77
|
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||||
2,2,2,2,2,2,2,2, # 80 - 87
|
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
|
||||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
|
||||||
2,2,2,2,2,2,2,2, # 90 - 97
|
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
|
||||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
|
||||||
2,2,2,2,2,2,2,2, # a0 - a7
|
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||||
2,2,2,2,2,2,2,2, # a8 - af
|
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||||
2,2,2,2,2,2,2,2, # b0 - b7
|
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||||
2,2,2,2,2,2,2,2, # b8 - bf
|
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||||
2,2,2,2,2,2,2,2, # c0 - c7
|
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||||
2,2,2,2,2,2,2,2, # c8 - cf
|
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||||
2,2,2,2,2,2,2,2, # d0 - d7
|
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||||
2,2,2,2,2,2,2,2, # e0 - e7
|
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||||
2,2,2,2,2,2,2,2, # e8 - ef
|
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||||
2,2,2,2,2,2,2,2, # f0 - f7
|
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||||
2,2,2,2,2,2,2,2, # f8 - ff
|
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022JP_ST = (
|
ISO2022JP_ST = (
|
||||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07
|
||||||
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, # 18-1f
|
||||||
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
|
MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 20-27
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 6, MachineState.ITS_ME, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, # 28-2f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, # 30-37
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 38-3f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, MachineState.START, # 40-47
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
|
ISO2022JP_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 10,
|
"class_table": ISO2022JP_CLS,
|
||||||
'state_table': ISO2022JP_ST,
|
"class_factor": 10,
|
||||||
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
|
"state_table": ISO2022JP_ST,
|
||||||
'name': "ISO-2022-JP",
|
"char_len_table": ISO2022JP_CHAR_LEN_TABLE,
|
||||||
'language': 'Japanese'}
|
"name": "ISO-2022-JP",
|
||||||
|
"language": "Japanese",
|
||||||
|
}
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ISO2022KR_CLS = (
|
ISO2022KR_CLS = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||||
0,0,0,0,3,0,0,0, # 20 - 27
|
0, 0, 0, 0, 3, 0, 0, 0, # 20 - 27
|
||||||
0,4,0,0,0,0,0,0, # 28 - 2f
|
0, 4, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||||
0,0,0,0,0,0,0,0, # 30 - 37
|
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||||
0,0,0,5,0,0,0,0, # 40 - 47
|
0, 0, 0, 5, 0, 0, 0, 0, # 40 - 47
|
||||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||||
0,0,0,0,0,0,0,0, # 50 - 57
|
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||||
0,0,0,0,0,0,0,0, # 60 - 67
|
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||||
0,0,0,0,0,0,0,0, # 70 - 77
|
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||||
2,2,2,2,2,2,2,2, # 80 - 87
|
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
|
||||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
|
||||||
2,2,2,2,2,2,2,2, # 90 - 97
|
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
|
||||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
|
||||||
2,2,2,2,2,2,2,2, # a0 - a7
|
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||||
2,2,2,2,2,2,2,2, # a8 - af
|
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||||
2,2,2,2,2,2,2,2, # b0 - b7
|
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||||
2,2,2,2,2,2,2,2, # b8 - bf
|
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||||
2,2,2,2,2,2,2,2, # c0 - c7
|
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||||
2,2,2,2,2,2,2,2, # c8 - cf
|
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||||
2,2,2,2,2,2,2,2, # d0 - d7
|
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||||
2,2,2,2,2,2,2,2, # e0 - e7
|
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||||
2,2,2,2,2,2,2,2, # e8 - ef
|
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||||
2,2,2,2,2,2,2,2, # f0 - f7
|
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||||
2,2,2,2,2,2,2,2, # f8 - ff
|
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022KR_ST = (
|
ISO2022KR_ST = (
|
||||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
|
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 10-17
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 18-1f
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 20-27
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
|
ISO2022KR_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 6,
|
"class_table": ISO2022KR_CLS,
|
||||||
'state_table': ISO2022KR_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
|
"state_table": ISO2022KR_ST,
|
||||||
'name': "ISO-2022-KR",
|
"char_len_table": ISO2022KR_CHAR_LEN_TABLE,
|
||||||
'language': 'Korean'}
|
"name": "ISO-2022-KR",
|
||||||
|
"language": "Korean",
|
||||||
|
}
|
||||||
|
|
|
@ -25,68 +25,78 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .enums import ProbingState, MachineState
|
from typing import Union
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import EUCJPDistributionAnalysis
|
from .chardistribution import EUCJPDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import MachineState, ProbingState
|
||||||
from .jpcntx import EUCJPContextAnalysis
|
from .jpcntx import EUCJPContextAnalysis
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import EUCJP_SM_MODEL
|
from .mbcssm import EUCJP_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCJPProber(MultiByteCharSetProber):
|
class EUCJPProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(EUCJPProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
||||||
self.context_analyzer = EUCJPContextAnalysis()
|
self.context_analyzer = EUCJPContextAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(EUCJPProber, self).reset()
|
super().reset()
|
||||||
self.context_analyzer.reset()
|
self.context_analyzer.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "EUC-JP"
|
return "EUC-JP"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Japanese"
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for i in range(len(byte_str)):
|
assert self.coding_sm is not None
|
||||||
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
|
assert self.distribution_analyzer is not None
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
|
||||||
|
for i, byte in enumerate(byte_str):
|
||||||
|
# PY3K: byte_str is a byte array, so byte is an int, not a byte
|
||||||
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s %s prober hit error at byte %s',
|
self.logger.debug(
|
||||||
self.charset_name, self.language, i)
|
"%s %s prober hit error at byte %s",
|
||||||
|
self.charset_name,
|
||||||
|
self.language,
|
||||||
|
i,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte
|
||||||
self.context_analyzer.feed(self._last_char, char_len)
|
self.context_analyzer.feed(self._last_char, char_len)
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.context_analyzer.feed(byte_str[i - 1:i + 1],
|
self.context_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
char_len)
|
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
|
||||||
char_len)
|
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.context_analyzer.got_enough_data() and
|
if self.context_analyzer.got_enough_data() and (
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||||
|
):
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
context_conf = self.context_analyzer.get_confidence()
|
context_conf = self.context_analyzer.get_confidence()
|
||||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||||
return max(context_conf, distrib_conf)
|
return max(context_conf, distrib_conf)
|
||||||
|
|
|
@ -43,6 +43,7 @@ EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0
|
||||||
EUCKR_TABLE_SIZE = 2352
|
EUCKR_TABLE_SIZE = 2352
|
||||||
|
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
|
# fmt: off
|
||||||
EUCKR_CHAR_TO_FREQ_ORDER = (
|
EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||||
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
||||||
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
||||||
|
@ -192,4 +193,4 @@ EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||||
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
||||||
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
|
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,23 +25,23 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import EUCKRDistributionAnalysis
|
from .chardistribution import EUCKRDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import EUCKR_SM_MODEL
|
from .mbcssm import EUCKR_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCKRProber(MultiByteCharSetProber):
|
class EUCKRProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(EUCKRProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "EUC-KR"
|
return "EUC-KR"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Korean"
|
return "Korean"
|
||||||
|
|
|
@ -43,345 +43,346 @@
|
||||||
|
|
||||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
|
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table
|
||||||
EUCTW_TABLE_SIZE = 5376
|
EUCTW_TABLE_SIZE = 5376
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
EUCTW_CHAR_TO_FREQ_ORDER = (
|
EUCTW_CHAR_TO_FREQ_ORDER = (
|
||||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
1, 1800, 1506, 255, 1431, 198, 9, 82, 6, 7310, 177, 202, 3615, 1256, 2808, 110, # 2742
|
||||||
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
|
3735, 33, 3241, 261, 76, 44, 2113, 16, 2931, 2184, 1176, 659, 3868, 26, 3404, 2643, # 2758
|
||||||
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
|
1198, 3869, 3313, 4060, 410, 2211, 302, 590, 361, 1963, 8, 204, 58, 4296, 7311, 1931, # 2774
|
||||||
63,7312,7313, 317,1614, 75, 222, 159,4061,2412,1480,7314,3500,3068, 224,2809, # 2790
|
63, 7312, 7313, 317, 1614, 75, 222, 159, 4061, 2412, 1480, 7314, 3500, 3068, 224, 2809, # 2790
|
||||||
3616, 3, 10,3870,1471, 29,2774,1135,2852,1939, 873, 130,3242,1123, 312,7315, # 2806
|
3616, 3, 10, 3870, 1471, 29, 2774, 1135, 2852, 1939, 873, 130, 3242, 1123, 312, 7315, # 2806
|
||||||
4297,2051, 507, 252, 682,7316, 142,1914, 124, 206,2932, 34,3501,3173, 64, 604, # 2822
|
4297, 2051, 507, 252, 682, 7316, 142, 1914, 124, 206, 2932, 34, 3501, 3173, 64, 604, # 2822
|
||||||
7317,2494,1976,1977, 155,1990, 645, 641,1606,7318,3405, 337, 72, 406,7319, 80, # 2838
|
7317, 2494, 1976, 1977, 155, 1990, 645, 641, 1606, 7318, 3405, 337, 72, 406, 7319, 80, # 2838
|
||||||
630, 238,3174,1509, 263, 939,1092,2644, 756,1440,1094,3406, 449, 69,2969, 591, # 2854
|
630, 238, 3174, 1509, 263, 939, 1092, 2644, 756, 1440, 1094, 3406, 449, 69, 2969, 591, # 2854
|
||||||
179,2095, 471, 115,2034,1843, 60, 50,2970, 134, 806,1868, 734,2035,3407, 180, # 2870
|
179, 2095, 471, 115, 2034, 1843, 60, 50, 2970, 134, 806, 1868, 734, 2035, 3407, 180, # 2870
|
||||||
995,1607, 156, 537,2893, 688,7320, 319,1305, 779,2144, 514,2374, 298,4298, 359, # 2886
|
995, 1607, 156, 537, 2893, 688, 7320, 319, 1305, 779, 2144, 514, 2374, 298, 4298, 359, # 2886
|
||||||
2495, 90,2707,1338, 663, 11, 906,1099,2545, 20,2436, 182, 532,1716,7321, 732, # 2902
|
2495, 90, 2707, 1338, 663, 11, 906, 1099, 2545, 20, 2436, 182, 532, 1716, 7321, 732, # 2902
|
||||||
1376,4062,1311,1420,3175, 25,2312,1056, 113, 399, 382,1949, 242,3408,2467, 529, # 2918
|
1376, 4062, 1311, 1420, 3175, 25, 2312, 1056, 113, 399, 382, 1949, 242, 3408, 2467, 529, # 2918
|
||||||
3243, 475,1447,3617,7322, 117, 21, 656, 810,1297,2295,2329,3502,7323, 126,4063, # 2934
|
3243, 475, 1447, 3617, 7322, 117, 21, 656, 810, 1297, 2295, 2329, 3502, 7323, 126, 4063, # 2934
|
||||||
706, 456, 150, 613,4299, 71,1118,2036,4064, 145,3069, 85, 835, 486,2114,1246, # 2950
|
706, 456, 150, 613, 4299, 71, 1118, 2036, 4064, 145, 3069, 85, 835, 486, 2114, 1246, # 2950
|
||||||
1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,7324,2127,2354, 347,3736, 221, # 2966
|
1426, 428, 727, 1285, 1015, 800, 106, 623, 303, 1281, 7324, 2127, 2354, 347, 3736, 221, # 2966
|
||||||
3503,3110,7325,1955,1153,4065, 83, 296,1199,3070, 192, 624, 93,7326, 822,1897, # 2982
|
3503, 3110, 7325, 1955, 1153, 4065, 83, 296, 1199, 3070, 192, 624, 93, 7326, 822, 1897, # 2982
|
||||||
2810,3111, 795,2064, 991,1554,1542,1592, 27, 43,2853, 859, 139,1456, 860,4300, # 2998
|
2810, 3111, 795, 2064, 991, 1554, 1542, 1592, 27, 43, 2853, 859, 139, 1456, 860, 4300, # 2998
|
||||||
437, 712,3871, 164,2392,3112, 695, 211,3017,2096, 195,3872,1608,3504,3505,3618, # 3014
|
437, 712, 3871, 164, 2392, 3112, 695, 211, 3017, 2096, 195, 3872, 1608, 3504, 3505, 3618, # 3014
|
||||||
3873, 234, 811,2971,2097,3874,2229,1441,3506,1615,2375, 668,2076,1638, 305, 228, # 3030
|
3873, 234, 811, 2971, 2097, 3874, 2229, 1441, 3506, 1615, 2375, 668, 2076, 1638, 305, 228, # 3030
|
||||||
1664,4301, 467, 415,7327, 262,2098,1593, 239, 108, 300, 200,1033, 512,1247,2077, # 3046
|
1664, 4301, 467, 415, 7327, 262, 2098, 1593, 239, 108, 300, 200, 1033, 512, 1247, 2077, # 3046
|
||||||
7328,7329,2173,3176,3619,2673, 593, 845,1062,3244, 88,1723,2037,3875,1950, 212, # 3062
|
7328, 7329, 2173, 3176, 3619, 2673, 593, 845, 1062, 3244, 88, 1723, 2037, 3875, 1950, 212, # 3062
|
||||||
266, 152, 149, 468,1898,4066,4302, 77, 187,7330,3018, 37, 5,2972,7331,3876, # 3078
|
266, 152, 149, 468, 1898, 4066, 4302, 77, 187, 7330, 3018, 37, 5, 2972, 7331, 3876, # 3078
|
||||||
7332,7333, 39,2517,4303,2894,3177,2078, 55, 148, 74,4304, 545, 483,1474,1029, # 3094
|
7332, 7333, 39, 2517, 4303, 2894, 3177, 2078, 55, 148, 74, 4304, 545, 483, 1474, 1029, # 3094
|
||||||
1665, 217,1869,1531,3113,1104,2645,4067, 24, 172,3507, 900,3877,3508,3509,4305, # 3110
|
1665, 217, 1869, 1531, 3113, 1104, 2645, 4067, 24, 172, 3507, 900, 3877, 3508, 3509, 4305, # 3110
|
||||||
32,1408,2811,1312, 329, 487,2355,2247,2708, 784,2674, 4,3019,3314,1427,1788, # 3126
|
32, 1408, 2811, 1312, 329, 487, 2355, 2247, 2708, 784, 2674, 4, 3019, 3314, 1427, 1788, # 3126
|
||||||
188, 109, 499,7334,3620,1717,1789, 888,1217,3020,4306,7335,3510,7336,3315,1520, # 3142
|
188, 109, 499, 7334, 3620, 1717, 1789, 888, 1217, 3020, 4306, 7335, 3510, 7336, 3315, 1520, # 3142
|
||||||
3621,3878, 196,1034, 775,7337,7338, 929,1815, 249, 439, 38,7339,1063,7340, 794, # 3158
|
3621, 3878, 196, 1034, 775, 7337, 7338, 929, 1815, 249, 439, 38, 7339, 1063, 7340, 794, # 3158
|
||||||
3879,1435,2296, 46, 178,3245,2065,7341,2376,7342, 214,1709,4307, 804, 35, 707, # 3174
|
3879, 1435, 2296, 46, 178, 3245, 2065, 7341, 2376, 7342, 214, 1709, 4307, 804, 35, 707, # 3174
|
||||||
324,3622,1601,2546, 140, 459,4068,7343,7344,1365, 839, 272, 978,2257,2572,3409, # 3190
|
324, 3622, 1601, 2546, 140, 459, 4068, 7343, 7344, 1365, 839, 272, 978, 2257, 2572, 3409, # 3190
|
||||||
2128,1363,3623,1423, 697, 100,3071, 48, 70,1231, 495,3114,2193,7345,1294,7346, # 3206
|
2128, 1363, 3623, 1423, 697, 100, 3071, 48, 70, 1231, 495, 3114, 2193, 7345, 1294, 7346, # 3206
|
||||||
2079, 462, 586,1042,3246, 853, 256, 988, 185,2377,3410,1698, 434,1084,7347,3411, # 3222
|
2079, 462, 586, 1042, 3246, 853, 256, 988, 185, 2377, 3410, 1698, 434, 1084, 7347, 3411, # 3222
|
||||||
314,2615,2775,4308,2330,2331, 569,2280, 637,1816,2518, 757,1162,1878,1616,3412, # 3238
|
314, 2615, 2775, 4308, 2330, 2331, 569, 2280, 637, 1816, 2518, 757, 1162, 1878, 1616, 3412, # 3238
|
||||||
287,1577,2115, 768,4309,1671,2854,3511,2519,1321,3737, 909,2413,7348,4069, 933, # 3254
|
287, 1577, 2115, 768, 4309, 1671, 2854, 3511, 2519, 1321, 3737, 909, 2413, 7348, 4069, 933, # 3254
|
||||||
3738,7349,2052,2356,1222,4310, 765,2414,1322, 786,4311,7350,1919,1462,1677,2895, # 3270
|
3738, 7349, 2052, 2356, 1222, 4310, 765, 2414, 1322, 786, 4311, 7350, 1919, 1462, 1677, 2895, # 3270
|
||||||
1699,7351,4312,1424,2437,3115,3624,2590,3316,1774,1940,3413,3880,4070, 309,1369, # 3286
|
1699, 7351, 4312, 1424, 2437, 3115, 3624, 2590, 3316, 1774, 1940, 3413, 3880, 4070, 309, 1369, # 3286
|
||||||
1130,2812, 364,2230,1653,1299,3881,3512,3882,3883,2646, 525,1085,3021, 902,2000, # 3302
|
1130, 2812, 364, 2230, 1653, 1299, 3881, 3512, 3882, 3883, 2646, 525, 1085, 3021, 902, 2000, # 3302
|
||||||
1475, 964,4313, 421,1844,1415,1057,2281, 940,1364,3116, 376,4314,4315,1381, 7, # 3318
|
1475, 964, 4313, 421, 1844, 1415, 1057, 2281, 940, 1364, 3116, 376, 4314, 4315, 1381, 7, # 3318
|
||||||
2520, 983,2378, 336,1710,2675,1845, 321,3414, 559,1131,3022,2742,1808,1132,1313, # 3334
|
2520, 983, 2378, 336, 1710, 2675, 1845, 321, 3414, 559, 1131, 3022, 2742, 1808, 1132, 1313, # 3334
|
||||||
265,1481,1857,7352, 352,1203,2813,3247, 167,1089, 420,2814, 776, 792,1724,3513, # 3350
|
265, 1481, 1857, 7352, 352, 1203, 2813, 3247, 167, 1089, 420, 2814, 776, 792, 1724, 3513, # 3350
|
||||||
4071,2438,3248,7353,4072,7354, 446, 229, 333,2743, 901,3739,1200,1557,4316,2647, # 3366
|
4071, 2438, 3248, 7353, 4072, 7354, 446, 229, 333, 2743, 901, 3739, 1200, 1557, 4316, 2647, # 3366
|
||||||
1920, 395,2744,2676,3740,4073,1835, 125, 916,3178,2616,4317,7355,7356,3741,7357, # 3382
|
1920, 395, 2744, 2676, 3740, 4073, 1835, 125, 916, 3178, 2616, 4317, 7355, 7356, 3741, 7357, # 3382
|
||||||
7358,7359,4318,3117,3625,1133,2547,1757,3415,1510,2313,1409,3514,7360,2145, 438, # 3398
|
7358, 7359, 4318, 3117, 3625, 1133, 2547, 1757, 3415, 1510, 2313, 1409, 3514, 7360, 2145, 438, # 3398
|
||||||
2591,2896,2379,3317,1068, 958,3023, 461, 311,2855,2677,4074,1915,3179,4075,1978, # 3414
|
2591, 2896, 2379, 3317, 1068, 958, 3023, 461, 311, 2855, 2677, 4074, 1915, 3179, 4075, 1978, # 3414
|
||||||
383, 750,2745,2617,4076, 274, 539, 385,1278,1442,7361,1154,1964, 384, 561, 210, # 3430
|
383, 750, 2745, 2617, 4076, 274, 539, 385, 1278, 1442, 7361, 1154, 1964, 384, 561, 210, # 3430
|
||||||
98,1295,2548,3515,7362,1711,2415,1482,3416,3884,2897,1257, 129,7363,3742, 642, # 3446
|
98, 1295, 2548, 3515, 7362, 1711, 2415, 1482, 3416, 3884, 2897, 1257, 129, 7363, 3742, 642, # 3446
|
||||||
523,2776,2777,2648,7364, 141,2231,1333, 68, 176, 441, 876, 907,4077, 603,2592, # 3462
|
523, 2776, 2777, 2648, 7364, 141, 2231, 1333, 68, 176, 441, 876, 907, 4077, 603, 2592, # 3462
|
||||||
710, 171,3417, 404, 549, 18,3118,2393,1410,3626,1666,7365,3516,4319,2898,4320, # 3478
|
710, 171, 3417, 404, 549, 18, 3118, 2393, 1410, 3626, 1666, 7365, 3516, 4319, 2898, 4320, # 3478
|
||||||
7366,2973, 368,7367, 146, 366, 99, 871,3627,1543, 748, 807,1586,1185, 22,2258, # 3494
|
7366, 2973, 368, 7367, 146, 366, 99, 871, 3627, 1543, 748, 807, 1586, 1185, 22, 2258, # 3494
|
||||||
379,3743,3180,7368,3181, 505,1941,2618,1991,1382,2314,7369, 380,2357, 218, 702, # 3510
|
379, 3743, 3180, 7368, 3181, 505, 1941, 2618, 1991, 1382, 2314, 7369, 380, 2357, 218, 702, # 3510
|
||||||
1817,1248,3418,3024,3517,3318,3249,7370,2974,3628, 930,3250,3744,7371, 59,7372, # 3526
|
1817, 1248, 3418, 3024, 3517, 3318, 3249, 7370, 2974, 3628, 930, 3250, 3744, 7371, 59, 7372, # 3526
|
||||||
585, 601,4078, 497,3419,1112,1314,4321,1801,7373,1223,1472,2174,7374, 749,1836, # 3542
|
585, 601, 4078, 497, 3419, 1112, 1314, 4321, 1801, 7373, 1223, 1472, 2174, 7374, 749, 1836, # 3542
|
||||||
690,1899,3745,1772,3885,1476, 429,1043,1790,2232,2116, 917,4079, 447,1086,1629, # 3558
|
690, 1899, 3745, 1772, 3885, 1476, 429, 1043, 1790, 2232, 2116, 917, 4079, 447, 1086, 1629, # 3558
|
||||||
7375, 556,7376,7377,2020,1654, 844,1090, 105, 550, 966,1758,2815,1008,1782, 686, # 3574
|
7375, 556, 7376, 7377, 2020, 1654, 844, 1090, 105, 550, 966, 1758, 2815, 1008, 1782, 686, # 3574
|
||||||
1095,7378,2282, 793,1602,7379,3518,2593,4322,4080,2933,2297,4323,3746, 980,2496, # 3590
|
1095, 7378, 2282, 793, 1602, 7379, 3518, 2593, 4322, 4080, 2933, 2297, 4323, 3746, 980, 2496, # 3590
|
||||||
544, 353, 527,4324, 908,2678,2899,7380, 381,2619,1942,1348,7381,1341,1252, 560, # 3606
|
544, 353, 527, 4324, 908, 2678, 2899, 7380, 381, 2619, 1942, 1348, 7381, 1341, 1252, 560, # 3606
|
||||||
3072,7382,3420,2856,7383,2053, 973, 886,2080, 143,4325,7384,7385, 157,3886, 496, # 3622
|
3072, 7382, 3420, 2856, 7383, 2053, 973, 886, 2080, 143, 4325, 7384, 7385, 157, 3886, 496, # 3622
|
||||||
4081, 57, 840, 540,2038,4326,4327,3421,2117,1445, 970,2259,1748,1965,2081,4082, # 3638
|
4081, 57, 840, 540, 2038, 4326, 4327, 3421, 2117, 1445, 970, 2259, 1748, 1965, 2081, 4082, # 3638
|
||||||
3119,1234,1775,3251,2816,3629, 773,1206,2129,1066,2039,1326,3887,1738,1725,4083, # 3654
|
3119, 1234, 1775, 3251, 2816, 3629, 773, 1206, 2129, 1066, 2039, 1326, 3887, 1738, 1725, 4083, # 3654
|
||||||
279,3120, 51,1544,2594, 423,1578,2130,2066, 173,4328,1879,7386,7387,1583, 264, # 3670
|
279, 3120, 51, 1544, 2594, 423, 1578, 2130, 2066, 173, 4328, 1879, 7386, 7387, 1583, 264, # 3670
|
||||||
610,3630,4329,2439, 280, 154,7388,7389,7390,1739, 338,1282,3073, 693,2857,1411, # 3686
|
610, 3630, 4329, 2439, 280, 154, 7388, 7389, 7390, 1739, 338, 1282, 3073, 693, 2857, 1411, # 3686
|
||||||
1074,3747,2440,7391,4330,7392,7393,1240, 952,2394,7394,2900,1538,2679, 685,1483, # 3702
|
1074, 3747, 2440, 7391, 4330, 7392, 7393, 1240, 952, 2394, 7394, 2900, 1538, 2679, 685, 1483, # 3702
|
||||||
4084,2468,1436, 953,4085,2054,4331, 671,2395, 79,4086,2441,3252, 608, 567,2680, # 3718
|
4084, 2468, 1436, 953, 4085, 2054, 4331, 671, 2395, 79, 4086, 2441, 3252, 608, 567, 2680, # 3718
|
||||||
3422,4087,4088,1691, 393,1261,1791,2396,7395,4332,7396,7397,7398,7399,1383,1672, # 3734
|
3422, 4087, 4088, 1691, 393, 1261, 1791, 2396, 7395, 4332, 7396, 7397, 7398, 7399, 1383, 1672, # 3734
|
||||||
3748,3182,1464, 522,1119, 661,1150, 216, 675,4333,3888,1432,3519, 609,4334,2681, # 3750
|
3748, 3182, 1464, 522, 1119, 661, 1150, 216, 675, 4333, 3888, 1432, 3519, 609, 4334, 2681, # 3750
|
||||||
2397,7400,7401,7402,4089,3025, 0,7403,2469, 315, 231,2442, 301,3319,4335,2380, # 3766
|
2397, 7400, 7401, 7402, 4089, 3025, 0, 7403, 2469, 315, 231, 2442, 301, 3319, 4335, 2380, # 3766
|
||||||
7404, 233,4090,3631,1818,4336,4337,7405, 96,1776,1315,2082,7406, 257,7407,1809, # 3782
|
7404, 233, 4090, 3631, 1818, 4336, 4337, 7405, 96, 1776, 1315, 2082, 7406, 257, 7407, 1809, # 3782
|
||||||
3632,2709,1139,1819,4091,2021,1124,2163,2778,1777,2649,7408,3074, 363,1655,3183, # 3798
|
3632, 2709, 1139, 1819, 4091, 2021, 1124, 2163, 2778, 1777, 2649, 7408, 3074, 363, 1655, 3183, # 3798
|
||||||
7409,2975,7410,7411,7412,3889,1567,3890, 718, 103,3184, 849,1443, 341,3320,2934, # 3814
|
7409, 2975, 7410, 7411, 7412, 3889, 1567, 3890, 718, 103, 3184, 849, 1443, 341, 3320, 2934, # 3814
|
||||||
1484,7413,1712, 127, 67, 339,4092,2398, 679,1412, 821,7414,7415, 834, 738, 351, # 3830
|
1484, 7413, 1712, 127, 67, 339, 4092, 2398, 679, 1412, 821, 7414, 7415, 834, 738, 351, # 3830
|
||||||
2976,2146, 846, 235,1497,1880, 418,1992,3749,2710, 186,1100,2147,2746,3520,1545, # 3846
|
2976, 2146, 846, 235, 1497, 1880, 418, 1992, 3749, 2710, 186, 1100, 2147, 2746, 3520, 1545, # 3846
|
||||||
1355,2935,2858,1377, 583,3891,4093,2573,2977,7416,1298,3633,1078,2549,3634,2358, # 3862
|
1355, 2935, 2858, 1377, 583, 3891, 4093, 2573, 2977, 7416, 1298, 3633, 1078, 2549, 3634, 2358, # 3862
|
||||||
78,3750,3751, 267,1289,2099,2001,1594,4094, 348, 369,1274,2194,2175,1837,4338, # 3878
|
78, 3750, 3751, 267, 1289, 2099, 2001, 1594, 4094, 348, 369, 1274, 2194, 2175, 1837, 4338, # 3878
|
||||||
1820,2817,3635,2747,2283,2002,4339,2936,2748, 144,3321, 882,4340,3892,2749,3423, # 3894
|
1820, 2817, 3635, 2747, 2283, 2002, 4339, 2936, 2748, 144, 3321, 882, 4340, 3892, 2749, 3423, # 3894
|
||||||
4341,2901,7417,4095,1726, 320,7418,3893,3026, 788,2978,7419,2818,1773,1327,2859, # 3910
|
4341, 2901, 7417, 4095, 1726, 320, 7418, 3893, 3026, 788, 2978, 7419, 2818, 1773, 1327, 2859, # 3910
|
||||||
3894,2819,7420,1306,4342,2003,1700,3752,3521,2359,2650, 787,2022, 506, 824,3636, # 3926
|
3894, 2819, 7420, 1306, 4342, 2003, 1700, 3752, 3521, 2359, 2650, 787, 2022, 506, 824, 3636, # 3926
|
||||||
534, 323,4343,1044,3322,2023,1900, 946,3424,7421,1778,1500,1678,7422,1881,4344, # 3942
|
534, 323, 4343, 1044, 3322, 2023, 1900, 946, 3424, 7421, 1778, 1500, 1678, 7422, 1881, 4344, # 3942
|
||||||
165, 243,4345,3637,2521, 123, 683,4096, 764,4346, 36,3895,1792, 589,2902, 816, # 3958
|
165, 243, 4345, 3637, 2521, 123, 683, 4096, 764, 4346, 36, 3895, 1792, 589, 2902, 816, # 3958
|
||||||
626,1667,3027,2233,1639,1555,1622,3753,3896,7423,3897,2860,1370,1228,1932, 891, # 3974
|
626, 1667, 3027, 2233, 1639, 1555, 1622, 3753, 3896, 7423, 3897, 2860, 1370, 1228, 1932, 891, # 3974
|
||||||
2083,2903, 304,4097,7424, 292,2979,2711,3522, 691,2100,4098,1115,4347, 118, 662, # 3990
|
2083, 2903, 304, 4097, 7424, 292, 2979, 2711, 3522, 691, 2100, 4098, 1115, 4347, 118, 662, # 3990
|
||||||
7425, 611,1156, 854,2381,1316,2861, 2, 386, 515,2904,7426,7427,3253, 868,2234, # 4006
|
7425, 611, 1156, 854, 2381, 1316, 2861, 2, 386, 515, 2904, 7426, 7427, 3253, 868, 2234, # 4006
|
||||||
1486, 855,2651, 785,2212,3028,7428,1040,3185,3523,7429,3121, 448,7430,1525,7431, # 4022
|
1486, 855, 2651, 785, 2212, 3028, 7428, 1040, 3185, 3523, 7429, 3121, 448, 7430, 1525, 7431, # 4022
|
||||||
2164,4348,7432,3754,7433,4099,2820,3524,3122, 503, 818,3898,3123,1568, 814, 676, # 4038
|
2164, 4348, 7432, 3754, 7433, 4099, 2820, 3524, 3122, 503, 818, 3898, 3123, 1568, 814, 676, # 4038
|
||||||
1444, 306,1749,7434,3755,1416,1030, 197,1428, 805,2821,1501,4349,7435,7436,7437, # 4054
|
1444, 306, 1749, 7434, 3755, 1416, 1030, 197, 1428, 805, 2821, 1501, 4349, 7435, 7436, 7437, # 4054
|
||||||
1993,7438,4350,7439,7440,2195, 13,2779,3638,2980,3124,1229,1916,7441,3756,2131, # 4070
|
1993, 7438, 4350, 7439, 7440, 2195, 13, 2779, 3638, 2980, 3124, 1229, 1916, 7441, 3756, 2131, # 4070
|
||||||
7442,4100,4351,2399,3525,7443,2213,1511,1727,1120,7444,7445, 646,3757,2443, 307, # 4086
|
7442, 4100, 4351, 2399, 3525, 7443, 2213, 1511, 1727, 1120, 7444, 7445, 646, 3757, 2443, 307, # 4086
|
||||||
7446,7447,1595,3186,7448,7449,7450,3639,1113,1356,3899,1465,2522,2523,7451, 519, # 4102
|
7446, 7447, 1595, 3186, 7448, 7449, 7450, 3639, 1113, 1356, 3899, 1465, 2522, 2523, 7451, 519, # 4102
|
||||||
7452, 128,2132, 92,2284,1979,7453,3900,1512, 342,3125,2196,7454,2780,2214,1980, # 4118
|
7452, 128, 2132, 92, 2284, 1979, 7453, 3900, 1512, 342, 3125, 2196, 7454, 2780, 2214, 1980, # 4118
|
||||||
3323,7455, 290,1656,1317, 789, 827,2360,7456,3758,4352, 562, 581,3901,7457, 401, # 4134
|
3323, 7455, 290, 1656, 1317, 789, 827, 2360, 7456, 3758, 4352, 562, 581, 3901, 7457, 401, # 4134
|
||||||
4353,2248, 94,4354,1399,2781,7458,1463,2024,4355,3187,1943,7459, 828,1105,4101, # 4150
|
4353, 2248, 94, 4354, 1399, 2781, 7458, 1463, 2024, 4355, 3187, 1943, 7459, 828, 1105, 4101, # 4150
|
||||||
1262,1394,7460,4102, 605,4356,7461,1783,2862,7462,2822, 819,2101, 578,2197,2937, # 4166
|
1262, 1394, 7460, 4102, 605, 4356, 7461, 1783, 2862, 7462, 2822, 819, 2101, 578, 2197, 2937, # 4166
|
||||||
7463,1502, 436,3254,4103,3255,2823,3902,2905,3425,3426,7464,2712,2315,7465,7466, # 4182
|
7463, 1502, 436, 3254, 4103, 3255, 2823, 3902, 2905, 3425, 3426, 7464, 2712, 2315, 7465, 7466, # 4182
|
||||||
2332,2067, 23,4357, 193, 826,3759,2102, 699,1630,4104,3075, 390,1793,1064,3526, # 4198
|
2332, 2067, 23, 4357, 193, 826, 3759, 2102, 699, 1630, 4104, 3075, 390, 1793, 1064, 3526, # 4198
|
||||||
7467,1579,3076,3077,1400,7468,4105,1838,1640,2863,7469,4358,4359, 137,4106, 598, # 4214
|
7467, 1579, 3076, 3077, 1400, 7468, 4105, 1838, 1640, 2863, 7469, 4358, 4359, 137, 4106, 598, # 4214
|
||||||
3078,1966, 780, 104, 974,2938,7470, 278, 899, 253, 402, 572, 504, 493,1339,7471, # 4230
|
3078, 1966, 780, 104, 974, 2938, 7470, 278, 899, 253, 402, 572, 504, 493, 1339, 7471, # 4230
|
||||||
3903,1275,4360,2574,2550,7472,3640,3029,3079,2249, 565,1334,2713, 863, 41,7473, # 4246
|
3903, 1275, 4360, 2574, 2550, 7472, 3640, 3029, 3079, 2249, 565, 1334, 2713, 863, 41, 7473, # 4246
|
||||||
7474,4361,7475,1657,2333, 19, 463,2750,4107, 606,7476,2981,3256,1087,2084,1323, # 4262
|
7474, 4361, 7475, 1657, 2333, 19, 463, 2750, 4107, 606, 7476, 2981, 3256, 1087, 2084, 1323, # 4262
|
||||||
2652,2982,7477,1631,1623,1750,4108,2682,7478,2864, 791,2714,2653,2334, 232,2416, # 4278
|
2652, 2982, 7477, 1631, 1623, 1750, 4108, 2682, 7478, 2864, 791, 2714, 2653, 2334, 232, 2416, # 4278
|
||||||
7479,2983,1498,7480,2654,2620, 755,1366,3641,3257,3126,2025,1609, 119,1917,3427, # 4294
|
7479, 2983, 1498, 7480, 2654, 2620, 755, 1366, 3641, 3257, 3126, 2025, 1609, 119, 1917, 3427, # 4294
|
||||||
862,1026,4109,7481,3904,3760,4362,3905,4363,2260,1951,2470,7482,1125, 817,4110, # 4310
|
862, 1026, 4109, 7481, 3904, 3760, 4362, 3905, 4363, 2260, 1951, 2470, 7482, 1125, 817, 4110, # 4310
|
||||||
4111,3906,1513,1766,2040,1487,4112,3030,3258,2824,3761,3127,7483,7484,1507,7485, # 4326
|
4111, 3906, 1513, 1766, 2040, 1487, 4112, 3030, 3258, 2824, 3761, 3127, 7483, 7484, 1507, 7485, # 4326
|
||||||
2683, 733, 40,1632,1106,2865, 345,4113, 841,2524, 230,4364,2984,1846,3259,3428, # 4342
|
2683, 733, 40, 1632, 1106, 2865, 345, 4113, 841, 2524, 230, 4364, 2984, 1846, 3259, 3428, # 4342
|
||||||
7486,1263, 986,3429,7487, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562,3907, # 4358
|
7486, 1263, 986, 3429, 7487, 735, 879, 254, 1137, 857, 622, 1300, 1180, 1388, 1562, 3907, # 4358
|
||||||
3908,2939, 967,2751,2655,1349, 592,2133,1692,3324,2985,1994,4114,1679,3909,1901, # 4374
|
3908, 2939, 967, 2751, 2655, 1349, 592, 2133, 1692, 3324, 2985, 1994, 4114, 1679, 3909, 1901, # 4374
|
||||||
2185,7488, 739,3642,2715,1296,1290,7489,4115,2198,2199,1921,1563,2595,2551,1870, # 4390
|
2185, 7488, 739, 3642, 2715, 1296, 1290, 7489, 4115, 2198, 2199, 1921, 1563, 2595, 2551, 1870, # 4390
|
||||||
2752,2986,7490, 435,7491, 343,1108, 596, 17,1751,4365,2235,3430,3643,7492,4366, # 4406
|
2752, 2986, 7490, 435, 7491, 343, 1108, 596, 17, 1751, 4365, 2235, 3430, 3643, 7492, 4366, # 4406
|
||||||
294,3527,2940,1693, 477, 979, 281,2041,3528, 643,2042,3644,2621,2782,2261,1031, # 4422
|
294, 3527, 2940, 1693, 477, 979, 281, 2041, 3528, 643, 2042, 3644, 2621, 2782, 2261, 1031, # 4422
|
||||||
2335,2134,2298,3529,4367, 367,1249,2552,7493,3530,7494,4368,1283,3325,2004, 240, # 4438
|
2335, 2134, 2298, 3529, 4367, 367, 1249, 2552, 7493, 3530, 7494, 4368, 1283, 3325, 2004, 240, # 4438
|
||||||
1762,3326,4369,4370, 836,1069,3128, 474,7495,2148,2525, 268,3531,7496,3188,1521, # 4454
|
1762, 3326, 4369, 4370, 836, 1069, 3128, 474, 7495, 2148, 2525, 268, 3531, 7496, 3188, 1521, # 4454
|
||||||
1284,7497,1658,1546,4116,7498,3532,3533,7499,4117,3327,2684,1685,4118, 961,1673, # 4470
|
1284, 7497, 1658, 1546, 4116, 7498, 3532, 3533, 7499, 4117, 3327, 2684, 1685, 4118, 961, 1673, # 4470
|
||||||
2622, 190,2005,2200,3762,4371,4372,7500, 570,2497,3645,1490,7501,4373,2623,3260, # 4486
|
2622, 190, 2005, 2200, 3762, 4371, 4372, 7500, 570, 2497, 3645, 1490, 7501, 4373, 2623, 3260, # 4486
|
||||||
1956,4374, 584,1514, 396,1045,1944,7502,4375,1967,2444,7503,7504,4376,3910, 619, # 4502
|
1956, 4374, 584, 1514, 396, 1045, 1944, 7502, 4375, 1967, 2444, 7503, 7504, 4376, 3910, 619, # 4502
|
||||||
7505,3129,3261, 215,2006,2783,2553,3189,4377,3190,4378, 763,4119,3763,4379,7506, # 4518
|
7505, 3129, 3261, 215, 2006, 2783, 2553, 3189, 4377, 3190, 4378, 763, 4119, 3763, 4379, 7506, # 4518
|
||||||
7507,1957,1767,2941,3328,3646,1174, 452,1477,4380,3329,3130,7508,2825,1253,2382, # 4534
|
7507, 1957, 1767, 2941, 3328, 3646, 1174, 452, 1477, 4380, 3329, 3130, 7508, 2825, 1253, 2382, # 4534
|
||||||
2186,1091,2285,4120, 492,7509, 638,1169,1824,2135,1752,3911, 648, 926,1021,1324, # 4550
|
2186, 1091, 2285, 4120, 492, 7509, 638, 1169, 1824, 2135, 1752, 3911, 648, 926, 1021, 1324, # 4550
|
||||||
4381, 520,4382, 997, 847,1007, 892,4383,3764,2262,1871,3647,7510,2400,1784,4384, # 4566
|
4381, 520, 4382, 997, 847, 1007, 892, 4383, 3764, 2262, 1871, 3647, 7510, 2400, 1784, 4384, # 4566
|
||||||
1952,2942,3080,3191,1728,4121,2043,3648,4385,2007,1701,3131,1551, 30,2263,4122, # 4582
|
1952, 2942, 3080, 3191, 1728, 4121, 2043, 3648, 4385, 2007, 1701, 3131, 1551, 30, 2263, 4122, # 4582
|
||||||
7511,2026,4386,3534,7512, 501,7513,4123, 594,3431,2165,1821,3535,3432,3536,3192, # 4598
|
7511, 2026, 4386, 3534, 7512, 501, 7513, 4123, 594, 3431, 2165, 1821, 3535, 3432, 3536, 3192, # 4598
|
||||||
829,2826,4124,7514,1680,3132,1225,4125,7515,3262,4387,4126,3133,2336,7516,4388, # 4614
|
829, 2826, 4124, 7514, 1680, 3132, 1225, 4125, 7515, 3262, 4387, 4126, 3133, 2336, 7516, 4388, # 4614
|
||||||
4127,7517,3912,3913,7518,1847,2383,2596,3330,7519,4389, 374,3914, 652,4128,4129, # 4630
|
4127, 7517, 3912, 3913, 7518, 1847, 2383, 2596, 3330, 7519, 4389, 374, 3914, 652, 4128, 4129, # 4630
|
||||||
375,1140, 798,7520,7521,7522,2361,4390,2264, 546,1659, 138,3031,2445,4391,7523, # 4646
|
375, 1140, 798, 7520, 7521, 7522, 2361, 4390, 2264, 546, 1659, 138, 3031, 2445, 4391, 7523, # 4646
|
||||||
2250, 612,1848, 910, 796,3765,1740,1371, 825,3766,3767,7524,2906,2554,7525, 692, # 4662
|
2250, 612, 1848, 910, 796, 3765, 1740, 1371, 825, 3766, 3767, 7524, 2906, 2554, 7525, 692, # 4662
|
||||||
444,3032,2624, 801,4392,4130,7526,1491, 244,1053,3033,4131,4132, 340,7527,3915, # 4678
|
444, 3032, 2624, 801, 4392, 4130, 7526, 1491, 244, 1053, 3033, 4131, 4132, 340, 7527, 3915, # 4678
|
||||||
1041,2987, 293,1168, 87,1357,7528,1539, 959,7529,2236, 721, 694,4133,3768, 219, # 4694
|
1041, 2987, 293, 1168, 87, 1357, 7528, 1539, 959, 7529, 2236, 721, 694, 4133, 3768, 219, # 4694
|
||||||
1478, 644,1417,3331,2656,1413,1401,1335,1389,3916,7530,7531,2988,2362,3134,1825, # 4710
|
1478, 644, 1417, 3331, 2656, 1413, 1401, 1335, 1389, 3916, 7530, 7531, 2988, 2362, 3134, 1825, # 4710
|
||||||
730,1515, 184,2827, 66,4393,7532,1660,2943, 246,3332, 378,1457, 226,3433, 975, # 4726
|
730, 1515, 184, 2827, 66, 4393, 7532, 1660, 2943, 246, 3332, 378, 1457, 226, 3433, 975, # 4726
|
||||||
3917,2944,1264,3537, 674, 696,7533, 163,7534,1141,2417,2166, 713,3538,3333,4394, # 4742
|
3917, 2944, 1264, 3537, 674, 696, 7533, 163, 7534, 1141, 2417, 2166, 713, 3538, 3333, 4394, # 4742
|
||||||
3918,7535,7536,1186, 15,7537,1079,1070,7538,1522,3193,3539, 276,1050,2716, 758, # 4758
|
3918, 7535, 7536, 1186, 15, 7537, 1079, 1070, 7538, 1522, 3193, 3539, 276, 1050, 2716, 758, # 4758
|
||||||
1126, 653,2945,3263,7539,2337, 889,3540,3919,3081,2989, 903,1250,4395,3920,3434, # 4774
|
1126, 653, 2945, 3263, 7539, 2337, 889, 3540, 3919, 3081, 2989, 903, 1250, 4395, 3920, 3434, # 4774
|
||||||
3541,1342,1681,1718, 766,3264, 286, 89,2946,3649,7540,1713,7541,2597,3334,2990, # 4790
|
3541, 1342, 1681, 1718, 766, 3264, 286, 89, 2946, 3649, 7540, 1713, 7541, 2597, 3334, 2990, # 4790
|
||||||
7542,2947,2215,3194,2866,7543,4396,2498,2526, 181, 387,1075,3921, 731,2187,3335, # 4806
|
7542, 2947, 2215, 3194, 2866, 7543, 4396, 2498, 2526, 181, 387, 1075, 3921, 731, 2187, 3335, # 4806
|
||||||
7544,3265, 310, 313,3435,2299, 770,4134, 54,3034, 189,4397,3082,3769,3922,7545, # 4822
|
7544, 3265, 310, 313, 3435, 2299, 770, 4134, 54, 3034, 189, 4397, 3082, 3769, 3922, 7545, # 4822
|
||||||
1230,1617,1849, 355,3542,4135,4398,3336, 111,4136,3650,1350,3135,3436,3035,4137, # 4838
|
1230, 1617, 1849, 355, 3542, 4135, 4398, 3336, 111, 4136, 3650, 1350, 3135, 3436, 3035, 4137, # 4838
|
||||||
2149,3266,3543,7546,2784,3923,3924,2991, 722,2008,7547,1071, 247,1207,2338,2471, # 4854
|
2149, 3266, 3543, 7546, 2784, 3923, 3924, 2991, 722, 2008, 7547, 1071, 247, 1207, 2338, 2471, # 4854
|
||||||
1378,4399,2009, 864,1437,1214,4400, 373,3770,1142,2216, 667,4401, 442,2753,2555, # 4870
|
1378, 4399, 2009, 864, 1437, 1214, 4400, 373, 3770, 1142, 2216, 667, 4401, 442, 2753, 2555, # 4870
|
||||||
3771,3925,1968,4138,3267,1839, 837, 170,1107, 934,1336,1882,7548,7549,2118,4139, # 4886
|
3771, 3925, 1968, 4138, 3267, 1839, 837, 170, 1107, 934, 1336, 1882, 7548, 7549, 2118, 4139, # 4886
|
||||||
2828, 743,1569,7550,4402,4140, 582,2384,1418,3437,7551,1802,7552, 357,1395,1729, # 4902
|
2828, 743, 1569, 7550, 4402, 4140, 582, 2384, 1418, 3437, 7551, 1802, 7552, 357, 1395, 1729, # 4902
|
||||||
3651,3268,2418,1564,2237,7553,3083,3772,1633,4403,1114,2085,4141,1532,7554, 482, # 4918
|
3651, 3268, 2418, 1564, 2237, 7553, 3083, 3772, 1633, 4403, 1114, 2085, 4141, 1532, 7554, 482, # 4918
|
||||||
2446,4404,7555,7556,1492, 833,1466,7557,2717,3544,1641,2829,7558,1526,1272,3652, # 4934
|
2446, 4404, 7555, 7556, 1492, 833, 1466, 7557, 2717, 3544, 1641, 2829, 7558, 1526, 1272, 3652, # 4934
|
||||||
4142,1686,1794, 416,2556,1902,1953,1803,7559,3773,2785,3774,1159,2316,7560,2867, # 4950
|
4142, 1686, 1794, 416, 2556, 1902, 1953, 1803, 7559, 3773, 2785, 3774, 1159, 2316, 7560, 2867, # 4950
|
||||||
4405,1610,1584,3036,2419,2754, 443,3269,1163,3136,7561,7562,3926,7563,4143,2499, # 4966
|
4405, 1610, 1584, 3036, 2419, 2754, 443, 3269, 1163, 3136, 7561, 7562, 3926, 7563, 4143, 2499, # 4966
|
||||||
3037,4406,3927,3137,2103,1647,3545,2010,1872,4144,7564,4145, 431,3438,7565, 250, # 4982
|
3037, 4406, 3927, 3137, 2103, 1647, 3545, 2010, 1872, 4144, 7564, 4145, 431, 3438, 7565, 250, # 4982
|
||||||
97, 81,4146,7566,1648,1850,1558, 160, 848,7567, 866, 740,1694,7568,2201,2830, # 4998
|
97, 81, 4146, 7566, 1648, 1850, 1558, 160, 848, 7567, 866, 740, 1694, 7568, 2201, 2830, # 4998
|
||||||
3195,4147,4407,3653,1687, 950,2472, 426, 469,3196,3654,3655,3928,7569,7570,1188, # 5014
|
3195, 4147, 4407, 3653, 1687, 950, 2472, 426, 469, 3196, 3654, 3655, 3928, 7569, 7570, 1188, # 5014
|
||||||
424,1995, 861,3546,4148,3775,2202,2685, 168,1235,3547,4149,7571,2086,1674,4408, # 5030
|
424, 1995, 861, 3546, 4148, 3775, 2202, 2685, 168, 1235, 3547, 4149, 7571, 2086, 1674, 4408, # 5030
|
||||||
3337,3270, 220,2557,1009,7572,3776, 670,2992, 332,1208, 717,7573,7574,3548,2447, # 5046
|
3337, 3270, 220, 2557, 1009, 7572, 3776, 670, 2992, 332, 1208, 717, 7573, 7574, 3548, 2447, # 5046
|
||||||
3929,3338,7575, 513,7576,1209,2868,3339,3138,4409,1080,7577,7578,7579,7580,2527, # 5062
|
3929, 3338, 7575, 513, 7576, 1209, 2868, 3339, 3138, 4409, 1080, 7577, 7578, 7579, 7580, 2527, # 5062
|
||||||
3656,3549, 815,1587,3930,3931,7581,3550,3439,3777,1254,4410,1328,3038,1390,3932, # 5078
|
3656, 3549, 815, 1587, 3930, 3931, 7581, 3550, 3439, 3777, 1254, 4410, 1328, 3038, 1390, 3932, # 5078
|
||||||
1741,3933,3778,3934,7582, 236,3779,2448,3271,7583,7584,3657,3780,1273,3781,4411, # 5094
|
1741, 3933, 3778, 3934, 7582, 236, 3779, 2448, 3271, 7583, 7584, 3657, 3780, 1273, 3781, 4411, # 5094
|
||||||
7585, 308,7586,4412, 245,4413,1851,2473,1307,2575, 430, 715,2136,2449,7587, 270, # 5110
|
7585, 308, 7586, 4412, 245, 4413, 1851, 2473, 1307, 2575, 430, 715, 2136, 2449, 7587, 270, # 5110
|
||||||
199,2869,3935,7588,3551,2718,1753, 761,1754, 725,1661,1840,4414,3440,3658,7589, # 5126
|
199, 2869, 3935, 7588, 3551, 2718, 1753, 761, 1754, 725, 1661, 1840, 4414, 3440, 3658, 7589, # 5126
|
||||||
7590, 587, 14,3272, 227,2598, 326, 480,2265, 943,2755,3552, 291, 650,1883,7591, # 5142
|
7590, 587, 14, 3272, 227, 2598, 326, 480, 2265, 943, 2755, 3552, 291, 650, 1883, 7591, # 5142
|
||||||
1702,1226, 102,1547, 62,3441, 904,4415,3442,1164,4150,7592,7593,1224,1548,2756, # 5158
|
1702, 1226, 102, 1547, 62, 3441, 904, 4415, 3442, 1164, 4150, 7592, 7593, 1224, 1548, 2756, # 5158
|
||||||
391, 498,1493,7594,1386,1419,7595,2055,1177,4416, 813, 880,1081,2363, 566,1145, # 5174
|
391, 498, 1493, 7594, 1386, 1419, 7595, 2055, 1177, 4416, 813, 880, 1081, 2363, 566, 1145, # 5174
|
||||||
4417,2286,1001,1035,2558,2599,2238, 394,1286,7596,7597,2068,7598, 86,1494,1730, # 5190
|
4417, 2286, 1001, 1035, 2558, 2599, 2238, 394, 1286, 7596, 7597, 2068, 7598, 86, 1494, 1730, # 5190
|
||||||
3936, 491,1588, 745, 897,2948, 843,3340,3937,2757,2870,3273,1768, 998,2217,2069, # 5206
|
3936, 491, 1588, 745, 897, 2948, 843, 3340, 3937, 2757, 2870, 3273, 1768, 998, 2217, 2069, # 5206
|
||||||
397,1826,1195,1969,3659,2993,3341, 284,7599,3782,2500,2137,2119,1903,7600,3938, # 5222
|
397, 1826, 1195, 1969, 3659, 2993, 3341, 284, 7599, 3782, 2500, 2137, 2119, 1903, 7600, 3938, # 5222
|
||||||
2150,3939,4151,1036,3443,1904, 114,2559,4152, 209,1527,7601,7602,2949,2831,2625, # 5238
|
2150, 3939, 4151, 1036, 3443, 1904, 114, 2559, 4152, 209, 1527, 7601, 7602, 2949, 2831, 2625, # 5238
|
||||||
2385,2719,3139, 812,2560,7603,3274,7604,1559, 737,1884,3660,1210, 885, 28,2686, # 5254
|
2385, 2719, 3139, 812, 2560, 7603, 3274, 7604, 1559, 737, 1884, 3660, 1210, 885, 28, 2686, # 5254
|
||||||
3553,3783,7605,4153,1004,1779,4418,7606, 346,1981,2218,2687,4419,3784,1742, 797, # 5270
|
3553, 3783, 7605, 4153, 1004, 1779, 4418, 7606, 346, 1981, 2218, 2687, 4419, 3784, 1742, 797, # 5270
|
||||||
1642,3940,1933,1072,1384,2151, 896,3941,3275,3661,3197,2871,3554,7607,2561,1958, # 5286
|
1642, 3940, 1933, 1072, 1384, 2151, 896, 3941, 3275, 3661, 3197, 2871, 3554, 7607, 2561, 1958, # 5286
|
||||||
4420,2450,1785,7608,7609,7610,3942,4154,1005,1308,3662,4155,2720,4421,4422,1528, # 5302
|
4420, 2450, 1785, 7608, 7609, 7610, 3942, 4154, 1005, 1308, 3662, 4155, 2720, 4421, 4422, 1528, # 5302
|
||||||
2600, 161,1178,4156,1982, 987,4423,1101,4157, 631,3943,1157,3198,2420,1343,1241, # 5318
|
2600, 161, 1178, 4156, 1982, 987, 4423, 1101, 4157, 631, 3943, 1157, 3198, 2420, 1343, 1241, # 5318
|
||||||
1016,2239,2562, 372, 877,2339,2501,1160, 555,1934, 911,3944,7611, 466,1170, 169, # 5334
|
1016, 2239, 2562, 372, 877, 2339, 2501, 1160, 555, 1934, 911, 3944, 7611, 466, 1170, 169, # 5334
|
||||||
1051,2907,2688,3663,2474,2994,1182,2011,2563,1251,2626,7612, 992,2340,3444,1540, # 5350
|
1051, 2907, 2688, 3663, 2474, 2994, 1182, 2011, 2563, 1251, 2626, 7612, 992, 2340, 3444, 1540, # 5350
|
||||||
2721,1201,2070,2401,1996,2475,7613,4424, 528,1922,2188,1503,1873,1570,2364,3342, # 5366
|
2721, 1201, 2070, 2401, 1996, 2475, 7613, 4424, 528, 1922, 2188, 1503, 1873, 1570, 2364, 3342, # 5366
|
||||||
3276,7614, 557,1073,7615,1827,3445,2087,2266,3140,3039,3084, 767,3085,2786,4425, # 5382
|
3276, 7614, 557, 1073, 7615, 1827, 3445, 2087, 2266, 3140, 3039, 3084, 767, 3085, 2786, 4425, # 5382
|
||||||
1006,4158,4426,2341,1267,2176,3664,3199, 778,3945,3200,2722,1597,2657,7616,4427, # 5398
|
1006, 4158, 4426, 2341, 1267, 2176, 3664, 3199, 778, 3945, 3200, 2722, 1597, 2657, 7616, 4427, # 5398
|
||||||
7617,3446,7618,7619,7620,3277,2689,1433,3278, 131, 95,1504,3946, 723,4159,3141, # 5414
|
7617, 3446, 7618, 7619, 7620, 3277, 2689, 1433, 3278, 131, 95, 1504, 3946, 723, 4159, 3141, # 5414
|
||||||
1841,3555,2758,2189,3947,2027,2104,3665,7621,2995,3948,1218,7622,3343,3201,3949, # 5430
|
1841, 3555, 2758, 2189, 3947, 2027, 2104, 3665, 7621, 2995, 3948, 1218, 7622, 3343, 3201, 3949, # 5430
|
||||||
4160,2576, 248,1634,3785, 912,7623,2832,3666,3040,3786, 654, 53,7624,2996,7625, # 5446
|
4160, 2576, 248, 1634, 3785, 912, 7623, 2832, 3666, 3040, 3786, 654, 53, 7624, 2996, 7625, # 5446
|
||||||
1688,4428, 777,3447,1032,3950,1425,7626, 191, 820,2120,2833, 971,4429, 931,3202, # 5462
|
1688, 4428, 777, 3447, 1032, 3950, 1425, 7626, 191, 820, 2120, 2833, 971, 4429, 931, 3202, # 5462
|
||||||
135, 664, 783,3787,1997, 772,2908,1935,3951,3788,4430,2909,3203, 282,2723, 640, # 5478
|
135, 664, 783, 3787, 1997, 772, 2908, 1935, 3951, 3788, 4430, 2909, 3203, 282, 2723, 640, # 5478
|
||||||
1372,3448,1127, 922, 325,3344,7627,7628, 711,2044,7629,7630,3952,2219,2787,1936, # 5494
|
1372, 3448, 1127, 922, 325, 3344, 7627, 7628, 711, 2044, 7629, 7630, 3952, 2219, 2787, 1936, # 5494
|
||||||
3953,3345,2220,2251,3789,2300,7631,4431,3790,1258,3279,3954,3204,2138,2950,3955, # 5510
|
3953, 3345, 2220, 2251, 3789, 2300, 7631, 4431, 3790, 1258, 3279, 3954, 3204, 2138, 2950, 3955, # 5510
|
||||||
3956,7632,2221, 258,3205,4432, 101,1227,7633,3280,1755,7634,1391,3281,7635,2910, # 5526
|
3956, 7632, 2221, 258, 3205, 4432, 101, 1227, 7633, 3280, 1755, 7634, 1391, 3281, 7635, 2910, # 5526
|
||||||
2056, 893,7636,7637,7638,1402,4161,2342,7639,7640,3206,3556,7641,7642, 878,1325, # 5542
|
2056, 893, 7636, 7637, 7638, 1402, 4161, 2342, 7639, 7640, 3206, 3556, 7641, 7642, 878, 1325, # 5542
|
||||||
1780,2788,4433, 259,1385,2577, 744,1183,2267,4434,7643,3957,2502,7644, 684,1024, # 5558
|
1780, 2788, 4433, 259, 1385, 2577, 744, 1183, 2267, 4434, 7643, 3957, 2502, 7644, 684, 1024, # 5558
|
||||||
4162,7645, 472,3557,3449,1165,3282,3958,3959, 322,2152, 881, 455,1695,1152,1340, # 5574
|
4162, 7645, 472, 3557, 3449, 1165, 3282, 3958, 3959, 322, 2152, 881, 455, 1695, 1152, 1340, # 5574
|
||||||
660, 554,2153,4435,1058,4436,4163, 830,1065,3346,3960,4437,1923,7646,1703,1918, # 5590
|
660, 554, 2153, 4435, 1058, 4436, 4163, 830, 1065, 3346, 3960, 4437, 1923, 7646, 1703, 1918, # 5590
|
||||||
7647, 932,2268, 122,7648,4438, 947, 677,7649,3791,2627, 297,1905,1924,2269,4439, # 5606
|
7647, 932, 2268, 122, 7648, 4438, 947, 677, 7649, 3791, 2627, 297, 1905, 1924, 2269, 4439, # 5606
|
||||||
2317,3283,7650,7651,4164,7652,4165, 84,4166, 112, 989,7653, 547,1059,3961, 701, # 5622
|
2317, 3283, 7650, 7651, 4164, 7652, 4165, 84, 4166, 112, 989, 7653, 547, 1059, 3961, 701, # 5622
|
||||||
3558,1019,7654,4167,7655,3450, 942, 639, 457,2301,2451, 993,2951, 407, 851, 494, # 5638
|
3558, 1019, 7654, 4167, 7655, 3450, 942, 639, 457, 2301, 2451, 993, 2951, 407, 851, 494, # 5638
|
||||||
4440,3347, 927,7656,1237,7657,2421,3348, 573,4168, 680, 921,2911,1279,1874, 285, # 5654
|
4440, 3347, 927, 7656, 1237, 7657, 2421, 3348, 573, 4168, 680, 921, 2911, 1279, 1874, 285, # 5654
|
||||||
790,1448,1983, 719,2167,7658,7659,4441,3962,3963,1649,7660,1541, 563,7661,1077, # 5670
|
790, 1448, 1983, 719, 2167, 7658, 7659, 4441, 3962, 3963, 1649, 7660, 1541, 563, 7661, 1077, # 5670
|
||||||
7662,3349,3041,3451, 511,2997,3964,3965,3667,3966,1268,2564,3350,3207,4442,4443, # 5686
|
7662, 3349, 3041, 3451, 511, 2997, 3964, 3965, 3667, 3966, 1268, 2564, 3350, 3207, 4442, 4443, # 5686
|
||||||
7663, 535,1048,1276,1189,2912,2028,3142,1438,1373,2834,2952,1134,2012,7664,4169, # 5702
|
7663, 535, 1048, 1276, 1189, 2912, 2028, 3142, 1438, 1373, 2834, 2952, 1134, 2012, 7664, 4169, # 5702
|
||||||
1238,2578,3086,1259,7665, 700,7666,2953,3143,3668,4170,7667,4171,1146,1875,1906, # 5718
|
1238, 2578, 3086, 1259, 7665, 700, 7666, 2953, 3143, 3668, 4170, 7667, 4171, 1146, 1875, 1906, # 5718
|
||||||
4444,2601,3967, 781,2422, 132,1589, 203, 147, 273,2789,2402, 898,1786,2154,3968, # 5734
|
4444, 2601, 3967, 781, 2422, 132, 1589, 203, 147, 273, 2789, 2402, 898, 1786, 2154, 3968, # 5734
|
||||||
3969,7668,3792,2790,7669,7670,4445,4446,7671,3208,7672,1635,3793, 965,7673,1804, # 5750
|
3969, 7668, 3792, 2790, 7669, 7670, 4445, 4446, 7671, 3208, 7672, 1635, 3793, 965, 7673, 1804, # 5750
|
||||||
2690,1516,3559,1121,1082,1329,3284,3970,1449,3794, 65,1128,2835,2913,2759,1590, # 5766
|
2690, 1516, 3559, 1121, 1082, 1329, 3284, 3970, 1449, 3794, 65, 1128, 2835, 2913, 2759, 1590, # 5766
|
||||||
3795,7674,7675, 12,2658, 45, 976,2579,3144,4447, 517,2528,1013,1037,3209,7676, # 5782
|
3795, 7674, 7675, 12, 2658, 45, 976, 2579, 3144, 4447, 517, 2528, 1013, 1037, 3209, 7676, # 5782
|
||||||
3796,2836,7677,3797,7678,3452,7679,2602, 614,1998,2318,3798,3087,2724,2628,7680, # 5798
|
3796, 2836, 7677, 3797, 7678, 3452, 7679, 2602, 614, 1998, 2318, 3798, 3087, 2724, 2628, 7680, # 5798
|
||||||
2580,4172, 599,1269,7681,1810,3669,7682,2691,3088, 759,1060, 489,1805,3351,3285, # 5814
|
2580, 4172, 599, 1269, 7681, 1810, 3669, 7682, 2691, 3088, 759, 1060, 489, 1805, 3351, 3285, # 5814
|
||||||
1358,7683,7684,2386,1387,1215,2629,2252, 490,7685,7686,4173,1759,2387,2343,7687, # 5830
|
1358, 7683, 7684, 2386, 1387, 1215, 2629, 2252, 490, 7685, 7686, 4173, 1759, 2387, 2343, 7687, # 5830
|
||||||
4448,3799,1907,3971,2630,1806,3210,4449,3453,3286,2760,2344, 874,7688,7689,3454, # 5846
|
4448, 3799, 1907, 3971, 2630, 1806, 3210, 4449, 3453, 3286, 2760, 2344, 874, 7688, 7689, 3454, # 5846
|
||||||
3670,1858, 91,2914,3671,3042,3800,4450,7690,3145,3972,2659,7691,3455,1202,1403, # 5862
|
3670, 1858, 91, 2914, 3671, 3042, 3800, 4450, 7690, 3145, 3972, 2659, 7691, 3455, 1202, 1403, # 5862
|
||||||
3801,2954,2529,1517,2503,4451,3456,2504,7692,4452,7693,2692,1885,1495,1731,3973, # 5878
|
3801, 2954, 2529, 1517, 2503, 4451, 3456, 2504, 7692, 4452, 7693, 2692, 1885, 1495, 1731, 3973, # 5878
|
||||||
2365,4453,7694,2029,7695,7696,3974,2693,1216, 237,2581,4174,2319,3975,3802,4454, # 5894
|
2365, 4453, 7694, 2029, 7695, 7696, 3974, 2693, 1216, 237, 2581, 4174, 2319, 3975, 3802, 4454, # 5894
|
||||||
4455,2694,3560,3457, 445,4456,7697,7698,7699,7700,2761, 61,3976,3672,1822,3977, # 5910
|
4455, 2694, 3560, 3457, 445, 4456, 7697, 7698, 7699, 7700, 2761, 61, 3976, 3672, 1822, 3977, # 5910
|
||||||
7701, 687,2045, 935, 925, 405,2660, 703,1096,1859,2725,4457,3978,1876,1367,2695, # 5926
|
7701, 687, 2045, 935, 925, 405, 2660, 703, 1096, 1859, 2725, 4457, 3978, 1876, 1367, 2695, # 5926
|
||||||
3352, 918,2105,1781,2476, 334,3287,1611,1093,4458, 564,3146,3458,3673,3353, 945, # 5942
|
3352, 918, 2105, 1781, 2476, 334, 3287, 1611, 1093, 4458, 564, 3146, 3458, 3673, 3353, 945, # 5942
|
||||||
2631,2057,4459,7702,1925, 872,4175,7703,3459,2696,3089, 349,4176,3674,3979,4460, # 5958
|
2631, 2057, 4459, 7702, 1925, 872, 4175, 7703, 3459, 2696, 3089, 349, 4176, 3674, 3979, 4460, # 5958
|
||||||
3803,4177,3675,2155,3980,4461,4462,4178,4463,2403,2046, 782,3981, 400, 251,4179, # 5974
|
3803, 4177, 3675, 2155, 3980, 4461, 4462, 4178, 4463, 2403, 2046, 782, 3981, 400, 251, 4179, # 5974
|
||||||
1624,7704,7705, 277,3676, 299,1265, 476,1191,3804,2121,4180,4181,1109, 205,7706, # 5990
|
1624, 7704, 7705, 277, 3676, 299, 1265, 476, 1191, 3804, 2121, 4180, 4181, 1109, 205, 7706, # 5990
|
||||||
2582,1000,2156,3561,1860,7707,7708,7709,4464,7710,4465,2565, 107,2477,2157,3982, # 6006
|
2582, 1000, 2156, 3561, 1860, 7707, 7708, 7709, 4464, 7710, 4465, 2565, 107, 2477, 2157, 3982, # 6006
|
||||||
3460,3147,7711,1533, 541,1301, 158, 753,4182,2872,3562,7712,1696, 370,1088,4183, # 6022
|
3460, 3147, 7711, 1533, 541, 1301, 158, 753, 4182, 2872, 3562, 7712, 1696, 370, 1088, 4183, # 6022
|
||||||
4466,3563, 579, 327, 440, 162,2240, 269,1937,1374,3461, 968,3043, 56,1396,3090, # 6038
|
4466, 3563, 579, 327, 440, 162, 2240, 269, 1937, 1374, 3461, 968, 3043, 56, 1396, 3090, # 6038
|
||||||
2106,3288,3354,7713,1926,2158,4467,2998,7714,3564,7715,7716,3677,4468,2478,7717, # 6054
|
2106, 3288, 3354, 7713, 1926, 2158, 4467, 2998, 7714, 3564, 7715, 7716, 3677, 4468, 2478, 7717, # 6054
|
||||||
2791,7718,1650,4469,7719,2603,7720,7721,3983,2661,3355,1149,3356,3984,3805,3985, # 6070
|
2791, 7718, 1650, 4469, 7719, 2603, 7720, 7721, 3983, 2661, 3355, 1149, 3356, 3984, 3805, 3985, # 6070
|
||||||
7722,1076, 49,7723, 951,3211,3289,3290, 450,2837, 920,7724,1811,2792,2366,4184, # 6086
|
7722, 1076, 49, 7723, 951, 3211, 3289, 3290, 450, 2837, 920, 7724, 1811, 2792, 2366, 4184, # 6086
|
||||||
1908,1138,2367,3806,3462,7725,3212,4470,1909,1147,1518,2423,4471,3807,7726,4472, # 6102
|
1908, 1138, 2367, 3806, 3462, 7725, 3212, 4470, 1909, 1147, 1518, 2423, 4471, 3807, 7726, 4472, # 6102
|
||||||
2388,2604, 260,1795,3213,7727,7728,3808,3291, 708,7729,3565,1704,7730,3566,1351, # 6118
|
2388, 2604, 260, 1795, 3213, 7727, 7728, 3808, 3291, 708, 7729, 3565, 1704, 7730, 3566, 1351, # 6118
|
||||||
1618,3357,2999,1886, 944,4185,3358,4186,3044,3359,4187,7731,3678, 422, 413,1714, # 6134
|
1618, 3357, 2999, 1886, 944, 4185, 3358, 4186, 3044, 3359, 4187, 7731, 3678, 422, 413, 1714, # 6134
|
||||||
3292, 500,2058,2345,4188,2479,7732,1344,1910, 954,7733,1668,7734,7735,3986,2404, # 6150
|
3292, 500, 2058, 2345, 4188, 2479, 7732, 1344, 1910, 954, 7733, 1668, 7734, 7735, 3986, 2404, # 6150
|
||||||
4189,3567,3809,4190,7736,2302,1318,2505,3091, 133,3092,2873,4473, 629, 31,2838, # 6166
|
4189, 3567, 3809, 4190, 7736, 2302, 1318, 2505, 3091, 133, 3092, 2873, 4473, 629, 31, 2838, # 6166
|
||||||
2697,3810,4474, 850, 949,4475,3987,2955,1732,2088,4191,1496,1852,7737,3988, 620, # 6182
|
2697, 3810, 4474, 850, 949, 4475, 3987, 2955, 1732, 2088, 4191, 1496, 1852, 7737, 3988, 620, # 6182
|
||||||
3214, 981,1242,3679,3360,1619,3680,1643,3293,2139,2452,1970,1719,3463,2168,7738, # 6198
|
3214, 981, 1242, 3679, 3360, 1619, 3680, 1643, 3293, 2139, 2452, 1970, 1719, 3463, 2168, 7738, # 6198
|
||||||
3215,7739,7740,3361,1828,7741,1277,4476,1565,2047,7742,1636,3568,3093,7743, 869, # 6214
|
3215, 7739, 7740, 3361, 1828, 7741, 1277, 4476, 1565, 2047, 7742, 1636, 3568, 3093, 7743, 869, # 6214
|
||||||
2839, 655,3811,3812,3094,3989,3000,3813,1310,3569,4477,7744,7745,7746,1733, 558, # 6230
|
2839, 655, 3811, 3812, 3094, 3989, 3000, 3813, 1310, 3569, 4477, 7744, 7745, 7746, 1733, 558, # 6230
|
||||||
4478,3681, 335,1549,3045,1756,4192,3682,1945,3464,1829,1291,1192, 470,2726,2107, # 6246
|
4478, 3681, 335, 1549, 3045, 1756, 4192, 3682, 1945, 3464, 1829, 1291, 1192, 470, 2726, 2107, # 6246
|
||||||
2793, 913,1054,3990,7747,1027,7748,3046,3991,4479, 982,2662,3362,3148,3465,3216, # 6262
|
2793, 913, 1054, 3990, 7747, 1027, 7748, 3046, 3991, 4479, 982, 2662, 3362, 3148, 3465, 3216, # 6262
|
||||||
3217,1946,2794,7749, 571,4480,7750,1830,7751,3570,2583,1523,2424,7752,2089, 984, # 6278
|
3217, 1946, 2794, 7749, 571, 4480, 7750, 1830, 7751, 3570, 2583, 1523, 2424, 7752, 2089, 984, # 6278
|
||||||
4481,3683,1959,7753,3684, 852, 923,2795,3466,3685, 969,1519, 999,2048,2320,1705, # 6294
|
4481, 3683, 1959, 7753, 3684, 852, 923, 2795, 3466, 3685, 969, 1519, 999, 2048, 2320, 1705, # 6294
|
||||||
7754,3095, 615,1662, 151, 597,3992,2405,2321,1049, 275,4482,3686,4193, 568,3687, # 6310
|
7754, 3095, 615, 1662, 151, 597, 3992, 2405, 2321, 1049, 275, 4482, 3686, 4193, 568, 3687, # 6310
|
||||||
3571,2480,4194,3688,7755,2425,2270, 409,3218,7756,1566,2874,3467,1002, 769,2840, # 6326
|
3571, 2480, 4194, 3688, 7755, 2425, 2270, 409, 3218, 7756, 1566, 2874, 3467, 1002, 769, 2840, # 6326
|
||||||
194,2090,3149,3689,2222,3294,4195, 628,1505,7757,7758,1763,2177,3001,3993, 521, # 6342
|
194, 2090, 3149, 3689, 2222, 3294, 4195, 628, 1505, 7757, 7758, 1763, 2177, 3001, 3993, 521, # 6342
|
||||||
1161,2584,1787,2203,2406,4483,3994,1625,4196,4197, 412, 42,3096, 464,7759,2632, # 6358
|
1161, 2584, 1787, 2203, 2406, 4483, 3994, 1625, 4196, 4197, 412, 42, 3096, 464, 7759, 2632, # 6358
|
||||||
4484,3363,1760,1571,2875,3468,2530,1219,2204,3814,2633,2140,2368,4485,4486,3295, # 6374
|
4484, 3363, 1760, 1571, 2875, 3468, 2530, 1219, 2204, 3814, 2633, 2140, 2368, 4485, 4486, 3295, # 6374
|
||||||
1651,3364,3572,7760,7761,3573,2481,3469,7762,3690,7763,7764,2271,2091, 460,7765, # 6390
|
1651, 3364, 3572, 7760, 7761, 3573, 2481, 3469, 7762, 3690, 7763, 7764, 2271, 2091, 460, 7765, # 6390
|
||||||
4487,7766,3002, 962, 588,3574, 289,3219,2634,1116, 52,7767,3047,1796,7768,7769, # 6406
|
4487, 7766, 3002, 962, 588, 3574, 289, 3219, 2634, 1116, 52, 7767, 3047, 1796, 7768, 7769, # 6406
|
||||||
7770,1467,7771,1598,1143,3691,4198,1984,1734,1067,4488,1280,3365, 465,4489,1572, # 6422
|
7770, 1467, 7771, 1598, 1143, 3691, 4198, 1984, 1734, 1067, 4488, 1280, 3365, 465, 4489, 1572, # 6422
|
||||||
510,7772,1927,2241,1812,1644,3575,7773,4490,3692,7774,7775,2663,1573,1534,7776, # 6438
|
510, 7772, 1927, 2241, 1812, 1644, 3575, 7773, 4490, 3692, 7774, 7775, 2663, 1573, 1534, 7776, # 6438
|
||||||
7777,4199, 536,1807,1761,3470,3815,3150,2635,7778,7779,7780,4491,3471,2915,1911, # 6454
|
7777, 4199, 536, 1807, 1761, 3470, 3815, 3150, 2635, 7778, 7779, 7780, 4491, 3471, 2915, 1911, # 6454
|
||||||
2796,7781,3296,1122, 377,3220,7782, 360,7783,7784,4200,1529, 551,7785,2059,3693, # 6470
|
2796, 7781, 3296, 1122, 377, 3220, 7782, 360, 7783, 7784, 4200, 1529, 551, 7785, 2059, 3693, # 6470
|
||||||
1769,2426,7786,2916,4201,3297,3097,2322,2108,2030,4492,1404, 136,1468,1479, 672, # 6486
|
1769, 2426, 7786, 2916, 4201, 3297, 3097, 2322, 2108, 2030, 4492, 1404, 136, 1468, 1479, 672, # 6486
|
||||||
1171,3221,2303, 271,3151,7787,2762,7788,2049, 678,2727, 865,1947,4493,7789,2013, # 6502
|
1171, 3221, 2303, 271, 3151, 7787, 2762, 7788, 2049, 678, 2727, 865, 1947, 4493, 7789, 2013, # 6502
|
||||||
3995,2956,7790,2728,2223,1397,3048,3694,4494,4495,1735,2917,3366,3576,7791,3816, # 6518
|
3995, 2956, 7790, 2728, 2223, 1397, 3048, 3694, 4494, 4495, 1735, 2917, 3366, 3576, 7791, 3816, # 6518
|
||||||
509,2841,2453,2876,3817,7792,7793,3152,3153,4496,4202,2531,4497,2304,1166,1010, # 6534
|
509, 2841, 2453, 2876, 3817, 7792, 7793, 3152, 3153, 4496, 4202, 2531, 4497, 2304, 1166, 1010, # 6534
|
||||||
552, 681,1887,7794,7795,2957,2958,3996,1287,1596,1861,3154, 358, 453, 736, 175, # 6550
|
552, 681, 1887, 7794, 7795, 2957, 2958, 3996, 1287, 1596, 1861, 3154, 358, 453, 736, 175, # 6550
|
||||||
478,1117, 905,1167,1097,7796,1853,1530,7797,1706,7798,2178,3472,2287,3695,3473, # 6566
|
478, 1117, 905, 1167, 1097, 7796, 1853, 1530, 7797, 1706, 7798, 2178, 3472, 2287, 3695, 3473, # 6566
|
||||||
3577,4203,2092,4204,7799,3367,1193,2482,4205,1458,2190,2205,1862,1888,1421,3298, # 6582
|
3577, 4203, 2092, 4204, 7799, 3367, 1193, 2482, 4205, 1458, 2190, 2205, 1862, 1888, 1421, 3298, # 6582
|
||||||
2918,3049,2179,3474, 595,2122,7800,3997,7801,7802,4206,1707,2636, 223,3696,1359, # 6598
|
2918, 3049, 2179, 3474, 595, 2122, 7800, 3997, 7801, 7802, 4206, 1707, 2636, 223, 3696, 1359, # 6598
|
||||||
751,3098, 183,3475,7803,2797,3003, 419,2369, 633, 704,3818,2389, 241,7804,7805, # 6614
|
751, 3098, 183, 3475, 7803, 2797, 3003, 419, 2369, 633, 704, 3818, 2389, 241, 7804, 7805, # 6614
|
||||||
7806, 838,3004,3697,2272,2763,2454,3819,1938,2050,3998,1309,3099,2242,1181,7807, # 6630
|
7806, 838, 3004, 3697, 2272, 2763, 2454, 3819, 1938, 2050, 3998, 1309, 3099, 2242, 1181, 7807, # 6630
|
||||||
1136,2206,3820,2370,1446,4207,2305,4498,7808,7809,4208,1055,2605, 484,3698,7810, # 6646
|
1136, 2206, 3820, 2370, 1446, 4207, 2305, 4498, 7808, 7809, 4208, 1055, 2605, 484, 3698, 7810, # 6646
|
||||||
3999, 625,4209,2273,3368,1499,4210,4000,7811,4001,4211,3222,2274,2275,3476,7812, # 6662
|
3999, 625, 4209, 2273, 3368, 1499, 4210, 4000, 7811, 4001, 4211, 3222, 2274, 2275, 3476, 7812, # 6662
|
||||||
7813,2764, 808,2606,3699,3369,4002,4212,3100,2532, 526,3370,3821,4213, 955,7814, # 6678
|
7813, 2764, 808, 2606, 3699, 3369, 4002, 4212, 3100, 2532, 526, 3370, 3821, 4213, 955, 7814, # 6678
|
||||||
1620,4214,2637,2427,7815,1429,3700,1669,1831, 994, 928,7816,3578,1260,7817,7818, # 6694
|
1620, 4214, 2637, 2427, 7815, 1429, 3700, 1669, 1831, 994, 928, 7816, 3578, 1260, 7817, 7818, # 6694
|
||||||
7819,1948,2288, 741,2919,1626,4215,2729,2455, 867,1184, 362,3371,1392,7820,7821, # 6710
|
7819, 1948, 2288, 741, 2919, 1626, 4215, 2729, 2455, 867, 1184, 362, 3371, 1392, 7820, 7821, # 6710
|
||||||
4003,4216,1770,1736,3223,2920,4499,4500,1928,2698,1459,1158,7822,3050,3372,2877, # 6726
|
4003, 4216, 1770, 1736, 3223, 2920, 4499, 4500, 1928, 2698, 1459, 1158, 7822, 3050, 3372, 2877, # 6726
|
||||||
1292,1929,2506,2842,3701,1985,1187,2071,2014,2607,4217,7823,2566,2507,2169,3702, # 6742
|
1292, 1929, 2506, 2842, 3701, 1985, 1187, 2071, 2014, 2607, 4217, 7823, 2566, 2507, 2169, 3702, # 6742
|
||||||
2483,3299,7824,3703,4501,7825,7826, 666,1003,3005,1022,3579,4218,7827,4502,1813, # 6758
|
2483, 3299, 7824, 3703, 4501, 7825, 7826, 666, 1003, 3005, 1022, 3579, 4218, 7827, 4502, 1813, # 6758
|
||||||
2253, 574,3822,1603, 295,1535, 705,3823,4219, 283, 858, 417,7828,7829,3224,4503, # 6774
|
2253, 574, 3822, 1603, 295, 1535, 705, 3823, 4219, 283, 858, 417, 7828, 7829, 3224, 4503, # 6774
|
||||||
4504,3051,1220,1889,1046,2276,2456,4004,1393,1599, 689,2567, 388,4220,7830,2484, # 6790
|
4504, 3051, 1220, 1889, 1046, 2276, 2456, 4004, 1393, 1599, 689, 2567, 388, 4220, 7830, 2484, # 6790
|
||||||
802,7831,2798,3824,2060,1405,2254,7832,4505,3825,2109,1052,1345,3225,1585,7833, # 6806
|
802, 7831, 2798, 3824, 2060, 1405, 2254, 7832, 4505, 3825, 2109, 1052, 1345, 3225, 1585, 7833, # 6806
|
||||||
809,7834,7835,7836, 575,2730,3477, 956,1552,1469,1144,2323,7837,2324,1560,2457, # 6822
|
809, 7834, 7835, 7836, 575, 2730, 3477, 956, 1552, 1469, 1144, 2323, 7837, 2324, 1560, 2457, # 6822
|
||||||
3580,3226,4005, 616,2207,3155,2180,2289,7838,1832,7839,3478,4506,7840,1319,3704, # 6838
|
3580, 3226, 4005, 616, 2207, 3155, 2180, 2289, 7838, 1832, 7839, 3478, 4506, 7840, 1319, 3704, # 6838
|
||||||
3705,1211,3581,1023,3227,1293,2799,7841,7842,7843,3826, 607,2306,3827, 762,2878, # 6854
|
3705, 1211, 3581, 1023, 3227, 1293, 2799, 7841, 7842, 7843, 3826, 607, 2306, 3827, 762, 2878, # 6854
|
||||||
1439,4221,1360,7844,1485,3052,7845,4507,1038,4222,1450,2061,2638,4223,1379,4508, # 6870
|
1439, 4221, 1360, 7844, 1485, 3052, 7845, 4507, 1038, 4222, 1450, 2061, 2638, 4223, 1379, 4508, # 6870
|
||||||
2585,7846,7847,4224,1352,1414,2325,2921,1172,7848,7849,3828,3829,7850,1797,1451, # 6886
|
2585, 7846, 7847, 4224, 1352, 1414, 2325, 2921, 1172, 7848, 7849, 3828, 3829, 7850, 1797, 1451, # 6886
|
||||||
7851,7852,7853,7854,2922,4006,4007,2485,2346, 411,4008,4009,3582,3300,3101,4509, # 6902
|
7851, 7852, 7853, 7854, 2922, 4006, 4007, 2485, 2346, 411, 4008, 4009, 3582, 3300, 3101, 4509, # 6902
|
||||||
1561,2664,1452,4010,1375,7855,7856, 47,2959, 316,7857,1406,1591,2923,3156,7858, # 6918
|
1561, 2664, 1452, 4010, 1375, 7855, 7856, 47, 2959, 316, 7857, 1406, 1591, 2923, 3156, 7858, # 6918
|
||||||
1025,2141,3102,3157, 354,2731, 884,2224,4225,2407, 508,3706, 726,3583, 996,2428, # 6934
|
1025, 2141, 3102, 3157, 354, 2731, 884, 2224, 4225, 2407, 508, 3706, 726, 3583, 996, 2428, # 6934
|
||||||
3584, 729,7859, 392,2191,1453,4011,4510,3707,7860,7861,2458,3585,2608,1675,2800, # 6950
|
3584, 729, 7859, 392, 2191, 1453, 4011, 4510, 3707, 7860, 7861, 2458, 3585, 2608, 1675, 2800, # 6950
|
||||||
919,2347,2960,2348,1270,4511,4012, 73,7862,7863, 647,7864,3228,2843,2255,1550, # 6966
|
919, 2347, 2960, 2348, 1270, 4511, 4012, 73, 7862, 7863, 647, 7864, 3228, 2843, 2255, 1550, # 6966
|
||||||
1346,3006,7865,1332, 883,3479,7866,7867,7868,7869,3301,2765,7870,1212, 831,1347, # 6982
|
1346, 3006, 7865, 1332, 883, 3479, 7866, 7867, 7868, 7869, 3301, 2765, 7870, 1212, 831, 1347, # 6982
|
||||||
4226,4512,2326,3830,1863,3053, 720,3831,4513,4514,3832,7871,4227,7872,7873,4515, # 6998
|
4226, 4512, 2326, 3830, 1863, 3053, 720, 3831, 4513, 4514, 3832, 7871, 4227, 7872, 7873, 4515, # 6998
|
||||||
7874,7875,1798,4516,3708,2609,4517,3586,1645,2371,7876,7877,2924, 669,2208,2665, # 7014
|
7874, 7875, 1798, 4516, 3708, 2609, 4517, 3586, 1645, 2371, 7876, 7877, 2924, 669, 2208, 2665, # 7014
|
||||||
2429,7878,2879,7879,7880,1028,3229,7881,4228,2408,7882,2256,1353,7883,7884,4518, # 7030
|
2429, 7878, 2879, 7879, 7880, 1028, 3229, 7881, 4228, 2408, 7882, 2256, 1353, 7883, 7884, 4518, # 7030
|
||||||
3158, 518,7885,4013,7886,4229,1960,7887,2142,4230,7888,7889,3007,2349,2350,3833, # 7046
|
3158, 518, 7885, 4013, 7886, 4229, 1960, 7887, 2142, 4230, 7888, 7889, 3007, 2349, 2350, 3833, # 7046
|
||||||
516,1833,1454,4014,2699,4231,4519,2225,2610,1971,1129,3587,7890,2766,7891,2961, # 7062
|
516, 1833, 1454, 4014, 2699, 4231, 4519, 2225, 2610, 1971, 1129, 3587, 7890, 2766, 7891, 2961, # 7062
|
||||||
1422, 577,1470,3008,1524,3373,7892,7893, 432,4232,3054,3480,7894,2586,1455,2508, # 7078
|
1422, 577, 1470, 3008, 1524, 3373, 7892, 7893, 432, 4232, 3054, 3480, 7894, 2586, 1455, 2508, # 7078
|
||||||
2226,1972,1175,7895,1020,2732,4015,3481,4520,7896,2733,7897,1743,1361,3055,3482, # 7094
|
2226, 1972, 1175, 7895, 1020, 2732, 4015, 3481, 4520, 7896, 2733, 7897, 1743, 1361, 3055, 3482, # 7094
|
||||||
2639,4016,4233,4521,2290, 895, 924,4234,2170, 331,2243,3056, 166,1627,3057,1098, # 7110
|
2639, 4016, 4233, 4521, 2290, 895, 924, 4234, 2170, 331, 2243, 3056, 166, 1627, 3057, 1098, # 7110
|
||||||
7898,1232,2880,2227,3374,4522, 657, 403,1196,2372, 542,3709,3375,1600,4235,3483, # 7126
|
7898, 1232, 2880, 2227, 3374, 4522, 657, 403, 1196, 2372, 542, 3709, 3375, 1600, 4235, 3483, # 7126
|
||||||
7899,4523,2767,3230, 576, 530,1362,7900,4524,2533,2666,3710,4017,7901, 842,3834, # 7142
|
7899, 4523, 2767, 3230, 576, 530, 1362, 7900, 4524, 2533, 2666, 3710, 4017, 7901, 842, 3834, # 7142
|
||||||
7902,2801,2031,1014,4018, 213,2700,3376, 665, 621,4236,7903,3711,2925,2430,7904, # 7158
|
7902, 2801, 2031, 1014, 4018, 213, 2700, 3376, 665, 621, 4236, 7903, 3711, 2925, 2430, 7904, # 7158
|
||||||
2431,3302,3588,3377,7905,4237,2534,4238,4525,3589,1682,4239,3484,1380,7906, 724, # 7174
|
2431, 3302, 3588, 3377, 7905, 4237, 2534, 4238, 4525, 3589, 1682, 4239, 3484, 1380, 7906, 724, # 7174
|
||||||
2277, 600,1670,7907,1337,1233,4526,3103,2244,7908,1621,4527,7909, 651,4240,7910, # 7190
|
2277, 600, 1670, 7907, 1337, 1233, 4526, 3103, 2244, 7908, 1621, 4527, 7909, 651, 4240, 7910, # 7190
|
||||||
1612,4241,2611,7911,2844,7912,2734,2307,3058,7913, 716,2459,3059, 174,1255,2701, # 7206
|
1612, 4241, 2611, 7911, 2844, 7912, 2734, 2307, 3058, 7913, 716, 2459, 3059, 174, 1255, 2701, # 7206
|
||||||
4019,3590, 548,1320,1398, 728,4020,1574,7914,1890,1197,3060,4021,7915,3061,3062, # 7222
|
4019, 3590, 548, 1320, 1398, 728, 4020, 1574, 7914, 1890, 1197, 3060, 4021, 7915, 3061, 3062, # 7222
|
||||||
3712,3591,3713, 747,7916, 635,4242,4528,7917,7918,7919,4243,7920,7921,4529,7922, # 7238
|
3712, 3591, 3713, 747, 7916, 635, 4242, 4528, 7917, 7918, 7919, 4243, 7920, 7921, 4529, 7922, # 7238
|
||||||
3378,4530,2432, 451,7923,3714,2535,2072,4244,2735,4245,4022,7924,1764,4531,7925, # 7254
|
3378, 4530, 2432, 451, 7923, 3714, 2535, 2072, 4244, 2735, 4245, 4022, 7924, 1764, 4531, 7925, # 7254
|
||||||
4246, 350,7926,2278,2390,2486,7927,4247,4023,2245,1434,4024, 488,4532, 458,4248, # 7270
|
4246, 350, 7926, 2278, 2390, 2486, 7927, 4247, 4023, 2245, 1434, 4024, 488, 4532, 458, 4248, # 7270
|
||||||
4025,3715, 771,1330,2391,3835,2568,3159,2159,2409,1553,2667,3160,4249,7928,2487, # 7286
|
4025, 3715, 771, 1330, 2391, 3835, 2568, 3159, 2159, 2409, 1553, 2667, 3160, 4249, 7928, 2487, # 7286
|
||||||
2881,2612,1720,2702,4250,3379,4533,7929,2536,4251,7930,3231,4252,2768,7931,2015, # 7302
|
2881, 2612, 1720, 2702, 4250, 3379, 4533, 7929, 2536, 4251, 7930, 3231, 4252, 2768, 7931, 2015, # 7302
|
||||||
2736,7932,1155,1017,3716,3836,7933,3303,2308, 201,1864,4253,1430,7934,4026,7935, # 7318
|
2736, 7932, 1155, 1017, 3716, 3836, 7933, 3303, 2308, 201, 1864, 4253, 1430, 7934, 4026, 7935, # 7318
|
||||||
7936,7937,7938,7939,4254,1604,7940, 414,1865, 371,2587,4534,4535,3485,2016,3104, # 7334
|
7936, 7937, 7938, 7939, 4254, 1604, 7940, 414, 1865, 371, 2587, 4534, 4535, 3485, 2016, 3104, # 7334
|
||||||
4536,1708, 960,4255, 887, 389,2171,1536,1663,1721,7941,2228,4027,2351,2926,1580, # 7350
|
4536, 1708, 960, 4255, 887, 389, 2171, 1536, 1663, 1721, 7941, 2228, 4027, 2351, 2926, 1580, # 7350
|
||||||
7942,7943,7944,1744,7945,2537,4537,4538,7946,4539,7947,2073,7948,7949,3592,3380, # 7366
|
7942, 7943, 7944, 1744, 7945, 2537, 4537, 4538, 7946, 4539, 7947, 2073, 7948, 7949, 3592, 3380, # 7366
|
||||||
2882,4256,7950,4257,2640,3381,2802, 673,2703,2460, 709,3486,4028,3593,4258,7951, # 7382
|
2882, 4256, 7950, 4257, 2640, 3381, 2802, 673, 2703, 2460, 709, 3486, 4028, 3593, 4258, 7951, # 7382
|
||||||
1148, 502, 634,7952,7953,1204,4540,3594,1575,4541,2613,3717,7954,3718,3105, 948, # 7398
|
1148, 502, 634, 7952, 7953, 1204, 4540, 3594, 1575, 4541, 2613, 3717, 7954, 3718, 3105, 948, # 7398
|
||||||
3232, 121,1745,3837,1110,7955,4259,3063,2509,3009,4029,3719,1151,1771,3838,1488, # 7414
|
3232, 121, 1745, 3837, 1110, 7955, 4259, 3063, 2509, 3009, 4029, 3719, 1151, 1771, 3838, 1488, # 7414
|
||||||
4030,1986,7956,2433,3487,7957,7958,2093,7959,4260,3839,1213,1407,2803, 531,2737, # 7430
|
4030, 1986, 7956, 2433, 3487, 7957, 7958, 2093, 7959, 4260, 3839, 1213, 1407, 2803, 531, 2737, # 7430
|
||||||
2538,3233,1011,1537,7960,2769,4261,3106,1061,7961,3720,3721,1866,2883,7962,2017, # 7446
|
2538, 3233, 1011, 1537, 7960, 2769, 4261, 3106, 1061, 7961, 3720, 3721, 1866, 2883, 7962, 2017, # 7446
|
||||||
120,4262,4263,2062,3595,3234,2309,3840,2668,3382,1954,4542,7963,7964,3488,1047, # 7462
|
120, 4262, 4263, 2062, 3595, 3234, 2309, 3840, 2668, 3382, 1954, 4542, 7963, 7964, 3488, 1047, # 7462
|
||||||
2704,1266,7965,1368,4543,2845, 649,3383,3841,2539,2738,1102,2846,2669,7966,7967, # 7478
|
2704, 1266, 7965, 1368, 4543, 2845, 649, 3383, 3841, 2539, 2738, 1102, 2846, 2669, 7966, 7967, # 7478
|
||||||
1999,7968,1111,3596,2962,7969,2488,3842,3597,2804,1854,3384,3722,7970,7971,3385, # 7494
|
1999, 7968, 1111, 3596, 2962, 7969, 2488, 3842, 3597, 2804, 1854, 3384, 3722, 7970, 7971, 3385, # 7494
|
||||||
2410,2884,3304,3235,3598,7972,2569,7973,3599,2805,4031,1460, 856,7974,3600,7975, # 7510
|
2410, 2884, 3304, 3235, 3598, 7972, 2569, 7973, 3599, 2805, 4031, 1460, 856, 7974, 3600, 7975, # 7510
|
||||||
2885,2963,7976,2886,3843,7977,4264, 632,2510, 875,3844,1697,3845,2291,7978,7979, # 7526
|
2885, 2963, 7976, 2886, 3843, 7977, 4264, 632, 2510, 875, 3844, 1697, 3845, 2291, 7978, 7979, # 7526
|
||||||
4544,3010,1239, 580,4545,4265,7980, 914, 936,2074,1190,4032,1039,2123,7981,7982, # 7542
|
4544, 3010, 1239, 580, 4545, 4265, 7980, 914, 936, 2074, 1190, 4032, 1039, 2123, 7981, 7982, # 7542
|
||||||
7983,3386,1473,7984,1354,4266,3846,7985,2172,3064,4033, 915,3305,4267,4268,3306, # 7558
|
7983, 3386, 1473, 7984, 1354, 4266, 3846, 7985, 2172, 3064, 4033, 915, 3305, 4267, 4268, 3306, # 7558
|
||||||
1605,1834,7986,2739, 398,3601,4269,3847,4034, 328,1912,2847,4035,3848,1331,4270, # 7574
|
1605, 1834, 7986, 2739, 398, 3601, 4269, 3847, 4034, 328, 1912, 2847, 4035, 3848, 1331, 4270, # 7574
|
||||||
3011, 937,4271,7987,3602,4036,4037,3387,2160,4546,3388, 524, 742, 538,3065,1012, # 7590
|
3011, 937, 4271, 7987, 3602, 4036, 4037, 3387, 2160, 4546, 3388, 524, 742, 538, 3065, 1012, # 7590
|
||||||
7988,7989,3849,2461,7990, 658,1103, 225,3850,7991,7992,4547,7993,4548,7994,3236, # 7606
|
7988, 7989, 3849, 2461, 7990, 658, 1103, 225, 3850, 7991, 7992, 4547, 7993, 4548, 7994, 3236, # 7606
|
||||||
1243,7995,4038, 963,2246,4549,7996,2705,3603,3161,7997,7998,2588,2327,7999,4550, # 7622
|
1243, 7995, 4038, 963, 2246, 4549, 7996, 2705, 3603, 3161, 7997, 7998, 2588, 2327, 7999, 4550, # 7622
|
||||||
8000,8001,8002,3489,3307, 957,3389,2540,2032,1930,2927,2462, 870,2018,3604,1746, # 7638
|
8000, 8001, 8002, 3489, 3307, 957, 3389, 2540, 2032, 1930, 2927, 2462, 870, 2018, 3604, 1746, # 7638
|
||||||
2770,2771,2434,2463,8003,3851,8004,3723,3107,3724,3490,3390,3725,8005,1179,3066, # 7654
|
2770, 2771, 2434, 2463, 8003, 3851, 8004, 3723, 3107, 3724, 3490, 3390, 3725, 8005, 1179, 3066, # 7654
|
||||||
8006,3162,2373,4272,3726,2541,3163,3108,2740,4039,8007,3391,1556,2542,2292, 977, # 7670
|
8006, 3162, 2373, 4272, 3726, 2541, 3163, 3108, 2740, 4039, 8007, 3391, 1556, 2542, 2292, 977, # 7670
|
||||||
2887,2033,4040,1205,3392,8008,1765,3393,3164,2124,1271,1689, 714,4551,3491,8009, # 7686
|
2887, 2033, 4040, 1205, 3392, 8008, 1765, 3393, 3164, 2124, 1271, 1689, 714, 4551, 3491, 8009, # 7686
|
||||||
2328,3852, 533,4273,3605,2181, 617,8010,2464,3308,3492,2310,8011,8012,3165,8013, # 7702
|
2328, 3852, 533, 4273, 3605, 2181, 617, 8010, 2464, 3308, 3492, 2310, 8011, 8012, 3165, 8013, # 7702
|
||||||
8014,3853,1987, 618, 427,2641,3493,3394,8015,8016,1244,1690,8017,2806,4274,4552, # 7718
|
8014, 3853, 1987, 618, 427, 2641, 3493, 3394, 8015, 8016, 1244, 1690, 8017, 2806, 4274, 4552, # 7718
|
||||||
8018,3494,8019,8020,2279,1576, 473,3606,4275,3395, 972,8021,3607,8022,3067,8023, # 7734
|
8018, 3494, 8019, 8020, 2279, 1576, 473, 3606, 4275, 3395, 972, 8021, 3607, 8022, 3067, 8023, # 7734
|
||||||
8024,4553,4554,8025,3727,4041,4042,8026, 153,4555, 356,8027,1891,2888,4276,2143, # 7750
|
8024, 4553, 4554, 8025, 3727, 4041, 4042, 8026, 153, 4555, 356, 8027, 1891, 2888, 4276, 2143, # 7750
|
||||||
408, 803,2352,8028,3854,8029,4277,1646,2570,2511,4556,4557,3855,8030,3856,4278, # 7766
|
408, 803, 2352, 8028, 3854, 8029, 4277, 1646, 2570, 2511, 4556, 4557, 3855, 8030, 3856, 4278, # 7766
|
||||||
8031,2411,3396, 752,8032,8033,1961,2964,8034, 746,3012,2465,8035,4279,3728, 698, # 7782
|
8031, 2411, 3396, 752, 8032, 8033, 1961, 2964, 8034, 746, 3012, 2465, 8035, 4279, 3728, 698, # 7782
|
||||||
4558,1892,4280,3608,2543,4559,3609,3857,8036,3166,3397,8037,1823,1302,4043,2706, # 7798
|
4558, 1892, 4280, 3608, 2543, 4559, 3609, 3857, 8036, 3166, 3397, 8037, 1823, 1302, 4043, 2706, # 7798
|
||||||
3858,1973,4281,8038,4282,3167, 823,1303,1288,1236,2848,3495,4044,3398, 774,3859, # 7814
|
3858, 1973, 4281, 8038, 4282, 3167, 823, 1303, 1288, 1236, 2848, 3495, 4044, 3398, 774, 3859, # 7814
|
||||||
8039,1581,4560,1304,2849,3860,4561,8040,2435,2161,1083,3237,4283,4045,4284, 344, # 7830
|
8039, 1581, 4560, 1304, 2849, 3860, 4561, 8040, 2435, 2161, 1083, 3237, 4283, 4045, 4284, 344, # 7830
|
||||||
1173, 288,2311, 454,1683,8041,8042,1461,4562,4046,2589,8043,8044,4563, 985, 894, # 7846
|
1173, 288, 2311, 454, 1683, 8041, 8042, 1461, 4562, 4046, 2589, 8043, 8044, 4563, 985, 894, # 7846
|
||||||
8045,3399,3168,8046,1913,2928,3729,1988,8047,2110,1974,8048,4047,8049,2571,1194, # 7862
|
8045, 3399, 3168, 8046, 1913, 2928, 3729, 1988, 8047, 2110, 1974, 8048, 4047, 8049, 2571, 1194, # 7862
|
||||||
425,8050,4564,3169,1245,3730,4285,8051,8052,2850,8053, 636,4565,1855,3861, 760, # 7878
|
425, 8050, 4564, 3169, 1245, 3730, 4285, 8051, 8052, 2850, 8053, 636, 4565, 1855, 3861, 760, # 7878
|
||||||
1799,8054,4286,2209,1508,4566,4048,1893,1684,2293,8055,8056,8057,4287,4288,2210, # 7894
|
1799, 8054, 4286, 2209, 1508, 4566, 4048, 1893, 1684, 2293, 8055, 8056, 8057, 4287, 4288, 2210, # 7894
|
||||||
479,8058,8059, 832,8060,4049,2489,8061,2965,2490,3731, 990,3109, 627,1814,2642, # 7910
|
479, 8058, 8059, 832, 8060, 4049, 2489, 8061, 2965, 2490, 3731, 990, 3109, 627, 1814, 2642, # 7910
|
||||||
4289,1582,4290,2125,2111,3496,4567,8062, 799,4291,3170,8063,4568,2112,1737,3013, # 7926
|
4289, 1582, 4290, 2125, 2111, 3496, 4567, 8062, 799, 4291, 3170, 8063, 4568, 2112, 1737, 3013, # 7926
|
||||||
1018, 543, 754,4292,3309,1676,4569,4570,4050,8064,1489,8065,3497,8066,2614,2889, # 7942
|
1018, 543, 754, 4292, 3309, 1676, 4569, 4570, 4050, 8064, 1489, 8065, 3497, 8066, 2614, 2889, # 7942
|
||||||
4051,8067,8068,2966,8069,8070,8071,8072,3171,4571,4572,2182,1722,8073,3238,3239, # 7958
|
4051, 8067, 8068, 2966, 8069, 8070, 8071, 8072, 3171, 4571, 4572, 2182, 1722, 8073, 3238, 3239, # 7958
|
||||||
1842,3610,1715, 481, 365,1975,1856,8074,8075,1962,2491,4573,8076,2126,3611,3240, # 7974
|
1842, 3610, 1715, 481, 365, 1975, 1856, 8074, 8075, 1962, 2491, 4573, 8076, 2126, 3611, 3240, # 7974
|
||||||
433,1894,2063,2075,8077, 602,2741,8078,8079,8080,8081,8082,3014,1628,3400,8083, # 7990
|
433, 1894, 2063, 2075, 8077, 602, 2741, 8078, 8079, 8080, 8081, 8082, 3014, 1628, 3400, 8083, # 7990
|
||||||
3172,4574,4052,2890,4575,2512,8084,2544,2772,8085,8086,8087,3310,4576,2891,8088, # 8006
|
3172, 4574, 4052, 2890, 4575, 2512, 8084, 2544, 2772, 8085, 8086, 8087, 3310, 4576, 2891, 8088, # 8006
|
||||||
4577,8089,2851,4578,4579,1221,2967,4053,2513,8090,8091,8092,1867,1989,8093,8094, # 8022
|
4577, 8089, 2851, 4578, 4579, 1221, 2967, 4053, 2513, 8090, 8091, 8092, 1867, 1989, 8093, 8094, # 8022
|
||||||
8095,1895,8096,8097,4580,1896,4054, 318,8098,2094,4055,4293,8099,8100, 485,8101, # 8038
|
8095, 1895, 8096, 8097, 4580, 1896, 4054, 318, 8098, 2094, 4055, 4293, 8099, 8100, 485, 8101, # 8038
|
||||||
938,3862, 553,2670, 116,8102,3863,3612,8103,3498,2671,2773,3401,3311,2807,8104, # 8054
|
938, 3862, 553, 2670, 116, 8102, 3863, 3612, 8103, 3498, 2671, 2773, 3401, 3311, 2807, 8104, # 8054
|
||||||
3613,2929,4056,1747,2930,2968,8105,8106, 207,8107,8108,2672,4581,2514,8109,3015, # 8070
|
3613, 2929, 4056, 1747, 2930, 2968, 8105, 8106, 207, 8107, 8108, 2672, 4581, 2514, 8109, 3015, # 8070
|
||||||
890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, # 8086
|
890, 3614, 3864, 8110, 1877, 3732, 3402, 8111, 2183, 2353, 3403, 1652, 8112, 8113, 8114, 941, # 8086
|
||||||
2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, # 8102
|
2294, 208, 3499, 4057, 2019, 330, 4294, 3865, 2892, 2492, 3733, 4295, 8115, 8116, 8117, 8118, # 8102
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,22 +25,23 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import EUCTWDistributionAnalysis
|
from .chardistribution import EUCTWDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import EUCTW_SM_MODEL
|
from .mbcssm import EUCTW_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCTWProber(MultiByteCharSetProber):
|
class EUCTWProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(EUCTWProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "EUC-TW"
|
return "EUC-TW"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Taiwan"
|
return "Taiwan"
|
||||||
|
|
|
@ -43,6 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
||||||
|
|
||||||
GB2312_TABLE_SIZE = 3760
|
GB2312_TABLE_SIZE = 3760
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
GB2312_CHAR_TO_FREQ_ORDER = (
|
GB2312_CHAR_TO_FREQ_ORDER = (
|
||||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||||
|
@ -280,4 +281,4 @@ GB2312_CHAR_TO_FREQ_ORDER = (
|
||||||
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
||||||
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
|
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,22 +25,23 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import GB2312DistributionAnalysis
|
from .chardistribution import GB2312DistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import GB2312_SM_MODEL
|
from .mbcssm import GB2312_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class GB2312Prober(MultiByteCharSetProber):
|
class GB2312Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(GB2312Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
||||||
self.distribution_analyzer = GB2312DistributionAnalysis()
|
self.distribution_analyzer = GB2312DistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "GB2312"
|
return "GB2312"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Chinese"
|
return "Chinese"
|
||||||
|
|
|
@ -25,8 +25,11 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
|
||||||
# This prober doesn't actually recognize a language or a charset.
|
# This prober doesn't actually recognize a language or a charset.
|
||||||
# It is a helper prober for the use of the Hebrew model probers
|
# It is a helper prober for the use of the Hebrew model probers
|
||||||
|
@ -125,18 +128,20 @@ from .enums import ProbingState
|
||||||
# model probers scores. The answer is returned in the form of the name of the
|
# model probers scores. The answer is returned in the form of the name of the
|
||||||
# charset identified, either "windows-1255" or "ISO-8859-8".
|
# charset identified, either "windows-1255" or "ISO-8859-8".
|
||||||
|
|
||||||
|
|
||||||
class HebrewProber(CharSetProber):
|
class HebrewProber(CharSetProber):
|
||||||
|
SPACE = 0x20
|
||||||
# windows-1255 / ISO-8859-8 code points of interest
|
# windows-1255 / ISO-8859-8 code points of interest
|
||||||
FINAL_KAF = 0xea
|
FINAL_KAF = 0xEA
|
||||||
NORMAL_KAF = 0xeb
|
NORMAL_KAF = 0xEB
|
||||||
FINAL_MEM = 0xed
|
FINAL_MEM = 0xED
|
||||||
NORMAL_MEM = 0xee
|
NORMAL_MEM = 0xEE
|
||||||
FINAL_NUN = 0xef
|
FINAL_NUN = 0xEF
|
||||||
NORMAL_NUN = 0xf0
|
NORMAL_NUN = 0xF0
|
||||||
FINAL_PE = 0xf3
|
FINAL_PE = 0xF3
|
||||||
NORMAL_PE = 0xf4
|
NORMAL_PE = 0xF4
|
||||||
FINAL_TSADI = 0xf5
|
FINAL_TSADI = 0xF5
|
||||||
NORMAL_TSADI = 0xf6
|
NORMAL_TSADI = 0xF6
|
||||||
|
|
||||||
# Minimum Visual vs Logical final letter score difference.
|
# Minimum Visual vs Logical final letter score difference.
|
||||||
# If the difference is below this, don't rely solely on the final letter score
|
# If the difference is below this, don't rely solely on the final letter score
|
||||||
|
@ -151,35 +156,44 @@ class HebrewProber(CharSetProber):
|
||||||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(HebrewProber, self).__init__()
|
super().__init__()
|
||||||
self._final_char_logical_score = None
|
self._final_char_logical_score = 0
|
||||||
self._final_char_visual_score = None
|
self._final_char_visual_score = 0
|
||||||
self._prev = None
|
self._prev = self.SPACE
|
||||||
self._before_prev = None
|
self._before_prev = self.SPACE
|
||||||
self._logical_prober = None
|
self._logical_prober: Optional[SingleByteCharSetProber] = None
|
||||||
self._visual_prober = None
|
self._visual_prober: Optional[SingleByteCharSetProber] = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._final_char_logical_score = 0
|
self._final_char_logical_score = 0
|
||||||
self._final_char_visual_score = 0
|
self._final_char_visual_score = 0
|
||||||
# The two last characters seen in the previous buffer,
|
# The two last characters seen in the previous buffer,
|
||||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||||
# a word delimiter at the beginning of the data
|
# a word delimiter at the beginning of the data
|
||||||
self._prev = ' '
|
self._prev = self.SPACE
|
||||||
self._before_prev = ' '
|
self._before_prev = self.SPACE
|
||||||
# These probers are owned by the group prober.
|
# These probers are owned by the group prober.
|
||||||
|
|
||||||
def set_model_probers(self, logicalProber, visualProber):
|
def set_model_probers(
|
||||||
self._logical_prober = logicalProber
|
self,
|
||||||
self._visual_prober = visualProber
|
logical_prober: SingleByteCharSetProber,
|
||||||
|
visual_prober: SingleByteCharSetProber,
|
||||||
|
) -> None:
|
||||||
|
self._logical_prober = logical_prober
|
||||||
|
self._visual_prober = visual_prober
|
||||||
|
|
||||||
def is_final(self, c):
|
def is_final(self, c: int) -> bool:
|
||||||
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
|
return c in [
|
||||||
self.FINAL_PE, self.FINAL_TSADI]
|
self.FINAL_KAF,
|
||||||
|
self.FINAL_MEM,
|
||||||
|
self.FINAL_NUN,
|
||||||
|
self.FINAL_PE,
|
||||||
|
self.FINAL_TSADI,
|
||||||
|
]
|
||||||
|
|
||||||
def is_non_final(self, c):
|
def is_non_final(self, c: int) -> bool:
|
||||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||||
|
@ -190,10 +204,9 @@ class HebrewProber(CharSetProber):
|
||||||
# for example legally end with a Non-Final Pe or Kaf. However, the
|
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||||
# benefit of these letters as Non-Final letters outweighs the damage
|
# benefit of these letters as Non-Final letters outweighs the damage
|
||||||
# since these words are quite rare.
|
# since these words are quite rare.
|
||||||
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
|
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
|
||||||
self.NORMAL_NUN, self.NORMAL_PE]
|
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
# Final letter analysis for logical-visual decision.
|
# Final letter analysis for logical-visual decision.
|
||||||
# Look for evidence that the received buffer is either logical Hebrew
|
# Look for evidence that the received buffer is either logical Hebrew
|
||||||
# or visual Hebrew.
|
# or visual Hebrew.
|
||||||
|
@ -227,9 +240,9 @@ class HebrewProber(CharSetProber):
|
||||||
byte_str = self.filter_high_byte_only(byte_str)
|
byte_str = self.filter_high_byte_only(byte_str)
|
||||||
|
|
||||||
for cur in byte_str:
|
for cur in byte_str:
|
||||||
if cur == ' ':
|
if cur == self.SPACE:
|
||||||
# We stand on a space - a word just ended
|
# We stand on a space - a word just ended
|
||||||
if self._before_prev != ' ':
|
if self._before_prev != self.SPACE:
|
||||||
# next-to-last char was not a space so self._prev is not a
|
# next-to-last char was not a space so self._prev is not a
|
||||||
# 1 letter word
|
# 1 letter word
|
||||||
if self.is_final(self._prev):
|
if self.is_final(self._prev):
|
||||||
|
@ -241,8 +254,11 @@ class HebrewProber(CharSetProber):
|
||||||
self._final_char_visual_score += 1
|
self._final_char_visual_score += 1
|
||||||
else:
|
else:
|
||||||
# Not standing on a space
|
# Not standing on a space
|
||||||
if ((self._before_prev == ' ') and
|
if (
|
||||||
(self.is_final(self._prev)) and (cur != ' ')):
|
(self._before_prev == self.SPACE)
|
||||||
|
and (self.is_final(self._prev))
|
||||||
|
and (cur != self.SPACE)
|
||||||
|
):
|
||||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||||
self._final_char_visual_score += 1
|
self._final_char_visual_score += 1
|
||||||
self._before_prev = self._prev
|
self._before_prev = self._prev
|
||||||
|
@ -253,7 +269,10 @@ class HebrewProber(CharSetProber):
|
||||||
return ProbingState.DETECTING
|
return ProbingState.DETECTING
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
|
assert self._logical_prober is not None
|
||||||
|
assert self._visual_prober is not None
|
||||||
|
|
||||||
# Make the decision: is it Logical or Visual?
|
# Make the decision: is it Logical or Visual?
|
||||||
# If the final letter score distance is dominant enough, rely on it.
|
# If the final letter score distance is dominant enough, rely on it.
|
||||||
finalsub = self._final_char_logical_score - self._final_char_visual_score
|
finalsub = self._final_char_logical_score - self._final_char_visual_score
|
||||||
|
@ -263,8 +282,9 @@ class HebrewProber(CharSetProber):
|
||||||
return self.VISUAL_HEBREW_NAME
|
return self.VISUAL_HEBREW_NAME
|
||||||
|
|
||||||
# It's not dominant enough, try to rely on the model scores instead.
|
# It's not dominant enough, try to rely on the model scores instead.
|
||||||
modelsub = (self._logical_prober.get_confidence()
|
modelsub = (
|
||||||
- self._visual_prober.get_confidence())
|
self._logical_prober.get_confidence() - self._visual_prober.get_confidence()
|
||||||
|
)
|
||||||
if modelsub > self.MIN_MODEL_DISTANCE:
|
if modelsub > self.MIN_MODEL_DISTANCE:
|
||||||
return self.LOGICAL_HEBREW_NAME
|
return self.LOGICAL_HEBREW_NAME
|
||||||
if modelsub < -self.MIN_MODEL_DISTANCE:
|
if modelsub < -self.MIN_MODEL_DISTANCE:
|
||||||
|
@ -280,13 +300,17 @@ class HebrewProber(CharSetProber):
|
||||||
return self.LOGICAL_HEBREW_NAME
|
return self.LOGICAL_HEBREW_NAME
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return 'Hebrew'
|
return "Hebrew"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self) -> ProbingState:
|
||||||
|
assert self._logical_prober is not None
|
||||||
|
assert self._visual_prober is not None
|
||||||
|
|
||||||
# Remain active as long as any of the model probers are active.
|
# Remain active as long as any of the model probers are active.
|
||||||
if (self._logical_prober.state == ProbingState.NOT_ME) and \
|
if (self._logical_prober.state == ProbingState.NOT_ME) and (
|
||||||
(self._visual_prober.state == ProbingState.NOT_ME):
|
self._visual_prober.state == ProbingState.NOT_ME
|
||||||
|
):
|
||||||
return ProbingState.NOT_ME
|
return ProbingState.NOT_ME
|
||||||
return ProbingState.DETECTING
|
return ProbingState.DETECTING
|
||||||
|
|
|
@ -46,6 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
JIS_TABLE_SIZE = 4368
|
JIS_TABLE_SIZE = 4368
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
JIS_CHAR_TO_FREQ_ORDER = (
|
JIS_CHAR_TO_FREQ_ORDER = (
|
||||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
||||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
||||||
|
@ -321,5 +322,4 @@ JIS_CHAR_TO_FREQ_ORDER = (
|
||||||
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
|
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
|
||||||
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
|
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
2382
lib/chardet/johabfreq.py
Normal file
2382
lib/chardet/johabfreq.py
Normal file
File diff suppressed because it is too large
Load diff
47
lib/chardet/johabprober.py
Normal file
47
lib/chardet/johabprober.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
# The Original Code is mozilla.org code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Mark Pilgrim - port to Python
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from .chardistribution import JOHABDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
|
from .mbcssm import JOHAB_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
class JOHABProber(MultiByteCharSetProber):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
|
||||||
|
self.distribution_analyzer = JOHABDistributionAnalysis()
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_name(self) -> str:
|
||||||
|
return "Johab"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self) -> str:
|
||||||
|
return "Korean"
|
|
@ -25,110 +25,114 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Tuple, Union
|
||||||
|
|
||||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||||
jp2CharContext = (
|
# fmt: off
|
||||||
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
|
jp2_char_context = (
|
||||||
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
|
(0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
|
||||||
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
|
(2, 4, 0, 4, 0, 3, 0, 4, 0, 3, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 3, 3, 4, 2, 3, 3, 3, 2, 4, 1, 4, 3, 3, 1, 5, 4, 3, 4, 3, 4, 3, 5, 3, 0, 3, 5, 4, 2, 0, 3, 1, 0, 3, 3, 0, 3, 3, 0, 1, 1, 0, 4, 3, 0, 3, 3, 0, 4, 0, 2, 0, 3, 5, 5, 5, 5, 4, 0, 4, 1, 0, 3, 4),
|
||||||
(0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4),
|
(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
|
||||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
(0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 4, 4, 3, 5, 3, 5, 1, 5, 3, 4, 3, 4, 4, 3, 4, 3, 3, 4, 3, 5, 4, 4, 3, 5, 5, 3, 5, 5, 5, 3, 5, 5, 3, 4, 5, 5, 3, 1, 3, 2, 0, 3, 4, 0, 4, 2, 0, 4, 2, 1, 5, 3, 2, 3, 5, 0, 4, 0, 2, 0, 5, 4, 4, 5, 4, 5, 0, 4, 0, 0, 4, 4),
|
||||||
(0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
(0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 5, 4, 3, 3, 3, 3, 4, 3, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 4, 4, 4, 4, 5, 3, 4, 4, 3, 4, 5, 5, 4, 5, 5, 1, 4, 5, 4, 3, 0, 3, 3, 1, 3, 3, 0, 4, 4, 0, 3, 3, 1, 5, 3, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 0, 4, 1, 1, 3, 4),
|
||||||
(0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
(0, 4, 0, 3, 0, 3, 0, 4, 0, 3, 4, 4, 3, 2, 2, 1, 2, 1, 3, 1, 3, 3, 3, 3, 3, 4, 3, 1, 3, 3, 5, 3, 3, 0, 4, 3, 0, 5, 4, 3, 3, 5, 4, 4, 3, 4, 4, 5, 0, 1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 0, 0, 5, 2, 2, 1, 4, 0, 3, 0, 1, 0, 4, 4, 3, 5, 4, 3, 0, 2, 1, 0, 4, 3),
|
||||||
(0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||||
(1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4),
|
(0, 3, 0, 5, 0, 4, 0, 2, 1, 4, 4, 2, 4, 1, 4, 2, 4, 2, 4, 3, 3, 3, 4, 3, 3, 3, 3, 1, 4, 2, 3, 3, 3, 1, 4, 4, 1, 1, 1, 4, 3, 3, 2, 0, 2, 4, 3, 2, 0, 3, 3, 0, 3, 1, 1, 0, 0, 0, 3, 3, 0, 4, 2, 2, 3, 4, 0, 4, 0, 3, 0, 4, 4, 5, 3, 4, 4, 0, 3, 0, 0, 1, 4),
|
||||||
(0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3),
|
(1, 4, 0, 4, 0, 4, 0, 4, 0, 3, 5, 4, 4, 3, 4, 3, 5, 4, 3, 3, 4, 3, 5, 4, 4, 4, 4, 3, 4, 2, 4, 3, 3, 1, 5, 4, 3, 2, 4, 5, 4, 5, 5, 4, 4, 5, 4, 4, 0, 3, 2, 2, 3, 3, 0, 4, 3, 1, 3, 2, 1, 4, 3, 3, 4, 5, 0, 3, 0, 2, 0, 4, 5, 5, 4, 5, 4, 0, 4, 0, 0, 5, 4),
|
||||||
(0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3),
|
(0, 5, 0, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 3, 4, 0, 4, 4, 4, 3, 4, 3, 4, 3, 3, 1, 4, 2, 4, 3, 4, 0, 5, 4, 1, 4, 5, 4, 4, 5, 3, 2, 4, 3, 4, 3, 2, 4, 1, 3, 3, 3, 2, 3, 2, 0, 4, 3, 3, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 4, 3, 0, 4, 1, 0, 1, 3),
|
||||||
(0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3),
|
(0, 3, 1, 4, 0, 3, 0, 2, 0, 3, 4, 4, 3, 1, 4, 2, 3, 3, 4, 3, 4, 3, 4, 3, 4, 4, 3, 2, 3, 1, 5, 4, 4, 1, 4, 4, 3, 5, 4, 4, 3, 5, 5, 4, 3, 4, 4, 3, 1, 2, 3, 1, 2, 2, 0, 3, 2, 0, 3, 1, 0, 5, 3, 3, 3, 4, 3, 3, 3, 3, 4, 4, 4, 4, 5, 4, 2, 0, 3, 3, 2, 4, 3),
|
||||||
(0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4),
|
(0, 2, 0, 3, 0, 1, 0, 1, 0, 0, 3, 2, 0, 0, 2, 0, 1, 0, 2, 1, 3, 3, 3, 1, 2, 3, 1, 0, 1, 0, 4, 2, 1, 1, 3, 3, 0, 4, 3, 3, 1, 4, 3, 3, 0, 3, 3, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 4, 1, 0, 2, 3, 2, 2, 2, 1, 3, 3, 3, 4, 4, 3, 2, 0, 3, 1, 0, 3, 3),
|
||||||
(0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3),
|
(0, 4, 0, 4, 0, 3, 0, 3, 0, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 4, 3, 4, 3, 3, 2, 4, 3, 4, 5, 4, 1, 4, 5, 3, 5, 4, 5, 3, 5, 4, 0, 3, 5, 5, 3, 1, 3, 3, 2, 2, 3, 0, 3, 4, 1, 3, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 5, 3, 0, 4, 1, 0, 3, 4),
|
||||||
(2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4),
|
(0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 2, 2, 1, 0, 1, 0, 0, 0, 3, 0, 3, 0, 3, 0, 1, 3, 1, 0, 3, 1, 3, 3, 3, 1, 3, 3, 3, 0, 1, 3, 1, 3, 4, 0, 0, 3, 1, 1, 0, 3, 2, 0, 0, 0, 0, 1, 3, 0, 1, 0, 0, 3, 3, 2, 0, 3, 0, 0, 0, 0, 0, 3, 4, 3, 4, 3, 3, 0, 3, 0, 0, 2, 3),
|
||||||
(0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3),
|
(2, 3, 0, 3, 0, 2, 0, 1, 0, 3, 3, 4, 3, 1, 3, 1, 1, 1, 3, 1, 4, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 4, 3, 1, 4, 3, 2, 5, 5, 4, 4, 4, 4, 3, 3, 4, 4, 4, 0, 2, 1, 1, 3, 2, 0, 1, 2, 0, 0, 1, 0, 4, 1, 3, 3, 3, 0, 3, 0, 1, 0, 4, 4, 4, 5, 5, 3, 0, 2, 0, 0, 4, 4),
|
||||||
(0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5),
|
(0, 2, 0, 1, 0, 3, 1, 3, 0, 2, 3, 3, 3, 0, 3, 1, 0, 0, 3, 0, 3, 2, 3, 1, 3, 2, 1, 1, 0, 0, 4, 2, 1, 0, 2, 3, 1, 4, 3, 2, 0, 4, 4, 3, 1, 3, 1, 3, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 1, 1, 1, 2, 0, 3, 0, 0, 0, 3, 4, 2, 4, 3, 2, 0, 1, 0, 0, 3, 3),
|
||||||
(0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3),
|
(0, 1, 0, 4, 0, 5, 0, 4, 0, 2, 4, 4, 2, 3, 3, 2, 3, 3, 5, 3, 3, 3, 4, 3, 4, 2, 3, 0, 4, 3, 3, 3, 4, 1, 4, 3, 2, 1, 5, 5, 3, 4, 5, 1, 3, 5, 4, 2, 0, 3, 3, 0, 1, 3, 0, 4, 2, 0, 1, 3, 1, 4, 3, 3, 3, 3, 0, 3, 0, 1, 0, 3, 4, 4, 4, 5, 5, 0, 3, 0, 1, 4, 5),
|
||||||
(2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5),
|
(0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 3, 1, 3, 0, 4, 0, 1, 1, 3, 0, 3, 4, 3, 2, 3, 1, 0, 3, 3, 2, 3, 1, 3, 0, 2, 3, 0, 2, 1, 4, 1, 2, 2, 0, 0, 3, 3, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 3, 2, 1, 3, 3, 0, 2, 0, 2, 0, 0, 3, 3, 1, 2, 4, 0, 3, 0, 2, 2, 3),
|
||||||
(0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4),
|
(2, 4, 0, 5, 0, 4, 0, 4, 0, 2, 4, 4, 4, 3, 4, 3, 3, 3, 1, 2, 4, 3, 4, 3, 4, 4, 5, 0, 3, 3, 3, 3, 2, 0, 4, 3, 1, 4, 3, 4, 1, 4, 4, 3, 3, 4, 4, 3, 1, 2, 3, 0, 4, 2, 0, 4, 1, 0, 3, 3, 0, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 3, 5, 3, 4, 5, 2, 0, 3, 0, 0, 4, 5),
|
||||||
(1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4),
|
(0, 3, 0, 4, 0, 1, 0, 1, 0, 1, 3, 2, 2, 1, 3, 0, 3, 0, 2, 0, 2, 0, 3, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 4, 0, 3, 1, 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 3, 1, 0, 3, 0, 0, 0, 1, 4, 4, 4, 3, 0, 0, 4, 0, 0, 1, 4),
|
||||||
(0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3),
|
(1, 4, 1, 5, 0, 3, 0, 3, 0, 4, 5, 4, 4, 3, 5, 3, 3, 4, 4, 3, 4, 1, 3, 3, 3, 3, 2, 1, 4, 1, 5, 4, 3, 1, 4, 4, 3, 5, 4, 4, 3, 5, 4, 3, 3, 4, 4, 4, 0, 3, 3, 1, 2, 3, 0, 3, 1, 0, 3, 3, 0, 5, 4, 4, 4, 4, 4, 4, 3, 3, 5, 4, 4, 3, 3, 5, 4, 0, 3, 2, 0, 4, 4),
|
||||||
(0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3),
|
(0, 2, 0, 3, 0, 1, 0, 0, 0, 1, 3, 3, 3, 2, 4, 1, 3, 0, 3, 1, 3, 0, 2, 2, 1, 1, 0, 0, 2, 0, 4, 3, 1, 0, 4, 3, 0, 4, 4, 4, 1, 4, 3, 1, 1, 3, 3, 1, 0, 2, 0, 0, 1, 3, 0, 0, 0, 0, 2, 0, 0, 4, 3, 2, 4, 3, 5, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 0, 2, 1, 0, 3, 3),
|
||||||
(0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3),
|
(0, 2, 0, 4, 0, 3, 0, 2, 0, 2, 5, 5, 3, 4, 4, 4, 4, 1, 4, 3, 3, 0, 4, 3, 4, 3, 1, 3, 3, 2, 4, 3, 0, 3, 4, 3, 0, 3, 4, 4, 2, 4, 4, 0, 4, 5, 3, 3, 2, 2, 1, 1, 1, 2, 0, 1, 5, 0, 3, 3, 2, 4, 3, 3, 3, 4, 0, 3, 0, 2, 0, 4, 4, 3, 5, 5, 0, 0, 3, 0, 2, 3, 3),
|
||||||
(0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5),
|
(0, 3, 0, 4, 0, 3, 0, 1, 0, 3, 4, 3, 3, 1, 3, 3, 3, 0, 3, 1, 3, 0, 4, 3, 3, 1, 1, 0, 3, 0, 3, 3, 0, 0, 4, 4, 0, 1, 5, 4, 3, 3, 5, 0, 3, 3, 4, 3, 0, 2, 0, 1, 1, 1, 0, 1, 3, 0, 1, 2, 1, 3, 3, 2, 3, 3, 0, 3, 0, 1, 0, 1, 3, 3, 4, 4, 1, 0, 1, 2, 2, 1, 3),
|
||||||
(0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4),
|
(0, 1, 0, 4, 0, 4, 0, 3, 0, 1, 3, 3, 3, 2, 3, 1, 1, 0, 3, 0, 3, 3, 4, 3, 2, 4, 2, 0, 1, 0, 4, 3, 2, 0, 4, 3, 0, 5, 3, 3, 2, 4, 4, 4, 3, 3, 3, 4, 0, 1, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 2, 3, 3, 3, 0, 3, 0, 0, 0, 4, 4, 4, 5, 3, 2, 0, 3, 3, 0, 3, 5),
|
||||||
(0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5),
|
(0, 2, 0, 3, 0, 0, 0, 3, 0, 1, 3, 0, 2, 0, 0, 0, 1, 0, 3, 1, 1, 3, 3, 0, 0, 3, 0, 0, 3, 0, 2, 3, 1, 0, 3, 1, 0, 3, 3, 2, 0, 4, 2, 2, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 0, 0, 1, 3, 1, 2, 0, 0, 0, 1, 0, 0, 1, 4),
|
||||||
(0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3),
|
(0, 3, 0, 3, 0, 5, 0, 1, 0, 2, 4, 3, 1, 3, 3, 2, 1, 1, 5, 2, 1, 0, 5, 1, 2, 0, 0, 0, 3, 3, 2, 2, 3, 2, 4, 3, 0, 0, 3, 3, 1, 3, 3, 0, 2, 5, 3, 4, 0, 3, 3, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0, 2, 2, 3, 3, 3, 0, 2, 0, 1, 0, 3, 4, 4, 2, 5, 4, 0, 3, 0, 0, 3, 5),
|
||||||
(0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4),
|
(0, 3, 0, 3, 0, 3, 0, 1, 0, 3, 3, 3, 3, 0, 3, 0, 2, 0, 2, 1, 1, 0, 2, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 3, 2, 0, 0, 3, 3, 1, 2, 3, 1, 0, 3, 3, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 3, 1, 2, 3, 0, 3, 0, 1, 0, 3, 2, 1, 0, 4, 3, 0, 1, 1, 0, 3, 3),
|
||||||
(0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4),
|
(0, 4, 0, 5, 0, 3, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 4, 3, 5, 3, 3, 2, 5, 3, 4, 4, 4, 3, 4, 3, 4, 5, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, 3, 4, 5, 5, 4, 2, 3, 4, 2, 3, 4, 0, 3, 3, 1, 4, 3, 2, 4, 3, 3, 5, 5, 0, 3, 0, 3, 0, 5, 5, 5, 5, 4, 4, 0, 4, 0, 1, 4, 4),
|
||||||
(0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4),
|
(0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 5, 4, 4, 2, 3, 2, 5, 1, 3, 2, 5, 1, 4, 2, 3, 2, 3, 3, 4, 3, 3, 3, 3, 2, 5, 4, 1, 3, 3, 5, 3, 4, 4, 0, 4, 4, 3, 1, 1, 3, 1, 0, 2, 3, 0, 2, 3, 0, 3, 0, 0, 4, 3, 1, 3, 4, 0, 3, 0, 2, 0, 4, 4, 4, 3, 4, 5, 0, 4, 0, 0, 3, 4),
|
||||||
(0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1),
|
(0, 3, 0, 3, 0, 3, 1, 2, 0, 3, 4, 4, 3, 3, 3, 0, 2, 2, 4, 3, 3, 1, 3, 3, 3, 1, 1, 0, 3, 1, 4, 3, 2, 3, 4, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 3, 1, 3, 3, 1, 3, 3, 0, 4, 1, 0, 2, 2, 1, 4, 3, 2, 3, 3, 5, 4, 3, 3, 5, 4, 4, 3, 3, 0, 4, 0, 3, 2, 2, 4, 4),
|
||||||
(0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0),
|
(0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 2, 1, 3, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 1, 3, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 3, 4, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1),
|
||||||
(1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3),
|
(0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 4, 1, 4, 0, 3, 0, 4, 0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 1, 5, 1, 4, 0, 0, 3, 0, 5, 0, 5, 2, 0, 1, 0, 0, 0, 2, 1, 4, 0, 1, 3, 0, 0, 3, 0, 0, 3, 1, 1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||||
(0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0),
|
(1, 4, 0, 5, 0, 3, 0, 2, 0, 3, 5, 4, 4, 3, 4, 3, 5, 3, 4, 3, 3, 0, 4, 3, 3, 3, 3, 3, 3, 2, 4, 4, 3, 1, 3, 4, 4, 5, 4, 4, 3, 4, 4, 1, 3, 5, 4, 3, 3, 3, 1, 2, 2, 3, 3, 1, 3, 1, 3, 3, 3, 5, 3, 3, 4, 5, 0, 3, 0, 3, 0, 3, 4, 3, 4, 4, 3, 0, 3, 0, 2, 4, 3),
|
||||||
(0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3),
|
(0, 1, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 4, 1, 4, 2, 4, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 3, 1, 1, 1, 0, 3, 0, 0, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 3, 2, 0, 2, 2, 0, 1, 0, 0, 0, 2, 3, 2, 3, 3, 0, 0, 0, 0, 2, 1, 0),
|
||||||
(0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3),
|
(0, 5, 1, 5, 0, 3, 0, 3, 0, 5, 4, 4, 5, 1, 5, 3, 3, 0, 4, 3, 4, 3, 5, 3, 4, 3, 3, 2, 4, 3, 4, 3, 3, 0, 3, 3, 1, 4, 4, 3, 4, 4, 4, 3, 4, 5, 5, 3, 2, 3, 1, 1, 3, 3, 1, 3, 1, 1, 3, 3, 2, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 5, 3, 3, 0, 3, 4, 0, 4, 3),
|
||||||
(0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5),
|
(0, 5, 0, 5, 0, 3, 0, 2, 0, 4, 4, 3, 5, 2, 4, 3, 3, 3, 4, 4, 4, 3, 5, 3, 5, 3, 3, 1, 4, 0, 4, 3, 3, 0, 3, 3, 0, 4, 4, 4, 4, 5, 4, 3, 3, 5, 5, 3, 2, 3, 1, 2, 3, 2, 0, 1, 0, 0, 3, 2, 2, 4, 4, 3, 1, 5, 0, 4, 0, 3, 0, 4, 3, 1, 3, 2, 1, 0, 3, 3, 0, 3, 3),
|
||||||
(0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4),
|
(0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 5, 5, 3, 4, 3, 3, 2, 5, 4, 4, 3, 5, 3, 5, 3, 4, 0, 4, 3, 4, 4, 3, 2, 4, 4, 3, 4, 5, 4, 4, 5, 5, 0, 3, 5, 5, 4, 1, 3, 3, 2, 3, 3, 1, 3, 1, 0, 4, 3, 1, 4, 4, 3, 4, 5, 0, 4, 0, 2, 0, 4, 3, 4, 4, 3, 3, 0, 4, 0, 0, 5, 5),
|
||||||
(2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5),
|
(0, 4, 0, 4, 0, 5, 0, 1, 1, 3, 3, 4, 4, 3, 4, 1, 3, 0, 5, 1, 3, 0, 3, 1, 3, 1, 1, 0, 3, 0, 3, 3, 4, 0, 4, 3, 0, 4, 4, 4, 3, 4, 4, 0, 3, 5, 4, 1, 0, 3, 0, 0, 2, 3, 0, 3, 1, 0, 3, 1, 0, 3, 2, 1, 3, 5, 0, 3, 0, 1, 0, 3, 2, 3, 3, 4, 4, 0, 2, 2, 0, 4, 4),
|
||||||
(0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3),
|
(2, 4, 0, 5, 0, 4, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 5, 3, 5, 3, 5, 2, 5, 3, 4, 3, 3, 4, 3, 4, 5, 3, 2, 1, 5, 4, 3, 2, 3, 4, 5, 3, 4, 1, 2, 5, 4, 3, 0, 3, 3, 0, 3, 2, 0, 2, 3, 0, 4, 1, 0, 3, 4, 3, 3, 5, 0, 3, 0, 1, 0, 4, 5, 5, 5, 4, 3, 0, 4, 2, 0, 3, 5),
|
||||||
(0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3),
|
(0, 5, 0, 4, 0, 4, 0, 2, 0, 5, 4, 3, 4, 3, 4, 3, 3, 3, 4, 3, 4, 2, 5, 3, 5, 3, 4, 1, 4, 3, 4, 4, 4, 0, 3, 5, 0, 4, 4, 4, 4, 5, 3, 1, 3, 4, 5, 3, 3, 3, 3, 3, 3, 3, 0, 2, 2, 0, 3, 3, 2, 4, 3, 3, 3, 5, 3, 4, 1, 3, 3, 5, 3, 2, 0, 0, 0, 0, 4, 3, 1, 3, 3),
|
||||||
(0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3),
|
(0, 1, 0, 3, 0, 3, 0, 1, 0, 1, 3, 3, 3, 2, 3, 3, 3, 0, 3, 0, 0, 0, 3, 1, 3, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 2, 0, 1, 2, 4, 1, 3, 3, 0, 0, 3, 3, 3, 0, 1, 0, 0, 2, 1, 0, 0, 3, 0, 3, 1, 0, 3, 0, 0, 1, 3, 0, 2, 0, 1, 0, 3, 3, 1, 3, 3, 0, 0, 1, 1, 0, 3, 3),
|
||||||
(0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3),
|
(0, 2, 0, 3, 0, 2, 1, 4, 0, 2, 2, 3, 1, 1, 3, 1, 1, 0, 2, 0, 3, 1, 2, 3, 1, 3, 0, 0, 1, 0, 4, 3, 2, 3, 3, 3, 1, 4, 2, 3, 3, 3, 3, 1, 0, 3, 1, 4, 0, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 0, 3, 1, 3, 2, 2, 0, 1, 0, 0, 0, 2, 3, 3, 3, 1, 0, 0, 0, 0, 0, 2, 3),
|
||||||
(0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4),
|
(0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 5, 5, 3, 3, 4, 3, 3, 1, 5, 4, 4, 2, 4, 4, 4, 3, 4, 2, 4, 3, 5, 5, 4, 3, 3, 4, 3, 3, 5, 5, 4, 5, 5, 1, 3, 4, 5, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 1, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 5, 3, 3, 1, 4, 3, 0, 4, 0, 1, 5, 3),
|
||||||
(0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4),
|
(0, 5, 0, 5, 0, 4, 0, 2, 0, 4, 4, 3, 4, 3, 3, 3, 3, 3, 5, 4, 4, 4, 4, 4, 4, 5, 3, 3, 5, 2, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 5, 5, 3, 3, 4, 3, 4, 3, 3, 4, 3, 3, 3, 3, 1, 2, 2, 1, 4, 3, 3, 5, 4, 4, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 4, 4, 1, 0, 4, 2, 0, 2, 4),
|
||||||
(0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2),
|
(0, 4, 0, 4, 0, 3, 0, 1, 0, 3, 5, 2, 3, 0, 3, 0, 2, 1, 4, 2, 3, 3, 4, 1, 4, 3, 3, 2, 4, 1, 3, 3, 3, 0, 3, 3, 0, 0, 3, 3, 3, 5, 3, 3, 3, 3, 3, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 3, 1, 2, 2, 3, 0, 3, 0, 2, 0, 4, 4, 3, 3, 4, 1, 0, 3, 0, 0, 2, 4),
|
||||||
(0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3),
|
(0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 2, 0, 0, 0, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 0, 2),
|
||||||
(0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3),
|
(0, 2, 1, 3, 0, 2, 0, 2, 0, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 4, 2, 2, 1, 2, 1, 4, 0, 4, 3, 1, 3, 3, 3, 2, 4, 3, 5, 4, 3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 2, 3, 0, 3, 3, 0, 3, 3, 4, 2, 3, 1, 4, 0, 1, 2, 0, 2, 3),
|
||||||
(0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3),
|
(0, 3, 0, 3, 0, 1, 0, 3, 0, 2, 3, 3, 3, 0, 3, 1, 2, 0, 3, 3, 2, 3, 3, 2, 3, 2, 3, 1, 3, 0, 4, 3, 2, 0, 3, 3, 1, 4, 3, 3, 2, 3, 4, 3, 1, 3, 3, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 4, 1, 1, 0, 3, 0, 3, 1, 0, 2, 3, 3, 3, 3, 3, 1, 0, 0, 2, 0, 3, 3),
|
||||||
(0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3),
|
||||||
(0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4),
|
(0, 2, 0, 3, 1, 3, 0, 3, 0, 2, 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3, 1, 3, 0, 2, 3, 1, 1, 4, 3, 3, 2, 3, 3, 1, 2, 2, 4, 1, 3, 3, 0, 1, 4, 2, 3, 0, 1, 3, 0, 3, 0, 0, 1, 3, 0, 2, 0, 0, 3, 3, 2, 1, 3, 0, 3, 0, 2, 0, 3, 4, 4, 4, 3, 1, 0, 3, 0, 0, 3, 3),
|
||||||
(0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3),
|
(0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 3, 2, 2, 1, 3, 0, 1, 1, 3, 0, 3, 2, 3, 1, 2, 0, 2, 0, 1, 1, 3, 3, 3, 0, 3, 3, 1, 1, 2, 3, 2, 3, 3, 1, 2, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 2, 1, 2, 1, 3, 0, 3, 0, 0, 0, 3, 4, 4, 4, 3, 2, 0, 2, 0, 0, 2, 4),
|
||||||
(0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4),
|
(0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 3),
|
||||||
(0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3),
|
(0, 3, 0, 3, 0, 2, 0, 3, 0, 3, 3, 3, 2, 3, 2, 2, 2, 0, 3, 1, 3, 3, 3, 2, 3, 3, 0, 0, 3, 0, 3, 2, 2, 0, 2, 3, 1, 4, 3, 4, 3, 3, 2, 3, 1, 5, 4, 4, 0, 3, 1, 2, 1, 3, 0, 3, 1, 1, 2, 0, 2, 3, 1, 3, 1, 3, 0, 3, 0, 1, 0, 3, 3, 4, 4, 2, 1, 0, 2, 1, 0, 2, 4),
|
||||||
(0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3),
|
(0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 4, 2, 5, 1, 4, 0, 2, 0, 2, 1, 3, 1, 4, 0, 2, 1, 0, 0, 2, 1, 4, 1, 1, 0, 3, 3, 0, 5, 1, 3, 2, 3, 3, 1, 0, 3, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, 0, 3, 0, 2, 0, 1, 0, 3, 3, 3, 4, 3, 3, 0, 0, 0, 0, 2, 3),
|
||||||
(0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4),
|
(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 3),
|
||||||
(0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4),
|
(0, 1, 0, 3, 0, 4, 0, 3, 0, 2, 4, 3, 1, 0, 3, 2, 2, 1, 3, 1, 2, 2, 3, 1, 1, 1, 2, 1, 3, 0, 1, 2, 0, 1, 3, 2, 1, 3, 0, 5, 5, 1, 0, 0, 1, 3, 2, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 1, 1, 1, 3, 2, 0, 2, 0, 1, 0, 2, 3, 3, 1, 2, 3, 0, 1, 0, 1, 0, 4),
|
||||||
(0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3),
|
(0, 0, 0, 1, 0, 3, 0, 3, 0, 2, 2, 1, 0, 0, 4, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 0, 1, 0, 3, 0, 3, 1, 3, 0, 3, 3, 0, 0, 1, 2, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 2, 0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 4),
|
||||||
(2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4),
|
(0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 2, 0, 2, 3, 0, 0, 2, 2, 3, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 2, 3),
|
||||||
(0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4),
|
(2, 4, 0, 5, 0, 5, 0, 4, 0, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 4, 5, 5, 5, 2, 3, 0, 5, 5, 4, 1, 5, 4, 3, 1, 5, 4, 3, 4, 4, 3, 3, 4, 3, 3, 0, 3, 2, 0, 2, 3, 0, 3, 0, 0, 3, 3, 0, 5, 3, 2, 3, 3, 0, 3, 0, 3, 0, 3, 4, 5, 4, 5, 3, 0, 4, 3, 0, 3, 4),
|
||||||
(0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3),
|
(0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 3, 4, 3, 2, 3, 2, 3, 0, 4, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 2, 4, 3, 3, 1, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 1, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 3, 1, 0, 5, 3, 2, 1, 3, 0, 3, 0, 1, 2, 4, 3, 2, 4, 3, 3, 0, 3, 2, 0, 4, 4),
|
||||||
(0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4),
|
(0, 3, 0, 3, 0, 1, 0, 0, 0, 1, 4, 3, 3, 2, 3, 1, 3, 1, 4, 2, 3, 2, 4, 2, 3, 4, 3, 0, 2, 2, 3, 3, 3, 0, 3, 3, 3, 0, 3, 4, 1, 3, 3, 0, 3, 4, 3, 3, 0, 1, 1, 0, 1, 0, 0, 0, 4, 0, 3, 0, 0, 3, 1, 2, 1, 3, 0, 4, 0, 1, 0, 4, 3, 3, 4, 3, 3, 0, 2, 0, 0, 3, 3),
|
||||||
(0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4),
|
(0, 3, 0, 4, 0, 1, 0, 3, 0, 3, 4, 3, 3, 0, 3, 3, 3, 1, 3, 1, 3, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 3, 3, 1, 3, 3, 2, 5, 4, 3, 3, 4, 5, 3, 2, 5, 3, 4, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 4, 2, 2, 1, 3, 0, 3, 0, 2, 0, 4, 4, 3, 5, 3, 2, 0, 1, 1, 0, 3, 4),
|
||||||
(1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4),
|
(0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 4, 3, 3, 2, 3, 3, 3, 1, 4, 3, 4, 1, 5, 3, 4, 3, 4, 0, 4, 2, 4, 3, 4, 1, 5, 4, 0, 4, 4, 4, 4, 5, 4, 1, 3, 5, 4, 2, 1, 4, 1, 1, 3, 2, 0, 3, 1, 0, 3, 2, 1, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 3, 3, 3, 0, 4, 2, 0, 3, 4),
|
||||||
(0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3),
|
(1, 4, 0, 4, 0, 3, 0, 1, 0, 3, 3, 3, 1, 1, 3, 3, 2, 2, 3, 3, 1, 0, 3, 2, 2, 1, 2, 0, 3, 1, 2, 1, 2, 0, 3, 2, 0, 2, 2, 3, 3, 4, 3, 0, 3, 3, 1, 2, 0, 1, 1, 3, 1, 2, 0, 0, 3, 0, 1, 1, 0, 3, 2, 2, 3, 3, 0, 3, 0, 0, 0, 2, 3, 3, 4, 3, 3, 0, 1, 0, 0, 1, 4),
|
||||||
(0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2),
|
(0, 4, 0, 4, 0, 4, 0, 0, 0, 3, 4, 4, 3, 1, 4, 2, 3, 2, 3, 3, 3, 1, 4, 3, 4, 0, 3, 0, 4, 2, 3, 3, 2, 2, 5, 4, 2, 1, 3, 4, 3, 4, 3, 1, 3, 3, 4, 2, 0, 2, 1, 0, 3, 3, 0, 0, 2, 0, 3, 1, 0, 4, 4, 3, 4, 3, 0, 4, 0, 1, 0, 2, 4, 4, 4, 4, 4, 0, 3, 2, 0, 3, 3),
|
||||||
(0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2),
|
(0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2),
|
||||||
(0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3),
|
(0, 2, 0, 3, 0, 4, 0, 4, 0, 1, 3, 3, 3, 0, 4, 0, 2, 1, 2, 1, 1, 1, 2, 0, 3, 1, 1, 0, 1, 0, 3, 1, 0, 0, 3, 3, 2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 3, 1, 0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 3, 0, 0, 0, 0, 1, 0, 0, 3, 3, 4, 3, 1, 0, 1, 0, 3, 0, 2),
|
||||||
(0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3),
|
(0, 0, 0, 3, 0, 5, 0, 0, 0, 0, 1, 0, 2, 0, 3, 1, 0, 1, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 4, 0, 0, 0, 2, 3, 0, 1, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 3),
|
||||||
(0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5),
|
(0, 2, 0, 5, 0, 5, 0, 1, 0, 2, 4, 3, 3, 2, 5, 1, 3, 2, 3, 3, 3, 0, 4, 1, 2, 0, 3, 0, 4, 0, 2, 2, 1, 1, 5, 3, 0, 0, 1, 4, 2, 3, 2, 0, 3, 3, 3, 2, 0, 2, 4, 1, 1, 2, 0, 1, 1, 0, 3, 1, 0, 1, 3, 1, 2, 3, 0, 2, 0, 0, 0, 1, 3, 5, 4, 4, 4, 0, 3, 0, 0, 1, 3),
|
||||||
(0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3),
|
(0, 4, 0, 5, 0, 4, 0, 4, 0, 4, 5, 4, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 3, 4, 5, 4, 2, 4, 2, 3, 4, 3, 1, 4, 4, 1, 3, 5, 4, 4, 5, 5, 4, 4, 5, 5, 5, 2, 3, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 4, 4, 4, 0, 3, 0, 4, 0, 3, 3, 4, 4, 5, 0, 0, 4, 3, 0, 4, 5),
|
||||||
(0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4),
|
(0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 4, 4, 4, 3, 3, 2, 4, 3, 4, 3, 4, 3, 5, 3, 4, 3, 2, 1, 4, 2, 4, 4, 3, 1, 3, 4, 2, 4, 5, 5, 3, 4, 5, 4, 1, 5, 4, 3, 0, 3, 2, 2, 3, 2, 1, 3, 1, 0, 3, 3, 3, 5, 3, 3, 3, 5, 4, 4, 2, 3, 3, 4, 3, 3, 3, 2, 1, 0, 3, 2, 1, 4, 3),
|
||||||
(1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4),
|
(0, 4, 0, 5, 0, 4, 0, 3, 0, 3, 5, 5, 3, 2, 4, 3, 4, 0, 5, 4, 4, 1, 4, 4, 4, 3, 3, 3, 4, 3, 5, 5, 2, 3, 3, 4, 1, 2, 5, 5, 3, 5, 5, 2, 3, 5, 5, 4, 0, 3, 2, 0, 3, 3, 1, 1, 5, 1, 4, 1, 0, 4, 3, 2, 3, 5, 0, 4, 0, 3, 0, 5, 4, 3, 4, 3, 0, 0, 4, 1, 0, 4, 4),
|
||||||
(0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4),
|
(1, 3, 0, 4, 0, 2, 0, 2, 0, 2, 5, 5, 3, 3, 3, 3, 3, 0, 4, 2, 3, 4, 4, 4, 3, 4, 0, 0, 3, 4, 5, 4, 3, 3, 3, 3, 2, 5, 5, 4, 5, 5, 5, 4, 3, 5, 5, 5, 1, 3, 1, 0, 1, 0, 0, 3, 2, 0, 4, 2, 0, 5, 2, 3, 2, 4, 1, 3, 0, 3, 0, 4, 5, 4, 5, 4, 3, 0, 4, 2, 0, 5, 4),
|
||||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
(0, 3, 0, 4, 0, 5, 0, 3, 0, 3, 4, 4, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 4, 3, 3, 2, 2, 0, 3, 3, 3, 3, 3, 1, 3, 3, 3, 0, 4, 4, 3, 4, 4, 1, 1, 4, 4, 2, 0, 3, 1, 0, 1, 1, 0, 4, 1, 0, 2, 3, 1, 3, 3, 1, 3, 4, 0, 3, 0, 1, 0, 3, 1, 3, 0, 0, 1, 0, 2, 0, 0, 4, 4),
|
||||||
(0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1),
|
(0, 3, 0, 3, 0, 2, 0, 3, 0, 1, 5, 4, 3, 3, 3, 1, 4, 2, 1, 2, 3, 4, 4, 2, 4, 4, 5, 0, 3, 1, 4, 3, 4, 0, 4, 3, 3, 3, 2, 3, 2, 5, 3, 4, 3, 2, 2, 3, 0, 0, 3, 0, 2, 1, 0, 1, 2, 0, 0, 0, 0, 2, 1, 1, 3, 1, 0, 2, 0, 4, 0, 3, 4, 4, 4, 5, 2, 0, 2, 0, 0, 1, 3),
|
||||||
(0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 4, 2, 1, 1, 0, 1, 0, 3, 2, 0, 0, 3, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 4, 0, 4, 2, 1, 0, 0, 0, 0, 0, 1),
|
||||||
(0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3),
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 0, 2, 1, 0, 0, 1, 2, 1, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2),
|
||||||
(0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1),
|
(0, 4, 0, 4, 0, 4, 0, 3, 0, 4, 4, 3, 4, 2, 4, 3, 2, 0, 4, 4, 4, 3, 5, 3, 5, 3, 3, 2, 4, 2, 4, 3, 4, 3, 1, 4, 0, 2, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 4, 1, 3, 4, 3, 2, 1, 2, 1, 3, 3, 3, 4, 4, 3, 3, 5, 0, 4, 0, 3, 0, 4, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3),
|
||||||
|
(0, 4, 0, 3, 0, 3, 0, 3, 0, 3, 5, 5, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 3, 5, 3, 3, 1, 3, 2, 4, 5, 5, 5, 5, 4, 3, 4, 5, 5, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 4, 3, 2, 2, 1, 2, 0, 3, 0, 0, 4, 1),
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
class JapaneseContextAnalysis(object):
|
|
||||||
|
class JapaneseContextAnalysis:
|
||||||
NUM_OF_CATEGORY = 6
|
NUM_OF_CATEGORY = 6
|
||||||
DONT_KNOW = -1
|
DONT_KNOW = -1
|
||||||
ENOUGH_REL_THRESHOLD = 100
|
ENOUGH_REL_THRESHOLD = 100
|
||||||
MAX_REL_THRESHOLD = 1000
|
MAX_REL_THRESHOLD = 1000
|
||||||
MINIMUM_DATA_THRESHOLD = 4
|
MINIMUM_DATA_THRESHOLD = 4
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self._total_rel = None
|
self._total_rel = 0
|
||||||
self._rel_sample = None
|
self._rel_sample: List[int] = []
|
||||||
self._need_to_skip_char_num = None
|
self._need_to_skip_char_num = 0
|
||||||
self._last_char_order = None
|
self._last_char_order = -1
|
||||||
self._done = None
|
self._done = False
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._total_rel = 0 # total sequence received
|
self._total_rel = 0 # total sequence received
|
||||||
# category counters, each integer counts sequence in its category
|
# category counters, each integer counts sequence in its category
|
||||||
self._rel_sample = [0] * self.NUM_OF_CATEGORY
|
self._rel_sample = [0] * self.NUM_OF_CATEGORY
|
||||||
|
@ -140,7 +144,7 @@ class JapaneseContextAnalysis(object):
|
||||||
# been made
|
# been made
|
||||||
self._done = False
|
self._done = False
|
||||||
|
|
||||||
def feed(self, byte_str, num_bytes):
|
def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
|
||||||
if self._done:
|
if self._done:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -153,7 +157,7 @@ class JapaneseContextAnalysis(object):
|
||||||
# this character will simply our logic and improve performance.
|
# this character will simply our logic and improve performance.
|
||||||
i = self._need_to_skip_char_num
|
i = self._need_to_skip_char_num
|
||||||
while i < num_bytes:
|
while i < num_bytes:
|
||||||
order, char_len = self.get_order(byte_str[i:i + 2])
|
order, char_len = self.get_order(byte_str[i : i + 2])
|
||||||
i += char_len
|
i += char_len
|
||||||
if i > num_bytes:
|
if i > num_bytes:
|
||||||
self._need_to_skip_char_num = i - num_bytes
|
self._need_to_skip_char_num = i - num_bytes
|
||||||
|
@ -164,32 +168,34 @@ class JapaneseContextAnalysis(object):
|
||||||
if self._total_rel > self.MAX_REL_THRESHOLD:
|
if self._total_rel > self.MAX_REL_THRESHOLD:
|
||||||
self._done = True
|
self._done = True
|
||||||
break
|
break
|
||||||
self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
|
self._rel_sample[
|
||||||
|
jp2_char_context[self._last_char_order][order]
|
||||||
|
] += 1
|
||||||
self._last_char_order = order
|
self._last_char_order = order
|
||||||
|
|
||||||
def got_enough_data(self):
|
def got_enough_data(self) -> bool:
|
||||||
return self._total_rel > self.ENOUGH_REL_THRESHOLD
|
return self._total_rel > self.ENOUGH_REL_THRESHOLD
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
# This is just one way to calculate confidence. It works well for me.
|
# This is just one way to calculate confidence. It works well for me.
|
||||||
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
||||||
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
||||||
else:
|
|
||||||
return self.DONT_KNOW
|
return self.DONT_KNOW
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
|
|
||||||
|
|
||||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(SJISContextAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._charset_name = "SHIFT_JIS"
|
self._charset_name = "SHIFT_JIS"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return self._charset_name
|
return self._charset_name
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
|
@ -209,8 +215,9 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
|
|
||||||
return -1, char_len
|
return -1, char_len
|
||||||
|
|
||||||
|
|
||||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
|
@ -229,5 +236,3 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
return second_char - 0xA1, char_len
|
return second_char - 0xA1, char_len
|
||||||
|
|
||||||
return -1, char_len
|
return -1, char_len
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,13 +4369,15 @@ ISO_8859_5_BULGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 253, # 'џ'
|
255: 253, # 'џ'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_5_BULGARIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5',
|
ISO_8859_5_BULGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Bulgairan',
|
charset_name="ISO-8859-5",
|
||||||
|
language="Bulgarian",
|
||||||
char_to_order_map=ISO_8859_5_BULGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_5_BULGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=BULGARIAN_LANG_MODEL,
|
language_model=BULGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.969392,
|
typical_positive_ratio=0.969392,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя')
|
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||||
|
)
|
||||||
|
|
||||||
WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = {
|
WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4640,11 +4638,12 @@ WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 16, # 'я'
|
255: 16, # 'я'
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1251_BULGARIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1251',
|
WINDOWS_1251_BULGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Bulgarian',
|
charset_name="windows-1251",
|
||||||
|
language="Bulgarian",
|
||||||
char_to_order_map=WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=BULGARIAN_LANG_MODEL,
|
language_model=BULGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.969392,
|
typical_positive_ratio=0.969392,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя')
|
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4121,13 +4117,15 @@ WINDOWS_1253_GREEK_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1253_GREEK_MODEL = SingleByteCharSetModel(charset_name='windows-1253',
|
WINDOWS_1253_GREEK_MODEL = SingleByteCharSetModel(
|
||||||
language='Greek',
|
charset_name="windows-1253",
|
||||||
|
language="Greek",
|
||||||
char_to_order_map=WINDOWS_1253_GREEK_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1253_GREEK_CHAR_TO_ORDER,
|
||||||
language_model=GREEK_LANG_MODEL,
|
language_model=GREEK_LANG_MODEL,
|
||||||
typical_positive_ratio=0.982851,
|
typical_positive_ratio=0.982851,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ')
|
alphabet="ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ",
|
||||||
|
)
|
||||||
|
|
||||||
ISO_8859_7_GREEK_CHAR_TO_ORDER = {
|
ISO_8859_7_GREEK_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4388,11 +4386,12 @@ ISO_8859_7_GREEK_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_7_GREEK_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-7',
|
ISO_8859_7_GREEK_MODEL = SingleByteCharSetModel(
|
||||||
language='Greek',
|
charset_name="ISO-8859-7",
|
||||||
|
language="Greek",
|
||||||
char_to_order_map=ISO_8859_7_GREEK_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_7_GREEK_CHAR_TO_ORDER,
|
||||||
language_model=GREEK_LANG_MODEL,
|
language_model=GREEK_LANG_MODEL,
|
||||||
typical_positive_ratio=0.982851,
|
typical_positive_ratio=0.982851,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ')
|
alphabet="ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,11 +4369,12 @@ WINDOWS_1255_HEBREW_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(charset_name='windows-1255',
|
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(
|
||||||
language='Hebrew',
|
charset_name="windows-1255",
|
||||||
|
language="Hebrew",
|
||||||
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
|
||||||
language_model=HEBREW_LANG_MODEL,
|
language_model=HEBREW_LANG_MODEL,
|
||||||
typical_positive_ratio=0.984004,
|
typical_positive_ratio=0.984004,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ')
|
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,13 +4369,15 @@ WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 253, # '˙'
|
255: 253, # '˙'
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1250_HUNGARIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1250',
|
WINDOWS_1250_HUNGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Hungarian',
|
charset_name="windows-1250",
|
||||||
|
language="Hungarian",
|
||||||
char_to_order_map=WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=HUNGARIAN_LANG_MODEL,
|
language_model=HUNGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.947368,
|
typical_positive_ratio=0.947368,
|
||||||
keep_ascii_letters=True,
|
keep_ascii_letters=True,
|
||||||
alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű')
|
alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű",
|
||||||
|
)
|
||||||
|
|
||||||
ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = {
|
ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4640,11 +4638,12 @@ ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 253, # '˙'
|
255: 253, # '˙'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_2_HUNGARIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2',
|
ISO_8859_2_HUNGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Hungarian',
|
charset_name="ISO-8859-2",
|
||||||
|
language="Hungarian",
|
||||||
char_to_order_map=ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=HUNGARIAN_LANG_MODEL,
|
language_model=HUNGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.947368,
|
typical_positive_ratio=0.947368,
|
||||||
keep_ascii_letters=True,
|
keep_ascii_letters=True,
|
||||||
alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű')
|
alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,13 +4369,15 @@ IBM866_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # '\xa0'
|
255: 255, # '\xa0'
|
||||||
}
|
}
|
||||||
|
|
||||||
IBM866_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM866',
|
IBM866_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="IBM866",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=IBM866_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=IBM866_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = {
|
WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4640,13 +4638,15 @@ WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 16, # 'я'
|
255: 16, # 'я'
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1251_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1251',
|
WINDOWS_1251_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="windows-1251",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
IBM855_RUSSIAN_CHAR_TO_ORDER = {
|
IBM855_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4907,13 +4907,15 @@ IBM855_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # '\xa0'
|
255: 255, # '\xa0'
|
||||||
}
|
}
|
||||||
|
|
||||||
IBM855_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM855',
|
IBM855_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="IBM855",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=IBM855_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=IBM855_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
KOI8_R_RUSSIAN_CHAR_TO_ORDER = {
|
KOI8_R_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -5174,13 +5176,15 @@ KOI8_R_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 70, # 'Ъ'
|
255: 70, # 'Ъ'
|
||||||
}
|
}
|
||||||
|
|
||||||
KOI8_R_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='KOI8-R',
|
KOI8_R_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="KOI8-R",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=KOI8_R_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=KOI8_R_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = {
|
MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -5441,13 +5445,15 @@ MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # '€'
|
255: 255, # '€'
|
||||||
}
|
}
|
||||||
|
|
||||||
MACCYRILLIC_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='MacCyrillic',
|
MACCYRILLIC_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="MacCyrillic",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = {
|
ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -5708,11 +5714,12 @@ ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # 'џ'
|
255: 255, # 'џ'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_5_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5',
|
ISO_8859_5_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="ISO-8859-5",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=ISO_8859_5_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_5_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,11 +4369,12 @@ TIS_620_THAI_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
TIS_620_THAI_MODEL = SingleByteCharSetModel(charset_name='TIS-620',
|
TIS_620_THAI_MODEL = SingleByteCharSetModel(
|
||||||
language='Thai',
|
charset_name="TIS-620",
|
||||||
|
language="Thai",
|
||||||
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
|
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
|
||||||
language_model=THAI_LANG_MODEL,
|
language_model=THAI_LANG_MODEL,
|
||||||
typical_positive_ratio=0.926386,
|
typical_positive_ratio=0.926386,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛')
|
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,11 +4369,12 @@ ISO_8859_9_TURKISH_CHAR_TO_ORDER = {
|
||||||
255: 107, # 'ÿ'
|
255: 107, # 'ÿ'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-9',
|
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(
|
||||||
language='Turkish',
|
charset_name="ISO-8859-9",
|
||||||
|
language="Turkish",
|
||||||
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
|
||||||
language_model=TURKISH_LANG_MODEL,
|
language_model=TURKISH_LANG_MODEL,
|
||||||
typical_positive_ratio=0.97029,
|
typical_positive_ratio=0.97029,
|
||||||
keep_ascii_letters=True,
|
keep_ascii_letters=True,
|
||||||
alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş')
|
alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş",
|
||||||
|
)
|
||||||
|
|
|
@ -26,6 +26,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
@ -41,6 +43,7 @@ ASV = 6 # accent small vowel
|
||||||
ASO = 7 # accent small other
|
ASO = 7 # accent small other
|
||||||
CLASS_NUM = 8 # total classes
|
CLASS_NUM = 8 # total classes
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
Latin1_CharToClass = (
|
Latin1_CharToClass = (
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||||
|
@ -91,34 +94,34 @@ Latin1ClassModel = (
|
||||||
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
||||||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
class Latin1Prober(CharSetProber):
|
class Latin1Prober(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(Latin1Prober, self).__init__()
|
super().__init__()
|
||||||
self._last_char_class = None
|
self._last_char_class = OTH
|
||||||
self._freq_counter = None
|
self._freq_counter: List[int] = []
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._last_char_class = OTH
|
self._last_char_class = OTH
|
||||||
self._freq_counter = [0] * FREQ_CAT_NUM
|
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||||
CharSetProber.reset(self)
|
super().reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "ISO-8859-1"
|
return "ISO-8859-1"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
byte_str = self.filter_with_english_letters(byte_str)
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
char_class = Latin1_CharToClass[c]
|
char_class = Latin1_CharToClass[c]
|
||||||
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
|
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]
|
||||||
+ char_class]
|
|
||||||
if freq == 0:
|
if freq == 0:
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
|
@ -127,19 +130,18 @@ class Latin1Prober(CharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
if self.state == ProbingState.NOT_ME:
|
if self.state == ProbingState.NOT_ME:
|
||||||
return 0.01
|
return 0.01
|
||||||
|
|
||||||
total = sum(self._freq_counter)
|
total = sum(self._freq_counter)
|
||||||
if total < 0.01:
|
confidence = (
|
||||||
confidence = 0.0
|
0.0
|
||||||
else:
|
if total < 0.01
|
||||||
confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)
|
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
|
||||||
/ total)
|
)
|
||||||
if confidence < 0.0:
|
confidence = max(confidence, 0.0)
|
||||||
confidence = 0.0
|
|
||||||
# lower the confidence of latin1 so that other more accurate
|
# lower the confidence of latin1 so that other more accurate
|
||||||
# detector can take priority.
|
# detector can take priority.
|
||||||
confidence = confidence * 0.73
|
confidence *= 0.73
|
||||||
return confidence
|
return confidence
|
||||||
|
|
162
lib/chardet/macromanprober.py
Normal file
162
lib/chardet/macromanprober.py
Normal file
|
@ -0,0 +1,162 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
|
||||||
|
# The Original Code is Mozilla Universal charset detector code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Rob Speer - adapt to MacRoman encoding
|
||||||
|
# Mark Pilgrim - port to Python
|
||||||
|
# Shy Shalom - original C code
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
FREQ_CAT_NUM = 4
|
||||||
|
|
||||||
|
UDF = 0 # undefined
|
||||||
|
OTH = 1 # other
|
||||||
|
ASC = 2 # ascii capital letter
|
||||||
|
ASS = 3 # ascii small letter
|
||||||
|
ACV = 4 # accent capital vowel
|
||||||
|
ACO = 5 # accent capital other
|
||||||
|
ASV = 6 # accent small vowel
|
||||||
|
ASO = 7 # accent small other
|
||||||
|
ODD = 8 # character that is unlikely to appear
|
||||||
|
CLASS_NUM = 9 # total classes
|
||||||
|
|
||||||
|
# The change from Latin1 is that we explicitly look for extended characters
|
||||||
|
# that are infrequently-occurring symbols, and consider them to always be
|
||||||
|
# improbable. This should let MacRoman get out of the way of more likely
|
||||||
|
# encodings in most situations.
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
MacRoman_CharToClass = (
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
||||||
|
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
||||||
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
||||||
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
||||||
|
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
||||||
|
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
||||||
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
||||||
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
||||||
|
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
||||||
|
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
|
||||||
|
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
|
||||||
|
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
|
||||||
|
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
|
||||||
|
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
|
||||||
|
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
|
||||||
|
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
|
||||||
|
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
|
||||||
|
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
|
||||||
|
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
|
||||||
|
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
|
||||||
|
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
|
||||||
|
)
|
||||||
|
|
||||||
|
# 0 : illegal
|
||||||
|
# 1 : very unlikely
|
||||||
|
# 2 : normal
|
||||||
|
# 3 : very likely
|
||||||
|
MacRomanClassModel = (
|
||||||
|
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||||
|
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
|
||||||
|
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
|
||||||
|
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
|
||||||
|
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
|
||||||
|
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
|
||||||
|
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
|
||||||
|
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
|
||||||
|
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
class MacRomanProber(CharSetProber):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._last_char_class = OTH
|
||||||
|
self._freq_counter: List[int] = []
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._last_char_class = OTH
|
||||||
|
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||||
|
|
||||||
|
# express the prior that MacRoman is a somewhat rare encoding;
|
||||||
|
# this can be done by starting out in a slightly improbable state
|
||||||
|
# that must be overcome
|
||||||
|
self._freq_counter[2] = 10
|
||||||
|
|
||||||
|
super().reset()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_name(self) -> str:
|
||||||
|
return "MacRoman"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
|
for c in byte_str:
|
||||||
|
char_class = MacRoman_CharToClass[c]
|
||||||
|
freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
|
||||||
|
if freq == 0:
|
||||||
|
self._state = ProbingState.NOT_ME
|
||||||
|
break
|
||||||
|
self._freq_counter[freq] += 1
|
||||||
|
self._last_char_class = char_class
|
||||||
|
|
||||||
|
return self.state
|
||||||
|
|
||||||
|
def get_confidence(self) -> float:
|
||||||
|
if self.state == ProbingState.NOT_ME:
|
||||||
|
return 0.01
|
||||||
|
|
||||||
|
total = sum(self._freq_counter)
|
||||||
|
confidence = (
|
||||||
|
0.0
|
||||||
|
if total < 0.01
|
||||||
|
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
|
||||||
|
)
|
||||||
|
confidence = max(confidence, 0.0)
|
||||||
|
# lower the confidence of MacRoman so that other more accurate
|
||||||
|
# detector can take priority.
|
||||||
|
confidence *= 0.73
|
||||||
|
return confidence
|
|
@ -27,8 +27,12 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from .chardistribution import CharDistributionAnalysis
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState, MachineState
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import LanguageFilter, MachineState, ProbingState
|
||||||
|
|
||||||
|
|
||||||
class MultiByteCharSetProber(CharSetProber):
|
class MultiByteCharSetProber(CharSetProber):
|
||||||
|
@ -36,56 +40,56 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
MultiByteCharSetProber
|
MultiByteCharSetProber
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.distribution_analyzer = None
|
self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
|
||||||
self.coding_sm = None
|
self.coding_sm: Optional[CodingStateMachine] = None
|
||||||
self._last_char = [0, 0]
|
self._last_char = bytearray(b"\0\0")
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(MultiByteCharSetProber, self).reset()
|
super().reset()
|
||||||
if self.coding_sm:
|
if self.coding_sm:
|
||||||
self.coding_sm.reset()
|
self.coding_sm.reset()
|
||||||
if self.distribution_analyzer:
|
if self.distribution_analyzer:
|
||||||
self.distribution_analyzer.reset()
|
self.distribution_analyzer.reset()
|
||||||
self._last_char = [0, 0]
|
self._last_char = bytearray(b"\0\0")
|
||||||
|
|
||||||
@property
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
def charset_name(self):
|
assert self.coding_sm is not None
|
||||||
raise NotImplementedError
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
@property
|
for i, byte in enumerate(byte_str):
|
||||||
def language(self):
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def feed(self, byte_str):
|
|
||||||
for i in range(len(byte_str)):
|
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s %s prober hit error at byte %s',
|
self.logger.debug(
|
||||||
self.charset_name, self.language, i)
|
"%s %s prober hit error at byte %s",
|
||||||
|
self.charset_name,
|
||||||
|
self.language,
|
||||||
|
i,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
char_len)
|
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.distribution_analyzer.got_enough_data() and
|
if self.distribution_analyzer.got_enough_data() and (
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||||
|
):
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
return self.distribution_analyzer.get_confidence()
|
return self.distribution_analyzer.get_confidence()
|
||||||
|
|
|
@ -27,20 +27,22 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
|
||||||
from .utf8prober import UTF8Prober
|
|
||||||
from .sjisprober import SJISProber
|
|
||||||
from .eucjpprober import EUCJPProber
|
|
||||||
from .gb2312prober import GB2312Prober
|
|
||||||
from .euckrprober import EUCKRProber
|
|
||||||
from .cp949prober import CP949Prober
|
|
||||||
from .big5prober import Big5Prober
|
from .big5prober import Big5Prober
|
||||||
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
|
from .cp949prober import CP949Prober
|
||||||
|
from .enums import LanguageFilter
|
||||||
|
from .eucjpprober import EUCJPProber
|
||||||
|
from .euckrprober import EUCKRProber
|
||||||
from .euctwprober import EUCTWProber
|
from .euctwprober import EUCTWProber
|
||||||
|
from .gb2312prober import GB2312Prober
|
||||||
|
from .johabprober import JOHABProber
|
||||||
|
from .sjisprober import SJISProber
|
||||||
|
from .utf8prober import UTF8Prober
|
||||||
|
|
||||||
|
|
||||||
class MBCSGroupProber(CharSetGroupProber):
|
class MBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super(MBCSGroupProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.probers = [
|
self.probers = [
|
||||||
UTF8Prober(),
|
UTF8Prober(),
|
||||||
SJISProber(),
|
SJISProber(),
|
||||||
|
@ -49,6 +51,7 @@ class MBCSGroupProber(CharSetGroupProber):
|
||||||
EUCKRProber(),
|
EUCKRProber(),
|
||||||
CP949Prober(),
|
CP949Prober(),
|
||||||
Big5Prober(),
|
Big5Prober(),
|
||||||
EUCTWProber()
|
EUCTWProber(),
|
||||||
|
JOHABProber(),
|
||||||
]
|
]
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
@ -25,43 +25,45 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from .codingstatemachinedict import CodingStateMachineDict
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
# BIG5
|
# BIG5
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
BIG5_CLS = (
|
BIG5_CLS = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||||
1,1,1,1,1,1,1,1, # 20 - 27
|
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||||
1,1,1,1,1,1,1,1, # 30 - 37
|
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||||
2,2,2,2,2,2,2,2, # 40 - 47
|
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||||
2,2,2,2,2,2,2,2, # 50 - 57
|
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||||
2,2,2,2,2,2,2,2, # 60 - 67
|
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||||
2,2,2,2,2,2,2,2, # 70 - 77
|
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
|
||||||
4,4,4,4,4,4,4,4, # 80 - 87
|
4, 4, 4, 4, 4, 4, 4, 4, # 80 - 87
|
||||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
|
||||||
4,4,4,4,4,4,4,4, # 90 - 97
|
4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
|
||||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
|
||||||
4,3,3,3,3,3,3,3, # a0 - a7
|
4, 3, 3, 3, 3, 3, 3, 3, # a0 - a7
|
||||||
3,3,3,3,3,3,3,3, # a8 - af
|
3, 3, 3, 3, 3, 3, 3, 3, # a8 - af
|
||||||
3,3,3,3,3,3,3,3, # b0 - b7
|
3, 3, 3, 3, 3, 3, 3, 3, # b0 - b7
|
||||||
3,3,3,3,3,3,3,3, # b8 - bf
|
3, 3, 3, 3, 3, 3, 3, 3, # b8 - bf
|
||||||
3,3,3,3,3,3,3,3, # c0 - c7
|
3, 3, 3, 3, 3, 3, 3, 3, # c0 - c7
|
||||||
3,3,3,3,3,3,3,3, # c8 - cf
|
3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
|
||||||
3,3,3,3,3,3,3,3, # d0 - d7
|
3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
|
||||||
3,3,3,3,3,3,3,3, # d8 - df
|
3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
|
||||||
3,3,3,3,3,3,3,3, # e0 - e7
|
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||||
3,3,3,3,3,3,3,3, # e8 - ef
|
3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
|
||||||
3,3,3,3,3,3,3,3, # f0 - f7
|
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||||
3,3,3,3,3,3,3,0 # f8 - ff
|
3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
BIG5_ST = (
|
BIG5_ST = (
|
||||||
|
@ -69,34 +71,37 @@ BIG5_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||||
|
|
||||||
BIG5_SM_MODEL = {'class_table': BIG5_CLS,
|
BIG5_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 5,
|
"class_table": BIG5_CLS,
|
||||||
'state_table': BIG5_ST,
|
"class_factor": 5,
|
||||||
'char_len_table': BIG5_CHAR_LEN_TABLE,
|
"state_table": BIG5_ST,
|
||||||
'name': 'Big5'}
|
"char_len_table": BIG5_CHAR_LEN_TABLE,
|
||||||
|
"name": "Big5",
|
||||||
|
}
|
||||||
|
|
||||||
# CP949
|
# CP949
|
||||||
|
# fmt: off
|
||||||
CP949_CLS = (
|
CP949_CLS = (
|
||||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f
|
||||||
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f
|
||||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 2f
|
||||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 3f
|
||||||
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
|
1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 4f
|
||||||
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
|
4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 50 - 5f
|
||||||
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
|
1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 60 - 6f
|
||||||
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 70 - 7f
|
||||||
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
|
0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 80 - 8f
|
||||||
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 9f
|
||||||
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
|
6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, # a0 - af
|
||||||
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0 - bf
|
||||||
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
|
7, 7, 7, 7, 7, 7, 9, 2, 2, 3, 2, 2, 2, 2, 2, 2, # c0 - cf
|
||||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0 - df
|
||||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # e0 - ef
|
||||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, # f0 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
CP949_ST = (
|
CP949_ST = (
|
||||||
|
@ -109,50 +114,53 @@ CP949_ST = (
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||||
|
|
||||||
CP949_SM_MODEL = {'class_table': CP949_CLS,
|
CP949_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 10,
|
"class_table": CP949_CLS,
|
||||||
'state_table': CP949_ST,
|
"class_factor": 10,
|
||||||
'char_len_table': CP949_CHAR_LEN_TABLE,
|
"state_table": CP949_ST,
|
||||||
'name': 'CP949'}
|
"char_len_table": CP949_CHAR_LEN_TABLE,
|
||||||
|
"name": "CP949",
|
||||||
|
}
|
||||||
|
|
||||||
# EUC-JP
|
# EUC-JP
|
||||||
|
# fmt: off
|
||||||
EUCJP_CLS = (
|
EUCJP_CLS = (
|
||||||
4,4,4,4,4,4,4,4, # 00 - 07
|
4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07
|
||||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f
|
||||||
4,4,4,4,4,4,4,4, # 10 - 17
|
4, 4, 4, 4, 4, 4, 4, 4, # 10 - 17
|
||||||
4,4,4,5,4,4,4,4, # 18 - 1f
|
4, 4, 4, 5, 4, 4, 4, 4, # 18 - 1f
|
||||||
4,4,4,4,4,4,4,4, # 20 - 27
|
4, 4, 4, 4, 4, 4, 4, 4, # 20 - 27
|
||||||
4,4,4,4,4,4,4,4, # 28 - 2f
|
4, 4, 4, 4, 4, 4, 4, 4, # 28 - 2f
|
||||||
4,4,4,4,4,4,4,4, # 30 - 37
|
4, 4, 4, 4, 4, 4, 4, 4, # 30 - 37
|
||||||
4,4,4,4,4,4,4,4, # 38 - 3f
|
4, 4, 4, 4, 4, 4, 4, 4, # 38 - 3f
|
||||||
4,4,4,4,4,4,4,4, # 40 - 47
|
4, 4, 4, 4, 4, 4, 4, 4, # 40 - 47
|
||||||
4,4,4,4,4,4,4,4, # 48 - 4f
|
4, 4, 4, 4, 4, 4, 4, 4, # 48 - 4f
|
||||||
4,4,4,4,4,4,4,4, # 50 - 57
|
4, 4, 4, 4, 4, 4, 4, 4, # 50 - 57
|
||||||
4,4,4,4,4,4,4,4, # 58 - 5f
|
4, 4, 4, 4, 4, 4, 4, 4, # 58 - 5f
|
||||||
4,4,4,4,4,4,4,4, # 60 - 67
|
4, 4, 4, 4, 4, 4, 4, 4, # 60 - 67
|
||||||
4,4,4,4,4,4,4,4, # 68 - 6f
|
4, 4, 4, 4, 4, 4, 4, 4, # 68 - 6f
|
||||||
4,4,4,4,4,4,4,4, # 70 - 77
|
4, 4, 4, 4, 4, 4, 4, 4, # 70 - 77
|
||||||
4,4,4,4,4,4,4,4, # 78 - 7f
|
4, 4, 4, 4, 4, 4, 4, 4, # 78 - 7f
|
||||||
5,5,5,5,5,5,5,5, # 80 - 87
|
5, 5, 5, 5, 5, 5, 5, 5, # 80 - 87
|
||||||
5,5,5,5,5,5,1,3, # 88 - 8f
|
5, 5, 5, 5, 5, 5, 1, 3, # 88 - 8f
|
||||||
5,5,5,5,5,5,5,5, # 90 - 97
|
5, 5, 5, 5, 5, 5, 5, 5, # 90 - 97
|
||||||
5,5,5,5,5,5,5,5, # 98 - 9f
|
5, 5, 5, 5, 5, 5, 5, 5, # 98 - 9f
|
||||||
5,2,2,2,2,2,2,2, # a0 - a7
|
5, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||||
2,2,2,2,2,2,2,2, # a8 - af
|
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||||
2,2,2,2,2,2,2,2, # b0 - b7
|
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||||
2,2,2,2,2,2,2,2, # b8 - bf
|
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||||
2,2,2,2,2,2,2,2, # c0 - c7
|
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||||
2,2,2,2,2,2,2,2, # c8 - cf
|
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||||
2,2,2,2,2,2,2,2, # d0 - d7
|
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||||
0,0,0,0,0,0,0,0, # e0 - e7
|
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
|
||||||
0,0,0,0,0,0,0,0, # e8 - ef
|
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
|
||||||
0,0,0,0,0,0,0,0, # f0 - f7
|
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
|
||||||
0,0,0,0,0,0,0,5 # f8 - ff
|
0, 0, 0, 0, 0, 0, 0, 5 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCJP_ST = (
|
EUCJP_ST = (
|
||||||
|
@ -162,100 +170,163 @@ EUCJP_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
||||||
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||||
|
|
||||||
EUCJP_SM_MODEL = {'class_table': EUCJP_CLS,
|
EUCJP_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 6,
|
"class_table": EUCJP_CLS,
|
||||||
'state_table': EUCJP_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': EUCJP_CHAR_LEN_TABLE,
|
"state_table": EUCJP_ST,
|
||||||
'name': 'EUC-JP'}
|
"char_len_table": EUCJP_CHAR_LEN_TABLE,
|
||||||
|
"name": "EUC-JP",
|
||||||
|
}
|
||||||
|
|
||||||
# EUC-KR
|
# EUC-KR
|
||||||
|
# fmt: off
|
||||||
EUCKR_CLS = (
|
EUCKR_CLS = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||||
1,1,1,1,1,1,1,1, # 20 - 27
|
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||||
1,1,1,1,1,1,1,1, # 30 - 37
|
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||||
1,1,1,1,1,1,1,1, # 40 - 47
|
1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
|
||||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
|
||||||
1,1,1,1,1,1,1,1, # 50 - 57
|
1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
|
||||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
|
||||||
1,1,1,1,1,1,1,1, # 60 - 67
|
1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
|
||||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
|
||||||
1,1,1,1,1,1,1,1, # 70 - 77
|
1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
|
||||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
|
||||||
0,0,0,0,0,0,0,0, # 80 - 87
|
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
|
||||||
0,0,0,0,0,0,0,0, # 90 - 97
|
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||||
0,2,2,2,2,2,2,2, # a0 - a7
|
0, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||||
2,2,2,2,2,3,3,3, # a8 - af
|
2, 2, 2, 2, 2, 3, 3, 3, # a8 - af
|
||||||
2,2,2,2,2,2,2,2, # b0 - b7
|
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||||
2,2,2,2,2,2,2,2, # b8 - bf
|
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||||
2,2,2,2,2,2,2,2, # c0 - c7
|
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||||
2,3,2,2,2,2,2,2, # c8 - cf
|
2, 3, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||||
2,2,2,2,2,2,2,2, # d0 - d7
|
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||||
2,2,2,2,2,2,2,2, # e0 - e7
|
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||||
2,2,2,2,2,2,2,2, # e8 - ef
|
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||||
2,2,2,2,2,2,2,2, # f0 - f7
|
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||||
2,2,2,2,2,2,2,0 # f8 - ff
|
2, 2, 2, 2, 2, 2, 2, 0 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCKR_ST = (
|
EUCKR_ST = (
|
||||||
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||||
|
|
||||||
EUCKR_SM_MODEL = {'class_table': EUCKR_CLS,
|
EUCKR_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 4,
|
"class_table": EUCKR_CLS,
|
||||||
'state_table': EUCKR_ST,
|
"class_factor": 4,
|
||||||
'char_len_table': EUCKR_CHAR_LEN_TABLE,
|
"state_table": EUCKR_ST,
|
||||||
'name': 'EUC-KR'}
|
"char_len_table": EUCKR_CHAR_LEN_TABLE,
|
||||||
|
"name": "EUC-KR",
|
||||||
|
}
|
||||||
|
|
||||||
|
# JOHAB
|
||||||
|
# fmt: off
|
||||||
|
JOHAB_CLS = (
|
||||||
|
4,4,4,4,4,4,4,4, # 00 - 07
|
||||||
|
4,4,4,4,4,4,0,0, # 08 - 0f
|
||||||
|
4,4,4,4,4,4,4,4, # 10 - 17
|
||||||
|
4,4,4,0,4,4,4,4, # 18 - 1f
|
||||||
|
4,4,4,4,4,4,4,4, # 20 - 27
|
||||||
|
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||||
|
4,3,3,3,3,3,3,3, # 30 - 37
|
||||||
|
3,3,3,3,3,3,3,3, # 38 - 3f
|
||||||
|
3,1,1,1,1,1,1,1, # 40 - 47
|
||||||
|
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||||
|
1,1,1,1,1,1,1,1, # 50 - 57
|
||||||
|
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||||
|
1,1,1,1,1,1,1,1, # 60 - 67
|
||||||
|
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||||
|
1,1,1,1,1,1,1,1, # 70 - 77
|
||||||
|
1,1,1,1,1,1,1,2, # 78 - 7f
|
||||||
|
6,6,6,6,8,8,8,8, # 80 - 87
|
||||||
|
8,8,8,8,8,8,8,8, # 88 - 8f
|
||||||
|
8,7,7,7,7,7,7,7, # 90 - 97
|
||||||
|
7,7,7,7,7,7,7,7, # 98 - 9f
|
||||||
|
7,7,7,7,7,7,7,7, # a0 - a7
|
||||||
|
7,7,7,7,7,7,7,7, # a8 - af
|
||||||
|
7,7,7,7,7,7,7,7, # b0 - b7
|
||||||
|
7,7,7,7,7,7,7,7, # b8 - bf
|
||||||
|
7,7,7,7,7,7,7,7, # c0 - c7
|
||||||
|
7,7,7,7,7,7,7,7, # c8 - cf
|
||||||
|
7,7,7,7,5,5,5,5, # d0 - d7
|
||||||
|
5,9,9,9,9,9,9,5, # d8 - df
|
||||||
|
9,9,9,9,9,9,9,9, # e0 - e7
|
||||||
|
9,9,9,9,9,9,9,9, # e8 - ef
|
||||||
|
9,9,9,9,9,9,9,9, # f0 - f7
|
||||||
|
9,9,5,5,5,5,5,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
|
JOHAB_ST = (
|
||||||
|
# cls = 0 1 2 3 4 5 6 7 8 9
|
||||||
|
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START
|
||||||
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
|
||||||
|
MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR
|
||||||
|
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3
|
||||||
|
MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
|
||||||
|
|
||||||
|
JOHAB_SM_MODEL: CodingStateMachineDict = {
|
||||||
|
"class_table": JOHAB_CLS,
|
||||||
|
"class_factor": 10,
|
||||||
|
"state_table": JOHAB_ST,
|
||||||
|
"char_len_table": JOHAB_CHAR_LEN_TABLE,
|
||||||
|
"name": "Johab",
|
||||||
|
}
|
||||||
|
|
||||||
# EUC-TW
|
# EUC-TW
|
||||||
|
# fmt: off
|
||||||
EUCTW_CLS = (
|
EUCTW_CLS = (
|
||||||
2,2,2,2,2,2,2,2, # 00 - 07
|
2, 2, 2, 2, 2, 2, 2, 2, # 00 - 07
|
||||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
2, 2, 2, 2, 2, 2, 0, 0, # 08 - 0f
|
||||||
2,2,2,2,2,2,2,2, # 10 - 17
|
2, 2, 2, 2, 2, 2, 2, 2, # 10 - 17
|
||||||
2,2,2,0,2,2,2,2, # 18 - 1f
|
2, 2, 2, 0, 2, 2, 2, 2, # 18 - 1f
|
||||||
2,2,2,2,2,2,2,2, # 20 - 27
|
2, 2, 2, 2, 2, 2, 2, 2, # 20 - 27
|
||||||
2,2,2,2,2,2,2,2, # 28 - 2f
|
2, 2, 2, 2, 2, 2, 2, 2, # 28 - 2f
|
||||||
2,2,2,2,2,2,2,2, # 30 - 37
|
2, 2, 2, 2, 2, 2, 2, 2, # 30 - 37
|
||||||
2,2,2,2,2,2,2,2, # 38 - 3f
|
2, 2, 2, 2, 2, 2, 2, 2, # 38 - 3f
|
||||||
2,2,2,2,2,2,2,2, # 40 - 47
|
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||||
2,2,2,2,2,2,2,2, # 50 - 57
|
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||||
2,2,2,2,2,2,2,2, # 60 - 67
|
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||||
2,2,2,2,2,2,2,2, # 70 - 77
|
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||||
2,2,2,2,2,2,2,2, # 78 - 7f
|
2, 2, 2, 2, 2, 2, 2, 2, # 78 - 7f
|
||||||
0,0,0,0,0,0,0,0, # 80 - 87
|
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||||
0,0,0,0,0,0,6,0, # 88 - 8f
|
0, 0, 0, 0, 0, 0, 6, 0, # 88 - 8f
|
||||||
0,0,0,0,0,0,0,0, # 90 - 97
|
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||||
0,3,4,4,4,4,4,4, # a0 - a7
|
0, 3, 4, 4, 4, 4, 4, 4, # a0 - a7
|
||||||
5,5,1,1,1,1,1,1, # a8 - af
|
5, 5, 1, 1, 1, 1, 1, 1, # a8 - af
|
||||||
1,1,1,1,1,1,1,1, # b0 - b7
|
1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7
|
||||||
1,1,1,1,1,1,1,1, # b8 - bf
|
1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf
|
||||||
1,1,3,1,3,3,3,3, # c0 - c7
|
1, 1, 3, 1, 3, 3, 3, 3, # c0 - c7
|
||||||
3,3,3,3,3,3,3,3, # c8 - cf
|
3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
|
||||||
3,3,3,3,3,3,3,3, # d0 - d7
|
3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
|
||||||
3,3,3,3,3,3,3,3, # d8 - df
|
3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
|
||||||
3,3,3,3,3,3,3,3, # e0 - e7
|
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||||
3,3,3,3,3,3,3,3, # e8 - ef
|
3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
|
||||||
3,3,3,3,3,3,3,3, # f0 - f7
|
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||||
3,3,3,3,3,3,3,0 # f8 - ff
|
3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
EUCTW_ST = (
|
EUCTW_ST = (
|
||||||
|
@ -266,50 +337,53 @@ EUCTW_ST = (
|
||||||
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
||||||
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||||
|
|
||||||
EUCTW_SM_MODEL = {'class_table': EUCTW_CLS,
|
EUCTW_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 7,
|
"class_table": EUCTW_CLS,
|
||||||
'state_table': EUCTW_ST,
|
"class_factor": 7,
|
||||||
'char_len_table': EUCTW_CHAR_LEN_TABLE,
|
"state_table": EUCTW_ST,
|
||||||
'name': 'x-euc-tw'}
|
"char_len_table": EUCTW_CHAR_LEN_TABLE,
|
||||||
|
"name": "x-euc-tw",
|
||||||
|
}
|
||||||
|
|
||||||
# GB2312
|
# GB2312
|
||||||
|
# fmt: off
|
||||||
GB2312_CLS = (
|
GB2312_CLS = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||||
1,1,1,1,1,1,1,1, # 20 - 27
|
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||||
3,3,3,3,3,3,3,3, # 30 - 37
|
3, 3, 3, 3, 3, 3, 3, 3, # 30 - 37
|
||||||
3,3,1,1,1,1,1,1, # 38 - 3f
|
3, 3, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||||
2,2,2,2,2,2,2,2, # 40 - 47
|
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||||
2,2,2,2,2,2,2,2, # 50 - 57
|
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||||
2,2,2,2,2,2,2,2, # 60 - 67
|
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||||
2,2,2,2,2,2,2,2, # 70 - 77
|
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||||
2,2,2,2,2,2,2,4, # 78 - 7f
|
2, 2, 2, 2, 2, 2, 2, 4, # 78 - 7f
|
||||||
5,6,6,6,6,6,6,6, # 80 - 87
|
5, 6, 6, 6, 6, 6, 6, 6, # 80 - 87
|
||||||
6,6,6,6,6,6,6,6, # 88 - 8f
|
6, 6, 6, 6, 6, 6, 6, 6, # 88 - 8f
|
||||||
6,6,6,6,6,6,6,6, # 90 - 97
|
6, 6, 6, 6, 6, 6, 6, 6, # 90 - 97
|
||||||
6,6,6,6,6,6,6,6, # 98 - 9f
|
6, 6, 6, 6, 6, 6, 6, 6, # 98 - 9f
|
||||||
6,6,6,6,6,6,6,6, # a0 - a7
|
6, 6, 6, 6, 6, 6, 6, 6, # a0 - a7
|
||||||
6,6,6,6,6,6,6,6, # a8 - af
|
6, 6, 6, 6, 6, 6, 6, 6, # a8 - af
|
||||||
6,6,6,6,6,6,6,6, # b0 - b7
|
6, 6, 6, 6, 6, 6, 6, 6, # b0 - b7
|
||||||
6,6,6,6,6,6,6,6, # b8 - bf
|
6, 6, 6, 6, 6, 6, 6, 6, # b8 - bf
|
||||||
6,6,6,6,6,6,6,6, # c0 - c7
|
6, 6, 6, 6, 6, 6, 6, 6, # c0 - c7
|
||||||
6,6,6,6,6,6,6,6, # c8 - cf
|
6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
|
||||||
6,6,6,6,6,6,6,6, # d0 - d7
|
6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
|
||||||
6,6,6,6,6,6,6,6, # d8 - df
|
6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
|
||||||
6,6,6,6,6,6,6,6, # e0 - e7
|
6, 6, 6, 6, 6, 6, 6, 6, # e0 - e7
|
||||||
6,6,6,6,6,6,6,6, # e8 - ef
|
6, 6, 6, 6, 6, 6, 6, 6, # e8 - ef
|
||||||
6,6,6,6,6,6,6,6, # f0 - f7
|
6, 6, 6, 6, 6, 6, 6, 6, # f0 - f7
|
||||||
6,6,6,6,6,6,6,0 # f8 - ff
|
6, 6, 6, 6, 6, 6, 6, 0 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
GB2312_ST = (
|
GB2312_ST = (
|
||||||
|
@ -320,6 +394,7 @@ GB2312_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||||
# But it is not necessary to discriminate between the two since
|
# But it is not necessary to discriminate between the two since
|
||||||
|
@ -328,100 +403,105 @@ GB2312_ST = (
|
||||||
# 2 here.
|
# 2 here.
|
||||||
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
||||||
|
|
||||||
GB2312_SM_MODEL = {'class_table': GB2312_CLS,
|
GB2312_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 7,
|
"class_table": GB2312_CLS,
|
||||||
'state_table': GB2312_ST,
|
"class_factor": 7,
|
||||||
'char_len_table': GB2312_CHAR_LEN_TABLE,
|
"state_table": GB2312_ST,
|
||||||
'name': 'GB2312'}
|
"char_len_table": GB2312_CHAR_LEN_TABLE,
|
||||||
|
"name": "GB2312",
|
||||||
|
}
|
||||||
|
|
||||||
# Shift_JIS
|
# Shift_JIS
|
||||||
|
# fmt: off
|
||||||
SJIS_CLS = (
|
SJIS_CLS = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||||
1,1,1,1,1,1,1,1, # 20 - 27
|
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||||
1,1,1,1,1,1,1,1, # 30 - 37
|
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||||
2,2,2,2,2,2,2,2, # 40 - 47
|
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||||
2,2,2,2,2,2,2,2, # 50 - 57
|
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||||
2,2,2,2,2,2,2,2, # 60 - 67
|
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||||
2,2,2,2,2,2,2,2, # 70 - 77
|
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
|
||||||
3,3,3,3,3,2,2,3, # 80 - 87
|
3, 3, 3, 3, 3, 2, 2, 3, # 80 - 87
|
||||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
3, 3, 3, 3, 3, 3, 3, 3, # 88 - 8f
|
||||||
3,3,3,3,3,3,3,3, # 90 - 97
|
3, 3, 3, 3, 3, 3, 3, 3, # 90 - 97
|
||||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
3, 3, 3, 3, 3, 3, 3, 3, # 98 - 9f
|
||||||
#0xa0 is illegal in sjis encoding, but some pages does
|
#0xa0 is illegal in sjis encoding, but some pages does
|
||||||
#contain such byte. We need to be more error forgiven.
|
#contain such byte. We need to be more error forgiven.
|
||||||
2,2,2,2,2,2,2,2, # a0 - a7
|
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||||
2,2,2,2,2,2,2,2, # a8 - af
|
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||||
2,2,2,2,2,2,2,2, # b0 - b7
|
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||||
2,2,2,2,2,2,2,2, # b8 - bf
|
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||||
2,2,2,2,2,2,2,2, # c0 - c7
|
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||||
2,2,2,2,2,2,2,2, # c8 - cf
|
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||||
2,2,2,2,2,2,2,2, # d0 - d7
|
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||||
3,3,3,3,3,3,3,3, # e0 - e7
|
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||||
3,3,3,3,3,4,4,4, # e8 - ef
|
3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef
|
||||||
3,3,3,3,3,3,3,3, # f0 - f7
|
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||||
3,3,3,3,3,0,0,0) # f8 - ff
|
3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
SJIS_ST = (
|
SJIS_ST = (
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||||
|
|
||||||
SJIS_SM_MODEL = {'class_table': SJIS_CLS,
|
SJIS_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 6,
|
"class_table": SJIS_CLS,
|
||||||
'state_table': SJIS_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': SJIS_CHAR_LEN_TABLE,
|
"state_table": SJIS_ST,
|
||||||
'name': 'Shift_JIS'}
|
"char_len_table": SJIS_CHAR_LEN_TABLE,
|
||||||
|
"name": "Shift_JIS",
|
||||||
|
}
|
||||||
|
|
||||||
# UCS2-BE
|
# UCS2-BE
|
||||||
|
# fmt: off
|
||||||
UCS2BE_CLS = (
|
UCS2BE_CLS = (
|
||||||
0,0,0,0,0,0,0,0, # 00 - 07
|
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
|
||||||
0,0,0,0,0,0,0,0, # 20 - 27
|
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
|
||||||
0,0,0,0,0,0,0,0, # 30 - 37
|
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||||
0,0,0,0,0,0,0,0, # 40 - 47
|
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
|
||||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||||
0,0,0,0,0,0,0,0, # 50 - 57
|
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||||
0,0,0,0,0,0,0,0, # 60 - 67
|
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||||
0,0,0,0,0,0,0,0, # 70 - 77
|
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||||
0,0,0,0,0,0,0,0, # 80 - 87
|
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
|
||||||
0,0,0,0,0,0,0,0, # 90 - 97
|
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||||
0,0,0,0,0,0,0,0, # a0 - a7
|
0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
|
||||||
0,0,0,0,0,0,0,0, # a8 - af
|
0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
|
||||||
0,0,0,0,0,0,0,0, # b0 - b7
|
0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
|
||||||
0,0,0,0,0,0,0,0, # b8 - bf
|
0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
|
||||||
0,0,0,0,0,0,0,0, # c0 - c7
|
0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
|
||||||
0,0,0,0,0,0,0,0, # c8 - cf
|
0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
|
||||||
0,0,0,0,0,0,0,0, # d0 - d7
|
0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
|
||||||
0,0,0,0,0,0,0,0, # d8 - df
|
0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
|
||||||
0,0,0,0,0,0,0,0, # e0 - e7
|
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
|
||||||
0,0,0,0,0,0,0,0, # e8 - ef
|
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
|
||||||
0,0,0,0,0,0,0,0, # f0 - f7
|
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
|
||||||
0,0,0,0,0,0,4,5 # f8 - ff
|
0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
UCS2BE_ST = (
|
UCS2BE_ST = (
|
||||||
|
@ -433,50 +513,53 @@ UCS2BE_ST = (
|
||||||
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
||||||
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||||
|
|
||||||
UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS,
|
UCS2BE_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 6,
|
"class_table": UCS2BE_CLS,
|
||||||
'state_table': UCS2BE_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': UCS2BE_CHAR_LEN_TABLE,
|
"state_table": UCS2BE_ST,
|
||||||
'name': 'UTF-16BE'}
|
"char_len_table": UCS2BE_CHAR_LEN_TABLE,
|
||||||
|
"name": "UTF-16BE",
|
||||||
|
}
|
||||||
|
|
||||||
# UCS2-LE
|
# UCS2-LE
|
||||||
|
# fmt: off
|
||||||
UCS2LE_CLS = (
|
UCS2LE_CLS = (
|
||||||
0,0,0,0,0,0,0,0, # 00 - 07
|
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
|
||||||
0,0,0,0,0,0,0,0, # 20 - 27
|
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
|
||||||
0,0,0,0,0,0,0,0, # 30 - 37
|
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||||
0,0,0,0,0,0,0,0, # 40 - 47
|
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
|
||||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||||
0,0,0,0,0,0,0,0, # 50 - 57
|
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||||
0,0,0,0,0,0,0,0, # 60 - 67
|
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||||
0,0,0,0,0,0,0,0, # 70 - 77
|
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||||
0,0,0,0,0,0,0,0, # 80 - 87
|
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
|
||||||
0,0,0,0,0,0,0,0, # 90 - 97
|
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||||
0,0,0,0,0,0,0,0, # a0 - a7
|
0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
|
||||||
0,0,0,0,0,0,0,0, # a8 - af
|
0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
|
||||||
0,0,0,0,0,0,0,0, # b0 - b7
|
0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
|
||||||
0,0,0,0,0,0,0,0, # b8 - bf
|
0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
|
||||||
0,0,0,0,0,0,0,0, # c0 - c7
|
0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
|
||||||
0,0,0,0,0,0,0,0, # c8 - cf
|
0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
|
||||||
0,0,0,0,0,0,0,0, # d0 - d7
|
0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
|
||||||
0,0,0,0,0,0,0,0, # d8 - df
|
0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
|
||||||
0,0,0,0,0,0,0,0, # e0 - e7
|
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
|
||||||
0,0,0,0,0,0,0,0, # e8 - ef
|
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
|
||||||
0,0,0,0,0,0,0,0, # f0 - f7
|
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
|
||||||
0,0,0,0,0,0,4,5 # f8 - ff
|
0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
UCS2LE_ST = (
|
UCS2LE_ST = (
|
||||||
|
@ -488,50 +571,53 @@ UCS2LE_ST = (
|
||||||
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
||||||
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||||
|
|
||||||
UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS,
|
UCS2LE_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 6,
|
"class_table": UCS2LE_CLS,
|
||||||
'state_table': UCS2LE_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': UCS2LE_CHAR_LEN_TABLE,
|
"state_table": UCS2LE_ST,
|
||||||
'name': 'UTF-16LE'}
|
"char_len_table": UCS2LE_CHAR_LEN_TABLE,
|
||||||
|
"name": "UTF-16LE",
|
||||||
|
}
|
||||||
|
|
||||||
# UTF-8
|
# UTF-8
|
||||||
|
# fmt: off
|
||||||
UTF8_CLS = (
|
UTF8_CLS = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as a legal value
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||||
1,1,1,1,1,1,1,1, # 20 - 27
|
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||||
1,1,1,1,1,1,1,1, # 30 - 37
|
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||||
1,1,1,1,1,1,1,1, # 40 - 47
|
1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
|
||||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
|
||||||
1,1,1,1,1,1,1,1, # 50 - 57
|
1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
|
||||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
|
||||||
1,1,1,1,1,1,1,1, # 60 - 67
|
1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
|
||||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
|
||||||
1,1,1,1,1,1,1,1, # 70 - 77
|
1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
|
||||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
|
||||||
2,2,2,2,3,3,3,3, # 80 - 87
|
2, 2, 2, 2, 3, 3, 3, 3, # 80 - 87
|
||||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
|
||||||
4,4,4,4,4,4,4,4, # 90 - 97
|
4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
|
||||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
|
||||||
5,5,5,5,5,5,5,5, # a0 - a7
|
5, 5, 5, 5, 5, 5, 5, 5, # a0 - a7
|
||||||
5,5,5,5,5,5,5,5, # a8 - af
|
5, 5, 5, 5, 5, 5, 5, 5, # a8 - af
|
||||||
5,5,5,5,5,5,5,5, # b0 - b7
|
5, 5, 5, 5, 5, 5, 5, 5, # b0 - b7
|
||||||
5,5,5,5,5,5,5,5, # b8 - bf
|
5, 5, 5, 5, 5, 5, 5, 5, # b8 - bf
|
||||||
0,0,6,6,6,6,6,6, # c0 - c7
|
0, 0, 6, 6, 6, 6, 6, 6, # c0 - c7
|
||||||
6,6,6,6,6,6,6,6, # c8 - cf
|
6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
|
||||||
6,6,6,6,6,6,6,6, # d0 - d7
|
6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
|
||||||
6,6,6,6,6,6,6,6, # d8 - df
|
6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
|
||||||
7,8,8,8,8,8,8,8, # e0 - e7
|
7, 8, 8, 8, 8, 8, 8, 8, # e0 - e7
|
||||||
8,8,8,8,8,9,8,8, # e8 - ef
|
8, 8, 8, 8, 8, 9, 8, 8, # e8 - ef
|
||||||
10,11,11,11,11,11,11,11, # f0 - f7
|
10, 11, 11, 11, 11, 11, 11, 11, # f0 - f7
|
||||||
12,13,13,13,14,15,0,0 # f8 - ff
|
12, 13, 13, 13, 14, 15, 0, 0 # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
UTF8_ST = (
|
UTF8_ST = (
|
||||||
|
@ -562,11 +648,14 @@ UTF8_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||||
|
|
||||||
UTF8_SM_MODEL = {'class_table': UTF8_CLS,
|
UTF8_SM_MODEL: CodingStateMachineDict = {
|
||||||
'class_factor': 16,
|
"class_table": UTF8_CLS,
|
||||||
'state_table': UTF8_ST,
|
"class_factor": 16,
|
||||||
'char_len_table': UTF8_CHAR_LEN_TABLE,
|
"state_table": UTF8_ST,
|
||||||
'name': 'UTF-8'}
|
"char_len_table": UTF8_CHAR_LEN_TABLE,
|
||||||
|
"name": "UTF-8",
|
||||||
|
}
|
||||||
|
|
|
@ -1,19 +1,17 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
"""
|
||||||
Metadata about languages used by our model training code for our
|
Metadata about languages used by our model training code for our
|
||||||
SingleByteCharSetProbers. Could be used for other things in the future.
|
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||||
|
|
||||||
This code is based on the language metadata from the uchardet project.
|
This code is based on the language metadata from the uchardet project.
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, print_function
|
|
||||||
|
|
||||||
from string import ascii_letters
|
from string import ascii_letters
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
# TODO: Add Ukrainian (KOI8-U)
|
||||||
|
|
||||||
|
|
||||||
# TODO: Add Ukranian (KOI8-U)
|
class Language:
|
||||||
|
|
||||||
class Language(object):
|
|
||||||
"""Metadata about a language useful for training models
|
"""Metadata about a language useful for training models
|
||||||
|
|
||||||
:ivar name: The human name for the language, in English.
|
:ivar name: The human name for the language, in English.
|
||||||
|
@ -33,9 +31,17 @@ class Language(object):
|
||||||
Wikipedia for training data.
|
Wikipedia for training data.
|
||||||
:type wiki_start_pages: list of str
|
:type wiki_start_pages: list of str
|
||||||
"""
|
"""
|
||||||
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
|
||||||
alphabet=None, wiki_start_pages=None):
|
def __init__(
|
||||||
super(Language, self).__init__()
|
self,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
iso_code: Optional[str] = None,
|
||||||
|
use_ascii: bool = True,
|
||||||
|
charsets: Optional[List[str]] = None,
|
||||||
|
alphabet: Optional[str] = None,
|
||||||
|
wiki_start_pages: Optional[List[str]] = None,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
self.name = name
|
self.name = name
|
||||||
self.iso_code = iso_code
|
self.iso_code = iso_code
|
||||||
self.use_ascii = use_ascii
|
self.use_ascii = use_ascii
|
||||||
|
@ -46,246 +52,282 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
alphabet = ascii_letters
|
alphabet = ascii_letters
|
||||||
elif not alphabet:
|
elif not alphabet:
|
||||||
raise ValueError('Must supply alphabet if use_ascii is False')
|
raise ValueError("Must supply alphabet if use_ascii is False")
|
||||||
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||||
self.wiki_start_pages = wiki_start_pages
|
self.wiki_start_pages = wiki_start_pages
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '{}({})'.format(self.__class__.__name__,
|
param_str = ", ".join(
|
||||||
', '.join('{}={!r}'.format(k, v)
|
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||||
for k, v in self.__dict__.items()
|
)
|
||||||
if not k.startswith('_')))
|
return f"{self.__class__.__name__}({param_str})"
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {'Arabic': Language(name='Arabic',
|
LANGUAGES = {
|
||||||
iso_code='ar',
|
"Arabic": Language(
|
||||||
|
name="Arabic",
|
||||||
|
iso_code="ar",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
# We only support encodings that use isolated
|
# We only support encodings that use isolated
|
||||||
# forms, because the current recommendation is
|
# forms, because the current recommendation is
|
||||||
# that the rendering system handles presentation
|
# that the rendering system handles presentation
|
||||||
# forms. This means we purposefully skip IBM864.
|
# forms. This means we purposefully skip IBM864.
|
||||||
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
|
||||||
'CP720', 'CP864'],
|
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
|
||||||
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
wiki_start_pages=["الصفحة_الرئيسية"],
|
||||||
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
),
|
||||||
'Belarusian': Language(name='Belarusian',
|
"Belarusian": Language(
|
||||||
iso_code='be',
|
name="Belarusian",
|
||||||
|
iso_code="be",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
|
||||||
'IBM866', 'MacCyrillic'],
|
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
|
||||||
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
wiki_start_pages=["Галоўная_старонка"],
|
||||||
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
),
|
||||||
wiki_start_pages=[u'Галоўная_старонка']),
|
"Bulgarian": Language(
|
||||||
'Bulgarian': Language(name='Bulgarian',
|
name="Bulgarian",
|
||||||
iso_code='bg',
|
iso_code="bg",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
|
||||||
'IBM855'],
|
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||||
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
wiki_start_pages=["Начална_страница"],
|
||||||
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
),
|
||||||
wiki_start_pages=[u'Начална_страница']),
|
"Czech": Language(
|
||||||
'Czech': Language(name='Czech',
|
name="Czech",
|
||||||
iso_code='cz',
|
iso_code="cz",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
|
||||||
wiki_start_pages=[u'Hlavní_strana']),
|
wiki_start_pages=["Hlavní_strana"],
|
||||||
'Danish': Language(name='Danish',
|
),
|
||||||
iso_code='da',
|
"Danish": Language(
|
||||||
|
name="Danish",
|
||||||
|
iso_code="da",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
'WINDOWS-1252'],
|
alphabet="æøåÆØÅ",
|
||||||
alphabet=u'æøåÆØÅ',
|
wiki_start_pages=["Forside"],
|
||||||
wiki_start_pages=[u'Forside']),
|
),
|
||||||
'German': Language(name='German',
|
"German": Language(
|
||||||
iso_code='de',
|
name="German",
|
||||||
|
iso_code="de",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet=u'äöüßÄÖÜ',
|
alphabet="äöüßẞÄÖÜ",
|
||||||
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||||
'Greek': Language(name='Greek',
|
),
|
||||||
iso_code='el',
|
"Greek": Language(
|
||||||
|
name="Greek",
|
||||||
|
iso_code="el",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
charsets=["ISO-8859-7", "WINDOWS-1253"],
|
||||||
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
|
||||||
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
wiki_start_pages=["Πύλη:Κύρια"],
|
||||||
wiki_start_pages=[u'Πύλη:Κύρια']),
|
),
|
||||||
'English': Language(name='English',
|
"English": Language(
|
||||||
iso_code='en',
|
name="English",
|
||||||
|
iso_code="en",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||||
wiki_start_pages=[u'Main_Page']),
|
wiki_start_pages=["Main_Page"],
|
||||||
'Esperanto': Language(name='Esperanto',
|
),
|
||||||
iso_code='eo',
|
"Esperanto": Language(
|
||||||
|
name="Esperanto",
|
||||||
|
iso_code="eo",
|
||||||
# Q, W, X, and Y not used at all
|
# Q, W, X, and Y not used at all
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-3'],
|
charsets=["ISO-8859-3"],
|
||||||
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
|
||||||
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
|
||||||
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
),
|
||||||
'Spanish': Language(name='Spanish',
|
"Spanish": Language(
|
||||||
iso_code='es',
|
name="Spanish",
|
||||||
|
iso_code="es",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||||
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
wiki_start_pages=["Wikipedia:Portada"],
|
||||||
wiki_start_pages=[u'Wikipedia:Portada']),
|
),
|
||||||
'Estonian': Language(name='Estonian',
|
"Estonian": Language(
|
||||||
iso_code='et',
|
name="Estonian",
|
||||||
|
iso_code="et",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-4', 'ISO-8859-13',
|
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
|
||||||
'WINDOWS-1257'],
|
|
||||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||||
# loanwords
|
# loanwords
|
||||||
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
|
||||||
u'abdeghijklmnoprstuvõäöü'),
|
wiki_start_pages=["Esileht"],
|
||||||
wiki_start_pages=[u'Esileht']),
|
),
|
||||||
'Finnish': Language(name='Finnish',
|
"Finnish": Language(
|
||||||
iso_code='fi',
|
name="Finnish",
|
||||||
|
iso_code="fi",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ÅÄÖŠŽåäöšž",
|
||||||
alphabet=u'ÅÄÖŠŽåäöšž',
|
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||||
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
),
|
||||||
'French': Language(name='French',
|
"French": Language(
|
||||||
iso_code='fr',
|
name="French",
|
||||||
|
iso_code="fr",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
'WINDOWS-1252'],
|
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||||
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||||
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
),
|
||||||
u'Bœuf (animal)']),
|
"Hebrew": Language(
|
||||||
'Hebrew': Language(name='Hebrew',
|
name="Hebrew",
|
||||||
iso_code='he',
|
iso_code="he",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
charsets=["ISO-8859-8", "WINDOWS-1255"],
|
||||||
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||||
wiki_start_pages=[u'עמוד_ראשי']),
|
wiki_start_pages=["עמוד_ראשי"],
|
||||||
'Croatian': Language(name='Croatian',
|
),
|
||||||
iso_code='hr',
|
"Croatian": Language(
|
||||||
|
name="Croatian",
|
||||||
|
iso_code="hr",
|
||||||
# Q, W, X, Y are only used for foreign words.
|
# Q, W, X, Y are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
|
||||||
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
wiki_start_pages=["Glavna_stranica"],
|
||||||
wiki_start_pages=[u'Glavna_stranica']),
|
),
|
||||||
'Hungarian': Language(name='Hungarian',
|
"Hungarian": Language(
|
||||||
iso_code='hu',
|
name="Hungarian",
|
||||||
|
iso_code="hu",
|
||||||
# Q, W, X, Y are only used for foreign words.
|
# Q, W, X, Y are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
|
||||||
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
wiki_start_pages=["Kezdőlap"],
|
||||||
wiki_start_pages=[u'Kezdőlap']),
|
),
|
||||||
'Italian': Language(name='Italian',
|
"Italian": Language(
|
||||||
iso_code='it',
|
name="Italian",
|
||||||
|
iso_code="it",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||||
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
wiki_start_pages=["Pagina_principale"],
|
||||||
wiki_start_pages=[u'Pagina_principale']),
|
),
|
||||||
'Lithuanian': Language(name='Lithuanian',
|
"Lithuanian": Language(
|
||||||
iso_code='lt',
|
name="Lithuanian",
|
||||||
|
iso_code="lt",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||||
'ISO-8859-4'],
|
|
||||||
# Q, W, and X not used at all
|
# Q, W, and X not used at all
|
||||||
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
|
||||||
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
wiki_start_pages=["Pagrindinis_puslapis"],
|
||||||
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
),
|
||||||
'Latvian': Language(name='Latvian',
|
"Latvian": Language(
|
||||||
iso_code='lv',
|
name="Latvian",
|
||||||
|
iso_code="lv",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||||
'ISO-8859-4'],
|
|
||||||
# Q, W, X, Y are only for loanwords
|
# Q, W, X, Y are only for loanwords
|
||||||
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
|
||||||
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
wiki_start_pages=["Sākumlapa"],
|
||||||
wiki_start_pages=[u'Sākumlapa']),
|
),
|
||||||
'Macedonian': Language(name='Macedonian',
|
"Macedonian": Language(
|
||||||
iso_code='mk',
|
name="Macedonian",
|
||||||
|
iso_code="mk",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||||
'MacCyrillic', 'IBM855'],
|
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
|
||||||
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
wiki_start_pages=["Главна_страница"],
|
||||||
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
),
|
||||||
wiki_start_pages=[u'Главна_страница']),
|
"Dutch": Language(
|
||||||
'Dutch': Language(name='Dutch',
|
name="Dutch",
|
||||||
iso_code='nl',
|
iso_code="nl",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||||
wiki_start_pages=[u'Hoofdpagina']),
|
wiki_start_pages=["Hoofdpagina"],
|
||||||
'Polish': Language(name='Polish',
|
),
|
||||||
iso_code='pl',
|
"Polish": Language(
|
||||||
|
name="Polish",
|
||||||
|
iso_code="pl",
|
||||||
# Q and X are only used for foreign words.
|
# Q and X are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
|
||||||
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
wiki_start_pages=["Wikipedia:Strona_główna"],
|
||||||
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
),
|
||||||
'Portuguese': Language(name='Portuguese',
|
"Portuguese": Language(
|
||||||
iso_code='pt',
|
name="Portuguese",
|
||||||
|
iso_code="pt",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||||
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||||
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
),
|
||||||
'Romanian': Language(name='Romanian',
|
"Romanian": Language(
|
||||||
iso_code='ro',
|
name="Romanian",
|
||||||
|
iso_code="ro",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=u'ăâîșțĂÂÎȘȚ',
|
alphabet="ăâîșțĂÂÎȘȚ",
|
||||||
wiki_start_pages=[u'Pagina_principală']),
|
wiki_start_pages=["Pagina_principală"],
|
||||||
'Russian': Language(name='Russian',
|
),
|
||||||
iso_code='ru',
|
"Russian": Language(
|
||||||
|
name="Russian",
|
||||||
|
iso_code="ru",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=[
|
||||||
'KOI8-R', 'MacCyrillic', 'IBM866',
|
"ISO-8859-5",
|
||||||
'IBM855'],
|
"WINDOWS-1251",
|
||||||
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
"KOI8-R",
|
||||||
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
"MacCyrillic",
|
||||||
wiki_start_pages=[u'Заглавная_страница']),
|
"IBM866",
|
||||||
'Slovak': Language(name='Slovak',
|
"IBM855",
|
||||||
iso_code='sk',
|
],
|
||||||
|
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||||||
|
wiki_start_pages=["Заглавная_страница"],
|
||||||
|
),
|
||||||
|
"Slovak": Language(
|
||||||
|
name="Slovak",
|
||||||
|
iso_code="sk",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
|
||||||
wiki_start_pages=[u'Hlavná_stránka']),
|
wiki_start_pages=["Hlavná_stránka"],
|
||||||
'Slovene': Language(name='Slovene',
|
),
|
||||||
iso_code='sl',
|
"Slovene": Language(
|
||||||
|
name="Slovene",
|
||||||
|
iso_code="sl",
|
||||||
# Q, W, X, Y are only used for foreign words.
|
# Q, W, X, Y are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
|
||||||
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
wiki_start_pages=["Glavna_stran"],
|
||||||
wiki_start_pages=[u'Glavna_stran']),
|
),
|
||||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||||
# the API, so for now we just support Cyrillic.
|
# the API, so for now we just support Cyrillic.
|
||||||
'Serbian': Language(name='Serbian',
|
"Serbian": Language(
|
||||||
iso_code='sr',
|
name="Serbian",
|
||||||
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
iso_code="sr",
|
||||||
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||||
'MacCyrillic', 'IBM855'],
|
wiki_start_pages=["Главна_страна"],
|
||||||
wiki_start_pages=[u'Главна_страна']),
|
),
|
||||||
'Thai': Language(name='Thai',
|
"Thai": Language(
|
||||||
iso_code='th',
|
name="Thai",
|
||||||
|
iso_code="th",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
charsets=["ISO-8859-11", "TIS-620", "CP874"],
|
||||||
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||||
wiki_start_pages=[u'หน้าหลัก']),
|
wiki_start_pages=["หน้าหลัก"],
|
||||||
'Turkish': Language(name='Turkish',
|
),
|
||||||
iso_code='tr',
|
"Turkish": Language(
|
||||||
|
name="Turkish",
|
||||||
|
iso_code="tr",
|
||||||
# Q, W, and X are not used by Turkish
|
# Q, W, and X are not used by Turkish
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-3', 'ISO-8859-9',
|
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
|
||||||
'WINDOWS-1254'],
|
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
|
||||||
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
wiki_start_pages=["Ana_Sayfa"],
|
||||||
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
),
|
||||||
wiki_start_pages=[u'Ana_Sayfa']),
|
"Vietnamese": Language(
|
||||||
'Vietnamese': Language(name='Vietnamese',
|
name="Vietnamese",
|
||||||
iso_code='vi',
|
iso_code="vi",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
# Windows-1258 is the only common 8-bit
|
# Windows-1258 is the only common 8-bit
|
||||||
# Vietnamese encoding supported by Python.
|
# Vietnamese encoding supported by Python.
|
||||||
|
@ -303,8 +345,8 @@ LANGUAGES = {'Arabic': Language(name='Arabic',
|
||||||
# scheme has declined dramatically following
|
# scheme has declined dramatically following
|
||||||
# the adoption of Unicode on the World Wide
|
# the adoption of Unicode on the World Wide
|
||||||
# Web.
|
# Web.
|
||||||
charsets=['WINDOWS-1258'],
|
charsets=["WINDOWS-1258"],
|
||||||
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
|
||||||
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
wiki_start_pages=["Chữ_Quốc_ngữ"],
|
||||||
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
),
|
||||||
}
|
}
|
||||||
|
|
16
lib/chardet/resultdict.py
Normal file
16
lib/chardet/resultdict.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# TypedDict was introduced in Python 3.8.
|
||||||
|
#
|
||||||
|
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
|
||||||
|
# for Python 3.7.
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
|
class ResultDict(TypedDict):
|
||||||
|
encoding: Optional[str]
|
||||||
|
confidence: float
|
||||||
|
language: Optional[str]
|
||||||
|
|
||||||
|
else:
|
||||||
|
ResultDict = dict
|
|
@ -26,20 +26,20 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from collections import namedtuple
|
from typing import Dict, List, NamedTuple, Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||||
|
|
||||||
|
|
||||||
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
|
class SingleByteCharSetModel(NamedTuple):
|
||||||
['charset_name',
|
charset_name: str
|
||||||
'language',
|
language: str
|
||||||
'char_to_order_map',
|
char_to_order_map: Dict[int, int]
|
||||||
'language_model',
|
language_model: Dict[int, Dict[int, int]]
|
||||||
'typical_positive_ratio',
|
typical_positive_ratio: float
|
||||||
'keep_ascii_letters',
|
keep_ascii_letters: bool
|
||||||
'alphabet'])
|
alphabet: str
|
||||||
|
|
||||||
|
|
||||||
class SingleByteCharSetProber(CharSetProber):
|
class SingleByteCharSetProber(CharSetProber):
|
||||||
|
@ -48,48 +48,55 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||||
|
|
||||||
def __init__(self, model, reversed=False, name_prober=None):
|
def __init__(
|
||||||
super(SingleByteCharSetProber, self).__init__()
|
self,
|
||||||
|
model: SingleByteCharSetModel,
|
||||||
|
is_reversed: bool = False,
|
||||||
|
name_prober: Optional[CharSetProber] = None,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
self._model = model
|
self._model = model
|
||||||
# TRUE if we need to reverse every pair in the model lookup
|
# TRUE if we need to reverse every pair in the model lookup
|
||||||
self._reversed = reversed
|
self._reversed = is_reversed
|
||||||
# Optional auxiliary prober for name decision
|
# Optional auxiliary prober for name decision
|
||||||
self._name_prober = name_prober
|
self._name_prober = name_prober
|
||||||
self._last_order = None
|
self._last_order = 255
|
||||||
self._seq_counters = None
|
self._seq_counters: List[int] = []
|
||||||
self._total_seqs = None
|
self._total_seqs = 0
|
||||||
self._total_char = None
|
self._total_char = 0
|
||||||
self._freq_char = None
|
self._control_char = 0
|
||||||
|
self._freq_char = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(SingleByteCharSetProber, self).reset()
|
super().reset()
|
||||||
# char order of last character
|
# char order of last character
|
||||||
self._last_order = 255
|
self._last_order = 255
|
||||||
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
||||||
self._total_seqs = 0
|
self._total_seqs = 0
|
||||||
self._total_char = 0
|
self._total_char = 0
|
||||||
|
self._control_char = 0
|
||||||
# characters that fall in our sampling range
|
# characters that fall in our sampling range
|
||||||
self._freq_char = 0
|
self._freq_char = 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.charset_name
|
return self._name_prober.charset_name
|
||||||
else:
|
|
||||||
return self._model.charset_name
|
return self._model.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> Optional[str]:
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.language
|
return self._name_prober.language
|
||||||
else:
|
|
||||||
return self._model.language
|
return self._model.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
# TODO: Make filter_international_words keep things in self.alphabet
|
# TODO: Make filter_international_words keep things in self.alphabet
|
||||||
if not self._model.keep_ascii_letters:
|
if not self._model.keep_ascii_letters:
|
||||||
byte_str = self.filter_international_words(byte_str)
|
byte_str = self.filter_international_words(byte_str)
|
||||||
|
else:
|
||||||
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return self.state
|
return self.state
|
||||||
char_to_order_map = self._model.char_to_order_map
|
char_to_order_map = self._model.char_to_order_map
|
||||||
|
@ -103,9 +110,6 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
# _total_char purposes.
|
# _total_char purposes.
|
||||||
if order < CharacterCategory.CONTROL:
|
if order < CharacterCategory.CONTROL:
|
||||||
self._total_char += 1
|
self._total_char += 1
|
||||||
# TODO: Follow uchardet's lead and discount confidence for frequent
|
|
||||||
# control characters.
|
|
||||||
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
|
|
||||||
if order < self.SAMPLE_SIZE:
|
if order < self.SAMPLE_SIZE:
|
||||||
self._freq_char += 1
|
self._freq_char += 1
|
||||||
if self._last_order < self.SAMPLE_SIZE:
|
if self._last_order < self.SAMPLE_SIZE:
|
||||||
|
@ -122,23 +126,36 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
||||||
confidence = self.get_confidence()
|
confidence = self.get_confidence()
|
||||||
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
||||||
self.logger.debug('%s confidence = %s, we have a winner',
|
self.logger.debug(
|
||||||
charset_name, confidence)
|
"%s confidence = %s, we have a winner", charset_name, confidence
|
||||||
|
)
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
||||||
self.logger.debug('%s confidence = %s, below negative '
|
self.logger.debug(
|
||||||
'shortcut threshhold %s', charset_name,
|
"%s confidence = %s, below negative shortcut threshold %s",
|
||||||
|
charset_name,
|
||||||
confidence,
|
confidence,
|
||||||
self.NEGATIVE_SHORTCUT_THRESHOLD)
|
self.NEGATIVE_SHORTCUT_THRESHOLD,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
r = 0.01
|
r = 0.01
|
||||||
if self._total_seqs > 0:
|
if self._total_seqs > 0:
|
||||||
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
r = (
|
||||||
self._total_seqs / self._model.typical_positive_ratio)
|
(
|
||||||
|
self._seq_counters[SequenceLikelihood.POSITIVE]
|
||||||
|
+ 0.25 * self._seq_counters[SequenceLikelihood.LIKELY]
|
||||||
|
)
|
||||||
|
/ self._total_seqs
|
||||||
|
/ self._model.typical_positive_ratio
|
||||||
|
)
|
||||||
|
# The more control characters (proportionnaly to the size
|
||||||
|
# of the text), the less confident we become in the current
|
||||||
|
# charset.
|
||||||
|
r = r * (self._total_char - self._control_char) / self._total_char
|
||||||
r = r * self._freq_char / self._total_char
|
r = r * self._freq_char / self._total_char
|
||||||
if r >= 1.0:
|
if r >= 1.0:
|
||||||
r = 0.99
|
r = 0.99
|
||||||
|
|
|
@ -28,33 +28,38 @@
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from .hebrewprober import HebrewProber
|
from .hebrewprober import HebrewProber
|
||||||
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
|
from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL
|
||||||
WINDOWS_1251_BULGARIAN_MODEL)
|
|
||||||
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
||||||
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
||||||
|
|
||||||
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
||||||
# WINDOWS_1250_HUNGARIAN_MODEL)
|
# WINDOWS_1250_HUNGARIAN_MODEL)
|
||||||
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
|
from .langrussianmodel import (
|
||||||
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
|
IBM855_RUSSIAN_MODEL,
|
||||||
|
IBM866_RUSSIAN_MODEL,
|
||||||
|
ISO_8859_5_RUSSIAN_MODEL,
|
||||||
|
KOI8_R_RUSSIAN_MODEL,
|
||||||
MACCYRILLIC_RUSSIAN_MODEL,
|
MACCYRILLIC_RUSSIAN_MODEL,
|
||||||
WINDOWS_1251_RUSSIAN_MODEL)
|
WINDOWS_1251_RUSSIAN_MODEL,
|
||||||
|
)
|
||||||
from .langthaimodel import TIS_620_THAI_MODEL
|
from .langthaimodel import TIS_620_THAI_MODEL
|
||||||
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
||||||
from .sbcharsetprober import SingleByteCharSetProber
|
from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
|
||||||
|
|
||||||
class SBCSGroupProber(CharSetGroupProber):
|
class SBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(SBCSGroupProber, self).__init__()
|
super().__init__()
|
||||||
hebrew_prober = HebrewProber()
|
hebrew_prober = HebrewProber()
|
||||||
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
logical_hebrew_prober = SingleByteCharSetProber(
|
||||||
False, hebrew_prober)
|
WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
|
||||||
|
)
|
||||||
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
||||||
# it's actually the visual one
|
# it's actually the visual one
|
||||||
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
visual_hebrew_prober = SingleByteCharSetProber(
|
||||||
True, hebrew_prober)
|
WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
|
||||||
hebrew_prober.set_model_probers(logical_hebrew_prober,
|
)
|
||||||
visual_hebrew_prober)
|
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
|
||||||
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
||||||
# and several tests failed that did not before. Some thought
|
# and several tests failed that did not before. Some thought
|
||||||
# should be put into the ordering, and we should consider making
|
# should be put into the ordering, and we should consider making
|
||||||
|
|
|
@ -25,68 +25,81 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
from typing import Union
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import SJISDistributionAnalysis
|
from .chardistribution import SJISDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import MachineState, ProbingState
|
||||||
from .jpcntx import SJISContextAnalysis
|
from .jpcntx import SJISContextAnalysis
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import SJIS_SM_MODEL
|
from .mbcssm import SJIS_SM_MODEL
|
||||||
from .enums import ProbingState, MachineState
|
|
||||||
|
|
||||||
|
|
||||||
class SJISProber(MultiByteCharSetProber):
|
class SJISProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(SJISProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
||||||
self.distribution_analyzer = SJISDistributionAnalysis()
|
self.distribution_analyzer = SJISDistributionAnalysis()
|
||||||
self.context_analyzer = SJISContextAnalysis()
|
self.context_analyzer = SJISContextAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(SJISProber, self).reset()
|
super().reset()
|
||||||
self.context_analyzer.reset()
|
self.context_analyzer.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return self.context_analyzer.charset_name
|
return self.context_analyzer.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Japanese"
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for i in range(len(byte_str)):
|
assert self.coding_sm is not None
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
|
for i, byte in enumerate(byte_str):
|
||||||
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s %s prober hit error at byte %s',
|
self.logger.debug(
|
||||||
self.charset_name, self.language, i)
|
"%s %s prober hit error at byte %s",
|
||||||
|
self.charset_name,
|
||||||
|
self.language,
|
||||||
|
i,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte
|
||||||
self.context_analyzer.feed(self._last_char[2 - char_len:],
|
self.context_analyzer.feed(
|
||||||
char_len)
|
self._last_char[2 - char_len :], char_len
|
||||||
|
)
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
|
self.context_analyzer.feed(
|
||||||
- char_len], char_len)
|
byte_str[i + 1 - char_len : i + 3 - char_len], char_len
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
)
|
||||||
char_len)
|
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.context_analyzer.got_enough_data() and
|
if self.context_analyzer.got_enough_data() and (
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||||
|
):
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
context_conf = self.context_analyzer.get_confidence()
|
context_conf = self.context_analyzer.get_confidence()
|
||||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||||
return max(context_conf, distrib_conf)
|
return max(context_conf, distrib_conf)
|
||||||
|
|
|
@ -39,16 +39,21 @@ class a user of ``chardet`` should use.
|
||||||
import codecs
|
import codecs
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
from .enums import InputState, LanguageFilter, ProbingState
|
from .enums import InputState, LanguageFilter, ProbingState
|
||||||
from .escprober import EscCharSetProber
|
from .escprober import EscCharSetProber
|
||||||
from .latin1prober import Latin1Prober
|
from .latin1prober import Latin1Prober
|
||||||
|
from .macromanprober import MacRomanProber
|
||||||
from .mbcsgroupprober import MBCSGroupProber
|
from .mbcsgroupprober import MBCSGroupProber
|
||||||
|
from .resultdict import ResultDict
|
||||||
from .sbcsgroupprober import SBCSGroupProber
|
from .sbcsgroupprober import SBCSGroupProber
|
||||||
|
from .utf1632prober import UTF1632Prober
|
||||||
|
|
||||||
|
|
||||||
class UniversalDetector(object):
|
class UniversalDetector:
|
||||||
"""
|
"""
|
||||||
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
|
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
|
||||||
and coordinates all of the different charset probers.
|
and coordinates all of the different charset probers.
|
||||||
|
@ -66,49 +71,87 @@ class UniversalDetector(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
MINIMUM_THRESHOLD = 0.20
|
MINIMUM_THRESHOLD = 0.20
|
||||||
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
|
HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
|
||||||
ESC_DETECTOR = re.compile(b'(\033|~{)')
|
ESC_DETECTOR = re.compile(b"(\033|~{)")
|
||||||
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
|
WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
|
||||||
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
|
ISO_WIN_MAP = {
|
||||||
'iso-8859-2': 'Windows-1250',
|
"iso-8859-1": "Windows-1252",
|
||||||
'iso-8859-5': 'Windows-1251',
|
"iso-8859-2": "Windows-1250",
|
||||||
'iso-8859-6': 'Windows-1256',
|
"iso-8859-5": "Windows-1251",
|
||||||
'iso-8859-7': 'Windows-1253',
|
"iso-8859-6": "Windows-1256",
|
||||||
'iso-8859-8': 'Windows-1255',
|
"iso-8859-7": "Windows-1253",
|
||||||
'iso-8859-9': 'Windows-1254',
|
"iso-8859-8": "Windows-1255",
|
||||||
'iso-8859-13': 'Windows-1257'}
|
"iso-8859-9": "Windows-1254",
|
||||||
|
"iso-8859-13": "Windows-1257",
|
||||||
|
}
|
||||||
|
# Based on https://encoding.spec.whatwg.org/#names-and-labels
|
||||||
|
# but altered to match Python names for encodings and remove mappings
|
||||||
|
# that break tests.
|
||||||
|
LEGACY_MAP = {
|
||||||
|
"ascii": "Windows-1252",
|
||||||
|
"iso-8859-1": "Windows-1252",
|
||||||
|
"tis-620": "ISO-8859-11",
|
||||||
|
"iso-8859-9": "Windows-1254",
|
||||||
|
"gb2312": "GB18030",
|
||||||
|
"euc-kr": "CP949",
|
||||||
|
"utf-16le": "UTF-16",
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, lang_filter=LanguageFilter.ALL):
|
def __init__(
|
||||||
self._esc_charset_prober = None
|
self,
|
||||||
self._charset_probers = []
|
lang_filter: LanguageFilter = LanguageFilter.ALL,
|
||||||
self.result = None
|
should_rename_legacy: bool = False,
|
||||||
self.done = None
|
) -> None:
|
||||||
self._got_data = None
|
self._esc_charset_prober: Optional[EscCharSetProber] = None
|
||||||
self._input_state = None
|
self._utf1632_prober: Optional[UTF1632Prober] = None
|
||||||
self._last_char = None
|
self._charset_probers: List[CharSetProber] = []
|
||||||
|
self.result: ResultDict = {
|
||||||
|
"encoding": None,
|
||||||
|
"confidence": 0.0,
|
||||||
|
"language": None,
|
||||||
|
}
|
||||||
|
self.done = False
|
||||||
|
self._got_data = False
|
||||||
|
self._input_state = InputState.PURE_ASCII
|
||||||
|
self._last_char = b""
|
||||||
self.lang_filter = lang_filter
|
self.lang_filter = lang_filter
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self._has_win_bytes = None
|
self._has_win_bytes = False
|
||||||
|
self.should_rename_legacy = should_rename_legacy
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
@property
|
||||||
|
def input_state(self) -> int:
|
||||||
|
return self._input_state
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_win_bytes(self) -> bool:
|
||||||
|
return self._has_win_bytes
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_probers(self) -> List[CharSetProber]:
|
||||||
|
return self._charset_probers
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
"""
|
"""
|
||||||
Reset the UniversalDetector and all of its probers back to their
|
Reset the UniversalDetector and all of its probers back to their
|
||||||
initial states. This is called by ``__init__``, so you only need to
|
initial states. This is called by ``__init__``, so you only need to
|
||||||
call this directly in between analyses of different documents.
|
call this directly in between analyses of different documents.
|
||||||
"""
|
"""
|
||||||
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
|
self.result = {"encoding": None, "confidence": 0.0, "language": None}
|
||||||
self.done = False
|
self.done = False
|
||||||
self._got_data = False
|
self._got_data = False
|
||||||
self._has_win_bytes = False
|
self._has_win_bytes = False
|
||||||
self._input_state = InputState.PURE_ASCII
|
self._input_state = InputState.PURE_ASCII
|
||||||
self._last_char = b''
|
self._last_char = b""
|
||||||
if self._esc_charset_prober:
|
if self._esc_charset_prober:
|
||||||
self._esc_charset_prober.reset()
|
self._esc_charset_prober.reset()
|
||||||
|
if self._utf1632_prober:
|
||||||
|
self._utf1632_prober.reset()
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
prober.reset()
|
prober.reset()
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> None:
|
||||||
"""
|
"""
|
||||||
Takes a chunk of a document and feeds it through all of the relevant
|
Takes a chunk of a document and feeds it through all of the relevant
|
||||||
charset probers.
|
charset probers.
|
||||||
|
@ -125,7 +168,7 @@ class UniversalDetector(object):
|
||||||
if self.done:
|
if self.done:
|
||||||
return
|
return
|
||||||
|
|
||||||
if not len(byte_str):
|
if not byte_str:
|
||||||
return
|
return
|
||||||
|
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
|
@ -136,35 +179,38 @@ class UniversalDetector(object):
|
||||||
# If the data starts with BOM, we know it is UTF
|
# If the data starts with BOM, we know it is UTF
|
||||||
if byte_str.startswith(codecs.BOM_UTF8):
|
if byte_str.startswith(codecs.BOM_UTF8):
|
||||||
# EF BB BF UTF-8 with BOM
|
# EF BB BF UTF-8 with BOM
|
||||||
self.result = {'encoding': "UTF-8-SIG",
|
self.result = {
|
||||||
'confidence': 1.0,
|
"encoding": "UTF-8-SIG",
|
||||||
'language': ''}
|
"confidence": 1.0,
|
||||||
elif byte_str.startswith((codecs.BOM_UTF32_LE,
|
"language": "",
|
||||||
codecs.BOM_UTF32_BE)):
|
}
|
||||||
|
elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
|
||||||
# FF FE 00 00 UTF-32, little-endian BOM
|
# FF FE 00 00 UTF-32, little-endian BOM
|
||||||
# 00 00 FE FF UTF-32, big-endian BOM
|
# 00 00 FE FF UTF-32, big-endian BOM
|
||||||
self.result = {'encoding': "UTF-32",
|
self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
|
||||||
'confidence': 1.0,
|
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
|
||||||
'language': ''}
|
|
||||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
|
||||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
self.result = {
|
||||||
'confidence': 1.0,
|
# TODO: This encoding is not supported by Python. Should remove?
|
||||||
'language': ''}
|
"encoding": "X-ISO-10646-UCS-4-3412",
|
||||||
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
|
"confidence": 1.0,
|
||||||
|
"language": "",
|
||||||
|
}
|
||||||
|
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
|
||||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
self.result = {
|
||||||
'confidence': 1.0,
|
# TODO: This encoding is not supported by Python. Should remove?
|
||||||
'language': ''}
|
"encoding": "X-ISO-10646-UCS-4-2143",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"language": "",
|
||||||
|
}
|
||||||
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
||||||
# FF FE UTF-16, little endian BOM
|
# FF FE UTF-16, little endian BOM
|
||||||
# FE FF UTF-16, big endian BOM
|
# FE FF UTF-16, big endian BOM
|
||||||
self.result = {'encoding': "UTF-16",
|
self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
|
||||||
'confidence': 1.0,
|
|
||||||
'language': ''}
|
|
||||||
|
|
||||||
self._got_data = True
|
self._got_data = True
|
||||||
if self.result['encoding'] is not None:
|
if self.result["encoding"] is not None:
|
||||||
self.done = True
|
self.done = True
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -173,12 +219,29 @@ class UniversalDetector(object):
|
||||||
if self._input_state == InputState.PURE_ASCII:
|
if self._input_state == InputState.PURE_ASCII:
|
||||||
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
||||||
self._input_state = InputState.HIGH_BYTE
|
self._input_state = InputState.HIGH_BYTE
|
||||||
elif self._input_state == InputState.PURE_ASCII and \
|
elif (
|
||||||
self.ESC_DETECTOR.search(self._last_char + byte_str):
|
self._input_state == InputState.PURE_ASCII
|
||||||
|
and self.ESC_DETECTOR.search(self._last_char + byte_str)
|
||||||
|
):
|
||||||
self._input_state = InputState.ESC_ASCII
|
self._input_state = InputState.ESC_ASCII
|
||||||
|
|
||||||
self._last_char = byte_str[-1:]
|
self._last_char = byte_str[-1:]
|
||||||
|
|
||||||
|
# next we will look to see if it is appears to be either a UTF-16 or
|
||||||
|
# UTF-32 encoding
|
||||||
|
if not self._utf1632_prober:
|
||||||
|
self._utf1632_prober = UTF1632Prober()
|
||||||
|
|
||||||
|
if self._utf1632_prober.state == ProbingState.DETECTING:
|
||||||
|
if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
|
self.result = {
|
||||||
|
"encoding": self._utf1632_prober.charset_name,
|
||||||
|
"confidence": self._utf1632_prober.get_confidence(),
|
||||||
|
"language": "",
|
||||||
|
}
|
||||||
|
self.done = True
|
||||||
|
return
|
||||||
|
|
||||||
# If we've seen escape sequences, use the EscCharSetProber, which
|
# If we've seen escape sequences, use the EscCharSetProber, which
|
||||||
# uses a simple state machine to check for known escape sequences in
|
# uses a simple state machine to check for known escape sequences in
|
||||||
# HZ and ISO-2022 encodings, since those are the only encodings that
|
# HZ and ISO-2022 encodings, since those are the only encodings that
|
||||||
|
@ -187,12 +250,11 @@ class UniversalDetector(object):
|
||||||
if not self._esc_charset_prober:
|
if not self._esc_charset_prober:
|
||||||
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
||||||
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {'encoding':
|
self.result = {
|
||||||
self._esc_charset_prober.charset_name,
|
"encoding": self._esc_charset_prober.charset_name,
|
||||||
'confidence':
|
"confidence": self._esc_charset_prober.get_confidence(),
|
||||||
self._esc_charset_prober.get_confidence(),
|
"language": self._esc_charset_prober.language,
|
||||||
'language':
|
}
|
||||||
self._esc_charset_prober.language}
|
|
||||||
self.done = True
|
self.done = True
|
||||||
# If we've seen high bytes (i.e., those with values greater than 127),
|
# If we've seen high bytes (i.e., those with values greater than 127),
|
||||||
# we need to do more complicated checks using all our multi-byte and
|
# we need to do more complicated checks using all our multi-byte and
|
||||||
|
@ -207,17 +269,20 @@ class UniversalDetector(object):
|
||||||
if self.lang_filter & LanguageFilter.NON_CJK:
|
if self.lang_filter & LanguageFilter.NON_CJK:
|
||||||
self._charset_probers.append(SBCSGroupProber())
|
self._charset_probers.append(SBCSGroupProber())
|
||||||
self._charset_probers.append(Latin1Prober())
|
self._charset_probers.append(Latin1Prober())
|
||||||
|
self._charset_probers.append(MacRomanProber())
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {'encoding': prober.charset_name,
|
self.result = {
|
||||||
'confidence': prober.get_confidence(),
|
"encoding": prober.charset_name,
|
||||||
'language': prober.language}
|
"confidence": prober.get_confidence(),
|
||||||
|
"language": prober.language,
|
||||||
|
}
|
||||||
self.done = True
|
self.done = True
|
||||||
break
|
break
|
||||||
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||||
self._has_win_bytes = True
|
self._has_win_bytes = True
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> ResultDict:
|
||||||
"""
|
"""
|
||||||
Stop analyzing the current document and come up with a final
|
Stop analyzing the current document and come up with a final
|
||||||
prediction.
|
prediction.
|
||||||
|
@ -231,13 +296,11 @@ class UniversalDetector(object):
|
||||||
self.done = True
|
self.done = True
|
||||||
|
|
||||||
if not self._got_data:
|
if not self._got_data:
|
||||||
self.logger.debug('no data received!')
|
self.logger.debug("no data received!")
|
||||||
|
|
||||||
# Default to ASCII if it is all we've seen so far
|
# Default to ASCII if it is all we've seen so far
|
||||||
elif self._input_state == InputState.PURE_ASCII:
|
elif self._input_state == InputState.PURE_ASCII:
|
||||||
self.result = {'encoding': 'ascii',
|
self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
|
||||||
'confidence': 1.0,
|
|
||||||
'language': ''}
|
|
||||||
|
|
||||||
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
||||||
elif self._input_state == InputState.HIGH_BYTE:
|
elif self._input_state == InputState.HIGH_BYTE:
|
||||||
|
@ -253,34 +316,47 @@ class UniversalDetector(object):
|
||||||
max_prober = prober
|
max_prober = prober
|
||||||
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
||||||
charset_name = max_prober.charset_name
|
charset_name = max_prober.charset_name
|
||||||
lower_charset_name = max_prober.charset_name.lower()
|
assert charset_name is not None
|
||||||
|
lower_charset_name = charset_name.lower()
|
||||||
confidence = max_prober.get_confidence()
|
confidence = max_prober.get_confidence()
|
||||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
# extra Windows-specific bytes
|
# extra Windows-specific bytes
|
||||||
if lower_charset_name.startswith('iso-8859'):
|
if lower_charset_name.startswith("iso-8859"):
|
||||||
if self._has_win_bytes:
|
if self._has_win_bytes:
|
||||||
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
|
charset_name = self.ISO_WIN_MAP.get(
|
||||||
charset_name)
|
lower_charset_name, charset_name
|
||||||
self.result = {'encoding': charset_name,
|
)
|
||||||
'confidence': confidence,
|
# Rename legacy encodings with superset encodings if asked
|
||||||
'language': max_prober.language}
|
if self.should_rename_legacy:
|
||||||
|
charset_name = self.LEGACY_MAP.get(
|
||||||
|
(charset_name or "").lower(), charset_name
|
||||||
|
)
|
||||||
|
self.result = {
|
||||||
|
"encoding": charset_name,
|
||||||
|
"confidence": confidence,
|
||||||
|
"language": max_prober.language,
|
||||||
|
}
|
||||||
|
|
||||||
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
||||||
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
||||||
if self.result['encoding'] is None:
|
if self.result["encoding"] is None:
|
||||||
self.logger.debug('no probers hit minimum threshold')
|
self.logger.debug("no probers hit minimum threshold")
|
||||||
for group_prober in self._charset_probers:
|
for group_prober in self._charset_probers:
|
||||||
if not group_prober:
|
if not group_prober:
|
||||||
continue
|
continue
|
||||||
if isinstance(group_prober, CharSetGroupProber):
|
if isinstance(group_prober, CharSetGroupProber):
|
||||||
for prober in group_prober.probers:
|
for prober in group_prober.probers:
|
||||||
self.logger.debug('%s %s confidence = %s',
|
self.logger.debug(
|
||||||
|
"%s %s confidence = %s",
|
||||||
prober.charset_name,
|
prober.charset_name,
|
||||||
prober.language,
|
prober.language,
|
||||||
prober.get_confidence())
|
prober.get_confidence(),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.debug('%s %s confidence = %s',
|
self.logger.debug(
|
||||||
prober.charset_name,
|
"%s %s confidence = %s",
|
||||||
prober.language,
|
group_prober.charset_name,
|
||||||
prober.get_confidence())
|
group_prober.language,
|
||||||
|
group_prober.get_confidence(),
|
||||||
|
)
|
||||||
return self.result
|
return self.result
|
||||||
|
|
225
lib/chardet/utf1632prober.py
Normal file
225
lib/chardet/utf1632prober.py
Normal file
|
@ -0,0 +1,225 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Jason Zavaglia
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
|
||||||
|
class UTF1632Prober(CharSetProber):
|
||||||
|
"""
|
||||||
|
This class simply looks for occurrences of zero bytes, and infers
|
||||||
|
whether the file is UTF16 or UTF32 (low-endian or big-endian)
|
||||||
|
For instance, files looking like ( \0 \0 \0 [nonzero] )+
|
||||||
|
have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
|
||||||
|
may be guessed to be UTF16BE, and inversely for little-endian varieties.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# how many logical characters to scan before feeling confident of prediction
|
||||||
|
MIN_CHARS_FOR_DETECTION = 20
|
||||||
|
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
|
||||||
|
EXPECTED_RATIO = 0.94
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.position = 0
|
||||||
|
self.zeros_at_mod = [0] * 4
|
||||||
|
self.nonzeros_at_mod = [0] * 4
|
||||||
|
self._state = ProbingState.DETECTING
|
||||||
|
self.quad = [0, 0, 0, 0]
|
||||||
|
self.invalid_utf16be = False
|
||||||
|
self.invalid_utf16le = False
|
||||||
|
self.invalid_utf32be = False
|
||||||
|
self.invalid_utf32le = False
|
||||||
|
self.first_half_surrogate_pair_detected_16be = False
|
||||||
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
super().reset()
|
||||||
|
self.position = 0
|
||||||
|
self.zeros_at_mod = [0] * 4
|
||||||
|
self.nonzeros_at_mod = [0] * 4
|
||||||
|
self._state = ProbingState.DETECTING
|
||||||
|
self.invalid_utf16be = False
|
||||||
|
self.invalid_utf16le = False
|
||||||
|
self.invalid_utf32be = False
|
||||||
|
self.invalid_utf32le = False
|
||||||
|
self.first_half_surrogate_pair_detected_16be = False
|
||||||
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
|
self.quad = [0, 0, 0, 0]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_name(self) -> str:
|
||||||
|
if self.is_likely_utf32be():
|
||||||
|
return "utf-32be"
|
||||||
|
if self.is_likely_utf32le():
|
||||||
|
return "utf-32le"
|
||||||
|
if self.is_likely_utf16be():
|
||||||
|
return "utf-16be"
|
||||||
|
if self.is_likely_utf16le():
|
||||||
|
return "utf-16le"
|
||||||
|
# default to something valid
|
||||||
|
return "utf-16"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def approx_32bit_chars(self) -> float:
|
||||||
|
return max(1.0, self.position / 4.0)
|
||||||
|
|
||||||
|
def approx_16bit_chars(self) -> float:
|
||||||
|
return max(1.0, self.position / 2.0)
|
||||||
|
|
||||||
|
def is_likely_utf32be(self) -> bool:
|
||||||
|
approx_chars = self.approx_32bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf32be
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_likely_utf32le(self) -> bool:
|
||||||
|
approx_chars = self.approx_32bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf32le
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_likely_utf16be(self) -> bool:
|
||||||
|
approx_chars = self.approx_16bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf16be
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_likely_utf16le(self) -> bool:
|
||||||
|
approx_chars = self.approx_16bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf16le
|
||||||
|
)
|
||||||
|
|
||||||
|
def validate_utf32_characters(self, quad: List[int]) -> None:
|
||||||
|
"""
|
||||||
|
Validate if the quad of bytes is valid UTF-32.
|
||||||
|
|
||||||
|
UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
|
||||||
|
excluding 0x0000D800 - 0x0000DFFF
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/UTF-32
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
quad[0] != 0
|
||||||
|
or quad[1] > 0x10
|
||||||
|
or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
|
||||||
|
):
|
||||||
|
self.invalid_utf32be = True
|
||||||
|
if (
|
||||||
|
quad[3] != 0
|
||||||
|
or quad[2] > 0x10
|
||||||
|
or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
|
||||||
|
):
|
||||||
|
self.invalid_utf32le = True
|
||||||
|
|
||||||
|
def validate_utf16_characters(self, pair: List[int]) -> None:
|
||||||
|
"""
|
||||||
|
Validate if the pair of bytes is valid UTF-16.
|
||||||
|
|
||||||
|
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
|
||||||
|
with an exception for surrogate pairs, which must be in the range
|
||||||
|
0xD800-0xDBFF followed by 0xDC00-0xDFFF
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/UTF-16
|
||||||
|
"""
|
||||||
|
if not self.first_half_surrogate_pair_detected_16be:
|
||||||
|
if 0xD8 <= pair[0] <= 0xDB:
|
||||||
|
self.first_half_surrogate_pair_detected_16be = True
|
||||||
|
elif 0xDC <= pair[0] <= 0xDF:
|
||||||
|
self.invalid_utf16be = True
|
||||||
|
else:
|
||||||
|
if 0xDC <= pair[0] <= 0xDF:
|
||||||
|
self.first_half_surrogate_pair_detected_16be = False
|
||||||
|
else:
|
||||||
|
self.invalid_utf16be = True
|
||||||
|
|
||||||
|
if not self.first_half_surrogate_pair_detected_16le:
|
||||||
|
if 0xD8 <= pair[1] <= 0xDB:
|
||||||
|
self.first_half_surrogate_pair_detected_16le = True
|
||||||
|
elif 0xDC <= pair[1] <= 0xDF:
|
||||||
|
self.invalid_utf16le = True
|
||||||
|
else:
|
||||||
|
if 0xDC <= pair[1] <= 0xDF:
|
||||||
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
|
else:
|
||||||
|
self.invalid_utf16le = True
|
||||||
|
|
||||||
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
|
for c in byte_str:
|
||||||
|
mod4 = self.position % 4
|
||||||
|
self.quad[mod4] = c
|
||||||
|
if mod4 == 3:
|
||||||
|
self.validate_utf32_characters(self.quad)
|
||||||
|
self.validate_utf16_characters(self.quad[0:2])
|
||||||
|
self.validate_utf16_characters(self.quad[2:4])
|
||||||
|
if c == 0:
|
||||||
|
self.zeros_at_mod[mod4] += 1
|
||||||
|
else:
|
||||||
|
self.nonzeros_at_mod[mod4] += 1
|
||||||
|
self.position += 1
|
||||||
|
return self.state
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state(self) -> ProbingState:
|
||||||
|
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
|
||||||
|
# terminal, decided states
|
||||||
|
return self._state
|
||||||
|
if self.get_confidence() > 0.80:
|
||||||
|
self._state = ProbingState.FOUND_IT
|
||||||
|
elif self.position > 4 * 1024:
|
||||||
|
# if we get to 4kb into the file, and we can't conclude it's UTF,
|
||||||
|
# let's give up
|
||||||
|
self._state = ProbingState.NOT_ME
|
||||||
|
return self._state
|
||||||
|
|
||||||
|
def get_confidence(self) -> float:
|
||||||
|
return (
|
||||||
|
0.85
|
||||||
|
if (
|
||||||
|
self.is_likely_utf16le()
|
||||||
|
or self.is_likely_utf16be()
|
||||||
|
or self.is_likely_utf32le()
|
||||||
|
or self.is_likely_utf32be()
|
||||||
|
)
|
||||||
|
else 0.00
|
||||||
|
)
|
|
@ -25,45 +25,46 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from typing import Union
|
||||||
from .enums import ProbingState, MachineState
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .mbcssm import UTF8_SM_MODEL
|
|
||||||
|
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import MachineState, ProbingState
|
||||||
|
from .mbcssm import UTF8_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class UTF8Prober(CharSetProber):
|
class UTF8Prober(CharSetProber):
|
||||||
ONE_CHAR_PROB = 0.5
|
ONE_CHAR_PROB = 0.5
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super(UTF8Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
||||||
self._num_mb_chars = None
|
self._num_mb_chars = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super(UTF8Prober, self).reset()
|
super().reset()
|
||||||
self.coding_sm.reset()
|
self.coding_sm.reset()
|
||||||
self._num_mb_chars = 0
|
self._num_mb_chars = 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "utf-8"
|
return "utf-8"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
coding_state = self.coding_sm.next_state(c)
|
coding_state = self.coding_sm.next_state(c)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
if self.coding_sm.get_current_charlen() >= 2:
|
if self.coding_sm.get_current_charlen() >= 2:
|
||||||
self._num_mb_chars += 1
|
self._num_mb_chars += 1
|
||||||
|
|
||||||
|
@ -73,10 +74,9 @@ class UTF8Prober(CharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
unlike = 0.99
|
unlike = 0.99
|
||||||
if self._num_mb_chars < 6:
|
if self._num_mb_chars < 6:
|
||||||
unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars
|
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
||||||
return 1.0 - unlike
|
return 1.0 - unlike
|
||||||
else:
|
|
||||||
return unlike
|
return unlike
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
"""
|
"""
|
||||||
This module exists only to simplify retrieving the version number of chardet
|
This module exists only to simplify retrieving the version number of chardet
|
||||||
from within setup.py and from chardet subpackages.
|
from within setuptools and from chardet subpackages.
|
||||||
|
|
||||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "4.0.0"
|
__version__ = "5.1.0"
|
||||||
VERSION = __version__.split('.')
|
VERSION = __version__.split(".")
|
||||||
|
|
Loading…
Reference in a new issue