mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-01 00:43:37 +00:00
243 lines
9.9 KiB
Python
243 lines
9.9 KiB
Python
|
import json
|
||
|
import xml.etree.ElementTree as ET
|
||
|
from langcodes.util import data_filename
|
||
|
from langcodes.registry_parser import parse_registry
|
||
|
|
||
|
|
||
|
def read_cldr_supplemental(dataname):
|
||
|
cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')
|
||
|
filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
|
||
|
fulldata = json.load(open(filename, encoding='utf-8'))
|
||
|
if dataname == 'aliases':
|
||
|
data = fulldata['supplemental']['metadata']['alias']
|
||
|
else:
|
||
|
data = fulldata['supplemental'][dataname]
|
||
|
return data
|
||
|
|
||
|
|
||
|
def read_iana_registry_suppress_scripts():
|
||
|
scripts = {}
|
||
|
for entry in parse_registry():
|
||
|
if entry['Type'] == 'language' and 'Suppress-Script' in entry:
|
||
|
scripts[entry['Subtag']] = entry['Suppress-Script']
|
||
|
return scripts
|
||
|
|
||
|
|
||
|
def read_iana_registry_scripts():
|
||
|
scripts = set()
|
||
|
for entry in parse_registry():
|
||
|
if entry['Type'] == 'script':
|
||
|
scripts.add(entry['Subtag'])
|
||
|
return scripts
|
||
|
|
||
|
|
||
|
def read_iana_registry_macrolanguages():
|
||
|
macros = {}
|
||
|
for entry in parse_registry():
|
||
|
if entry['Type'] == 'language' and 'Macrolanguage' in entry:
|
||
|
macros[entry['Subtag']] = entry['Macrolanguage']
|
||
|
return macros
|
||
|
|
||
|
|
||
|
def read_iana_registry_replacements():
|
||
|
replacements = {}
|
||
|
for entry in parse_registry():
|
||
|
if entry['Type'] == 'language' and 'Preferred-Value' in entry:
|
||
|
# Replacements for language codes
|
||
|
replacements[entry['Subtag']] = entry['Preferred-Value']
|
||
|
elif 'Tag' in entry and 'Preferred-Value' in entry:
|
||
|
# Replacements for entire tags
|
||
|
replacements[entry['Tag'].lower()] = entry['Preferred-Value']
|
||
|
return replacements
|
||
|
|
||
|
|
||
|
def write_python_dict(outfile, name, d):
|
||
|
print(f"{name} = {{", file=outfile)
|
||
|
for key in sorted(d):
|
||
|
value = d[key]
|
||
|
print(f" {key!r}: {value!r},", file=outfile)
|
||
|
print("}", file=outfile)
|
||
|
|
||
|
|
||
|
def write_python_set(outfile, name, s):
|
||
|
print(f"{name} = {{", file=outfile)
|
||
|
for key in sorted(set(s)):
|
||
|
print(f" {key!r},", file=outfile)
|
||
|
print("}", file=outfile)
|
||
|
|
||
|
|
||
|
GENERATED_HEADER = "# This file is generated by build_data.py."
|
||
|
|
||
|
|
||
|
def read_validity_regex():
|
||
|
validity_options = []
|
||
|
for codetype in ('language', 'region', 'script', 'variant'):
|
||
|
validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')
|
||
|
root = ET.fromstring(open(validity_path).read())
|
||
|
matches = root.findall('./idValidity/id')
|
||
|
for match in matches:
|
||
|
for item in match.text.strip().split():
|
||
|
if '~' in item:
|
||
|
assert item[-2] == '~'
|
||
|
prefix = item[:-3]
|
||
|
range_start = item[-3]
|
||
|
range_end = item[-1]
|
||
|
option = f"{prefix}[{range_start}-{range_end}]"
|
||
|
validity_options.append(option)
|
||
|
else:
|
||
|
validity_options.append(item)
|
||
|
options = '|'.join(validity_options)
|
||
|
return f'^({options})$'
|
||
|
|
||
|
|
||
|
def read_language_distances():
|
||
|
language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')
|
||
|
root = ET.fromstring(open(language_info_path).read())
|
||
|
matches = root.findall(
|
||
|
'./languageMatching/languageMatches[@type="written_new"]/languageMatch'
|
||
|
)
|
||
|
tag_distances = {}
|
||
|
for match in matches:
|
||
|
attribs = match.attrib
|
||
|
n_parts = attribs['desired'].count('_') + 1
|
||
|
if n_parts < 3:
|
||
|
if attribs.get('oneway') == 'true':
|
||
|
pairs = [(attribs['desired'], attribs['supported'])]
|
||
|
else:
|
||
|
pairs = [
|
||
|
(attribs['desired'], attribs['supported']),
|
||
|
(attribs['supported'], attribs['desired']),
|
||
|
]
|
||
|
for (desired, supported) in pairs:
|
||
|
desired_distance = tag_distances.setdefault(desired, {})
|
||
|
desired_distance[supported] = int(attribs['distance'])
|
||
|
|
||
|
# The 'languageInfo' data file contains distances for the unnormalized
|
||
|
# tag 'sh', but we work mostly with normalized tags, and they don't
|
||
|
# describe at all how to cope with this.
|
||
|
#
|
||
|
# 'sh' normalizes to 'sr-Latn', and when we're matching languages we
|
||
|
# aren't matching scripts yet, so when 'sh' appears we'll add a
|
||
|
# corresponding match for 'sr'.
|
||
|
#
|
||
|
# Then because we're kind of making this plan up, add 1 to the distance
|
||
|
# so it's a worse match than ones that are actually clearly defined
|
||
|
# in languageInfo.
|
||
|
if desired == 'sh' or supported == 'sh':
|
||
|
if desired == 'sh':
|
||
|
desired = 'sr'
|
||
|
if supported == 'sh':
|
||
|
supported = 'sr'
|
||
|
if desired != supported:
|
||
|
# don't try to define a non-zero distance for sr <=> sr
|
||
|
desired_distance = tag_distances.setdefault(desired, {})
|
||
|
desired_distance[supported] = int(attribs['distance']) + 1
|
||
|
|
||
|
return tag_distances
|
||
|
|
||
|
|
||
|
def build_data():
|
||
|
lang_scripts = read_iana_registry_suppress_scripts()
|
||
|
all_scripts = read_iana_registry_scripts()
|
||
|
macrolanguages = read_iana_registry_macrolanguages()
|
||
|
iana_replacements = read_iana_registry_replacements()
|
||
|
language_distances = read_language_distances()
|
||
|
|
||
|
alias_data = read_cldr_supplemental('aliases')
|
||
|
likely_subtags = read_cldr_supplemental('likelySubtags')
|
||
|
replacements = {}
|
||
|
|
||
|
# Aliased codes can still have alpha3 codes, and there's no unified source
|
||
|
# about what they are. It depends on whether the alias predates or postdates
|
||
|
# ISO 639-2, which nobody should have to care about. So let's set all the
|
||
|
# alpha3 codes for aliased alpha2 codes here.
|
||
|
alpha3_mapping = {
|
||
|
'tl': 'tgl', # even though it normalizes to 'fil'
|
||
|
'in': 'ind',
|
||
|
'iw': 'heb',
|
||
|
'ji': 'yid',
|
||
|
'jw': 'jav',
|
||
|
'sh': 'hbs',
|
||
|
}
|
||
|
alpha3_biblio = {}
|
||
|
norm_macrolanguages = {}
|
||
|
for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
|
||
|
aliases = alias_data[alias_type]
|
||
|
# Initially populate 'languageAlias' with the aliases from the IANA file
|
||
|
if alias_type == 'languageAlias':
|
||
|
replacements[alias_type] = iana_replacements
|
||
|
replacements[alias_type]['root'] = 'und'
|
||
|
else:
|
||
|
replacements[alias_type] = {}
|
||
|
for code, value in aliases.items():
|
||
|
# Make all keys lowercase so they can be looked up
|
||
|
# case-insensitively
|
||
|
code = code.lower()
|
||
|
|
||
|
# If there are multiple replacements, take the first one. For example,
|
||
|
# we just replace the Soviet Union (SU) with Russia (RU), instead of
|
||
|
# trying to do something context-sensitive and poorly standardized
|
||
|
# that selects one of the successor countries to the Soviet Union.
|
||
|
replacement = value['_replacement'].split()[0]
|
||
|
if value['_reason'] == 'macrolanguage':
|
||
|
norm_macrolanguages[code] = replacement
|
||
|
else:
|
||
|
# CLDR tries to oversimplify some codes as it assigns aliases.
|
||
|
# For example, 'nor' is the ISO alpha3 code for 'no', but CLDR
|
||
|
# would prefer you use 'nb' over 'no', so it makes 'nor' an
|
||
|
# alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.
|
||
|
#
|
||
|
# We undo this oversimplification so that we can get a
|
||
|
# canonical mapping between alpha2 and alpha3 codes.
|
||
|
if code == 'nor':
|
||
|
replacement = 'no'
|
||
|
elif code == 'mol':
|
||
|
replacement = 'mo'
|
||
|
elif code == 'twi':
|
||
|
replacement = 'tw'
|
||
|
elif code == 'bih':
|
||
|
replacement = 'bh'
|
||
|
|
||
|
replacements[alias_type][code] = replacement
|
||
|
if alias_type == 'languageAlias':
|
||
|
if value['_reason'] == 'overlong':
|
||
|
if replacement in alpha3_mapping:
|
||
|
raise ValueError(
|
||
|
"{code!r} is an alpha3 for {replacement!r}, which"
|
||
|
" already has an alpha3: {orig!r}".format(
|
||
|
code=code,
|
||
|
replacement=replacement,
|
||
|
orig=alpha3_mapping[replacement],
|
||
|
)
|
||
|
)
|
||
|
alpha3_mapping[replacement] = code
|
||
|
elif value['_reason'] == 'bibliographic':
|
||
|
alpha3_biblio[replacement] = code
|
||
|
|
||
|
validity_regex = read_validity_regex()
|
||
|
|
||
|
# Write the contents of data_dicts.py.
|
||
|
with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
|
||
|
print(GENERATED_HEADER, file=outfile)
|
||
|
print("import re\n", file=outfile)
|
||
|
write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
|
||
|
write_python_dict(
|
||
|
outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']
|
||
|
)
|
||
|
write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)
|
||
|
write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)
|
||
|
write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])
|
||
|
write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)
|
||
|
write_python_dict(
|
||
|
outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']
|
||
|
)
|
||
|
write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
|
||
|
write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)
|
||
|
write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
|
||
|
write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
|
||
|
print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
build_data()
|