import json import xml.etree.ElementTree as ET from langcodes.util import data_filename from langcodes.registry_parser import parse_registry def read_cldr_supplemental(dataname): cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental') filename = data_filename(f'{cldr_supp_path}/{dataname}.json') fulldata = json.load(open(filename, encoding='utf-8')) if dataname == 'aliases': data = fulldata['supplemental']['metadata']['alias'] else: data = fulldata['supplemental'][dataname] return data def read_iana_registry_suppress_scripts(): scripts = {} for entry in parse_registry(): if entry['Type'] == 'language' and 'Suppress-Script' in entry: scripts[entry['Subtag']] = entry['Suppress-Script'] return scripts def read_iana_registry_scripts(): scripts = set() for entry in parse_registry(): if entry['Type'] == 'script': scripts.add(entry['Subtag']) return scripts def read_iana_registry_macrolanguages(): macros = {} for entry in parse_registry(): if entry['Type'] == 'language' and 'Macrolanguage' in entry: macros[entry['Subtag']] = entry['Macrolanguage'] return macros def read_iana_registry_replacements(): replacements = {} for entry in parse_registry(): if entry['Type'] == 'language' and 'Preferred-Value' in entry: # Replacements for language codes replacements[entry['Subtag']] = entry['Preferred-Value'] elif 'Tag' in entry and 'Preferred-Value' in entry: # Replacements for entire tags replacements[entry['Tag'].lower()] = entry['Preferred-Value'] return replacements def write_python_dict(outfile, name, d): print(f"{name} = {{", file=outfile) for key in sorted(d): value = d[key] print(f" {key!r}: {value!r},", file=outfile) print("}", file=outfile) def write_python_set(outfile, name, s): print(f"{name} = {{", file=outfile) for key in sorted(set(s)): print(f" {key!r},", file=outfile) print("}", file=outfile) GENERATED_HEADER = "# This file is generated by build_data.py." def read_validity_regex(): validity_options = [] for codetype in ('language', 'region', 'script', 'variant'): validity_path = data_filename(f'cldr/common/validity/{codetype}.xml') root = ET.fromstring(open(validity_path).read()) matches = root.findall('./idValidity/id') for match in matches: for item in match.text.strip().split(): if '~' in item: assert item[-2] == '~' prefix = item[:-3] range_start = item[-3] range_end = item[-1] option = f"{prefix}[{range_start}-{range_end}]" validity_options.append(option) else: validity_options.append(item) options = '|'.join(validity_options) return f'^({options})$' def read_language_distances(): language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml') root = ET.fromstring(open(language_info_path).read()) matches = root.findall( './languageMatching/languageMatches[@type="written_new"]/languageMatch' ) tag_distances = {} for match in matches: attribs = match.attrib n_parts = attribs['desired'].count('_') + 1 if n_parts < 3: if attribs.get('oneway') == 'true': pairs = [(attribs['desired'], attribs['supported'])] else: pairs = [ (attribs['desired'], attribs['supported']), (attribs['supported'], attribs['desired']), ] for (desired, supported) in pairs: desired_distance = tag_distances.setdefault(desired, {}) desired_distance[supported] = int(attribs['distance']) # The 'languageInfo' data file contains distances for the unnormalized # tag 'sh', but we work mostly with normalized tags, and they don't # describe at all how to cope with this. # # 'sh' normalizes to 'sr-Latn', and when we're matching languages we # aren't matching scripts yet, so when 'sh' appears we'll add a # corresponding match for 'sr'. # # Then because we're kind of making this plan up, add 1 to the distance # so it's a worse match than ones that are actually clearly defined # in languageInfo. if desired == 'sh' or supported == 'sh': if desired == 'sh': desired = 'sr' if supported == 'sh': supported = 'sr' if desired != supported: # don't try to define a non-zero distance for sr <=> sr desired_distance = tag_distances.setdefault(desired, {}) desired_distance[supported] = int(attribs['distance']) + 1 return tag_distances def build_data(): lang_scripts = read_iana_registry_suppress_scripts() all_scripts = read_iana_registry_scripts() macrolanguages = read_iana_registry_macrolanguages() iana_replacements = read_iana_registry_replacements() language_distances = read_language_distances() alias_data = read_cldr_supplemental('aliases') likely_subtags = read_cldr_supplemental('likelySubtags') replacements = {} # Aliased codes can still have alpha3 codes, and there's no unified source # about what they are. It depends on whether the alias predates or postdates # ISO 639-2, which nobody should have to care about. So let's set all the # alpha3 codes for aliased alpha2 codes here. alpha3_mapping = { 'tl': 'tgl', # even though it normalizes to 'fil' 'in': 'ind', 'iw': 'heb', 'ji': 'yid', 'jw': 'jav', 'sh': 'hbs', } alpha3_biblio = {} norm_macrolanguages = {} for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']: aliases = alias_data[alias_type] # Initially populate 'languageAlias' with the aliases from the IANA file if alias_type == 'languageAlias': replacements[alias_type] = iana_replacements replacements[alias_type]['root'] = 'und' else: replacements[alias_type] = {} for code, value in aliases.items(): # Make all keys lowercase so they can be looked up # case-insensitively code = code.lower() # If there are multiple replacements, take the first one. For example, # we just replace the Soviet Union (SU) with Russia (RU), instead of # trying to do something context-sensitive and poorly standardized # that selects one of the successor countries to the Soviet Union. replacement = value['_replacement'].split()[0] if value['_reason'] == 'macrolanguage': norm_macrolanguages[code] = replacement else: # CLDR tries to oversimplify some codes as it assigns aliases. # For example, 'nor' is the ISO alpha3 code for 'no', but CLDR # would prefer you use 'nb' over 'no', so it makes 'nor' an # alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'. # # We undo this oversimplification so that we can get a # canonical mapping between alpha2 and alpha3 codes. if code == 'nor': replacement = 'no' elif code == 'mol': replacement = 'mo' elif code == 'twi': replacement = 'tw' elif code == 'bih': replacement = 'bh' replacements[alias_type][code] = replacement if alias_type == 'languageAlias': if value['_reason'] == 'overlong': if replacement in alpha3_mapping: raise ValueError( "{code!r} is an alpha3 for {replacement!r}, which" " already has an alpha3: {orig!r}".format( code=code, replacement=replacement, orig=alpha3_mapping[replacement], ) ) alpha3_mapping[replacement] = code elif value['_reason'] == 'bibliographic': alpha3_biblio[replacement] = code validity_regex = read_validity_regex() # Write the contents of data_dicts.py. with open('data_dicts.py', 'w', encoding='utf-8') as outfile: print(GENERATED_HEADER, file=outfile) print("import re\n", file=outfile) write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts) write_python_dict( outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias'] ) write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping) write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio) write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias']) write_python_set(outfile, 'ALL_SCRIPTS', all_scripts) write_python_dict( outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias'] ) write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages) write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages) write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags) write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances) print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile) if __name__ == '__main__': build_data()