Update language-data 1.1 → 1.3 (vendor'd from pip install langcodes[data]).

Change remove marisa-trie requirement ref: HACKS.txt
This commit is contained in:
JackDandy 2025-06-03 14:03:44 +01:00
parent 382bfcdcb2
commit be3ae7529b
7 changed files with 2722 additions and 1701 deletions

View file

@ -9,6 +9,8 @@
* Update feedparser 6.0.11 (efcb89b) to 6.0.11 (ff59549)
* Update hachoir 3.2.0 (38d759f) to 3.3.0 (48b478f)
* Update idna library 3.7 (1d365e1) to 3.10 (729225d)
* Update langcodes 3.3.0 to 3.5.0
* Update language-data 1.1 to 1.3
* Update Msgpack 1.0.6 (e1d3d5d) to 1.1.0 (0eeabfb)
* Update deprecated pkg_resources 24.0 to Packaging 25.0 (f585376)
* Update profilehooks module 1.13.0.dev0 (99f8a31) to 1.13.1.dev0 (824fcd4)

View file

@ -0,0 +1,498 @@
import marisa_trie
import json
import xml.etree.ElementTree as ET
import os
from pathlib import Path
from collections import defaultdict, Counter
from language_data.names import normalize_name
from language_data.util import data_filename
from language_data.registry_parser import parse_registry
# Naming things is hard, especially languages
# ===========================================
#
# CLDR is supposed to avoid ambiguous language names, particularly among its
# core languages. But it seems that languages are incompletely disambiguated.
#
# It's convenient to be able to get a language by its name, without having to
# also refer to the language that the name is in. In most cases, we can do this
# unambiguously. With the disambiguations and overrides here, this will work
# in a lot of cases. However, some names such as 'Dongo', 'Fala', 'Malayo', and
# 'Tonga' are ambiguous in ways that can only be disambiguated by specifying
# the language the name is in.
#
# Ambiguous names can arise from:
#
# - Ambiguities in the scope of a name. These tend to span languages, and the
# data files mask the fact that these names are generally ambiguous *within*
# a language. This is why we have codes.
#
# - Names that just happen to be ambiguous between different things with
# different etymologies.
#
# Most doubly-claimed language names have standard ways to disambiguate
# them in CLDR, but names such as 'Tonga' and 'Fala' have complex
# inter-language ambiguities.
#
# Our approach is:
#
# - Fix conflicts that seem to arise simply from errors in the data, by
# overriding the data.
#
# - Fix ambiguities in scope by preferring one scope over another. For example,
# "North America" could refer to a territory that includes Central America or
# a territory that doesn't. In any such conflict, we choose to include Central
# America.
#
# - Avoid ambiguities between different sources of data, by using an order
# of precedence. CLDR data takes priority over IANA data, which takes priority
# over Wiktionary data.
#
# - When ambiguity remains, that name is not resolvable to a language code.
# Resolving the name might require a more specific name, or specifying the
# language that the name is in.
AMBIGUOUS_PREFERENCES = {
# Prefer 'Micronesia' to refer to the Federated States of Micronesia -
# this seems to be poorly disambiguated in many languages, but we can't
# do much with a code for the general region of Micronesia
'FM': {'057'},
# Prefer the country of South Africa over the general region of southern
# Africa, in languages that don't distinguish them
'ZA': {'018'},
# Prefer territory 003 for 'North America', which includes Central America
# and the Caribbean, over territory 021, which excludes them
'003': {'021'},
# Prefer territory 005 for 'Lulli-Amerihkká' (South America), over territory
# 419, which includes Central America
'005': {'419'},
# If a name like "Amerika" is ambiguous between the Americas and the United
# States of America, choose the Americas
'019': {'US'},
# Prefer 'Swiss German' to be a specific language
'gsw': {'de-CH'},
# Of the two countries named 'Congo', prefer the one with Kinshasa
'CD': {'CG'},
# Prefer Han script to not include bopomofo
'Hani': {'Hanb'},
# Prefer the specific language Tagalog over standard Filipino, because
# the ambiguous name was probably some form of 'Tagalog'
'tl': {'fil'},
# Confusion between Ilokano and Hiligaynon
'ilo': {'hil'},
# Prefer Central Atlas Tamazight over Standard Moroccan Tamazight
'tzm': {'zgh'},
# Prefer the specific definition of Low Saxon
'nds-NL': {'nds'},
# Prefer the specific definition of Mandarin Chinese
'cmn': {'zh'},
# Prefer the territorially-specific definition of Dari
'fa-AF': {'prs', 'fa', 'gbz'},
# Ambiguity in the scope of Korean script (whether to include Han characters)
'Kore': {'Hang'},
# This ambiguity is kind of our fault, for adding an autonym for 'zsm'.
# "Bahasa Malaysia" should still resolve to the more expected 'ms'.
'ms': {'zsm'},
# I think that the CLDR data for Mazanderani confuses Latvia and Lithuania,
# and Wikipedia tells me it means Latvia. I should make this a CLDR issue
'lv': {'lt'},
'LV': {'LT'},
}
OVERRIDES = {
# When I ask Wiktionary, it tells me that "Breatnais" is Scots Gaelic for
# Welsh, not Breton, which is "Breatannais". This may be one of those
# things that's not as standardized as it sounds, but let's at least agree
# with Wiktionary and avoid a name conflict.
("gd", "br"): "Breatannais",
# 'tagaloga' should be 'tl', not 'fil'
("eu", "tl"): "Tagaloga",
("eu", "fil"): "Filipinera",
# 'Dakota' should be 'dak', not 'dar', which is "Dargwa"
("af", "dar"): "Dargwa",
("af-NA", "dar"): "Dargwa",
# 'интерлингве' should be 'ie', not 'ia', which is 'интерлингва'
("az-Cyrl", "ia"): "интерлингва",
# Don't confuse Samaritan Hebrew with Samaritan Aramaic
("en", "smp"): "Samaritan Hebrew",
# Don't confuse the Mongol language of New Guinea with Mongolian
("en", "mgt"): "Mongol (New Guinea)",
# Don't confuse Romang with Romani over the name 'Roma'
("en", "rmm"): "Romang",
# 'Tai' is a large language family, and it should not refer exclusively and
# unrelatedly to a language spoken by 900 people in New Guinea
("en", "taw"): "Kalam-Tai",
# The code for Ladin -- the language that's almost certainly being named in
# Friulian here -- is "lld". The given code of "lad" seems to be an error,
# pointing to the Judeo-Spanish language Ladino, which would be less likely
# to be what you mean when speaking Friulian.
("fur", "lad"): None,
# The Amharic data in v39 appears to have switched the words for 'Western'
# and 'Eastern'.
("am", "011"): "ምዕራባዊ አፍሪካ", # Western Africa
("am", "014"): "ምስራቃዊ አፍሪካ", # Eastern Africa
("am", "155"): "ምዕራባዊ አውሮፓ", # Western Europe
("am", "151"): "ምስራቃዊ አውሮፓ", # Eastern Europe
}
def resolve_name(key, vals, debug=False):
"""
Given a name, and a number of possible values it could resolve to,
find the single value it should resolve to, in the following way:
- Apply the priority order
- If names with the highest priority all agree, use that name
- If there is disagreement that can be resolved by AMBIGUOUS_PREFERENCES,
use that
- Otherwise, don't resolve the name (and possibly show a debugging message
when building the data)
"""
max_priority = max([val[2] for val in vals])
val_count = Counter([val[1] for val in vals if val[2] == max_priority])
if len(val_count) == 1:
unanimous = val_count.most_common(1)
return unanimous[0][0]
for pkey in val_count:
if pkey in AMBIGUOUS_PREFERENCES:
others = set(val_count)
others.remove(pkey)
if others == others & AMBIGUOUS_PREFERENCES[pkey]:
if debug:
print("Resolved: {} -> {}".format(key, pkey))
return pkey
# In debug mode, show which languages vote for which name
if debug and max_priority >= 0:
votes = defaultdict(list)
for voter, val, prio in vals:
if prio == max_priority:
votes[val].append(voter)
print("{}:".format(key))
for val, voters in sorted(votes.items()):
print("\t{}: {}".format(val, ' '.join(voters)))
# Don't use names that remain ambiguous
return None
def resolve_names(name_dict, debug=False):
resolved = {}
for key, vals in sorted(name_dict.items()):
resolved_name = resolve_name(key, vals, debug=debug)
if resolved_name is not None:
resolved[key] = resolved_name
return resolved
def read_cldr_names(language, category):
"""
Read CLDR's names for things in a particular language.
"""
filename = data_filename(
'cldr-json/cldr-json/cldr-localenames-full/main/{}/{}.json'.format(language, category)
)
fulldata = json.load(open(filename, encoding='utf-8'))
data = fulldata['main'][language]['localeDisplayNames'][category]
return data
def read_cldr_name_file(langcode, category):
data = read_cldr_names(langcode, category)
name_quads = []
for subtag, name in sorted(data.items()):
if (langcode, subtag) in OVERRIDES:
name = OVERRIDES[langcode, subtag]
if name is None:
continue
if subtag == name:
# Default entries that map a language code to itself, which
# an inattentive annotator just left there
continue
priority = 3
if subtag.endswith('-alt-menu') and name == 'mandarin':
# The -alt-menu entries are supposed to do things like alphabetize
# "Mandarin Chinese" under "Chinese, Mandarin". A few languages
# just put the string "mandarin" there, which seems wrong and
# messes up our name lookups.
continue
# CLDR assigns multiple names to one code by adding -alt-* to
# the end of the code. For example, the English name of 'az' is
# Azerbaijani, but the English name of 'az-alt-short' is Azeri.
if '-alt-' in subtag:
subtag, _ = subtag.split('-alt-', 1)
priority = 1
if normalize_name(name) == normalize_name(subtag):
# Giving the name "zh (Hans)" to "zh-Hans" is still lazy
continue
name_quads.append((langcode, subtag, name, priority))
return name_quads
def read_iana_registry_names():
language_quads = []
script_quads = []
territory_quads = []
for entry in parse_registry():
target = None
if entry['Type'] == 'language':
target = language_quads
elif entry['Type'] == 'script':
target = script_quads
elif entry['Type'] == 'region':
# IANA's terminology is 'region' where CLDR's is 'territory'
target = territory_quads
if target is not None:
subtag = entry['Subtag']
priority = 2
if 'Deprecated' in entry:
priority = 0
if ('en', subtag) in OVERRIDES:
target.append(('en', subtag, OVERRIDES['en', subtag], priority))
else:
for desc in entry['Description']:
target.append(('en', subtag, desc, priority))
return language_quads, script_quads, territory_quads
def read_iana_registry_macrolanguages():
macros = {}
for entry in parse_registry():
if entry['Type'] == 'language' and 'Macrolanguage' in entry:
macros[entry['Subtag']] = entry['Macrolanguage']
return macros
def read_iana_registry_replacements():
replacements = {}
for entry in parse_registry():
if entry['Type'] == 'language' and 'Preferred-Value' in entry:
# Replacements for language codes
replacements[entry['Subtag']] = entry['Preferred-Value']
elif 'Tag' in entry and 'Preferred-Value' in entry:
# Replacements for entire tags
replacements[entry['Tag'].lower()] = entry['Preferred-Value']
return replacements
def read_csv_names(filename):
data = open(filename, encoding='utf-8')
quads = []
for line in data:
quad = line.rstrip().split(',', 3) + [True]
quads.append(tuple(quad))
return quads
def read_wiktionary_names(filename, language):
data = open(filename, encoding='utf-8')
quads = []
for line in data:
parts = line.rstrip().split('\t')
code = parts[0]
quads.append((language, code, parts[1], -1))
names = [parts[1]]
if len(parts) > 4 and parts[4]:
names = parts[4].split(', ')
for name in names:
quads.append((language, code, name, -2))
return quads
def update_names(names_fwd, names_rev, name_quads):
for name_language, referent, name, priority in name_quads:
# Get just the language from name_language, not the territory or script.
short_language = name_language.split('-')[0]
rev_all = names_rev.setdefault('und', {})
rev_language = names_rev.setdefault(short_language, {})
for rev_dict in (rev_all, rev_language):
rev_dict.setdefault(normalize_name(name), []).append(
(name_language, referent, priority)
)
names_for_referent = names_fwd.setdefault(referent, {})
if name_language not in names_for_referent:
names_for_referent[name_language] = name
def save_trie(mapping, filename):
trie = marisa_trie.BytesTrie(
(key, value.encode('utf-8')) for (key, value) in sorted(mapping.items())
)
trie.save(filename)
def save_reverse_name_tables(category, rev_dict):
for language, lang_dict in rev_dict.items():
os.makedirs(data_filename(f"trie/{language}"), exist_ok=True)
save_trie(
resolve_names(lang_dict, debug=True),
data_filename(f"trie/{language}/name_to_{category}.marisa"),
)
def get_name_languages():
cldr_main_path = Path(data_filename("cldr-json/cldr-json/cldr-localenames-full/main"))
languages = [
subpath.name
for subpath in sorted(cldr_main_path.iterdir())
if subpath.name != 'root' and (subpath / 'languages.json').exists()
]
return [language for language in languages if 'a' <= language[-1] <= 'z']
def get_population_data():
import langcodes
filename = data_filename("supplementalData.xml")
root = ET.fromstring(open(filename).read())
territories = root.findall("./territoryInfo/territory")
language_population = defaultdict(int)
language_writing_population = defaultdict(int)
for territory in territories:
t_code = territory.attrib['type']
t_population = float(territory.attrib['population'])
t_literacy_rate = float(territory.attrib['literacyPercent']) / 100
for language in territory:
attrs = language.attrib
l_code = attrs['type'].replace('_', '-')
l_proportion = float(attrs.get('populationPercent', 0)) / 100
if 'writingPercent' in attrs:
writing_prop = float(attrs['writingPercent']) / 100
elif 'literacyPercent' in attrs:
writing_prop = float(attrs['literacyPercent']) / 100
else:
writing_prop = t_literacy_rate
l_population = t_population * l_proportion
l_writing = t_population * l_proportion * writing_prop
# Distinguish data in different territories, and also in different
# scripts when necessary, while also accumulating more general data
# We need to use maximize() on the bare language code, not just
# assume_script(), because assumed defaults like 'zh-Hans' are unwritten
# in the data. We need this if we want to count the relative use of
# Simplified vs. Traditional Chinese, for example.
written_ls = (
langcodes.get(l_code).maximize()._filter_attributes(['language', 'script'])
)
written_lst = written_ls.update_dict({'territory': t_code})
spoken_lt = written_lst._filter_attributes(['language', 'territory'])
spoken_l = written_lst._filter_attributes(['language'])
written_lt = written_lst._filter_attributes(['language', 'territory'])
written_l = written_lst._filter_attributes(['language'])
for lang in set([spoken_lt, spoken_l]):
language_population[str(lang)] += int(round(l_population))
for lang in set([written_lst, written_lt, written_ls, written_l]):
language_writing_population[str(lang)] += int(round(l_writing))
return language_population, language_writing_population
def write_python_dict(outfile, name, d):
"""
Write Python code that initializes a given dictionary, with one value on
each line.
"""
print(f"{name} = {{", file=outfile)
for key, value in sorted(d.items()):
print(f" {key!r}: {value!r},", file=outfile)
print("}", file=outfile)
def write_python_set(outfile, name, s):
print(f"{name} = {{", file=outfile)
for key in sorted(set(s)):
print(f" {key!r},", file=outfile)
print("}", file=outfile)
GENERATED_HEADER = "# This file is generated by build_data.py."
def build_data():
language_names_rev = {}
territory_names_rev = {}
script_names_rev = {}
names_fwd = {}
override_language_data = read_csv_names(data_filename('override_language_names.csv'))
update_names(names_fwd, language_names_rev, override_language_data)
for langcode in get_name_languages():
language_data = read_cldr_name_file(langcode, 'languages')
update_names(names_fwd, language_names_rev, language_data)
try:
script_data = read_cldr_name_file(langcode, 'scripts')
update_names(names_fwd, script_names_rev, script_data)
except FileNotFoundError:
pass
try:
territory_data = read_cldr_name_file(langcode, 'territories')
update_names(names_fwd, territory_names_rev, territory_data)
except FileNotFoundError:
pass
iana_languages, iana_scripts, iana_territories = read_iana_registry_names()
update_names(names_fwd, language_names_rev, iana_languages)
update_names(names_fwd, script_names_rev, iana_scripts)
update_names(names_fwd, territory_names_rev, iana_territories)
wiktionary_data = read_wiktionary_names(data_filename('wiktionary/codes-en.csv'), 'en')
update_names(names_fwd, language_names_rev, wiktionary_data)
extra_language_data = read_csv_names(data_filename('extra_language_names.csv'))
update_names(names_fwd, language_names_rev, extra_language_data)
save_reverse_name_tables('language', language_names_rev)
save_reverse_name_tables('script', script_names_rev)
save_reverse_name_tables('territory', territory_names_rev)
# Get the list of languages where we have any name data. These are base
# language codes (without scripts or territories) which contain a name for
# themselves.
name_languages = [
langcode
for langcode in get_name_languages()
if '-' not in langcode and langcode in names_fwd and langcode in names_fwd[langcode]
]
# Add the languages that have autonyms in extra_language_data, perhaps because
# we specifically put them there to get their autonyms right
name_languages += [lang1 for (lang1, lang2, _, _) in extra_language_data if lang1 == lang2]
# Write the contents of name_data.py.
with open('name_data.py', 'w', encoding='utf-8') as outfile:
print(GENERATED_HEADER, file=outfile)
write_python_set(outfile, 'LANGUAGES_WITH_NAME_DATA', name_languages)
print(file=outfile)
write_python_dict(outfile, 'CODE_TO_NAMES', names_fwd)
language_population, language_writing_population = get_population_data()
with open('population_data.py', 'w', encoding='utf-8') as outfile:
print(GENERATED_HEADER, file=outfile)
write_python_dict(outfile, 'LANGUAGE_SPEAKING_POPULATION', language_population)
write_python_dict(outfile, 'LANGUAGE_WRITING_POPULATION', language_writing_population)
if __name__ == '__main__':
build_data()

View file

@ -10,11 +10,11 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatching>
<languageMatches type="written_new">
<paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
<matchVariable id="$enUS" value="AS+GU+MH+MP+PR+UM+US+VI"/>
<matchVariable id="$enUS" value="AS+CA+GU+MH+MP+PH+PR+UM+US+VI"/>
<matchVariable id="$cnsar" value="HK+MO"/>
<matchVariable id="$americas" value="019"/>
<matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
<languageMatch desired="no" supported="nb" distance="1"/> <!-- no ⇒ nb -->
<languageMatch desired="nb" supported="no" distance="1"/> <!-- nb ⇒ no -->
<!-- languageMatch desired="ku" supported="ckb" distance="4" oneway="true"/ --> <!-- ku ⇒ ckb -->
<!-- languageMatch desired="ckb" supported="ku" percent="8" oneway="true"/ --> <!-- ckb ⇒ ku -->
<languageMatch desired="hr" supported="bs" distance="4"/> <!-- hr ⇒ bs -->
@ -38,18 +38,23 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatch desired="ach" supported="en" distance="30" oneway="true"/> <!-- Acoli (Southern Luo dialect in Uganda): ach ⇒ en -->
<languageMatch desired="af" supported="nl" distance="20" oneway="true"/> <!-- Afrikaans: af ⇒ nl -->
<languageMatch desired="ak" supported="en" distance="30" oneway="true"/> <!-- Akan: ak ⇒ en -->
<languageMatch desired="am" supported="en" distance="30" oneway="true"/> <!-- Amharic ⇒ English -->
<languageMatch desired="ay" supported="es" distance="20" oneway="true"/> <!-- Aymara: ay ⇒ es -->
<languageMatch desired="az" supported="ru" distance="30" oneway="true"/> <!-- Azerbaijani: az ⇒ ru -->
<languageMatch desired="bal" supported="ur" distance="20" oneway="true"/> <!-- Baluchi ⇒ Urdu -->
<languageMatch desired="be" supported="ru" distance="20" oneway="true"/> <!-- Belarusian: be ⇒ ru -->
<languageMatch desired="bem" supported="en" distance="30" oneway="true"/> <!-- Bemba (Zambia): bem ⇒ en -->
<languageMatch desired="bh" supported="hi" distance="30" oneway="true"/> <!-- Bihari languages (gets canonicalized to bho): bh ⇒ hi -->
<languageMatch desired="bn" supported="en" distance="30" oneway="true"/> <!-- Bangla: bn ⇒ en -->
<languageMatch desired="bo" supported="zh" distance="20" oneway="true"/> <!-- Tibetan ⇒ Chinese -->
<languageMatch desired="br" supported="fr" distance="20" oneway="true"/> <!-- Breton: br ⇒ fr -->
<languageMatch desired="ca" supported="es" distance="20" oneway="true"/> <!-- Catalan ⇒ Spanish -->
<languageMatch desired="ceb" supported="fil" distance="30" oneway="true"/> <!-- Cebuano: ceb ⇒ fil -->
<languageMatch desired="chr" supported="en" distance="20" oneway="true"/> <!-- Cherokee: chr ⇒ en -->
<languageMatch desired="ckb" supported="ar" distance="30" oneway="true"/> <!-- Sorani Kurdish: ckb ⇒ ar -->
<languageMatch desired="co" supported="fr" distance="20" oneway="true"/> <!-- Corsican: co ⇒ fr -->
<languageMatch desired="crs" supported="fr" distance="20" oneway="true"/> <!-- Seselwa Creole French: crs ⇒ fr -->
<languageMatch desired="cs" supported="sk" distance="20"/> <!-- Czech ⇔ Slovak -->
<languageMatch desired="cy" supported="en" distance="20" oneway="true"/> <!-- Welsh: cy ⇒ en -->
<languageMatch desired="ee" supported="en" distance="30" oneway="true"/> <!-- Ewe: ee ⇒ en -->
<languageMatch desired="eo" supported="en" distance="30" oneway="true"/> <!-- Esperanto: eo ⇒ en -->
@ -88,9 +93,10 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatch desired="lo" supported="en" distance="30" oneway="true"/> <!-- Lao: lo ⇒ en -->
<languageMatch desired="loz" supported="en" distance="30" oneway="true"/> <!-- Lozi: loz ⇒ en -->
<languageMatch desired="lua" supported="fr" distance="30" oneway="true"/> <!-- Luba-Lulua: lua ⇒ fr -->
<languageMatch desired="mai" supported="hi" distance="20" oneway="true"/> <!-- Maithili ⇒ Hindi -->
<languageMatch desired="mfe" supported="en" distance="30" oneway="true"/> <!-- Morisyen: mfe ⇒ en -->
<languageMatch desired="mg" supported="fr" distance="30" oneway="true"/> <!-- Malagasy: mg ⇒ fr -->
<languageMatch desired="mi" supported="en" distance="20" oneway="true"/> <!-- Maori: mi ⇒ en -->
<languageMatch desired="mi" supported="en" distance="20" oneway="true"/> <!-- Māori: mi ⇒ en -->
<!-- CLDR-13625: Macedonian should not fall back to Bulgarian -->
<!-- languageMatch desired="mk" supported="bg" distance="30" oneway="true"/--> <!-- Macedonian: mk ⇒ bg -->
@ -137,12 +143,14 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatch desired="tt" supported="ru" distance="30" oneway="true"/> <!-- Tatar: tt ⇒ ru -->
<languageMatch desired="tum" supported="en" distance="30" oneway="true"/> <!-- Tumbuka: tum ⇒ en -->
<languageMatch desired="ug" supported="zh" distance="20" oneway="true"/> <!-- Uighur: ug ⇒ zh -->
<languageMatch desired="uk" supported="ru" distance="20" oneway="true"/> <!-- Ukrainian ⇒ Russian -->
<languageMatch desired="ur" supported="en" distance="30" oneway="true"/> <!-- Urdu: ur ⇒ en -->
<languageMatch desired="uz" supported="ru" distance="30" oneway="true"/> <!-- Uzbek: uz ⇒ ru -->
<languageMatch desired="wo" supported="fr" distance="30" oneway="true"/> <!-- Wolof: wo ⇒ fr -->
<languageMatch desired="xh" supported="en" distance="30" oneway="true"/> <!-- Xhosa: xh ⇒ en -->
<languageMatch desired="yi" supported="en" distance="30" oneway="true"/> <!-- Yiddish: yi ⇒ en -->
<languageMatch desired="yo" supported="en" distance="30" oneway="true"/> <!-- Yoruba: yo ⇒ en -->
<languageMatch desired="za" supported="zh" distance="20" oneway="true"/> <!-- Zhuang languages ⇒ Chinese -->
<languageMatch desired="zu" supported="en" distance="30" oneway="true"/> <!-- Zulu: zu ⇒ en -->
<!-- START generated by GenerateLanguageMatches.java: don't manually change -->
@ -159,7 +167,6 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatch desired="aeb" supported="ar" distance="10" oneway="true"/> <!-- Tunisian Arabic -->
<languageMatch desired="aec" supported="ar" distance="10" oneway="true"/> <!-- Saidi Arabic -->
<languageMatch desired="afb" supported="ar" distance="10" oneway="true"/> <!-- Gulf Arabic -->
<languageMatch desired="ajp" supported="ar" distance="10" oneway="true"/> <!-- South Levantine Arabic -->
<languageMatch desired="apc" supported="ar" distance="10" oneway="true"/> <!-- North Levantine Arabic -->
<languageMatch desired="apd" supported="ar" distance="10" oneway="true"/> <!-- Sudanese Arabic -->
<languageMatch desired="arq" supported="ar" distance="10" oneway="true"/> <!-- Algerian Arabic -->
@ -359,8 +366,10 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatch desired="yue" supported="zh" distance="10" oneway="true"/> <!-- Chinese, Cantonese -->
<!-- END generated by GenerateLanguageMatches.java -->
<languageMatch desired="*" supported="*" distance="80"/> <!-- * ⇒ * -->
<languageMatch desired="am_Ethi" supported="en_Latn" distance="10" oneway="true"/>
<languageMatch desired="az_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- az; Latn ⇒ ru; Cyrl -->
<languageMatch desired="bn_Beng" supported="en_Latn" distance="10" oneway="true"/> <!-- bn; Beng ⇒ en; Latn -->
<languageMatch desired="bo_Tibt" supported="zh_Hans" distance="10" oneway="true"/>
<languageMatch desired="hy_Armn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- hy; Armn ⇒ ru; Cyrl -->
<languageMatch desired="ka_Geor" supported="en_Latn" distance="10" oneway="true"/> <!-- ka; Geor ⇒ en; Latn -->
<languageMatch desired="km_Khmr" supported="en_Latn" distance="10" oneway="true"/> <!-- km; Khmr ⇒ en; Latn -->
@ -382,9 +391,8 @@ For terms of use, see http://www.unicode.org/copyright.html
<languageMatch desired="uz_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- uz; Latn ⇒ ru; Cyrl -->
<languageMatch desired="yi_Hebr" supported="en_Latn" distance="10" oneway="true"/> <!-- yi; Hebr ⇒ en; Latn -->
<languageMatch desired="sr_Latn" supported="sr_Cyrl" distance="5"/> <!-- sr; Latn ⇒ sr; Cyrl -->
<languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/> <!-- zh; Hans ⇒ zh; Hant -->
<languageMatch desired="zh_Hant" supported="zh_Hans" distance="19" oneway="true"/> <!-- zh; Hant ⇒ zh; Hans -->
<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans -->
<languageMatch desired="za_Latn" supported="zh_Hans" distance="10" oneway="true"/>
<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans was before CLDR-14355 -->
<languageMatch desired="zh_Hani" supported="zh_Hans" distance="20" oneway="true"/>
<languageMatch desired="zh_Hani" supported="zh_Hant" distance="20" oneway="true"/>
<!-- Latin transliterations of some languages, initially from CLDR-13577 -->

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -2,14 +2,11 @@
Used for locating a file in the data directory.
"""
from pkg_resources import resource_filename
DATA_ROOT = resource_filename('language_data', 'data')
import os
from importlib.resources import files
def data_filename(filename):
"""
Given a relative filename, get the full path to that file in the data
directory.
"""
return os.path.join(DATA_ROOT, filename)
return files('language_data') / 'data' / filename