Update language-data 1.1 → 1.3 (vendor'd from pip install langcodes[data]).

Change remove marisa-trie requirement ref: HACKS.txt
2025-12-04 16:14:36 +00:00 · 2025-06-03 14:03:44 +01:00 · 2025-06-03 14:03:44 +01:00 · be3ae7529b
commit be3ae7529b
parent 382bfcdcb2
7 changed files with 2722 additions and 1701 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -9,6 +9,8 @@
 * Update feedparser 6.0.11 (efcb89b) to 6.0.11 (ff59549)
 * Update hachoir 3.2.0 (38d759f) to 3.3.0 (48b478f)
 * Update idna library 3.7 (1d365e1) to 3.10 (729225d)
+* Update langcodes 3.3.0 to 3.5.0
+* Update language-data 1.1 to 1.3
 * Update Msgpack 1.0.6 (e1d3d5d) to 1.1.0 (0eeabfb)
 * Update deprecated pkg_resources 24.0 to Packaging 25.0 (f585376)
 * Update profilehooks module 1.13.0.dev0 (99f8a31) to 1.13.1.dev0 (824fcd4) 
--- a/lib/language_data/build_data.py
+++ b/lib/language_data/build_data.py
@ -0,0 +1,498 @@
+import marisa_trie
+import json
+import xml.etree.ElementTree as ET
+import os
+from pathlib import Path
+from collections import defaultdict, Counter
+
+from language_data.names import normalize_name
+from language_data.util import data_filename
+from language_data.registry_parser import parse_registry
+
+# Naming things is hard, especially languages
+# ===========================================
+#
+# CLDR is supposed to avoid ambiguous language names, particularly among its
+# core languages. But it seems that languages are incompletely disambiguated.
+#
+# It's convenient to be able to get a language by its name, without having to
+# also refer to the language that the name is in. In most cases, we can do this
+# unambiguously. With the disambiguations and overrides here, this will work
+# in a lot of cases. However, some names such as 'Dongo', 'Fala', 'Malayo', and
+# 'Tonga' are ambiguous in ways that can only be disambiguated by specifying
+# the language the name is in.
+#
+# Ambiguous names can arise from:
+#
+# - Ambiguities in the scope of a name. These tend to span languages, and the
+#   data files mask the fact that these names are generally ambiguous *within*
+#   a language. This is why we have codes.
+#
+# - Names that just happen to be ambiguous between different things with
+#   different etymologies.
+#
+# Most doubly-claimed language names have standard ways to disambiguate
+# them in CLDR, but names such as 'Tonga' and 'Fala' have complex
+# inter-language ambiguities.
+#
+# Our approach is:
+#
+# - Fix conflicts that seem to arise simply from errors in the data, by
+#   overriding the data.
+#
+# - Fix ambiguities in scope by preferring one scope over another. For example,
+#   "North America" could refer to a territory that includes Central America or
+#   a territory that doesn't. In any such conflict, we choose to include Central
+#   America.
+#
+# - Avoid ambiguities between different sources of data, by using an order
+#   of precedence. CLDR data takes priority over IANA data, which takes priority
+#   over Wiktionary data.
+#
+# - When ambiguity remains, that name is not resolvable to a language code.
+#   Resolving the name might require a more specific name, or specifying the
+#   language that the name is in.
+
+
+AMBIGUOUS_PREFERENCES = {
+    # Prefer 'Micronesia' to refer to the Federated States of Micronesia -
+    # this seems to be poorly disambiguated in many languages, but we can't
+    # do much with a code for the general region of Micronesia
+    'FM': {'057'},
+    # Prefer the country of South Africa over the general region of southern
+    # Africa, in languages that don't distinguish them
+    'ZA': {'018'},
+    # Prefer territory 003 for 'North America', which includes Central America
+    # and the Caribbean, over territory 021, which excludes them
+    '003': {'021'},
+    # Prefer territory 005 for 'Lulli-Amerihkká' (South America), over territory
+    # 419, which includes Central America
+    '005': {'419'},
+    # If a name like "Amerika" is ambiguous between the Americas and the United
+    # States of America, choose the Americas
+    '019': {'US'},
+    # Prefer 'Swiss German' to be a specific language
+    'gsw': {'de-CH'},
+    # Of the two countries named 'Congo', prefer the one with Kinshasa
+    'CD': {'CG'},
+    # Prefer Han script to not include bopomofo
+    'Hani': {'Hanb'},
+    # Prefer the specific language Tagalog over standard Filipino, because
+    # the ambiguous name was probably some form of 'Tagalog'
+    'tl': {'fil'},
+    # Confusion between Ilokano and Hiligaynon
+    'ilo': {'hil'},
+    # Prefer Central Atlas Tamazight over Standard Moroccan Tamazight
+    'tzm': {'zgh'},
+    # Prefer the specific definition of Low Saxon
+    'nds-NL': {'nds'},
+    # Prefer the specific definition of Mandarin Chinese
+    'cmn': {'zh'},
+    # Prefer the territorially-specific definition of Dari
+    'fa-AF': {'prs', 'fa', 'gbz'},
+    # Ambiguity in the scope of Korean script (whether to include Han characters)
+    'Kore': {'Hang'},
+    # This ambiguity is kind of our fault, for adding an autonym for 'zsm'.
+    # "Bahasa Malaysia" should still resolve to the more expected 'ms'.
+    'ms': {'zsm'},
+    # I think that the CLDR data for Mazanderani confuses Latvia and Lithuania,
+    # and Wikipedia tells me it means Latvia. I should make this a CLDR issue
+    'lv': {'lt'},
+    'LV': {'LT'},
+}
+
+OVERRIDES = {
+    # When I ask Wiktionary, it tells me that "Breatnais" is Scots Gaelic for
+    # Welsh, not Breton, which is "Breatannais". This may be one of those
+    # things that's not as standardized as it sounds, but let's at least agree
+    # with Wiktionary and avoid a name conflict.
+    ("gd", "br"): "Breatannais",
+    # 'tagaloga' should be 'tl', not 'fil'
+    ("eu", "tl"): "Tagaloga",
+    ("eu", "fil"): "Filipinera",
+    # 'Dakota' should be 'dak', not 'dar', which is "Dargwa"
+    ("af", "dar"): "Dargwa",
+    ("af-NA", "dar"): "Dargwa",
+    # 'интерлингве' should be 'ie', not 'ia', which is 'интерлингва'
+    ("az-Cyrl", "ia"): "интерлингва",
+    # Don't confuse Samaritan Hebrew with Samaritan Aramaic
+    ("en", "smp"): "Samaritan Hebrew",
+    # Don't confuse the Mongol language of New Guinea with Mongolian
+    ("en", "mgt"): "Mongol (New Guinea)",
+    # Don't confuse Romang with Romani over the name 'Roma'
+    ("en", "rmm"): "Romang",
+    # 'Tai' is a large language family, and it should not refer exclusively and
+    # unrelatedly to a language spoken by 900 people in New Guinea
+    ("en", "taw"): "Kalam-Tai",
+    # The code for Ladin -- the language that's almost certainly being named in
+    # Friulian here -- is "lld". The given code of "lad" seems to be an error,
+    # pointing to the Judeo-Spanish language Ladino, which would be less likely
+    # to be what you mean when speaking Friulian.
+    ("fur", "lad"): None,
+    # The Amharic data in v39 appears to have switched the words for 'Western'
+    # and 'Eastern'.
+    ("am", "011"): "ምዕራባዊ አፍሪካ",  # Western Africa
+    ("am", "014"): "ምስራቃዊ አፍሪካ",  # Eastern Africa
+    ("am", "155"): "ምዕራባዊ አውሮፓ",  # Western Europe
+    ("am", "151"): "ምስራቃዊ አውሮፓ",  # Eastern Europe
+}
+
+
+def resolve_name(key, vals, debug=False):
+    """
+    Given a name, and a number of possible values it could resolve to,
+    find the single value it should resolve to, in the following way:
+
+    - Apply the priority order
+    - If names with the highest priority all agree, use that name
+    - If there is disagreement that can be resolved by AMBIGUOUS_PREFERENCES,
+      use that
+    - Otherwise, don't resolve the name (and possibly show a debugging message
+      when building the data)
+    """
+    max_priority = max([val[2] for val in vals])
+    val_count = Counter([val[1] for val in vals if val[2] == max_priority])
+    if len(val_count) == 1:
+        unanimous = val_count.most_common(1)
+        return unanimous[0][0]
+
+    for pkey in val_count:
+        if pkey in AMBIGUOUS_PREFERENCES:
+            others = set(val_count)
+            others.remove(pkey)
+            if others == others & AMBIGUOUS_PREFERENCES[pkey]:
+                if debug:
+                    print("Resolved: {} -> {}".format(key, pkey))
+                return pkey
+
+    # In debug mode, show which languages vote for which name
+    if debug and max_priority >= 0:
+        votes = defaultdict(list)
+        for voter, val, prio in vals:
+            if prio == max_priority:
+                votes[val].append(voter)
+
+        print("{}:".format(key))
+        for val, voters in sorted(votes.items()):
+            print("\t{}: {}".format(val, ' '.join(voters)))
+
+    # Don't use names that remain ambiguous
+    return None
+
+
+def resolve_names(name_dict, debug=False):
+    resolved = {}
+    for key, vals in sorted(name_dict.items()):
+        resolved_name = resolve_name(key, vals, debug=debug)
+        if resolved_name is not None:
+            resolved[key] = resolved_name
+    return resolved
+
+
+def read_cldr_names(language, category):
+    """
+    Read CLDR's names for things in a particular language.
+    """
+    filename = data_filename(
+        'cldr-json/cldr-json/cldr-localenames-full/main/{}/{}.json'.format(language, category)
+    )
+    fulldata = json.load(open(filename, encoding='utf-8'))
+    data = fulldata['main'][language]['localeDisplayNames'][category]
+    return data
+
+
+def read_cldr_name_file(langcode, category):
+    data = read_cldr_names(langcode, category)
+    name_quads = []
+    for subtag, name in sorted(data.items()):
+        if (langcode, subtag) in OVERRIDES:
+            name = OVERRIDES[langcode, subtag]
+            if name is None:
+                continue
+
+        if subtag == name:
+            # Default entries that map a language code to itself, which
+            # an inattentive annotator just left there
+            continue
+
+        priority = 3
+        if subtag.endswith('-alt-menu') and name == 'mandarin':
+            # The -alt-menu entries are supposed to do things like alphabetize
+            # "Mandarin Chinese" under "Chinese, Mandarin". A few languages
+            # just put the string "mandarin" there, which seems wrong and
+            # messes up our name lookups.
+            continue
+
+        # CLDR assigns multiple names to one code by adding -alt-* to
+        # the end of the code. For example, the English name of 'az' is
+        # Azerbaijani, but the English name of 'az-alt-short' is Azeri.
+        if '-alt-' in subtag:
+            subtag, _ = subtag.split('-alt-', 1)
+            priority = 1
+
+        if normalize_name(name) == normalize_name(subtag):
+            # Giving the name "zh (Hans)" to "zh-Hans" is still lazy
+            continue
+
+        name_quads.append((langcode, subtag, name, priority))
+    return name_quads
+
+
+def read_iana_registry_names():
+    language_quads = []
+    script_quads = []
+    territory_quads = []
+    for entry in parse_registry():
+        target = None
+        if entry['Type'] == 'language':
+            target = language_quads
+        elif entry['Type'] == 'script':
+            target = script_quads
+        elif entry['Type'] == 'region':
+            # IANA's terminology is 'region' where CLDR's is 'territory'
+            target = territory_quads
+        if target is not None:
+            subtag = entry['Subtag']
+            priority = 2
+            if 'Deprecated' in entry:
+                priority = 0
+            if ('en', subtag) in OVERRIDES:
+                target.append(('en', subtag, OVERRIDES['en', subtag], priority))
+            else:
+                for desc in entry['Description']:
+                    target.append(('en', subtag, desc, priority))
+    return language_quads, script_quads, territory_quads
+
+
+def read_iana_registry_macrolanguages():
+    macros = {}
+    for entry in parse_registry():
+        if entry['Type'] == 'language' and 'Macrolanguage' in entry:
+            macros[entry['Subtag']] = entry['Macrolanguage']
+    return macros
+
+
+def read_iana_registry_replacements():
+    replacements = {}
+    for entry in parse_registry():
+        if entry['Type'] == 'language' and 'Preferred-Value' in entry:
+            # Replacements for language codes
+            replacements[entry['Subtag']] = entry['Preferred-Value']
+        elif 'Tag' in entry and 'Preferred-Value' in entry:
+            # Replacements for entire tags
+            replacements[entry['Tag'].lower()] = entry['Preferred-Value']
+    return replacements
+
+
+def read_csv_names(filename):
+    data = open(filename, encoding='utf-8')
+    quads = []
+    for line in data:
+        quad = line.rstrip().split(',', 3) + [True]
+        quads.append(tuple(quad))
+    return quads
+
+
+def read_wiktionary_names(filename, language):
+    data = open(filename, encoding='utf-8')
+    quads = []
+    for line in data:
+        parts = line.rstrip().split('\t')
+        code = parts[0]
+        quads.append((language, code, parts[1], -1))
+        names = [parts[1]]
+        if len(parts) > 4 and parts[4]:
+            names = parts[4].split(', ')
+            for name in names:
+                quads.append((language, code, name, -2))
+    return quads
+
+
+def update_names(names_fwd, names_rev, name_quads):
+    for name_language, referent, name, priority in name_quads:
+        # Get just the language from name_language, not the territory or script.
+        short_language = name_language.split('-')[0]
+        rev_all = names_rev.setdefault('und', {})
+        rev_language = names_rev.setdefault(short_language, {})
+        for rev_dict in (rev_all, rev_language):
+            rev_dict.setdefault(normalize_name(name), []).append(
+                (name_language, referent, priority)
+            )
+
+        names_for_referent = names_fwd.setdefault(referent, {})
+        if name_language not in names_for_referent:
+            names_for_referent[name_language] = name
+
+
+def save_trie(mapping, filename):
+    trie = marisa_trie.BytesTrie(
+        (key, value.encode('utf-8')) for (key, value) in sorted(mapping.items())
+    )
+    trie.save(filename)
+
+
+def save_reverse_name_tables(category, rev_dict):
+    for language, lang_dict in rev_dict.items():
+        os.makedirs(data_filename(f"trie/{language}"), exist_ok=True)
+        save_trie(
+            resolve_names(lang_dict, debug=True),
+            data_filename(f"trie/{language}/name_to_{category}.marisa"),
+        )
+
+
+def get_name_languages():
+    cldr_main_path = Path(data_filename("cldr-json/cldr-json/cldr-localenames-full/main"))
+    languages = [
+        subpath.name
+        for subpath in sorted(cldr_main_path.iterdir())
+        if subpath.name != 'root' and (subpath / 'languages.json').exists()
+    ]
+    return [language for language in languages if 'a' <= language[-1] <= 'z']
+
+
+def get_population_data():
+    import langcodes
+
+    filename = data_filename("supplementalData.xml")
+    root = ET.fromstring(open(filename).read())
+    territories = root.findall("./territoryInfo/territory")
+
+    language_population = defaultdict(int)
+    language_writing_population = defaultdict(int)
+
+    for territory in territories:
+        t_code = territory.attrib['type']
+        t_population = float(territory.attrib['population'])
+        t_literacy_rate = float(territory.attrib['literacyPercent']) / 100
+
+        for language in territory:
+            attrs = language.attrib
+            l_code = attrs['type'].replace('_', '-')
+            l_proportion = float(attrs.get('populationPercent', 0)) / 100
+            if 'writingPercent' in attrs:
+                writing_prop = float(attrs['writingPercent']) / 100
+            elif 'literacyPercent' in attrs:
+                writing_prop = float(attrs['literacyPercent']) / 100
+            else:
+                writing_prop = t_literacy_rate
+
+            l_population = t_population * l_proportion
+            l_writing = t_population * l_proportion * writing_prop
+
+            # Distinguish data in different territories, and also in different
+            # scripts when necessary, while also accumulating more general data
+
+            # We need to use maximize() on the bare language code, not just
+            # assume_script(), because assumed defaults like 'zh-Hans' are unwritten
+            # in the data. We need this if we want to count the relative use of
+            # Simplified vs. Traditional Chinese, for example.
+            written_ls = (
+                langcodes.get(l_code).maximize()._filter_attributes(['language', 'script'])
+            )
+            written_lst = written_ls.update_dict({'territory': t_code})
+
+            spoken_lt = written_lst._filter_attributes(['language', 'territory'])
+            spoken_l = written_lst._filter_attributes(['language'])
+
+            written_lt = written_lst._filter_attributes(['language', 'territory'])
+            written_l = written_lst._filter_attributes(['language'])
+
+            for lang in set([spoken_lt, spoken_l]):
+                language_population[str(lang)] += int(round(l_population))
+
+            for lang in set([written_lst, written_lt, written_ls, written_l]):
+                language_writing_population[str(lang)] += int(round(l_writing))
+
+    return language_population, language_writing_population
+
+
+def write_python_dict(outfile, name, d):
+    """
+    Write Python code that initializes a given dictionary, with one value on
+    each line.
+    """
+    print(f"{name} = {{", file=outfile)
+    for key, value in sorted(d.items()):
+        print(f"    {key!r}: {value!r},", file=outfile)
+    print("}", file=outfile)
+
+
+def write_python_set(outfile, name, s):
+    print(f"{name} = {{", file=outfile)
+    for key in sorted(set(s)):
+        print(f"    {key!r},", file=outfile)
+    print("}", file=outfile)
+
+
+GENERATED_HEADER = "# This file is generated by build_data.py."
+
+
+def build_data():
+    language_names_rev = {}
+    territory_names_rev = {}
+    script_names_rev = {}
+    names_fwd = {}
+
+    override_language_data = read_csv_names(data_filename('override_language_names.csv'))
+    update_names(names_fwd, language_names_rev, override_language_data)
+
+    for langcode in get_name_languages():
+        language_data = read_cldr_name_file(langcode, 'languages')
+        update_names(names_fwd, language_names_rev, language_data)
+
+        try:
+            script_data = read_cldr_name_file(langcode, 'scripts')
+            update_names(names_fwd, script_names_rev, script_data)
+        except FileNotFoundError:
+            pass
+
+        try:
+            territory_data = read_cldr_name_file(langcode, 'territories')
+            update_names(names_fwd, territory_names_rev, territory_data)
+        except FileNotFoundError:
+            pass
+
+    iana_languages, iana_scripts, iana_territories = read_iana_registry_names()
+    update_names(names_fwd, language_names_rev, iana_languages)
+    update_names(names_fwd, script_names_rev, iana_scripts)
+    update_names(names_fwd, territory_names_rev, iana_territories)
+
+    wiktionary_data = read_wiktionary_names(data_filename('wiktionary/codes-en.csv'), 'en')
+    update_names(names_fwd, language_names_rev, wiktionary_data)
+
+    extra_language_data = read_csv_names(data_filename('extra_language_names.csv'))
+    update_names(names_fwd, language_names_rev, extra_language_data)
+
+    save_reverse_name_tables('language', language_names_rev)
+    save_reverse_name_tables('script', script_names_rev)
+    save_reverse_name_tables('territory', territory_names_rev)
+
+    # Get the list of languages where we have any name data. These are base
+    # language codes (without scripts or territories) which contain a name for
+    # themselves.
+    name_languages = [
+        langcode
+        for langcode in get_name_languages()
+        if '-' not in langcode and langcode in names_fwd and langcode in names_fwd[langcode]
+    ]
+
+    # Add the languages that have autonyms in extra_language_data, perhaps because
+    # we specifically put them there to get their autonyms right
+    name_languages += [lang1 for (lang1, lang2, _, _) in extra_language_data if lang1 == lang2]
+
+    # Write the contents of name_data.py.
+    with open('name_data.py', 'w', encoding='utf-8') as outfile:
+        print(GENERATED_HEADER, file=outfile)
+        write_python_set(outfile, 'LANGUAGES_WITH_NAME_DATA', name_languages)
+        print(file=outfile)
+        write_python_dict(outfile, 'CODE_TO_NAMES', names_fwd)
+
+    language_population, language_writing_population = get_population_data()
+    with open('population_data.py', 'w', encoding='utf-8') as outfile:
+        print(GENERATED_HEADER, file=outfile)
+        write_python_dict(outfile, 'LANGUAGE_SPEAKING_POPULATION', language_population)
+        write_python_dict(outfile, 'LANGUAGE_WRITING_POPULATION', language_writing_population)
+
+
+if __name__ == '__main__':
+    build_data()
--- a/lib/language_data/data/languageInfo.xml
+++ b/lib/language_data/data/languageInfo.xml
@ -10,11 +10,11 @@ For terms of use, see http://www.unicode.org/copyright.html
 	<languageMatching>
 		<languageMatches type="written_new">
 			<paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
-			<matchVariable id="$enUS" value="AS+GU+MH+MP+PR+UM+US+VI"/>
+			<matchVariable id="$enUS" value="AS+CA+GU+MH+MP+PH+PR+UM+US+VI"/>
 			<matchVariable id="$cnsar" value="HK+MO"/>
 			<matchVariable id="$americas" value="019"/>
 			<matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
-			<languageMatch desired="no"	supported="nb"	distance="1"/>	<!-- no ⇒ nb -->
+			<languageMatch desired="nb"	supported="no"	distance="1"/>	<!-- nb ⇒ no -->
 			<!-- languageMatch desired="ku"	supported="ckb"	distance="4" oneway="true"/ -->	<!-- ku ⇒ ckb -->
 			<!-- languageMatch desired="ckb" supported="ku" percent="8" oneway="true"/ --> <!-- ckb ⇒ ku -->
 			<languageMatch desired="hr"	supported="bs"	distance="4"/>	<!-- hr ⇒ bs -->
@ -38,18 +38,23 @@ For terms of use, see http://www.unicode.org/copyright.html
 			<languageMatch desired="ach"	supported="en"	distance="30"	oneway="true"/>	<!-- Acoli (Southern Luo dialect in Uganda): ach ⇒ en -->
 			<languageMatch desired="af"	supported="nl"	distance="20"	oneway="true"/>	<!-- Afrikaans: af ⇒ nl -->
 			<languageMatch desired="ak"	supported="en"	distance="30"	oneway="true"/>	<!-- Akan: ak ⇒ en -->
+			<languageMatch desired="am"	supported="en"	distance="30"	oneway="true"/>	<!-- Amharic ⇒ English -->
 			<languageMatch desired="ay"	supported="es"	distance="20"	oneway="true"/>	<!-- Aymara: ay ⇒ es -->
 			<languageMatch desired="az"	supported="ru"	distance="30"	oneway="true"/>	<!-- Azerbaijani: az ⇒ ru -->
+			<languageMatch desired="bal"	supported="ur"	distance="20"	oneway="true"/>	<!-- Baluchi ⇒ Urdu -->
 			<languageMatch desired="be"	supported="ru"	distance="20"	oneway="true"/>	<!-- Belarusian: be ⇒ ru -->
 			<languageMatch desired="bem"	supported="en"	distance="30"	oneway="true"/>	<!-- Bemba (Zambia): bem ⇒ en -->
 			<languageMatch desired="bh"	supported="hi"	distance="30"	oneway="true"/>	<!-- Bihari languages (gets canonicalized to bho): bh ⇒ hi -->
 			<languageMatch desired="bn"	supported="en"	distance="30"	oneway="true"/>	<!-- Bangla: bn ⇒ en -->
+			<languageMatch desired="bo"	supported="zh"	distance="20"	oneway="true"/>	<!-- Tibetan ⇒ Chinese -->
 			<languageMatch desired="br"	supported="fr"	distance="20"	oneway="true"/>	<!-- Breton: br ⇒ fr -->
+			<languageMatch desired="ca"	supported="es"	distance="20"	oneway="true"/>	<!-- Catalan ⇒ Spanish -->
 			<languageMatch desired="ceb"	supported="fil"	distance="30"	oneway="true"/>	<!-- Cebuano: ceb ⇒ fil -->
 			<languageMatch desired="chr"	supported="en"	distance="20"	oneway="true"/>	<!-- Cherokee: chr ⇒ en -->
 			<languageMatch desired="ckb"	supported="ar"	distance="30"	oneway="true"/>	<!-- Sorani Kurdish: ckb ⇒ ar -->
 			<languageMatch desired="co"	supported="fr"	distance="20"	oneway="true"/>	<!-- Corsican: co ⇒ fr -->
 			<languageMatch desired="crs"	supported="fr"	distance="20"	oneway="true"/>	<!-- Seselwa Creole French: crs ⇒ fr -->
+			<languageMatch desired="cs"	supported="sk"	distance="20"/>	<!-- Czech ⇔ Slovak -->
 			<languageMatch desired="cy"	supported="en"	distance="20"	oneway="true"/>	<!-- Welsh: cy ⇒ en -->
 			<languageMatch desired="ee"	supported="en"	distance="30"	oneway="true"/>	<!-- Ewe: ee ⇒ en -->
 			<languageMatch desired="eo"	supported="en"	distance="30"	oneway="true"/>	<!-- Esperanto: eo ⇒ en -->
@ -88,9 +93,10 @@ For terms of use, see http://www.unicode.org/copyright.html
 			<languageMatch desired="lo"	supported="en"	distance="30"	oneway="true"/>	<!-- Lao: lo ⇒ en -->
 			<languageMatch desired="loz"	supported="en"	distance="30"	oneway="true"/>	<!-- Lozi: loz ⇒ en -->
 			<languageMatch desired="lua"	supported="fr"	distance="30"	oneway="true"/>	<!-- Luba-Lulua: lua ⇒ fr -->
+			<languageMatch desired="mai"	supported="hi"	distance="20"	oneway="true"/>	<!-- Maithili ⇒ Hindi -->
 			<languageMatch desired="mfe"	supported="en"	distance="30"	oneway="true"/>	<!-- Morisyen: mfe ⇒ en -->
 			<languageMatch desired="mg"	supported="fr"	distance="30"	oneway="true"/>	<!-- Malagasy: mg ⇒ fr -->
-			<languageMatch desired="mi"	supported="en"	distance="20"	oneway="true"/>	<!-- Maori: mi ⇒ en -->
+			<languageMatch desired="mi"	supported="en"	distance="20"	oneway="true"/>	<!-- Māori: mi ⇒ en -->

 			<!-- CLDR-13625: Macedonian should not fall back to Bulgarian -->
 			<!-- languageMatch desired="mk"	supported="bg"	distance="30"	oneway="true"/-->	<!-- Macedonian: mk ⇒ bg -->
@ -137,12 +143,14 @@ For terms of use, see http://www.unicode.org/copyright.html
 			<languageMatch desired="tt"	supported="ru"	distance="30"	oneway="true"/>	<!-- Tatar: tt ⇒ ru -->
 			<languageMatch desired="tum"	supported="en"	distance="30"	oneway="true"/>	<!-- Tumbuka: tum ⇒ en -->
 			<languageMatch desired="ug"	supported="zh"	distance="20"	oneway="true"/>	<!-- Uighur: ug ⇒ zh -->
+			<languageMatch desired="uk"	supported="ru"	distance="20"	oneway="true"/>	<!-- Ukrainian ⇒ Russian -->
 			<languageMatch desired="ur"	supported="en"	distance="30"	oneway="true"/>	<!-- Urdu: ur ⇒ en -->
 			<languageMatch desired="uz"	supported="ru"	distance="30"	oneway="true"/>	<!-- Uzbek: uz ⇒ ru -->
 			<languageMatch desired="wo"	supported="fr"	distance="30"	oneway="true"/>	<!-- Wolof: wo ⇒ fr -->
 			<languageMatch desired="xh"	supported="en"	distance="30"	oneway="true"/>	<!-- Xhosa: xh ⇒ en -->
 			<languageMatch desired="yi"	supported="en"	distance="30"	oneway="true"/>	<!-- Yiddish: yi ⇒ en -->
 			<languageMatch desired="yo"	supported="en"	distance="30"	oneway="true"/>	<!-- Yoruba: yo ⇒ en -->
+			<languageMatch desired="za"	supported="zh"	distance="20"	oneway="true"/>	<!-- Zhuang languages ⇒ Chinese -->
 			<languageMatch desired="zu"	supported="en"	distance="30"	oneway="true"/>	<!-- Zulu: zu ⇒ en -->
 			
 			<!-- START generated by GenerateLanguageMatches.java: don't manually change -->
@ -159,7 +167,6 @@ For terms of use, see http://www.unicode.org/copyright.html
 			<languageMatch desired="aeb" supported="ar" distance="10" oneway="true"/>	<!-- Tunisian Arabic -->
 			<languageMatch desired="aec" supported="ar" distance="10" oneway="true"/>	<!-- Saidi Arabic -->
 			<languageMatch desired="afb" supported="ar" distance="10" oneway="true"/>	<!-- Gulf Arabic -->
-			<languageMatch desired="ajp" supported="ar" distance="10" oneway="true"/>	<!-- South Levantine Arabic -->
 			<languageMatch desired="apc" supported="ar" distance="10" oneway="true"/>	<!-- North Levantine Arabic -->
 			<languageMatch desired="apd" supported="ar" distance="10" oneway="true"/>	<!-- Sudanese Arabic -->
 			<languageMatch desired="arq" supported="ar" distance="10" oneway="true"/>	<!-- Algerian Arabic -->
@ -359,8 +366,10 @@ For terms of use, see http://www.unicode.org/copyright.html
 			<languageMatch desired="yue" supported="zh" distance="10" oneway="true"/>	<!-- Chinese, Cantonese -->
 			<!-- END generated by GenerateLanguageMatches.java -->
 			<languageMatch desired="*"	supported="*"	distance="80"/>	<!-- * ⇒ * -->
+			<languageMatch desired="am_Ethi"	supported="en_Latn"	distance="10"	oneway="true"/>
 			<languageMatch desired="az_Latn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- az; Latn ⇒ ru; Cyrl -->
 			<languageMatch desired="bn_Beng"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- bn; Beng ⇒ en; Latn -->
+			<languageMatch desired="bo_Tibt"	supported="zh_Hans"	distance="10"	oneway="true"/>
 			<languageMatch desired="hy_Armn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- hy; Armn ⇒ ru; Cyrl -->
 			<languageMatch desired="ka_Geor"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ka; Geor ⇒ en; Latn -->
 			<languageMatch desired="km_Khmr"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- km; Khmr ⇒ en; Latn -->
@ -382,9 +391,8 @@ For terms of use, see http://www.unicode.org/copyright.html
 			<languageMatch desired="uz_Latn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- uz; Latn ⇒ ru; Cyrl -->
 			<languageMatch desired="yi_Hebr"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- yi; Hebr ⇒ en; Latn -->
 			<languageMatch desired="sr_Latn"	supported="sr_Cyrl"	distance="5"/>	<!-- sr; Latn ⇒ sr; Cyrl -->
-			<languageMatch desired="zh_Hans"	supported="zh_Hant"	distance="15"	oneway="true"/>	<!-- zh; Hans ⇒ zh; Hant -->
-			<languageMatch desired="zh_Hant"	supported="zh_Hans"	distance="19"	oneway="true"/>	<!-- zh; Hant ⇒ zh; Hans -->
-			<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans -->
+			<languageMatch desired="za_Latn"	supported="zh_Hans"	distance="10"	oneway="true"/>
+			<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans was before CLDR-14355 -->
 			<languageMatch desired="zh_Hani"	supported="zh_Hans"	distance="20"	oneway="true"/>
 			<languageMatch desired="zh_Hani"	supported="zh_Hant"	distance="20"	oneway="true"/>
 			<!-- Latin transliterations of some languages, initially from CLDR-13577 -->
--- a/lib/language_data/data/supplementalData.xml
+++ b/lib/language_data/data/supplementalData.xml
--- a/lib/language_data/name_data.py
+++ b/lib/language_data/name_data.py
--- a/lib/language_data/population_data.py
+++ b/lib/language_data/population_data.py
--- a/lib/language_data/util.py
+++ b/lib/language_data/util.py
@ -2,14 +2,11 @@
 Used for locating a file in the data directory.
 """

-from pkg_resources import resource_filename
-DATA_ROOT = resource_filename('language_data', 'data')
-import os
-
+from importlib.resources import files

 def data_filename(filename):
    """
    Given a relative filename, get the full path to that file in the data
    directory.
    """
-    return os.path.join(DATA_ROOT, filename)
+    return files('language_data') / 'data' / filename