Add langcodes 3.3.0 and language-data 1.1 (vendor'd from pip install langcodes[data])

Add a Select2 drop-down to `add-shows` and `edit-show`. The Select2 enables displaying inline language flag images, this feature deprecated by the native `select` drop-down element on some browsers. Change run existing TVInfo source language lists through validation (removes ~4 bad items), de-dupe list, get the native names, English names, and three letter abbr. Change remove marisa-trie requirement from language_data/names.py because nothing in SG calls a function that requires it. Change update some flags.
2024-11-21 20:35:05 +00:00 · 2023-01-26 03:30:07 +00:00 · 2023-01-26 03:30:07 +00:00 · 9009cc7a7b
commit 9009cc7a7b
parent fce8878fa9
36 changed files with 134485 additions and 32 deletions
--- a/HACKS.txt
+++ b/HACKS.txt
@ -17,6 +17,7 @@ Libs with customisations...
 /lib/hachoir_parser/guess.py
 /lib/hachoir_parser/misc/torrent.py
 /lib/imdbpie
 /lib/language_data/names.py
 /lib/lockfile/mkdirlockfile.py
 /lib/rtorrent
 /lib/scandir/scandir.py
--- a/gui/slick/css/style.css
+++ b/gui/slick/css/style.css
@ -1312,6 +1312,9 @@ div.formpaginate{
 	width:480px;
 	margin-top:0
 }
 #addShowForm #nameToSearch.select2{
 	width:428px;
 }
 #addShowForm #nameToSearch.wide{
 	width:591px;
 }
@ -3790,6 +3793,13 @@ option.flag{
 	background-position:10px 50%
 }
 #select2-infosrc-lang-select-container .flag,
 #select2-infosrc-lang-select-results .flag{
 	padding-left:25px;
 	background-repeat:no-repeat;
 	background-position:0 50%
 }
 /* Anime section for editShow */
 .anigrouplists-wrapper{
 	height:auto;
--- a/gui/slick/images/flags/hy.png
+++ b/gui/slick/images/flags/hy.png
--- a/gui/slick/images/flags/ka.png
+++ b/gui/slick/images/flags/ka.png
--- a/gui/slick/images/flags/nb.png
+++ b/gui/slick/images/flags/nb.png
--- a/gui/slick/images/flags/nn.png
+++ b/gui/slick/images/flags/nn.png
--- a/gui/slick/images/flags/sq.png
+++ b/gui/slick/images/flags/sq.png
--- a/gui/slick/images/flags/ua.png
+++ b/gui/slick/images/flags/ua.png
--- a/gui/slick/images/flags/uk.png
+++ b/gui/slick/images/flags/uk.png
--- a/gui/slick/interfaces/default/editShow.tmpl
+++ b/gui/slick/interfaces/default/editShow.tmpl
@ -28,6 +28,24 @@
 <script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
 <script type="text/javascript" src="$sbRoot/js/editShow.js?v=$sbPID"></script>
 <script type="text/javascript" src="$sbRoot/js/livepanel.js?v=$sbPID"></script>
 <script src="$sbRoot/js/lib/select2.full.min.js"></script>
 <link href="$sbRoot/css/lib/select2.css" rel="stylesheet">
 <style>
 .select2-container{height:32px; font-size:12px; margin-right:6px}
 .select2-container .select2-selection--single{height:30px}
 .select2-results__group{color: #eee; background-color: rgb(51,51,51)}
 .select2-results__options .select2-results__option{color: #222; background-color: #ddd}
 .select2-results__options .select2-results__option .ended{color: #888}
 .select2-container--default .select2-results > .select2-results__options{max-height: 300px}
 #select2-infosrc-lang-select-results .select2-results__option,
 #select2-infosrc-lang-select-results .select2-results__group{padding-top: 2px !important; padding-bottom:2px !important}
 #select2-infosrc-lang-select-results .select2-results__option--highlighted.select2-results__option--selectable .ended{color:white}
 #select2-infosrc-lang-select-results .select2-results__option--selected,
 #select2-infosrc-lang-select-results .select2-results__option--selected span{color:rgb(143, 21, 21) !important}
 #select2-infosrc-lang-select-results span.flag{width:100%; height:100%; display:block}
 </style>
 #if $varExists('header')
 	<h1 class="header"><span class="grey-text">Edit&nbsp;</span>$header</h1>
 #else
@ -244,10 +262,10 @@
 					</div>
 					<div class="field-pair">
-						<label for="infosrc-lang-select-edit">
+						<label for="infosrc-lang-select">
 							<span class="component-title">Info language</span>
 							<span class="component-desc">
-								<select name="tvinfo_lang" id="infosrc-lang-select-edit" class="form-control form-control-inline input-sm"></select>
+								<select name="tvinfo_lang" id="infosrc-lang-select" class="form-control form-control-inline input-sm"></select>
 								<span>fetch show information in this language</span>
 							</span>
 						</label>
--- a/gui/slick/interfaces/default/home_newShow.tmpl
+++ b/gui/slick/interfaces/default/home_newShow.tmpl
@ -35,6 +35,23 @@
 <script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
 <script type="text/javascript" src="$sbRoot/js/newShow.js?v=$sbPID"></script>
 <script type="text/javascript" src="$sbRoot/js/addShowOptions.js?v=$sbPID"></script>
 <script src="$sbRoot/js/lib/select2.full.min.js"></script>
 <link href="$sbRoot/css/lib/select2.css" rel="stylesheet">
 <style>
 .select2-container{height:32px; font-size:12px}
 .select2-container .select2-selection--single{height:30px}
 .select2-results__group{color: #eee; background-color: rgb(51,51,51)}
 .select2-results__options .select2-results__option{color: #222; background-color: #ddd}
 .select2-results__options .select2-results__option .ended{color: #888}
 .select2-container--default .select2-results > .select2-results__options{max-height: 300px}
 #select2-infosrc-lang-select-results .select2-results__option,
 #select2-infosrc-lang-select-results .select2-results__group{padding-top: 2px !important; padding-bottom:2px !important}
 #select2-infosrc-lang-select-results .select2-results__option--highlighted.select2-results__option--selectable .ended{color:white}
 #select2-infosrc-lang-select-results .select2-results__option--selected,
 #select2-infosrc-lang-select-results .select2-results__option--selected span{color:rgb(143, 21, 21) !important}
 #select2-infosrc-lang-select-results span.flag{width:100%; height:100%; display:block}
 </style>
 #if $varExists('header')
 	<h1 class="header">$header</h1>
--- a/gui/slick/js/editShow.js
+++ b/gui/slick/js/editShow.js
@ -16,31 +16,73 @@ $(document).ready(function () {
 		return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
 	}
-	$.getJSON($.SickGear.Root + '/add-shows/get-infosrc-languages', {}, function (data) {
+	function uriFlag(lang) {
-		var result = '', currentLangAdded = '', selected = ' selected="selected"';
+		return $.SickGear.Root + '/images/flags/' + lang + '.png'
 	}
-		if (!data.results.length) {
+	$.getJSON($.SickGear.Root + '/add-shows/get-infosrc-languages', {}, function (data) {
-			result = '<option value="' + config.showLang + '"' + selected + htmlFlag(config.showLang) + '>'
+		var htmlText = '', currentLangAdded = '',
 			selected = ' selected="selected"', htmlSelected = '',
 			elInfosrcLang = $('#infosrc-lang-select'),
 			useSelect2 = 0 < data.results_ext.length, populateItem;
 		if (!data.results.length && !data.results_ext.length) {
 			htmlText = '<option value="' + config.showLang + '"' + selected + htmlFlag(config.showLang) + '>'
 				+ config.showLang + '</option>';
 		} else {
 			currentLangAdded = !1;
-			$.each(data.results, function (index, strLang) {
+			if (useSelect2){
 				// 3 letter abbr object
 				$.each(data.results_ext, function (index, obj) {
-				var htmlSelected = '';
+					htmlSelected = '';
-				if (strLang === config.showLang) {
+					if (obj.std_abbr === config.showLang) {
-					currentLangAdded = !0;
+						currentLangAdded = !0;
-					htmlSelected = selected;
+						htmlSelected = selected;
-				}
+					}
-				result += '<option value="' + strLang + '"' + htmlSelected + htmlFlag(strLang) + '>'
+					htmlText += '<option style="padding-left:25px" value="' + obj.std_abbr + '"'
-					+ strLang + '</option>';
+								+ ' data-abbr="' + obj.abbr + '"'
-			});
+								+ ' data-img="' + uriFlag(obj.std_abbr) + '"'
 								+ ' data-title="' + obj.en + ' (' + obj.orig_abbr + '/' + obj.std_abbr + '/' + obj.abbr + ')' +  '"'
 								+ (!!htmlSelected
 									? htmlSelected + '>&gt; '
 									: '>')
 								+ obj.native
 								+ '</option>';
 				});
 			} else {
 				// legacy 2 letter abbr list
 				$.each(data.results, function (index, strLang) {
 					htmlSelected = '';
 					if (strLang === config.showLang) {
 						currentLangAdded = !0;
 						htmlSelected = selected;
 					}
 					htmlText += '<option value="' + strLang + '"' + htmlSelected + htmlFlag(strLang) + '>'
 						+ strLang + '</option>';
 				});
 			}
 			if (!currentLangAdded)
-				result += '<option value="' + config.showLang + '" ' + selected + '>' + config.showLang + '</option>';
+				htmlText += '<option value="' + config.showLang + '" ' + selected + '>' + config.showLang + '</option>';
 		}
-		$('#infosrc-lang-select-edit').html(result);
+		elInfosrcLang.html(htmlText);
 		if (useSelect2) {
 			populateItem = function (data) {
 				if (!!data.element)
 					return $('<span class="flag"'
 						+ ' style="background-image:url(' + $(data.element).data('img') + ')"'
 						+ ' title="' + $(data.element).data('title') + '">'
 						+ data.text
 						+ '</span>');
 				return data.text;
 			}
 			elInfosrcLang.select2({templateResult: populateItem, templateSelection: populateItem, width: 162});
 		}
 	});
 	function getExceptions() {
--- a/gui/slick/js/newShow.js
+++ b/gui/slick/js/newShow.js
@ -9,35 +9,70 @@ $(document).ready(function () {
 		return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
 	}
 	function uriFlag(lang) {
 		return $.SickGear.Root + '/images/flags/' + lang + '.png'
 	}
 	function populateLangSelect() {
 		if (!$('#nameToSearch').length)
 			return;
-		if (1 >= $('#infosrc-lang-select').find('option').length) {
+		if (1 >=  $('#infosrc-lang-select').find('option').length) {
 			$.getJSON(sbRoot + '/add-shows/get-infosrc-languages', {}, function (data) {
-				var resultStr = '', flag,
+				var htmlText = '', flag,
 					selected = ' selected="selected"',
-					elInfosrcLang = $('#infosrc-lang-select');
+					elInfosrcLang = $('#infosrc-lang-select'),
 					useSelect2 = 0 < data.results_ext.length, populateItem;
-				if (0 === data.results.length) {
+				if (0 === data.results.length && 0 === data.results_ext.length) {
-					resultStr = '<option value="en"' + selected + '>&gt; en</option>';
+					htmlText = '<option value="en"' + selected + '>&gt; en</option>';
 				} else {
-					$.each(data.results, function (index, obj) {
+					if (useSelect2) {
-						flag = htmlFlag(obj);
+						$('#nameToSearch').addClass('select2');
-						resultStr += '<option value="' + obj + '"'
+						// 3 letter abbr object
-							+ ('' === resultStr
+						$.each(data.results_ext, function (index, obj) {
-								? flag.replace('"flag', '"flag selected-text') + selected + '>&gt; '
+							htmlText += '<option style="padding-left:25px" value="' + obj.std_abbr + '"'
-								: flag + '>')
+								+ ' data-abbr="' + obj.abbr + '"'
-							+ obj + '</option>';
+								+ ' data-img="' + uriFlag(obj.std_abbr) + '"'
-					});
+								+ ' data-title="' + obj.en + ' (' + obj.orig_abbr + '/' + obj.std_abbr + '/' + obj.abbr + ')' +  '"'
 								+ ('' === htmlText
 									? selected + '>&gt; '
 									: '>')
 								+ obj.native
 								+ '</option>';
 						});
 					} else {
 						// legacy 2 letter abbr list
 						$.each(data.results, function (index, obj) {
 							flag = htmlFlag(obj);
 							htmlText += '<option value="' + obj + '"'
 								+ ('' === htmlText
 									? flag.replace('"flag', '"flag selected-text') + selected + '>&gt; '
 									: flag + '>')
 								+ obj + '</option>';
 						});
 					}
 				}
-				elInfosrcLang.html(resultStr);
+				elInfosrcLang.html(htmlText);
 				elInfosrcLang.change(function () {
 					searchIndexers();
 				});
 				if (useSelect2) {
 					populateItem = function(data) {
 						if (!!data.element)
 							return $('<span class="flag"'
 								+ ' style="background-image:url(' + $(data.element).data('img') + ')"'
 								+ ' title="' + $(data.element).data('title') + '">'
 								+ data.text
 								+ '</span>');
 						return data.text;
 					}
 					elInfosrcLang.select2({templateResult: populateItem, templateSelection: populateItem, width: 155});
 				}
 			});
 		}
 	}
--- a/lib/langcodes/init.py
+++ b/lib/langcodes/init.py
--- a/lib/langcodes/build_data.py
+++ b/lib/langcodes/build_data.py
@ -0,0 +1,242 @@
 import json
 import xml.etree.ElementTree as ET
 from langcodes.util import data_filename
 from langcodes.registry_parser import parse_registry
 def read_cldr_supplemental(dataname):
    cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')
    filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
    fulldata = json.load(open(filename, encoding='utf-8'))
    if dataname == 'aliases':
        data = fulldata['supplemental']['metadata']['alias']
    else:
        data = fulldata['supplemental'][dataname]
    return data
 def read_iana_registry_suppress_scripts():
    scripts = {}
    for entry in parse_registry():
        if entry['Type'] == 'language' and 'Suppress-Script' in entry:
            scripts[entry['Subtag']] = entry['Suppress-Script']
    return scripts
 def read_iana_registry_scripts():
    scripts = set()
    for entry in parse_registry():
        if entry['Type'] == 'script':
            scripts.add(entry['Subtag'])
    return scripts
 def read_iana_registry_macrolanguages():
    macros = {}
    for entry in parse_registry():
        if entry['Type'] == 'language' and 'Macrolanguage' in entry:
            macros[entry['Subtag']] = entry['Macrolanguage']
    return macros
 def read_iana_registry_replacements():
    replacements = {}
    for entry in parse_registry():
        if entry['Type'] == 'language' and 'Preferred-Value' in entry:
            # Replacements for language codes
            replacements[entry['Subtag']] = entry['Preferred-Value']
        elif 'Tag' in entry and 'Preferred-Value' in entry:
            # Replacements for entire tags
            replacements[entry['Tag'].lower()] = entry['Preferred-Value']
    return replacements
 def write_python_dict(outfile, name, d):
    print(f"{name} = {{", file=outfile)
    for key in sorted(d):
        value = d[key]
        print(f"    {key!r}: {value!r},", file=outfile)
    print("}", file=outfile)
 def write_python_set(outfile, name, s):
    print(f"{name} = {{", file=outfile)
    for key in sorted(set(s)):
        print(f"    {key!r},", file=outfile)
    print("}", file=outfile)
 GENERATED_HEADER = "# This file is generated by build_data.py."
 def read_validity_regex():
    validity_options = []
    for codetype in ('language', 'region', 'script', 'variant'):
        validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')
        root = ET.fromstring(open(validity_path).read())
        matches = root.findall('./idValidity/id')
        for match in matches:
            for item in match.text.strip().split():
                if '~' in item:
                    assert item[-2] == '~'
                    prefix = item[:-3]
                    range_start = item[-3]
                    range_end = item[-1]
                    option = f"{prefix}[{range_start}-{range_end}]"
                    validity_options.append(option)
                else:
                    validity_options.append(item)
    options = '|'.join(validity_options)
    return f'^({options})$'
 def read_language_distances():
    language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')
    root = ET.fromstring(open(language_info_path).read())
    matches = root.findall(
        './languageMatching/languageMatches[@type="written_new"]/languageMatch'
    )
    tag_distances = {}
    for match in matches:
        attribs = match.attrib
        n_parts = attribs['desired'].count('_') + 1
        if n_parts < 3:
            if attribs.get('oneway') == 'true':
                pairs = [(attribs['desired'], attribs['supported'])]
            else:
                pairs = [
                    (attribs['desired'], attribs['supported']),
                    (attribs['supported'], attribs['desired']),
                ]
            for (desired, supported) in pairs:
                desired_distance = tag_distances.setdefault(desired, {})
                desired_distance[supported] = int(attribs['distance'])
                # The 'languageInfo' data file contains distances for the unnormalized
                # tag 'sh', but we work mostly with normalized tags, and they don't
                # describe at all how to cope with this.
                #
                # 'sh' normalizes to 'sr-Latn', and when we're matching languages we
                # aren't matching scripts yet, so when 'sh' appears we'll add a
                # corresponding match for 'sr'.
                #
                # Then because we're kind of making this plan up, add 1 to the distance
                # so it's a worse match than ones that are actually clearly defined
                # in languageInfo.
                if desired == 'sh' or supported == 'sh':
                    if desired == 'sh':
                        desired = 'sr'
                    if supported == 'sh':
                        supported = 'sr'
                    if desired != supported:
                        # don't try to define a non-zero distance for sr <=> sr
                        desired_distance = tag_distances.setdefault(desired, {})
                        desired_distance[supported] = int(attribs['distance']) + 1
    return tag_distances
 def build_data():
    lang_scripts = read_iana_registry_suppress_scripts()
    all_scripts = read_iana_registry_scripts()
    macrolanguages = read_iana_registry_macrolanguages()
    iana_replacements = read_iana_registry_replacements()
    language_distances = read_language_distances()
    alias_data = read_cldr_supplemental('aliases')
    likely_subtags = read_cldr_supplemental('likelySubtags')
    replacements = {}
    # Aliased codes can still have alpha3 codes, and there's no unified source
    # about what they are. It depends on whether the alias predates or postdates
    # ISO 639-2, which nobody should have to care about. So let's set all the
    # alpha3 codes for aliased alpha2 codes here.
    alpha3_mapping = {
        'tl': 'tgl',  # even though it normalizes to 'fil'
        'in': 'ind',
        'iw': 'heb',
        'ji': 'yid',
        'jw': 'jav',
        'sh': 'hbs',
    }
    alpha3_biblio = {}
    norm_macrolanguages = {}
    for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
        aliases = alias_data[alias_type]
        # Initially populate 'languageAlias' with the aliases from the IANA file
        if alias_type == 'languageAlias':
            replacements[alias_type] = iana_replacements
            replacements[alias_type]['root'] = 'und'
        else:
            replacements[alias_type] = {}
        for code, value in aliases.items():
            # Make all keys lowercase so they can be looked up
            # case-insensitively
            code = code.lower()
            # If there are multiple replacements, take the first one. For example,
            # we just replace the Soviet Union (SU) with Russia (RU), instead of
            # trying to do something context-sensitive and poorly standardized
            # that selects one of the successor countries to the Soviet Union.
            replacement = value['_replacement'].split()[0]
            if value['_reason'] == 'macrolanguage':
                norm_macrolanguages[code] = replacement
            else:
                # CLDR tries to oversimplify some codes as it assigns aliases.
                # For example, 'nor' is the ISO alpha3 code for 'no', but CLDR
                # would prefer you use 'nb' over 'no', so it makes 'nor' an
                # alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.
                #
                # We undo this oversimplification so that we can get a
                # canonical mapping between alpha2 and alpha3 codes.
                if code == 'nor':
                    replacement = 'no'
                elif code == 'mol':
                    replacement = 'mo'
                elif code == 'twi':
                    replacement = 'tw'
                elif code == 'bih':
                    replacement = 'bh'
                replacements[alias_type][code] = replacement
                if alias_type == 'languageAlias':
                    if value['_reason'] == 'overlong':
                        if replacement in alpha3_mapping:
                            raise ValueError(
                                "{code!r} is an alpha3 for {replacement!r}, which"
                                " already has an alpha3: {orig!r}".format(
                                    code=code,
                                    replacement=replacement,
                                    orig=alpha3_mapping[replacement],
                                )
                            )
                        alpha3_mapping[replacement] = code
                    elif value['_reason'] == 'bibliographic':
                        alpha3_biblio[replacement] = code
    validity_regex = read_validity_regex()
    # Write the contents of data_dicts.py.
    with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
        print(GENERATED_HEADER, file=outfile)
        print("import re\n", file=outfile)
        write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
        write_python_dict(
            outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']
        )
        write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)
        write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)
        write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])
        write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)
        write_python_dict(
            outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']
        )
        write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
        write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)
        write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
        write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
        print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)
 if __name__ == '__main__':
    build_data()
--- a/lib/langcodes/data/language-subtag-registry.txt
+++ b/lib/langcodes/data/language-subtag-registry.txt
--- a/lib/langcodes/data_dicts.py
+++ b/lib/langcodes/data_dicts.py
--- a/lib/langcodes/language_distance.py
+++ b/lib/langcodes/language_distance.py
@ -0,0 +1,188 @@
 from .data_dicts import LANGUAGE_DISTANCES
 from typing import Dict, Tuple
 TagTriple = Tuple[str, str, str]
 _DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
 DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
 DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
 DEFAULT_TERRITORY_DISTANCE = 4
 # Territory clusters used in territory matching:
 # Maghreb (the western Arab world)
 MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}
 # United States and its territories
 US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}
 # Special Autonomous Regions of China
 CNSAR = {"HK", "MO"}
 LATIN_AMERICA = {
    "419",
    # Central America
    "013",
    "BZ",
    "CR",
    "SV",
    "GT",
    "HN",
    "MX",
    "NI",
    "PA",
    # South America
    "005",
    "AR",
    "BO",
    "BR",
    "CL",
    "CO",
    "EC",
    "FK",
    "GF",
    "GY",
    "PY",
    "PE",
    "SR",
    "UY",
    "VE",
 }
 # North and South America
 AMERICAS = {
    "019",
    # Caribbean
    "029",
    "AI",
    "AG",
    "AW",
    "BS",
    "BB",
    "VG",
    "BQ",
    "KY",
    "CU",
    "CW",
    "DM",
    "DO",
    "GD",
    "GP",
    "HT",
    "JM",
    "MQ",
    "MS",
    "PR",
    "SX",
    "BL",
    "KN",
    "LC",
    "MF",
    "VC",
    "TT",
    "TC",
    "VI",
    # Northern America
    "021",
    "BM",
    "CA",
    "GL",
    "PM",
    "US",
    # North America as a whole
    "003",
 } | LATIN_AMERICA
 def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
    """
    Takes in triples of (language, script, territory), which can be derived by
    'maximizing' a language tag. Returns a number from 0 to 135 indicating the
    'distance' between these for the purposes of language matching.
    """
    # First of all, if these are identical, return quickly:
    if supported == desired:
        return 0
    # If we've already figured it out, return the cached distance.
    if (desired, supported) in _DISTANCE_CACHE:
        return _DISTANCE_CACHE[desired, supported]
    else:
        result = _tuple_distance(desired, supported)
        _DISTANCE_CACHE[desired, supported] = result
        return result
 def _get2(dictionary: dict, key1: str, key2: str, default):
    return dictionary.get(key1, {}).get(key2, default)
 def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
    desired_language, desired_script, desired_territory = desired
    supported_language, supported_script, supported_territory = supported
    distance = 0
    if desired_language != supported_language:
        distance += _get2(
            LANGUAGE_DISTANCES,
            desired_language,
            supported_language,
            DEFAULT_LANGUAGE_DISTANCE,
        )
    desired_script_pair = f"{desired_language}_{desired_script}"
    supported_script_pair = f"{supported_language}_{supported_script}"
    if desired_script != supported_script:
        # Scripts can match other scripts, but only when paired with a
        # language. For example, there is no reason to assume someone who can
        # read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
        # someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
        # a language written in two scripts.
        distance += _get2(
            LANGUAGE_DISTANCES,
            desired_script_pair,
            supported_script_pair,
            DEFAULT_SCRIPT_DISTANCE,
        )
    if desired_territory != supported_territory:
        # The rules for matching territories are too weird to implement the
        # general case efficiently. Instead of implementing all the possible
        # match rules the XML could define, instead we just reimplement the
        # rules of CLDR 36.1 here in code.
        tdist = DEFAULT_TERRITORY_DISTANCE
        if desired_script_pair == supported_script_pair:
            if desired_language == "ar":
                if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
                    tdist = 5
            elif desired_language == "en":
                if (desired_territory == "GB") and (supported_territory not in US):
                    tdist = 3
                elif (desired_territory not in US) and (supported_territory == "GB"):
                    tdist = 3
                elif (desired_territory in US) != (supported_territory in US):
                    tdist = 5
            # This is not a rule that's spelled out in CLDR, but is implied by things
            # about territory containment mentioned in other standards. Numeric values
            # for territories, like '003', represent broad regions that contain more
            # specific territories.
            #
            # 419 is the numeric value most often seen in language codes, particularly
            # 'es-419' for Latin American Spanish. If you have a language code that
            # differs only in that its territory is more specific, like 'es-PY', it should
            # be closer to a supported 'es-419' than anything with a territory difference.
            #
            # We can implement this for 419 without becoming responsible for keeping up
            # with which countries/territories/regions contain others in the general case.
            elif desired_territory in LATIN_AMERICA and supported_territory == "419":
                tdist = 1
            elif desired_language == "es" or desired_language == "pt":
                if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
                    tdist = 5
            elif desired_script_pair == "zh_Hant":
                if (desired_territory in CNSAR) != (supported_territory in CNSAR):
                    tdist = 5
        distance += tdist
    return distance
--- a/lib/langcodes/language_lists.py
+++ b/lib/langcodes/language_lists.py
@ -0,0 +1,517 @@
 # This is the list of language codes with the 'modern' level of support in CLDR
 # (compared to 'full', which contains many more languages). We use this as the
 # list of languages that we store specific name-to-code mappings for.
 CLDR_LANGUAGES = {
    'af',
    'am',
    'ar',
    'az',
    'be',
    'bg',
    'bn',
    'bs',
    'ca',
    'cs',
    'cy',
    'da',
    'de',
    'el',
    'en',
    'es',
    'et',
    'eu',
    'fa',
    'fi',
    'fil',
    'fo',
    'fr',
    'ga',
    'gl',
    'gu',
    'he',
    'hi',
    'hr',
    'hu',
    'hy',
    'id',
    'is',
    'it',
    'ja',
    'ka',
    'kk',
    'km',
    'kn',
    'ko',
    'ky',
    'lo',
    'lt',
    'lv',
    'mk',
    'ml',
    'mn',
    'mr',
    'ms',
    'my',
    'nb',
    'ne',
    'nl',
    'pa',
    'pl',
    'pt',
    'ro',
    'ru',
    'si',
    'sk',
    'sl',
    'sq',
    'sr',
    'sv',
    'sw',
    'ta',
    'te',
    'th',
    'ti',
    'to',
    'tr',
    'uk',
    'und',
    'ur',
    'uz',
    'vi',
    'yue',
    'zh',
    'zu',
 }
 # These are the names languages that have the most entries on the English and
 # German Wiktionaries. Wiktionary only consistently identifies languages by their
 # name, making it important to be able to recognize the names.
 #
 # These lists of names are used in `tests/test_wikt_languages.py`.
 WIKT_LANGUAGE_NAMES = {}
 WIKT_LANGUAGE_NAMES['en'] = [
    "Spanish",
    "French",
    "Latvian",
    "Latin",
    "English",
    "Mandarin",
    "Italian",
    "Portuguese",
    "Cantonese",
    "Japanese",
    "German",
    "Swedish",
    "Korean",
    "Serbo-Croatian",
    "Serbian",
    "Croatian",
    "Bosnian",
    "Finnish",
    "Vietnamese",
    "Dutch",
    "Galician",
    "Catalan",
    "Polish",
    "Danish",
    "Norwegian Nynorsk",
    "Turkish",
    "Romanian",
    "Lithuanian",
    "Ido",
    "Old French",
    "Czech",
    "Norwegian",
    # Jèrriais -- same as Norman
    "Esperanto",
    "Icelandic",
    # Old Armenian
    "Norwegian Bokmål",
    "Asturian",
    "Hungarian",
    "Proto-Germanic",
    "Russian",
    "Slovene",
    "Min Nan",
    "Scottish Gaelic",
    "Greek",
    "Irish",
    "Lojban",
    "Middle French",
    "Malay",
    "Luxembourgish",
    "Slovak",
    "Estonian",
    "Persian",
    "Venetian",
    "Old English",
    "Volapük",
    "Ladin",
    "Faroese",
    "Scots",
    "Interlingua",
    "Romansch",
    "Urdu",
    # Middle Chinese
    "Indonesian",
    "Swahili",
    "Middle English",
    "Occitan",
    "Welsh",
    "Old Norse",
    "Albanian",
    "Old Irish",
    "Old Saxon",
    "Lower Sorbian",
    "Afrikaans",
    "Ukrainian",
    "Proto-Slavic",
    "Ancient Greek",
    "Gothic",
    "Hawaiian",
    "Kurdish",
    "Tagalog",
    "Old High German",
    "Crimean Tatar",
    "Manx",
    "Sanskrit",
    "Hiligaynon",
    "West Frisian",
    "Hebrew",
    "Tok Pisin",
    "Proto-Indo-European",
    "Macedonian",
    "Novial",
    "Armenian",
    "Arabic",
    "Maltese",
    "Hakka",
    "Sicilian",
    "Ladino",
    "Basque",
    "Breton",
    # Guernésiais -- same as Norman
    "Vai",
    "Navajo",
    "Azeri",
    "Vilamovian",
    # Tarantino
    "Maori",
    "Friulian",
    "Hausa",
    "Haitian Creole",
    "Yiddish",
    "Tatar",
    "Proto-Malayo-Polynesian",
    "Aromanian",
    "Ottoman Turkish",
    "Old Provençal",
    "Northern Sami",
    "Dalmatian",
    "Bulgarian",
    "Neapolitan",
    "Cornish",
    "Middle Dutch",
    "Rapa Nui",
    # Old Portuguese
    "Egyptian Arabic",
    "Romani",
    "Tahitian",
    "Thai",
    "Limburgish",
    "Karelian",
    "Tajik",
    "Turkmen",
    "Kabardian",
    "Uzbek",
    "Samoan",
    "Mongolian",
    "Zulu",
    "Upper Sorbian",
    "Walloon",
    # Proto-Finnic
    "Frankish",
    "Mapudungun",
    "Pashto",
    "Low German",
    "Bashkir",
    "Kashubian",
    "Sranan Tongo",
    "Proto-Sino-Tibetan",
    "Norman",
    "Proto-Austronesian",
    "Marathi",
    "Rohingya",
    "Classical Nahuatl",
    # Proto-Malayic
    # German Low German
    "Fijian",
    "Zazaki",
    "Proto-Italic",
    "Old Dutch",
    "Egyptian",
    "Old Frisian",
    "Greenlandic",
    "Burmese",
    "Votic",
    "Ewe",
    "Cherokee",
    "Old Church Slavonic",
    "Quechua",
    "Mirandese",
    "Livonian",
    "Bengali",
    "Skolt Sami",
    # Proto-Balto-Slavic
    "Pitjantjatjara",
    "Georgian",
    "North Frisian",
    "Tetum",
    "Tongan",
    # Mauritian Creole
    "Torres Strait Creole",
    "Papiamentu",
    "Lao",
    "Malagasy",
    "Interlingue",
    "Aragonese",
    "Istriot",
    "Sumerian",
    "Proto-Celtic",
    "Võro",
    # Proto-Polynesian
    "Nepali",
    "Chickasaw",
    "Akkadian",
    "Middle Armenian",
    "Cimbrian",
    "Somali",
    "Sardinian",
    "Tocharian B",
    "Telugu",
    "Javanese",
    "Taos",
    "Proto-Semitic",
    # Old Prussian
    "Kyrgyz",
    "Corsican",
    "Veps",
    "Baluchi",
    "Middle Low German",
    "Middle High German",
    "Uyghur",
    # Dutch Low Saxon
    "Belarusian",
    "Guaraní",
    "Undetermined",
    "Inuktitut",
    "Tocharian A",
    "Nigerian Pidgin",
    # Gallo
    # Saterland Frisian
    "Punjabi",
    "Proto-Algonquian",
    # Istro-Romanian
    "Wiradhuri",
    "Sichuan Yi",
    "Wu",
    # White Hmong
    "Ugaritic",
    "Sundanese",
    # Old East Slavic
    # Fala
    # Elfdalian
    "Tamil",
    "Pijin",
    "Okinawan",
    "Kazakh",
    "Hindi",
    "Tuvan",
    "Polabian",
    "Aramaic",
    "Malayalam",
    "Kumyk",
    "Inari Sami",
    "Ilocano",
    "Tswana",
    "Libyan Arabic",
    "Latgalian",
    "Yakut",
    "Sindhi",
    "Khmer",
    "Gamilaraay",
    "Ojibwe",
    "Choctaw",
    "Chinese",
    "Chamorro",
    "Yucatec Maya",
    "Picard",
    "Ngarrindjeri",
    "Kott",
    "Ingrian",
    # Crimean Gothic
    "Chamicuro",
    "Rajasthani",
    # Old Tupi
    "Old Spanish",
    "Gagauz",
    "Extremaduran",
    "Chinook Jargon",
    "Cahuilla",
    "Kannada",
    "Iban",
    "American Sign Language",
    "Adyghe",
    "Warlpiri",
    "Tibetan",
    "Ossetian",
    "Meriam",
    "Marshallese",
    "Khakas",
    "Balinese",
    "Zhuang",
    "Tuvaluan",
    "Niuean",
    "Martuthunira",
    "Guugu Yimidhirr",
    "Chechen",
    "Campidanese Sardinian",
    "Tolai",
    # Old Javanese
    "Nahuatl",
    "Lombard",
    "West Coast Bajau",
    "Romagnol",
    "Middle Irish",
    "Yoruba",
    "Wangaaybuwan-Ngiyambaa",
    # Old Swedish
    "Lingala",
    "Fiji Hindi",
    "Shabo",
    "Sasak",
    "Judeo-Arabic",
    "Central Kurdish",
    "Bislama",
 ]
 WIKT_LANGUAGE_NAMES['de'] = [
    "Deutsch",
    "Englisch",
    "Polnisch",
    "Italienisch",
    "Französisch",
    "Esperanto",
    "Schwedisch",
    "Lateinisch",
    "Tschechisch",
    "Katalanisch",
    "Spanisch",
    "Okzitanisch",
    "Ungarisch",
    "Litauisch",
    "Finnisch",
    "Russisch",
    "Altgriechisch",
    "Niederländisch",
    "Kurdisch",
    "Baskisch",
    "Armenisch",
    "Isländisch",
    "Bulgarisch",
    "Färöisch",
    "Dänisch",
    "Portugiesisch",
    "Slowakisch",
    "Türkisch",
    "Maori",
    "Albanisch",
    "Japanisch",
    "Norwegisch",
    "Irisch",
    "Koreanisch",
    "Chinesisch",
    "Venezianisch",
    "Friaulisch",
    "Serbisch",
    "Indonesisch",
    "Walisisch",
    "Arabisch",
    "Zentral-Nahuatl",
    "Neugriechisch",
    "Sumerisch",
    "Obersorbisch",
    "Sesotho",
    "Rumänisch",
    "Suaheli",
    "Persisch",
    "Krimtatarisch",
    "Plattdeutsch",
    "Prußisch",
    "Thai",
    "Bosnisch",
    "Sardisch",
    "Maltesisch",
    "Akkadisch",
    "Hawaiianisch",
    "Hebräisch",
    "Gotisch",
    "Afrikaans",
    "Rätoromanisch",
    "Tamil",
    "Bretonisch",
    "Ukrainisch",
    "Hindi",
    "Georgisch",
    "Panjabi",
    "Papiamentu",
    "Slowenisch",
    "Nauruisch",
    "Schottisch-Gälisch",
    "Balinesisch",
    "Estnisch",
    "Manx",
    "Korsisch",
    # "Frühneuhochdeutsch",
    "Lettisch",
    "isiZulu",
    "Tagalog",
    "Tok Pisin",
    # "Südpikenisch",
    "Kroatisch",
    "Niedersorbisch",
    "Kannada",
    "Guanche",
    "Weißrussisch",
    "Sanskrit",
    "Aserbaidschanisch",
    "Mittelhochdeutsch",
    "Laotisch",
    "Altnordisch",
    "Altenglisch",
    "Vietnamesisch",
    "Tadschikisch",
    "Samoanisch",
    "Mazedonisch",
    "Luxemburgisch",
    "Hethitisch",
    # "Yukatekisch",
    "Kaschubisch",
    "Wallonisch",
    # "Klassisches Nahuatl",
    "Telugu",
    "Rapanui",
    "Jiddisch",
    "Ido",
    # "Galicisch",
    "Volapük",
    "Bengalisch",
    "Mapudungun",
    "Lojban",
    "Tuvaluisch",
    "Gujarati",
    "Assamesisch",
 ]
--- a/lib/langcodes/registry_parser.py
+++ b/lib/langcodes/registry_parser.py
@ -0,0 +1,59 @@
 from langcodes.util import data_filename
 LIST_KEYS = {'Description', 'Prefix'}
 def parse_file(file):
    """
    Take an open file containing the IANA subtag registry, and yield a
    dictionary of information for each subtag it describes.
    """
    lines = []
    for line in file:
        line = line.rstrip('\n')
        if line == '%%':
            # This is a separator between items. Parse the data we've
            # collected and yield the result.
            yield from parse_item(lines)
            lines.clear()
        elif line.startswith('  '):
            # This is a continuation line. Concatenate it to the previous
            # line, including one of the spaces.
            lines[-1] += line[1:]
        else:
            lines.append(line)
    yield from parse_item(lines)
 def parse_item(lines):
    """
    Given the lines that form a subtag entry (after joining wrapped lines
    back together), parse the data they contain.
    Returns a generator that yields once if there was any data there
    (and an empty generator if this was just the header).
    """
    info = {}
    for line in lines:
        key, value = line.split(': ', 1)
        if key in LIST_KEYS:
            info.setdefault(key, []).append(value)
        else:
            assert key not in info
            info[key] = value
    if 'Subtag' in info or 'Tag' in info:
        yield info
 def parse_registry():
    """
    Yield a sequence of dictionaries, containing the info in the included
    IANA subtag registry file.
    """
    with open(
        data_filename('language-subtag-registry.txt'), encoding='utf-8'
    ) as data_file:
        # 'yield from' instead of returning, so that we only close the file
        # when finished.
        yield from parse_file(data_file)
--- a/lib/langcodes/tag_parser.py
+++ b/lib/langcodes/tag_parser.py
@ -0,0 +1,422 @@
 """
 This module implements a parser for language tags, according to the RFC 5646
 (BCP 47) standard.
 Here, we're only concerned with the syntax of the language tag. Looking up
 what they actually mean in a data file is a separate step.
 For a full description of the syntax of a language tag, see page 3 of
    http://tools.ietf.org/html/bcp47
 >>> parse_tag('en')
 [('language', 'en')]
 >>> parse_tag('en_US')
 [('language', 'en'), ('territory', 'US')]
 >>> parse_tag('en-Latn')
 [('language', 'en'), ('script', 'Latn')]
 >>> parse_tag('es-419')
 [('language', 'es'), ('territory', '419')]
 >>> parse_tag('zh-hant-tw')
 [('language', 'zh'), ('script', 'Hant'), ('territory', 'TW')]
 >>> parse_tag('zh-tw-hant')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: This script subtag, 'hant', is out of place. Expected variant, extension, or end of string.
 >>> parse_tag('de-DE-1901')
 [('language', 'de'), ('territory', 'DE'), ('variant', '1901')]
 >>> parse_tag('ja-latn-hepburn')
 [('language', 'ja'), ('script', 'Latn'), ('variant', 'hepburn')]
 >>> parse_tag('ja-hepburn-latn')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
 >>> parse_tag('zh-yue')
 [('language', 'zh'), ('extlang', 'yue')]
 >>> parse_tag('zh-yue-Hant')
 [('language', 'zh'), ('extlang', 'yue'), ('script', 'Hant')]
 >>> parse_tag('zh-min-nan')
 [('grandfathered', 'zh-min-nan')]
 >>> parse_tag('x-dothraki')
 [('language', 'x-dothraki')]
 >>> parse_tag('en-u-co-backward-x-pig-latin')
 [('language', 'en'), ('extension', 'u-co-backward'), ('private', 'x-pig-latin')]
 >>> parse_tag('en-x-pig-latin-u-co-backward')
 [('language', 'en'), ('private', 'x-pig-latin-u-co-backward')]
 >>> parse_tag('u-co-backward')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Expected a language code, got 'u'
 >>> parse_tag('x-')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
 >>> parse_tag('und-u-')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
 >>> parse_tag('und-0-foo')
 [('language', 'und'), ('extension', '0-foo')]
 >>> parse_tag('und-?-foo')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '?'
 >>> parse_tag('und-x-123456789')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '123456789'
 >>> parse_tag('en-a-b-foo')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Tag extensions may not contain two singletons in a row
 >>> parse_tag('ar-٠٠١')
 Traceback (most recent call last):
    ...
 langcodes.tag_parser.LanguageTagError: Language tags must be made of ASCII characters
 """
 # These tags should not be parsed by the usual parser; they're grandfathered
 # in from RFC 3066. The 'irregular' ones don't fit the syntax at all; the
 # 'regular' ones do, but would give meaningless results when parsed.
 #
 # These are all lowercased so they can be matched case-insensitively, as the
 # standard requires.
 EXCEPTIONS = {
    # Irregular exceptions
    "en-gb-oed",
    "i-ami",
    "i-bnn",
    "i-default",
    "i-enochian",
    "i-hak",
    "i-klingon",
    "i-lux",
    "i-mingo",
    "i-navajo",
    "i-pwn",
    "i-tao",
    "i-tay",
    "i-tsu",
    "sgn-be-fr",
    "sgn-be-nl",
    "sgn-ch-de",
    # Regular exceptions
    "art-lojban",
    "cel-gaulish",
    "no-bok",
    "no-nyn",
    "zh-guoyu",
    "zh-hakka",
    "zh-min",
    "zh-min-nan",
    "zh-xiang",
 }
 # Define the order of subtags as integer constants, but also give them names
 # so we can describe them in error messages
 EXTLANG, SCRIPT, TERRITORY, VARIANT, EXTENSION = range(5)
 SUBTAG_TYPES = [
    'extlang',
    'script',
    'territory',
    'variant',
    'extension',
    'end of string',
 ]
 def _is_ascii(s):
    """
    Determine whether a tag consists of ASCII characters.
    """
    # When Python 3.6 support is dropped, we can replace this with str.isascii().
    try:
        s.encode('ascii')
        return True
    except UnicodeEncodeError:
        return False
 def normalize_characters(tag):
    """
    BCP 47 is case-insensitive, and CLDR's use of it considers underscores
    equivalent to hyphens. So here we smash tags into lowercase with hyphens,
    so we can make exact comparisons.
    >>> normalize_characters('en_US')
    'en-us'
    >>> normalize_characters('zh-Hant_TW')
    'zh-hant-tw'
    """
    return tag.lower().replace('_', '-')
 def parse_tag(tag):
    """
    Parse the syntax of a language tag, without looking up anything in the
    registry, yet. Returns a list of (type, value) tuples indicating what
    information will need to be looked up.
    """
    if not _is_ascii(tag):
        raise LanguageTagError("Language tags must be made of ASCII characters")
    tag = normalize_characters(tag)
    if tag in EXCEPTIONS:
        return [('grandfathered', tag)]
    else:
        # The first subtag is always either the language code, or 'x' to mark
        # the entire tag as private-use. Other subtags are distinguished
        # by their length and format, but the language code is distinguished
        # by the fact that it is required to come first.
        subtags = tag.split('-')
        # check all subtags for their shape: 1-8 alphanumeric characters
        for subtag in subtags:
            if len(subtag) < 1 or len(subtag) > 8 or not subtag.isalnum():
                raise LanguageTagError(
                    f"Expected 1-8 alphanumeric characters, got {subtag!r}"
                )
        if subtags[0] == 'x':
            if len(subtags) == 1:
                raise LanguageTagError("'x' is not a language tag on its own")
            # the entire language tag is private use, but we know that,
            # whatever it is, it fills the "language" slot
            return [('language', tag)]
        elif 2 <= len(subtags[0]) <= 4:
            # Language codes should be 2 or 3 letters, but 4-letter codes
            # are allowed to parse for legacy Unicode reasons
            return [('language', subtags[0])] + parse_subtags(subtags[1:])
        else:
            subtag_error(subtags[0], 'a language code')
 def parse_subtags(subtags, expect=EXTLANG):
    """
    Parse everything that comes after the language tag: scripts, territories,
    variants, and assorted extensions.
    """
    # We parse the parts of a language code recursively: each step of
    # language code parsing handles one component of the code, recurses
    # to handle the rest of the code, and adds what it found onto the
    # list of things that were in the rest of the code.
    #
    # This could just as well have been iterative, but the loops would have
    # been convoluted.
    #
    # So here's the base case.
    if not subtags:
        return []
    # There's a subtag that comes next. We need to find out what it is.
    #
    # The primary thing that distinguishes different types of subtags is
    # length, but the subtags also come in a specified order. The 'expect'
    # parameter keeps track of where we are in that order. expect=TERRITORY,
    # for example, means we're expecting a territory code, or anything later
    # (because everything but the language is optional).
    subtag = subtags[0]
    tag_length = len(subtag)
    # In the usual case, our goal is to recognize what kind of tag this is,
    # and set it in 'tagtype' -- as an integer, so we can compare where it
    # should go in order. You can see the enumerated list of tagtypes above,
    # where the SUBTAG_TYPES global is defined.
    tagtype = None
    if tag_length == 1:
        # A one-letter subtag introduces an extension, which can itself have
        # sub-subtags, so we dispatch to a different function at this point.
        #
        # We don't need to check anything about the order, because extensions
        # necessarily come last.
        if subtag.isalnum():
            return parse_extension(subtags)
        else:
            subtag_error(subtag)
    elif tag_length == 2:
        if subtag.isalpha():
            # Two-letter alphabetic subtags are territories. These are the only
            # two-character subtags after the language.
            tagtype = TERRITORY
    elif tag_length == 3:
        if subtag.isalpha():
            # Three-letter alphabetic subtags are 'extended languages'.
            # It's allowed for there to be up to three of them in a row, so we
            # need another function to enforce that. Before we dispatch to that
            # function, though, we need to check whether we're in the right
            # place in order.
            if expect <= EXTLANG:
                return parse_extlang(subtags)
            else:
                order_error(subtag, EXTLANG, expect)
        elif subtag.isdigit():
            # Three-digit subtags are territories representing broad regions,
            # such as Latin America (419).
            tagtype = TERRITORY
    elif tag_length == 4:
        if subtag.isalpha():
            # Four-letter alphabetic subtags are scripts.
            tagtype = SCRIPT
        elif subtag[0].isdigit():
            # Four-character subtags that start with a digit are variants.
            tagtype = VARIANT
    else:
        # Tags of length 5-8 are variants.
        tagtype = VARIANT
    # That's the end of the big elif block for figuring out what kind of
    # subtag we have based on its length. Now we should do something with that
    # kind of subtag.
    if tagtype is None:
        # We haven't recognized a type of tag. This subtag just doesn't fit the
        # standard.
        subtag_error(subtag)
    elif tagtype < expect:
        # We got a tag type that was supposed to appear earlier in the order.
        order_error(subtag, tagtype, expect)
    else:
        # We've recognized a subtag of a particular type. If it's a territory or
        # script, we expect the next subtag to be a strictly later type, because
        # there can be at most one territory and one script. Otherwise, we expect
        # the next subtag to be the type we got or later.
        if tagtype in (SCRIPT, TERRITORY):
            expect = tagtype + 1
        else:
            expect = tagtype
        # Get the name of this subtag type instead of its integer value.
        typename = SUBTAG_TYPES[tagtype]
        # Some subtags are conventionally written with capitalization. Apply
        # those conventions.
        if tagtype == SCRIPT:
            subtag = subtag.title()
        elif tagtype == TERRITORY:
            subtag = subtag.upper()
        # Recurse on the remaining subtags.
        return [(typename, subtag)] + parse_subtags(subtags[1:], expect)
 def parse_extlang(subtags):
    """
    Parse an 'extended language' tag, which consists of 1 to 3 three-letter
    language codes.
    Extended languages are used for distinguishing dialects/sublanguages
    (depending on your view) of macrolanguages such as Arabic, Bahasa Malay,
    and Chinese.
    It's supposed to also be acceptable to just use the sublanguage as the
    primary language code, and your code should know what's a macrolanguage of
    what. For example, 'zh-yue' and 'yue' are the same language (Cantonese),
    and differ only in whether they explicitly spell out that Cantonese is a
    kind of Chinese.
    """
    index = 0
    parsed = []
    while index < len(subtags) and len(subtags[index]) == 3 and index < 3:
        parsed.append(('extlang', subtags[index]))
        index += 1
    return parsed + parse_subtags(subtags[index:], SCRIPT)
 def parse_extension(subtags):
    """
    An extension tag consists of a 'singleton' -- a one-character subtag --
    followed by other subtags. Extension tags are in the BCP 47 syntax, but
    their meaning is outside the scope of the standard.
    For example, there's the u- extension, which is used for setting Unicode
    properties in some context I'm not aware of.
    If the singleton is 'x', it's a private use extension, and consumes the
    rest of the tag. Otherwise, it stops at the next singleton.
    """
    subtag = subtags[0]
    if len(subtags) == 1:
        raise LanguageTagError(f"The subtag {subtag!r} must be followed by something")
    if subtag == 'x':
        # Private use. Everything after this is arbitrary codes that we
        # can't look up.
        return [('private', '-'.join(subtags))]
    else:
        # Look for the next singleton, if there is one.
        boundary = 1
        while boundary < len(subtags) and len(subtags[boundary]) != 1:
            boundary += 1
        if boundary == 1:
            raise LanguageTagError(
                "Tag extensions may not contain two singletons in a row"
            )
        # We've parsed a complete extension subtag. Return to the main
        # parse_subtags function, but expect to find nothing but more
        # extensions at this point.
        return [('extension', '-'.join(subtags[:boundary]))] + parse_subtags(
            subtags[boundary:], EXTENSION
        )
 class LanguageTagError(ValueError):
    pass
 def order_error(subtag, got, expected):
    """
    Output an error indicating that tags were out of order.
    """
    options = SUBTAG_TYPES[expected:]
    if len(options) == 1:
        expect_str = options[0]
    elif len(options) == 2:
        expect_str = f'{options[0]} or {options[1]}'
    else:
        joined = ', '.join(options[:-1])
        last = options[-1]
        expect_str = f'{joined}, or {last}'
    got_str = SUBTAG_TYPES[got]
    raise LanguageTagError(
        f"This {got_str} subtag, {subtag!r}, is out of place. Expected {expect_str}."
    )
 def subtag_error(subtag, expected='a valid subtag'):
    """
    Try to output a reasonably helpful error message based on our state of
    parsing. Most of this code is about how to list, in English, the kinds
    of things we were expecting to find.
    """
    raise LanguageTagError(f"Expected {expected}, got {subtag!r}")
--- a/lib/langcodes/util.py
+++ b/lib/langcodes/util.py
@ -0,0 +1,8 @@
 from pkg_resources import resource_filename
 DATA_ROOT = resource_filename('langcodes', 'data')
 import os
 def data_filename(filename):
    return os.path.join(DATA_ROOT, filename)
--- a/lib/language_data/init.py
+++ b/lib/language_data/init.py
--- a/lib/language_data/data/extra_language_names.csv
+++ b/lib/language_data/data/extra_language_names.csv
@ -0,0 +1,68 @@
 en,av,Avar
 en,frr,North Frisian
 en,frs,East Frisian
 en,fy,West Frisian
 en,gn,Guaraní
 en,ilo,Ilocano
 en,jam,Jamaican Creole
 en,kky,Guugu Yimidhirr
 en,kky,Guugu Yimithirr
 en,ksd,Tolai
 en,liv,Livonian
 en,nay,Ngarrindjeri
 en,nmn,ǃXóõ
 en,nrf,Norman
 en,oj,Ojibwe
 en,pap,Papiamentu
 en,pms,Piedmontese
 en,rap,Rapa Nui
 en,rm,Romansch
 en,rom,Romani
 en,ryu,Okinawan
 en,sl,Slovene
 en,st,Sesotho
 en,tvl,Tuvaluan
 en,twf,Taos
 en,txb,Tocharian B
 en,tyv,Tuvan
 en,vma,Martuthunira
 en,wym,Vilamovian
 en,xto,Tocharian A
 en,zu,isiZulu
 de,el,Neugriechisch
 de,la,Lateinisch
 de,fur,Friaulisch
 de,gd,Schottisch-Gälisch
 de,haw,Hawaiianisch
 de,nds,Plattdeutsch
 de,nhn,Zentral-Nahuatl
 de,pa,Panjabi
 de,pap,Papiamentu
 de,prg,Prußisch
 de,vec,Venezianisch
 de,tvl,Tuvaluisch
 sh,sh,Srpskohrvatski
 la,la,Lingua latina
 ceb,ceb,Sinugbuanong Binisayâ
 ceb,ceb,Bisayâ
 ceb,ceb,Bisaya
 lah,lah,لہندا پنجابی
 bho,bho,भोजपुरी
 ang,ang,Ænglisc
 vo,vo,Volapük
 io,io,Ido
 jbo,jbo,lojban
 jbo,jbo,lojbau
 rup,rup,armãneashti
 nv,nv,Diné bizaad
 zh-Hant,nan,閩南語
 zh-Hans,nan,闽南语
 nan-Latn,nan,Bân-lâm-gú
 zh-Hant,hak,客家語
 zh-Hans,hak,客家语
 ilo,ilo,Ilokano
 hil,hil,Ilonggo
 nah,nah,Nāhuatl
 tpi,tpi,Tok Pisin
 ve,ve,tshiVenḓa
 kcm,kcm,Kristang
--- a/lib/language_data/data/language-subtag-registry.txt
+++ b/lib/language_data/data/language-subtag-registry.txt
--- a/lib/language_data/data/languageInfo.xml
+++ b/lib/language_data/data/languageInfo.xml
@ -0,0 +1,442 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
 <!--
 Copyright © 1991-2020 Unicode, Inc.
 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 For terms of use, see http://www.unicode.org/copyright.html
 -->
 <supplementalData>
 	<version number="$Revision$"/>
 	<languageMatching>
 		<languageMatches type="written_new">
 			<paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
 			<matchVariable id="$enUS" value="AS+GU+MH+MP+PR+UM+US+VI"/>
 			<matchVariable id="$cnsar" value="HK+MO"/>
 			<matchVariable id="$americas" value="019"/>
 			<matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
 			<languageMatch desired="no"	supported="nb"	distance="1"/>	<!-- no ⇒ nb -->
 			<!-- languageMatch desired="ku"	supported="ckb"	distance="4" oneway="true"/ -->	<!-- ku ⇒ ckb -->
 			<!-- languageMatch desired="ckb" supported="ku" percent="8" oneway="true"/ --> <!-- ckb ⇒ ku -->
 			<languageMatch desired="hr"	supported="bs"	distance="4"/>	<!-- hr ⇒ bs -->
 			<languageMatch desired="sh"	supported="bs"	distance="4"/>	<!-- sh ⇒ bs -->
 			<!-- languageMatch desired="sr"	supported="bs"	distance="4"/-->	<!-- sr ⇒ bs -->
 			<languageMatch desired="sh"	supported="hr"	distance="4"/>	<!-- sh ⇒ hr -->
 			<!-- languageMatch desired="sr"	supported="hr"	distance="4"/-->	<!-- sr ⇒ hr -->
 			<languageMatch desired="sh"	supported="sr"	distance="4"/>	<!-- sh ⇒ sr -->
 			<languageMatch desired="ssy"	supported="aa"	distance="4"/>	<!-- ssy ⇒ aa -->
 			<languageMatch desired="gsw"	supported="de"	distance="4"	oneway="true"/>	<!-- gsw ⇒ de -->
 			<languageMatch desired="lb"	supported="de"	distance="4"	oneway="true"/>	<!-- lb ⇒ de -->
 			<languageMatch desired="da"	supported="no"	distance="8"/>	<!-- da ⇒ no -->
 			<languageMatch desired="da"	supported="nb"	distance="8"/>	<!-- da ⇒ nb -->
 			<!-- various fallbacks for more or less loosely related languages -->
 			<!-- CLDR-13528:
 			    Distance 20 for some linguistic relation (e.g., Creoles to French)
 			    or a local language in the area of another (e.g., Breton to French).
 			    Distance 30 for fallbacks to prevalent second languages,
 			    and in the absence of better information. -->
 			<languageMatch desired="ab"	supported="ru"	distance="30"	oneway="true"/>	<!-- Abkhazian: ab ⇒ ru -->
 			<languageMatch desired="ach"	supported="en"	distance="30"	oneway="true"/>	<!-- Acoli (Southern Luo dialect in Uganda): ach ⇒ en -->
 			<languageMatch desired="af"	supported="nl"	distance="20"	oneway="true"/>	<!-- Afrikaans: af ⇒ nl -->
 			<languageMatch desired="ak"	supported="en"	distance="30"	oneway="true"/>	<!-- Akan: ak ⇒ en -->
 			<languageMatch desired="ay"	supported="es"	distance="20"	oneway="true"/>	<!-- Aymara: ay ⇒ es -->
 			<languageMatch desired="az"	supported="ru"	distance="30"	oneway="true"/>	<!-- Azerbaijani: az ⇒ ru -->
 			<languageMatch desired="be"	supported="ru"	distance="20"	oneway="true"/>	<!-- Belarusian: be ⇒ ru -->
 			<languageMatch desired="bem"	supported="en"	distance="30"	oneway="true"/>	<!-- Bemba (Zambia): bem ⇒ en -->
 			<languageMatch desired="bh"	supported="hi"	distance="30"	oneway="true"/>	<!-- Bihari languages (gets canonicalized to bho): bh ⇒ hi -->
 			<languageMatch desired="bn"	supported="en"	distance="30"	oneway="true"/>	<!-- Bangla: bn ⇒ en -->
 			<languageMatch desired="br"	supported="fr"	distance="20"	oneway="true"/>	<!-- Breton: br ⇒ fr -->
 			<languageMatch desired="ceb"	supported="fil"	distance="30"	oneway="true"/>	<!-- Cebuano: ceb ⇒ fil -->
 			<languageMatch desired="chr"	supported="en"	distance="20"	oneway="true"/>	<!-- Cherokee: chr ⇒ en -->
 			<languageMatch desired="ckb"	supported="ar"	distance="30"	oneway="true"/>	<!-- Sorani Kurdish: ckb ⇒ ar -->
 			<languageMatch desired="co"	supported="fr"	distance="20"	oneway="true"/>	<!-- Corsican: co ⇒ fr -->
 			<languageMatch desired="crs"	supported="fr"	distance="20"	oneway="true"/>	<!-- Seselwa Creole French: crs ⇒ fr -->
 			<languageMatch desired="cy"	supported="en"	distance="20"	oneway="true"/>	<!-- Welsh: cy ⇒ en -->
 			<languageMatch desired="ee"	supported="en"	distance="30"	oneway="true"/>	<!-- Ewe: ee ⇒ en -->
 			<languageMatch desired="eo"	supported="en"	distance="30"	oneway="true"/>	<!-- Esperanto: eo ⇒ en -->
 			<!-- CLDR-13650: No fallback for Estonian -->
 			<!-- languageMatch desired="et"	supported="fi"	distance="30"	oneway="true"/-->	<!-- Estonian: et ⇒ fi -->
 			<languageMatch desired="eu"	supported="es"	distance="20"	oneway="true"/>	<!-- Basque: eu ⇒ es -->
 			<languageMatch desired="fo"	supported="da"	distance="20"	oneway="true"/>	<!-- Faroese: fo ⇒ da -->
 			<languageMatch desired="fy"	supported="nl"	distance="20"	oneway="true"/>	<!-- Western Frisian: fy ⇒ nl -->
 			<languageMatch desired="ga"	supported="en"	distance="20"	oneway="true"/>	<!-- Irish: ga ⇒ en -->
 			<languageMatch desired="gaa"	supported="en"	distance="30"	oneway="true"/>	<!-- Ga: gaa ⇒ en -->
 			<languageMatch desired="gd"	supported="en"	distance="20"	oneway="true"/>	<!-- Scottish Gaelic: gd ⇒ en -->
 			<languageMatch desired="gl"	supported="es"	distance="20"	oneway="true"/>	<!-- Galician: gl ⇒ es -->
 			<languageMatch desired="gn"	supported="es"	distance="20"	oneway="true"/>	<!-- Guarani: gn ⇒ es -->
 			<languageMatch desired="gu"	supported="hi"	distance="30"	oneway="true"/>	<!-- Gujarati: gu ⇒ hi -->
 			<languageMatch desired="ha"	supported="en"	distance="30"	oneway="true"/>	<!-- Hausa: ha ⇒ en -->
 			<languageMatch desired="haw"	supported="en"	distance="20"	oneway="true"/>	<!-- Hawaiian: haw ⇒ en -->
 			<languageMatch desired="ht"	supported="fr"	distance="20"	oneway="true"/>	<!-- Haitian Creole: ht ⇒ fr -->
 			<languageMatch desired="hy"	supported="ru"	distance="30"	oneway="true"/>	<!-- Armenian: hy ⇒ ru -->
 			<languageMatch desired="ia"	supported="en"	distance="30"	oneway="true"/>	<!-- Interlingua: ia ⇒ en -->
 			<languageMatch desired="ig"	supported="en"	distance="30"	oneway="true"/>	<!-- Igbo: ig ⇒ en -->
 			<languageMatch desired="is"	supported="en"	distance="20"	oneway="true"/>	<!-- Icelandic: is ⇒ en -->
 			<languageMatch desired="jv"	supported="id"	distance="20"	oneway="true"/>	<!-- Javanese: jv ⇒ id -->
 			<languageMatch desired="ka"	supported="en"	distance="30"	oneway="true"/>	<!-- Georgian: ka ⇒ en -->
 			<languageMatch desired="kg"	supported="fr"	distance="30"	oneway="true"/>	<!-- Kongo: kg ⇒ fr -->
 			<languageMatch desired="kk"	supported="ru"	distance="30"	oneway="true"/>	<!-- Kazakh: kk ⇒ ru -->
 			<languageMatch desired="km"	supported="en"	distance="30"	oneway="true"/>	<!-- Khmer: km ⇒ en -->
 			<languageMatch desired="kn"	supported="en"	distance="30"	oneway="true"/>	<!-- Kannada: kn ⇒ en -->
 			<languageMatch desired="kri"	supported="en"	distance="30"	oneway="true"/>	<!-- Krio: kri ⇒ en -->
 			<languageMatch desired="ku"	supported="tr"	distance="30"	oneway="true"/>	<!-- Kurdish: ku ⇒ tr -->
 			<languageMatch desired="ky"	supported="ru"	distance="30"	oneway="true"/>	<!-- Kirghiz: ky ⇒ ru -->
 			<languageMatch desired="la"	supported="it"	distance="20"	oneway="true"/>	<!-- Latin: la ⇒ it -->
 			<languageMatch desired="lg"	supported="en"	distance="30"	oneway="true"/>	<!-- Luganda: lg ⇒ en -->
 			<languageMatch desired="ln"	supported="fr"	distance="30"	oneway="true"/>	<!-- Lingala: ln ⇒ fr -->
 			<languageMatch desired="lo"	supported="en"	distance="30"	oneway="true"/>	<!-- Lao: lo ⇒ en -->
 			<languageMatch desired="loz"	supported="en"	distance="30"	oneway="true"/>	<!-- Lozi: loz ⇒ en -->
 			<languageMatch desired="lua"	supported="fr"	distance="30"	oneway="true"/>	<!-- Luba-Lulua: lua ⇒ fr -->
 			<languageMatch desired="mfe"	supported="en"	distance="30"	oneway="true"/>	<!-- Morisyen: mfe ⇒ en -->
 			<languageMatch desired="mg"	supported="fr"	distance="30"	oneway="true"/>	<!-- Malagasy: mg ⇒ fr -->
 			<languageMatch desired="mi"	supported="en"	distance="20"	oneway="true"/>	<!-- Maori: mi ⇒ en -->
 			<!-- CLDR-13625: Macedonian should not fall back to Bulgarian -->
 			<!-- languageMatch desired="mk"	supported="bg"	distance="30"	oneway="true"/-->	<!-- Macedonian: mk ⇒ bg -->
 			<languageMatch desired="ml"	supported="en"	distance="30"	oneway="true"/>	<!-- Malayalam: ml ⇒ en -->
 			<languageMatch desired="mn"	supported="ru"	distance="30"	oneway="true"/>	<!-- Mongolian: mn ⇒ ru -->
 			<languageMatch desired="mr"	supported="hi"	distance="30"	oneway="true"/>	<!-- Marathi: mr ⇒ hi -->
 			<languageMatch desired="ms"	supported="id"	distance="30"	oneway="true"/>	<!-- Malay: ms ⇒ id -->
 			<languageMatch desired="mt"	supported="en"	distance="30"	oneway="true"/>	<!-- Maltese: mt ⇒ en -->
 			<languageMatch desired="my"	supported="en"	distance="30"	oneway="true"/>	<!-- Myanmar: my ⇒ en -->
 			<languageMatch desired="ne"	supported="en"	distance="30"	oneway="true"/>	<!-- Nepali: ne ⇒ en -->
 			<languageMatch desired="nn"	supported="nb"	distance="20"/>	<!-- Nynorsk: nn ⟺ nb -->
 			<languageMatch desired="nn"	supported="no"	distance="20"/>	<!-- Nynorsk: nn ⟺ no; CLDR-13679  -->
 			<languageMatch desired="nso"	supported="en"	distance="30"	oneway="true"/>	<!-- Northern Sotho: nso ⇒ en -->
 			<languageMatch desired="ny"	supported="en"	distance="30"	oneway="true"/>	<!-- Nyanja: ny ⇒ en -->
 			<languageMatch desired="nyn"	supported="en"	distance="30"	oneway="true"/>	<!-- Nyankole: nyn ⇒ en -->
 			<languageMatch desired="oc"	supported="fr"	distance="20"	oneway="true"/>	<!-- Occitan: oc ⇒ fr -->
 			<languageMatch desired="om"	supported="en"	distance="30"	oneway="true"/>	<!-- Oromo: om ⇒ en -->
 			<languageMatch desired="or"	supported="en"	distance="30"	oneway="true"/>	<!-- Odia: or ⇒ en -->
 			<languageMatch desired="pa"	supported="en"	distance="30"	oneway="true"/>	<!-- Punjabi: pa ⇒ en -->
 			<languageMatch desired="pcm"	supported="en"	distance="20"	oneway="true"/>	<!-- Nigerian Pidgin: pcm ⇒ en -->
 			<languageMatch desired="ps"	supported="en"	distance="30"	oneway="true"/>	<!-- Pashto: ps ⇒ en -->
 			<languageMatch desired="qu"	supported="es"	distance="30"	oneway="true"/>	<!-- Quechua: qu ⇒ es -->
 			<languageMatch desired="rm"	supported="de"	distance="20"	oneway="true"/>	<!-- Romansh: rm ⇒ de -->
 			<languageMatch desired="rn"	supported="en"	distance="30"	oneway="true"/>	<!-- Rundi: rn ⇒ en -->
 			<languageMatch desired="rw"	supported="fr"	distance="30"	oneway="true"/>	<!-- Kinyarwanda: rw ⇒ fr -->
 			<languageMatch desired="sa"	supported="hi"	distance="30"	oneway="true"/>	<!-- Sanskrit: sa ⇒ hi -->
 			<languageMatch desired="sd"	supported="en"	distance="30"	oneway="true"/>	<!-- Sindhi: sd ⇒ en -->
 			<languageMatch desired="si"	supported="en"	distance="30"	oneway="true"/>	<!-- Sinhalese: si ⇒ en -->
 			<languageMatch desired="sn"	supported="en"	distance="30"	oneway="true"/>	<!-- Shona: sn ⇒ en -->
 			<languageMatch desired="so"	supported="en"	distance="30"	oneway="true"/>	<!-- Somali: so ⇒ en -->
 			<languageMatch desired="sq"	supported="en"	distance="30"	oneway="true"/>	<!-- Albanian: sq ⇒ en -->
 			<languageMatch desired="st"	supported="en"	distance="30"	oneway="true"/>	<!-- Southern Sotho: st ⇒ en -->
 			<languageMatch desired="su"	supported="id"	distance="20"	oneway="true"/>	<!-- Sundanese: su ⇒ id -->
 			<languageMatch desired="sw"	supported="en"	distance="30"	oneway="true"/>	<!-- Swahili: sw ⇒ en -->
 			<languageMatch desired="ta"	supported="en"	distance="30"	oneway="true"/>	<!-- Tamil: ta ⇒ en -->
 			<languageMatch desired="te"	supported="en"	distance="30"	oneway="true"/>	<!-- Telugu: te ⇒ en -->
 			<languageMatch desired="tg"	supported="ru"	distance="30"	oneway="true"/>	<!-- Tajik: tg ⇒ ru -->
 			<languageMatch desired="ti"	supported="en"	distance="30"	oneway="true"/>	<!-- Tigrinya: ti ⇒ en -->
 			<languageMatch desired="tk"	supported="ru"	distance="30"	oneway="true"/>	<!-- Turkmen: tk ⇒ ru -->
 			<languageMatch desired="tlh"	supported="en"	distance="30"	oneway="true"/>	<!-- Klingon: tlh ⇒ en -->
 			<languageMatch desired="tn"	supported="en"	distance="30"	oneway="true"/>	<!-- Tswana: tn ⇒ en -->
 			<languageMatch desired="to"	supported="en"	distance="30"	oneway="true"/>	<!-- Tonga: to ⇒ en -->
 			<languageMatch desired="tt"	supported="ru"	distance="30"	oneway="true"/>	<!-- Tatar: tt ⇒ ru -->
 			<languageMatch desired="tum"	supported="en"	distance="30"	oneway="true"/>	<!-- Tumbuka: tum ⇒ en -->
 			<languageMatch desired="ug"	supported="zh"	distance="20"	oneway="true"/>	<!-- Uighur: ug ⇒ zh -->
 			<languageMatch desired="ur"	supported="en"	distance="30"	oneway="true"/>	<!-- Urdu: ur ⇒ en -->
 			<languageMatch desired="uz"	supported="ru"	distance="30"	oneway="true"/>	<!-- Uzbek: uz ⇒ ru -->
 			<languageMatch desired="wo"	supported="fr"	distance="30"	oneway="true"/>	<!-- Wolof: wo ⇒ fr -->
 			<languageMatch desired="xh"	supported="en"	distance="30"	oneway="true"/>	<!-- Xhosa: xh ⇒ en -->
 			<languageMatch desired="yi"	supported="en"	distance="30"	oneway="true"/>	<!-- Yiddish: yi ⇒ en -->
 			<languageMatch desired="yo"	supported="en"	distance="30"	oneway="true"/>	<!-- Yoruba: yo ⇒ en -->
 			<languageMatch desired="zu"	supported="en"	distance="30"	oneway="true"/>	<!-- Zulu: zu ⇒ en -->
 			<!-- START generated by GenerateLanguageMatches.java: don't manually change -->
 			<!-- Encompassed by Arabic -->
 			<languageMatch desired="aao" supported="ar" distance="10" oneway="true"/>	<!-- Algerian Saharan Arabic -->
 			<languageMatch desired="abh" supported="ar" distance="10" oneway="true"/>	<!-- Tajiki Arabic -->
 			<languageMatch desired="abv" supported="ar" distance="10" oneway="true"/>	<!-- Baharna Arabic -->
 			<languageMatch desired="acm" supported="ar" distance="10" oneway="true"/>	<!-- Mesopotamian Arabic -->
 			<languageMatch desired="acq" supported="ar" distance="10" oneway="true"/>	<!-- Ta'izzi-Adeni Arabic -->
 			<languageMatch desired="acw" supported="ar" distance="10" oneway="true"/>	<!-- Hijazi Arabic -->
 			<languageMatch desired="acx" supported="ar" distance="10" oneway="true"/>	<!-- Omani Arabic -->
 			<languageMatch desired="acy" supported="ar" distance="10" oneway="true"/>	<!-- Cypriot Arabic -->
 			<languageMatch desired="adf" supported="ar" distance="10" oneway="true"/>	<!-- Dhofari Arabic -->
 			<languageMatch desired="aeb" supported="ar" distance="10" oneway="true"/>	<!-- Tunisian Arabic -->
 			<languageMatch desired="aec" supported="ar" distance="10" oneway="true"/>	<!-- Saidi Arabic -->
 			<languageMatch desired="afb" supported="ar" distance="10" oneway="true"/>	<!-- Gulf Arabic -->
 			<languageMatch desired="ajp" supported="ar" distance="10" oneway="true"/>	<!-- South Levantine Arabic -->
 			<languageMatch desired="apc" supported="ar" distance="10" oneway="true"/>	<!-- North Levantine Arabic -->
 			<languageMatch desired="apd" supported="ar" distance="10" oneway="true"/>	<!-- Sudanese Arabic -->
 			<languageMatch desired="arq" supported="ar" distance="10" oneway="true"/>	<!-- Algerian Arabic -->
 			<languageMatch desired="ars" supported="ar" distance="10" oneway="true"/>	<!-- Najdi Arabic -->
 			<languageMatch desired="ary" supported="ar" distance="10" oneway="true"/>	<!-- Moroccan Arabic -->
 			<languageMatch desired="arz" supported="ar" distance="10" oneway="true"/>	<!-- Egyptian Arabic -->
 			<languageMatch desired="auz" supported="ar" distance="10" oneway="true"/>	<!-- Uzbeki Arabic -->
 			<languageMatch desired="avl" supported="ar" distance="10" oneway="true"/>	<!-- Eastern Egyptian Bedawi Arabic -->
 			<languageMatch desired="ayh" supported="ar" distance="10" oneway="true"/>	<!-- Hadrami Arabic -->
 			<languageMatch desired="ayl" supported="ar" distance="10" oneway="true"/>	<!-- Libyan Arabic -->
 			<languageMatch desired="ayn" supported="ar" distance="10" oneway="true"/>	<!-- Sanaani Arabic -->
 			<languageMatch desired="ayp" supported="ar" distance="10" oneway="true"/>	<!-- North Mesopotamian Arabic -->
 			<languageMatch desired="bbz" supported="ar" distance="10" oneway="true"/>	<!-- Babalia Creole Arabic -->
 			<languageMatch desired="pga" supported="ar" distance="10" oneway="true"/>	<!-- Sudanese Creole Arabic -->
 			<languageMatch desired="shu" supported="ar" distance="10" oneway="true"/>	<!-- Chadian Arabic -->
 			<languageMatch desired="ssh" supported="ar" distance="10" oneway="true"/>	<!-- Shihhi Arabic -->
 			<!-- Encompassed by Azerbaijani -->
 			<languageMatch desired="azb" supported="az" distance="10" oneway="true"/>	<!-- South Azerbaijani -->
 			<!-- Encompassed by Estonian -->
 			<languageMatch desired="vro" supported="et" distance="10" oneway="true"/>	<!-- Võro -->
 			<!-- Encompassed by Fulah -->
 			<languageMatch desired="ffm" supported="ff" distance="10" oneway="true"/>	<!-- Maasina Fulfulde -->
 			<languageMatch desired="fub" supported="ff" distance="10" oneway="true"/>	<!-- Adamawa Fulfulde -->
 			<languageMatch desired="fue" supported="ff" distance="10" oneway="true"/>	<!-- Borgu Fulfulde -->
 			<languageMatch desired="fuf" supported="ff" distance="10" oneway="true"/>	<!-- Pular -->
 			<languageMatch desired="fuh" supported="ff" distance="10" oneway="true"/>	<!-- Western Niger Fulfulde -->
 			<languageMatch desired="fui" supported="ff" distance="10" oneway="true"/>	<!-- Bagirmi Fulfulde -->
 			<languageMatch desired="fuq" supported="ff" distance="10" oneway="true"/>	<!-- Central-Eastern Niger Fulfulde -->
 			<languageMatch desired="fuv" supported="ff" distance="10" oneway="true"/>	<!-- Nigerian Fulfulde -->
 			<!-- Encompassed by Guarani -->
 			<languageMatch desired="gnw" supported="gn" distance="10" oneway="true"/>	<!-- Western Bolivian Guaraní -->
 			<languageMatch desired="gui" supported="gn" distance="10" oneway="true"/>	<!-- Eastern Bolivian Guaraní -->
 			<languageMatch desired="gun" supported="gn" distance="10" oneway="true"/>	<!-- Mbyá Guaraní -->
 			<languageMatch desired="nhd" supported="gn" distance="10" oneway="true"/>	<!-- Chiripá -->
 			<!-- Encompassed by Inuktitut -->
 			<languageMatch desired="ikt" supported="iu" distance="10" oneway="true"/>	<!-- Inuinnaqtun -->
 			<!-- Encompassed by Kalenjin -->
 			<languageMatch desired="enb" supported="kln" distance="10" oneway="true"/>	<!-- Markweeta -->
 			<languageMatch desired="eyo" supported="kln" distance="10" oneway="true"/>	<!-- Keiyo -->
 			<languageMatch desired="niq" supported="kln" distance="10" oneway="true"/>	<!-- Nandi -->
 			<languageMatch desired="oki" supported="kln" distance="10" oneway="true"/>	<!-- Okiek -->
 			<languageMatch desired="pko" supported="kln" distance="10" oneway="true"/>	<!-- Pökoot -->
 			<languageMatch desired="sgc" supported="kln" distance="10" oneway="true"/>	<!-- Kipsigis -->
 			<languageMatch desired="tec" supported="kln" distance="10" oneway="true"/>	<!-- Terik -->
 			<languageMatch desired="tuy" supported="kln" distance="10" oneway="true"/>	<!-- Tugen -->
 			<!-- Encompassed by Konkani -->
 			<languageMatch desired="gom" supported="kok" distance="10" oneway="true"/>	<!-- Goan Konkani -->
 			<!-- Encompassed by Kpelle -->
 			<languageMatch desired="gkp" supported="kpe" distance="10" oneway="true"/>	<!-- Guinea Kpelle -->
 			<!-- Encompassed by Luyia -->
 			<languageMatch desired="ida" supported="luy" distance="10" oneway="true"/>	<!-- Idakho-Isukha-Tiriki -->
 			<languageMatch desired="lkb" supported="luy" distance="10" oneway="true"/>	<!-- Kabras -->
 			<languageMatch desired="lko" supported="luy" distance="10" oneway="true"/>	<!-- Khayo -->
 			<languageMatch desired="lks" supported="luy" distance="10" oneway="true"/>	<!-- Kisa -->
 			<languageMatch desired="lri" supported="luy" distance="10" oneway="true"/>	<!-- Marachi -->
 			<languageMatch desired="lrm" supported="luy" distance="10" oneway="true"/>	<!-- Marama -->
 			<languageMatch desired="lsm" supported="luy" distance="10" oneway="true"/>	<!-- Saamia -->
 			<languageMatch desired="lto" supported="luy" distance="10" oneway="true"/>	<!-- Tsotso -->
 			<languageMatch desired="lts" supported="luy" distance="10" oneway="true"/>	<!-- Tachoni -->
 			<languageMatch desired="lwg" supported="luy" distance="10" oneway="true"/>	<!-- Wanga -->
 			<languageMatch desired="nle" supported="luy" distance="10" oneway="true"/>	<!-- East Nyala -->
 			<languageMatch desired="nyd" supported="luy" distance="10" oneway="true"/>	<!-- Nyore -->
 			<languageMatch desired="rag" supported="luy" distance="10" oneway="true"/>	<!-- Logooli -->
 			<!-- Encompassed by Latvian -->
 			<languageMatch desired="ltg" supported="lv" distance="10" oneway="true"/>	<!-- Latgalian -->
 			<!-- Encompassed by Malagasy -->
 			<languageMatch desired="bhr" supported="mg" distance="10" oneway="true"/>	<!-- Bara Malagasy -->
 			<languageMatch desired="bjq" supported="mg" distance="10" oneway="true"/>	<!-- Southern Betsimisaraka Malagasy -->
 			<languageMatch desired="bmm" supported="mg" distance="10" oneway="true"/>	<!-- Northern Betsimisaraka Malagasy -->
 			<languageMatch desired="bzc" supported="mg" distance="10" oneway="true"/>	<!-- Southern Betsimisaraka Malagasy -->
 			<languageMatch desired="msh" supported="mg" distance="10" oneway="true"/>	<!-- Masikoro Malagasy -->
 			<languageMatch desired="skg" supported="mg" distance="10" oneway="true"/>	<!-- Sakalava Malagasy -->
 			<languageMatch desired="tdx" supported="mg" distance="10" oneway="true"/>	<!-- Tandroy-Mahafaly Malagasy -->
 			<languageMatch desired="tkg" supported="mg" distance="10" oneway="true"/>	<!-- Tesaka Malagasy -->
 			<languageMatch desired="txy" supported="mg" distance="10" oneway="true"/>	<!-- Tanosy Malagasy -->
 			<languageMatch desired="xmv" supported="mg" distance="10" oneway="true"/>	<!-- Antankarana Malagasy -->
 			<languageMatch desired="xmw" supported="mg" distance="10" oneway="true"/>	<!-- Tsimihety Malagasy -->
 			<!-- Encompassed by Mongolian -->
 			<languageMatch desired="mvf" supported="mn" distance="10" oneway="true"/>	<!-- Peripheral Mongolian -->
 			<!-- Encompassed by Malay -->
 			<languageMatch desired="bjn" supported="ms" distance="10" oneway="true"/>	<!-- Banjar -->
 			<languageMatch desired="btj" supported="ms" distance="10" oneway="true"/>	<!-- Bacanese Malay -->
 			<languageMatch desired="bve" supported="ms" distance="10" oneway="true"/>	<!-- Berau Malay -->
 			<languageMatch desired="bvu" supported="ms" distance="10" oneway="true"/>	<!-- Bukit Malay -->
 			<languageMatch desired="coa" supported="ms" distance="10" oneway="true"/>	<!-- Cocos Islands Malay -->
 			<languageMatch desired="dup" supported="ms" distance="10" oneway="true"/>	<!-- Duano -->
 			<languageMatch desired="hji" supported="ms" distance="10" oneway="true"/>	<!-- Haji -->
 			<languageMatch desired="id" supported="ms" distance="10" oneway="true"/>	<!-- Indonesian -->
 			<languageMatch desired="jak" supported="ms" distance="10" oneway="true"/>	<!-- Jakun -->
 			<languageMatch desired="jax" supported="ms" distance="10" oneway="true"/>	<!-- Jambi Malay -->
 			<languageMatch desired="kvb" supported="ms" distance="10" oneway="true"/>	<!-- Kubu -->
 			<languageMatch desired="kvr" supported="ms" distance="10" oneway="true"/>	<!-- Kerinci -->
 			<languageMatch desired="kxd" supported="ms" distance="10" oneway="true"/>	<!-- Brunei -->
 			<languageMatch desired="lce" supported="ms" distance="10" oneway="true"/>	<!-- Loncong -->
 			<languageMatch desired="lcf" supported="ms" distance="10" oneway="true"/>	<!-- Lubu -->
 			<languageMatch desired="liw" supported="ms" distance="10" oneway="true"/>	<!-- Col -->
 			<languageMatch desired="max" supported="ms" distance="10" oneway="true"/>	<!-- North Moluccan Malay -->
 			<languageMatch desired="meo" supported="ms" distance="10" oneway="true"/>	<!-- Kedah Malay -->
 			<languageMatch desired="mfa" supported="ms" distance="10" oneway="true"/>	<!-- Pattani Malay -->
 			<languageMatch desired="mfb" supported="ms" distance="10" oneway="true"/>	<!-- Bangka -->
 			<languageMatch desired="min" supported="ms" distance="10" oneway="true"/>	<!-- Minangkabau -->
 			<languageMatch desired="mqg" supported="ms" distance="10" oneway="true"/>	<!-- Kota Bangun Kutai Malay -->
 			<languageMatch desired="msi" supported="ms" distance="10" oneway="true"/>	<!-- Sabah Malay -->
 			<languageMatch desired="mui" supported="ms" distance="10" oneway="true"/>	<!-- Musi -->
 			<languageMatch desired="orn" supported="ms" distance="10" oneway="true"/>	<!-- Orang Kanaq -->
 			<languageMatch desired="ors" supported="ms" distance="10" oneway="true"/>	<!-- Orang Seletar -->
 			<languageMatch desired="pel" supported="ms" distance="10" oneway="true"/>	<!-- Pekal -->
 			<languageMatch desired="pse" supported="ms" distance="10" oneway="true"/>	<!-- Central Malay -->
 			<languageMatch desired="tmw" supported="ms" distance="10" oneway="true"/>	<!-- Temuan -->
 			<languageMatch desired="urk" supported="ms" distance="10" oneway="true"/>	<!-- Urak Lawoi' -->
 			<languageMatch desired="vkk" supported="ms" distance="10" oneway="true"/>	<!-- Kaur -->
 			<languageMatch desired="vkt" supported="ms" distance="10" oneway="true"/>	<!-- Tenggarong Kutai Malay -->
 			<languageMatch desired="xmm" supported="ms" distance="10" oneway="true"/>	<!-- Manado Malay -->
 			<languageMatch desired="zlm" supported="ms" distance="10" oneway="true"/>	<!-- Malay (individual language) -->
 			<languageMatch desired="zmi" supported="ms" distance="10" oneway="true"/>	<!-- Negeri Sembilan Malay -->
 			<!-- Encompassed by Nepali -->
 			<languageMatch desired="dty" supported="ne" distance="10" oneway="true"/>	<!-- Dotyali -->
 			<!-- Encompassed by Oromo -->
 			<languageMatch desired="gax" supported="om" distance="10" oneway="true"/>	<!-- Borana-Arsi-Guji Oromo -->
 			<languageMatch desired="hae" supported="om" distance="10" oneway="true"/>	<!-- Eastern Oromo -->
 			<languageMatch desired="orc" supported="om" distance="10" oneway="true"/>	<!-- Orma -->
 			<!-- Encompassed by Odia -->
 			<languageMatch desired="spv" supported="or" distance="10" oneway="true"/>	<!-- Sambalpuri -->
 			<!-- Encompassed by Pashto -->
 			<languageMatch desired="pbt" supported="ps" distance="10" oneway="true"/>	<!-- Southern Pashto -->
 			<languageMatch desired="pst" supported="ps" distance="10" oneway="true"/>	<!-- Central Pashto -->
 			<!-- Encompassed by Quechua -->
 			<languageMatch desired="qub" supported="qu" distance="10" oneway="true"/>	<!-- Huallaga Huánuco Quechua -->
 			<languageMatch desired="qud" supported="qu" distance="10" oneway="true"/>	<!-- Calderón Highland Quichua -->
 			<languageMatch desired="quf" supported="qu" distance="10" oneway="true"/>	<!-- Lambayeque Quechua -->
 			<languageMatch desired="qug" supported="qu" distance="10" oneway="true"/>	<!-- Chimborazo Highland Quichua -->
 			<languageMatch desired="quh" supported="qu" distance="10" oneway="true"/>	<!-- South Bolivian Quechua -->
 			<languageMatch desired="quk" supported="qu" distance="10" oneway="true"/>	<!-- Chachapoyas Quechua -->
 			<languageMatch desired="qul" supported="qu" distance="10" oneway="true"/>	<!-- North Bolivian Quechua -->
 			<languageMatch desired="qup" supported="qu" distance="10" oneway="true"/>	<!-- Southern Pastaza Quechua -->
 			<languageMatch desired="qur" supported="qu" distance="10" oneway="true"/>	<!-- Yanahuanca Pasco Quechua -->
 			<languageMatch desired="qus" supported="qu" distance="10" oneway="true"/>	<!-- Santiago del Estero Quichua -->
 			<languageMatch desired="quw" supported="qu" distance="10" oneway="true"/>	<!-- Tena Lowland Quichua -->
 			<languageMatch desired="qux" supported="qu" distance="10" oneway="true"/>	<!-- Yauyos Quechua -->
 			<languageMatch desired="quy" supported="qu" distance="10" oneway="true"/>	<!-- Ayacucho Quechua -->
 			<languageMatch desired="qva" supported="qu" distance="10" oneway="true"/>	<!-- Ambo-Pasco Quechua -->
 			<languageMatch desired="qvc" supported="qu" distance="10" oneway="true"/>	<!-- Cajamarca Quechua -->
 			<languageMatch desired="qve" supported="qu" distance="10" oneway="true"/>	<!-- Eastern Apurímac Quechua -->
 			<languageMatch desired="qvh" supported="qu" distance="10" oneway="true"/>	<!-- Huamalíes-Dos de Mayo Huánuco Quechua -->
 			<languageMatch desired="qvi" supported="qu" distance="10" oneway="true"/>	<!-- Imbabura Highland Quichua -->
 			<languageMatch desired="qvj" supported="qu" distance="10" oneway="true"/>	<!-- Loja Highland Quichua -->
 			<languageMatch desired="qvl" supported="qu" distance="10" oneway="true"/>	<!-- Cajatambo North Lima Quechua -->
 			<languageMatch desired="qvm" supported="qu" distance="10" oneway="true"/>	<!-- Margos-Yarowilca-Lauricocha Quechua -->
 			<languageMatch desired="qvn" supported="qu" distance="10" oneway="true"/>	<!-- North Junín Quechua -->
 			<languageMatch desired="qvo" supported="qu" distance="10" oneway="true"/>	<!-- Napo Lowland Quechua -->
 			<languageMatch desired="qvp" supported="qu" distance="10" oneway="true"/>	<!-- Pacaraos Quechua -->
 			<languageMatch desired="qvs" supported="qu" distance="10" oneway="true"/>	<!-- San Martín Quechua -->
 			<languageMatch desired="qvw" supported="qu" distance="10" oneway="true"/>	<!-- Huaylla Wanca Quechua -->
 			<languageMatch desired="qvz" supported="qu" distance="10" oneway="true"/>	<!-- Northern Pastaza Quichua -->
 			<languageMatch desired="qwa" supported="qu" distance="10" oneway="true"/>	<!-- Corongo Ancash Quechua -->
 			<languageMatch desired="qwc" supported="qu" distance="10" oneway="true"/>	<!-- Classical Quechua -->
 			<languageMatch desired="qwh" supported="qu" distance="10" oneway="true"/>	<!-- Huaylas Ancash Quechua -->
 			<languageMatch desired="qws" supported="qu" distance="10" oneway="true"/>	<!-- Sihuas Ancash Quechua -->
 			<languageMatch desired="qxa" supported="qu" distance="10" oneway="true"/>	<!-- Chiquián Ancash Quechua -->
 			<languageMatch desired="qxc" supported="qu" distance="10" oneway="true"/>	<!-- Chincha Quechua -->
 			<languageMatch desired="qxh" supported="qu" distance="10" oneway="true"/>	<!-- Panao Huánuco Quechua -->
 			<languageMatch desired="qxl" supported="qu" distance="10" oneway="true"/>	<!-- Salasaca Highland Quichua -->
 			<languageMatch desired="qxn" supported="qu" distance="10" oneway="true"/>	<!-- Northern Conchucos Ancash Quechua -->
 			<languageMatch desired="qxo" supported="qu" distance="10" oneway="true"/>	<!-- Southern Conchucos Ancash Quechua -->
 			<languageMatch desired="qxp" supported="qu" distance="10" oneway="true"/>	<!-- Puno Quechua -->
 			<languageMatch desired="qxr" supported="qu" distance="10" oneway="true"/>	<!-- Cañar Highland Quichua -->
 			<languageMatch desired="qxt" supported="qu" distance="10" oneway="true"/>	<!-- Santa Ana de Tusi Pasco Quechua -->
 			<languageMatch desired="qxu" supported="qu" distance="10" oneway="true"/>	<!-- Arequipa-La Unión Quechua -->
 			<languageMatch desired="qxw" supported="qu" distance="10" oneway="true"/>	<!-- Jauja Wanca Quechua -->
 			<!-- Encompassed by Sardinian -->
 			<languageMatch desired="sdc" supported="sc" distance="10" oneway="true"/>	<!-- Sassarese Sardinian -->
 			<languageMatch desired="sdn" supported="sc" distance="10" oneway="true"/>	<!-- Gallurese Sardinian -->
 			<languageMatch desired="sro" supported="sc" distance="10" oneway="true"/>	<!-- Campidanese Sardinian -->
 			<!-- Encompassed by Albanian -->
 			<languageMatch desired="aae" supported="sq" distance="10" oneway="true"/>	<!-- Arbëreshë Albanian -->
 			<languageMatch desired="aat" supported="sq" distance="10" oneway="true"/>	<!-- Arvanitika Albanian -->
 			<languageMatch desired="aln" supported="sq" distance="10" oneway="true"/>	<!-- Gheg Albanian -->
 			<!-- Encompassed by Syriac -->
 			<languageMatch desired="aii" supported="syr" distance="10" oneway="true"/>	<!-- Assyrian Neo-Aramaic -->
 			<!-- Encompassed by Uzbek -->
 			<languageMatch desired="uzs" supported="uz" distance="10" oneway="true"/>	<!-- Southern Uzbek -->
 			<!-- Encompassed by Yiddish -->
 			<languageMatch desired="yih" supported="yi" distance="10" oneway="true"/>	<!-- Western Yiddish -->
 			<!-- Encompassed by Chinese, Mandarin -->
 			<languageMatch desired="cdo" supported="zh" distance="10" oneway="true"/>	<!-- Min Dong Chinese -->
 			<languageMatch desired="cjy" supported="zh" distance="10" oneway="true"/>	<!-- Jinyu Chinese -->
 			<languageMatch desired="cpx" supported="zh" distance="10" oneway="true"/>	<!-- Pu-Xian Chinese -->
 			<languageMatch desired="czh" supported="zh" distance="10" oneway="true"/>	<!-- Huizhou Chinese -->
 			<languageMatch desired="czo" supported="zh" distance="10" oneway="true"/>	<!-- Min Zhong Chinese -->
 			<languageMatch desired="gan" supported="zh" distance="10" oneway="true"/>	<!-- Gan Chinese -->
 			<languageMatch desired="hak" supported="zh" distance="10" oneway="true"/>	<!-- Hakka Chinese -->
 			<languageMatch desired="hsn" supported="zh" distance="10" oneway="true"/>	<!-- Xiang Chinese -->
 			<languageMatch desired="lzh" supported="zh" distance="10" oneway="true"/>	<!-- Literary Chinese -->
 			<languageMatch desired="mnp" supported="zh" distance="10" oneway="true"/>	<!-- Min Bei Chinese -->
 			<languageMatch desired="nan" supported="zh" distance="10" oneway="true"/>	<!-- Min Nan Chinese -->
 			<languageMatch desired="wuu" supported="zh" distance="10" oneway="true"/>	<!-- Wu Chinese -->
 			<languageMatch desired="yue" supported="zh" distance="10" oneway="true"/>	<!-- Chinese, Cantonese -->
 			<!-- END generated by GenerateLanguageMatches.java -->
 			<languageMatch desired="*"	supported="*"	distance="80"/>	<!-- * ⇒ * -->
 			<languageMatch desired="az_Latn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- az; Latn ⇒ ru; Cyrl -->
 			<languageMatch desired="bn_Beng"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- bn; Beng ⇒ en; Latn -->
 			<languageMatch desired="hy_Armn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- hy; Armn ⇒ ru; Cyrl -->
 			<languageMatch desired="ka_Geor"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ka; Geor ⇒ en; Latn -->
 			<languageMatch desired="km_Khmr"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- km; Khmr ⇒ en; Latn -->
 			<languageMatch desired="kn_Knda"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- kn; Knda ⇒ en; Latn -->
 			<languageMatch desired="lo_Laoo"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- lo; Laoo ⇒ en; Latn -->
 			<languageMatch desired="ml_Mlym"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ml; Mlym ⇒ en; Latn -->
 			<languageMatch desired="my_Mymr"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- my; Mymr ⇒ en; Latn -->
 			<languageMatch desired="ne_Deva"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ne; Deva ⇒ en; Latn -->
 			<languageMatch desired="or_Orya"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- or; Orya ⇒ en; Latn -->
 			<languageMatch desired="pa_Guru"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- pa; Guru ⇒ en; Latn -->
 			<languageMatch desired="ps_Arab"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ps; Arab ⇒ en; Latn -->
 			<languageMatch desired="sd_Arab"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- sd; Arab ⇒ en; Latn -->
 			<languageMatch desired="si_Sinh"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- si; Sinh ⇒ en; Latn -->
 			<languageMatch desired="ta_Taml"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ta; Taml ⇒ en; Latn -->
 			<languageMatch desired="te_Telu"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- te; Telu ⇒ en; Latn -->
 			<languageMatch desired="ti_Ethi"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ti; Ethi ⇒ en; Latn -->
 			<languageMatch desired="tk_Latn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- tk; Latn ⇒ ru; Cyrl -->
 			<languageMatch desired="ur_Arab"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- ur; Arab ⇒ en; Latn -->
 			<languageMatch desired="uz_Latn"	supported="ru_Cyrl"	distance="10"	oneway="true"/>	<!-- uz; Latn ⇒ ru; Cyrl -->
 			<languageMatch desired="yi_Hebr"	supported="en_Latn"	distance="10"	oneway="true"/>	<!-- yi; Hebr ⇒ en; Latn -->
 			<languageMatch desired="sr_Latn"	supported="sr_Cyrl"	distance="5"/>	<!-- sr; Latn ⇒ sr; Cyrl -->
 			<languageMatch desired="zh_Hans"	supported="zh_Hant"	distance="15"	oneway="true"/>	<!-- zh; Hans ⇒ zh; Hant -->
 			<languageMatch desired="zh_Hant"	supported="zh_Hans"	distance="19"	oneway="true"/>	<!-- zh; Hant ⇒ zh; Hans -->
 			<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans -->
 			<languageMatch desired="zh_Hani"	supported="zh_Hans"	distance="20"	oneway="true"/>
 			<languageMatch desired="zh_Hani"	supported="zh_Hant"	distance="20"	oneway="true"/>
 			<!-- Latin transliterations of some languages, initially from CLDR-13577 -->
 			<languageMatch desired="ar_Latn"	supported="ar_Arab"	distance="20"	oneway="true"/>
 			<languageMatch desired="bn_Latn"	supported="bn_Beng"	distance="20"	oneway="true"/>
 			<languageMatch desired="gu_Latn"	supported="gu_Gujr"	distance="20"	oneway="true"/>
 			<languageMatch desired="hi_Latn"	supported="hi_Deva"	distance="20"	oneway="true"/>
 			<languageMatch desired="kn_Latn"	supported="kn_Knda"	distance="20"	oneway="true"/>
 			<languageMatch desired="ml_Latn"	supported="ml_Mlym"	distance="20"	oneway="true"/>
 			<languageMatch desired="mr_Latn"	supported="mr_Deva"	distance="20"	oneway="true"/>
 			<languageMatch desired="ta_Latn"	supported="ta_Taml"	distance="20"	oneway="true"/>
 			<languageMatch desired="te_Latn"	supported="te_Telu"	distance="20"	oneway="true"/>
 			<languageMatch desired="zh_Latn"	supported="zh_Hans"	distance="20"	oneway="true"/> <!-- Pinyin -->
 			<!-- start fallbacks for group script codes, initially from CLDR-13526
                             Look for plus signs on https://www.unicode.org/iso15924/iso15924-codes.html -->
 			<languageMatch desired="ja_Latn"	supported="ja_Jpan"	distance="5"	oneway="true"/>
 			<languageMatch desired="ja_Hani"	supported="ja_Jpan"	distance="5"	oneway="true"/>
 			<languageMatch desired="ja_Hira"	supported="ja_Jpan"	distance="5"	oneway="true"/>
 			<languageMatch desired="ja_Kana"	supported="ja_Jpan"	distance="5"	oneway="true"/>
 			<languageMatch desired="ja_Hrkt"	supported="ja_Jpan"	distance="5"	oneway="true"/>
 			<languageMatch desired="ja_Hira"	supported="ja_Hrkt"	distance="5"	oneway="true"/>
 			<languageMatch desired="ja_Kana"	supported="ja_Hrkt"	distance="5"	oneway="true"/>
 			<languageMatch desired="ko_Hani"	supported="ko_Kore"	distance="5"	oneway="true"/>
 			<languageMatch desired="ko_Hang"	supported="ko_Kore"	distance="5"	oneway="true"/>
 			<languageMatch desired="ko_Jamo"	supported="ko_Kore"	distance="5"	oneway="true"/>
 			<languageMatch desired="ko_Jamo"	supported="ko_Hang"	distance="5"	oneway="true"/>
 			<!-- No special mappings for zh Bopo/Hanb
 			     because Bopomofo is used only in TW, and unsure how widely.
 			     No special mappings for styled scripts like Latf or Aran
 			     because those would apply to many languages;
 			     if desired, those would be better handled as matcher-specific script aliases. -->
 			<!-- end fallbacks for group script codes -->
 			<!-- default script mismatch distance -->
 			<languageMatch desired="*_*"	supported="*_*"	distance="50"/>	<!-- *; * ⇒ *; * -->
 			<languageMatch desired="ar_*_$maghreb"	supported="ar_*_$maghreb"	distance="4"/>	<!-- ar; *; $maghreb ⇒ ar; *; $maghreb -->
 			<languageMatch desired="ar_*_$!maghreb"	supported="ar_*_$!maghreb"	distance="4"/>	<!-- ar; *; $!maghreb ⇒ ar; *; $!maghreb -->
 			<languageMatch desired="ar_*_*"	supported="ar_*_*"	distance="5"/>	<!-- ar; *; * ⇒ ar; *; * -->
 			<languageMatch desired="en_*_$enUS"	supported="en_*_$enUS"	distance="4"/>	<!-- en; *; $enUS ⇒ en; *; $enUS -->
 			<languageMatch desired="en_*_$!enUS"	supported="en_*_GB"	distance="3"/>	<!--  Make en_GB preferred... -->
 			<languageMatch desired="en_*_$!enUS"	supported="en_*_$!enUS"	distance="4"/>	<!-- en; *; $!enUS ⇒ en; *; $!enUS -->
 			<languageMatch desired="en_*_*"	supported="en_*_*"	distance="5"/>	<!-- en; *; * ⇒ en; *; * -->
 			<languageMatch desired="es_*_$americas"	supported="es_*_$americas"	distance="4"/>	<!-- es; *; $americas ⇒ es; *; $americas -->
 			<languageMatch desired="es_*_$!americas"	supported="es_*_$!americas"	distance="4"/>	<!-- es; *; $!americas ⇒ es; *; $!americas -->
 			<languageMatch desired="es_*_*"	supported="es_*_*"	distance="5"/>	<!-- es; *; * ⇒ es; *; * -->
 			<languageMatch desired="pt_*_$americas"	supported="pt_*_$americas"	distance="4"/>	<!-- pt; *; $americas ⇒ pt; *; $americas -->
 			<languageMatch desired="pt_*_$!americas"	supported="pt_*_$!americas"	distance="4"/>	<!-- pt; *; $!americas ⇒ pt; *; $!americas -->
 			<languageMatch desired="pt_*_*"	supported="pt_*_*"	distance="5"/>	<!-- pt; *; * ⇒ pt; *; * -->
 			<languageMatch desired="zh_Hant_$cnsar"	supported="zh_Hant_$cnsar"	distance="4"/>	<!-- zh; Hant; $cnsar ⇒ zh; Hant; $cnsar -->
 			<languageMatch desired="zh_Hant_$!cnsar"	supported="zh_Hant_$!cnsar"	distance="4"/>	<!-- zh; Hant; $!cnsar ⇒ zh; Hant; $!cnsar -->
 			<languageMatch desired="zh_Hant_*"	supported="zh_Hant_*"	distance="5"/>	<!-- zh; Hant; * ⇒ zh; Hant; * -->
 			<languageMatch desired="*_*_*"	supported="*_*_*"	distance="4"/>	<!-- *; *; * ⇒ *; *; * -->
 		</languageMatches>
 	</languageMatching>
 </supplementalData>
--- a/lib/language_data/data/override_language_names.csv
+++ b/lib/language_data/data/override_language_names.csv
@ -0,0 +1,3 @@
 zsm,zsm,bahasa Malaysia
 id,id,bahasa Indonesia
 ms,ms,bahasa Malaysia
--- a/lib/language_data/data/supplementalData.xml
+++ b/lib/language_data/data/supplementalData.xml
--- a/lib/language_data/data/wiktionary/codes-en.csv
+++ b/lib/language_data/data/wiktionary/codes-en.csv
--- a/lib/language_data/language_lists.py
+++ b/lib/language_data/language_lists.py
@ -0,0 +1,89 @@
 # This is the list of language codes with the 'modern' level of support in CLDR
 # (compared to 'full', which contains many more languages). We use this as the
 # list of languages that we store specific name-to-code mappings for.
 CLDR_LANGUAGES = {
    "af",
    "am",
    "ar",
    "as",
    "az",
    "be",
    "bg",
    "bn",
    "bs",
    "ca",
    "cs",
    "cy",
    "da",
    "de",
    "el",
    "en",
    "es",
    "et",
    "eu",
    "fa",
    "fi",
    "fil",
    "fr",
    "ga",
    "gl",
    "gu",
    "he",
    "hi",
    "hr",
    "hu",
    "hy",
    "id",
    "is",
    "it",
    "ja",
    "jv",
    "ka",
    "kk",
    "km",
    "kn",
    "ko",
    "ky",
    "lo",
    "lt",
    "lv",
    "mk",
    "ml",
    "mn",
    "mr",
    "ms",
    "my",
    "nb",
    "ne",
    "nl",
    "or",
    "pa",
    "pl",
    "pt",
    "ro",
    "ru",
    "sd",
    "si",
    "sk",
    "sl",
    "so",
    "sq",
    "sr",
    "sv",
    "sw",
    "ta",
    "te",
    "th",
    "ti",
    "tk",
    "tr",
    "uk",
    "und",
    "ur",
    "uz",
    "vi",
    "yue",
    "zh",
    "zu",
 }
--- a/lib/language_data/name_data.py
+++ b/lib/language_data/name_data.py
--- a/lib/language_data/names.py
+++ b/lib/language_data/names.py
@ -0,0 +1,112 @@
 # import marisa_trie
 import warnings
 from language_data.util import data_filename
 TRIES = {}
 # This is something we could hypothetically discover from XML files, but
 # we end up learning that most languages separate things with commas, with
 # a few exceptions. We'll just put those exceptions here.
 DISPLAY_SEPARATORS = {
    'am': '፣',
    'ar': '، ',
    'brx': ',',
    'fa': '، ',
    'ja': '、',
    'my': '၊ ',
    'ug': '، ',
    'und': ', ',
    'ur': '، ',
    'yue': '，',
    'zh': '，',
 }
 def normalize_name(name):
    """
    When looking up a language-code component by name, we would rather ignore
    distinctions of case and certain punctuation. "Chinese (Traditional)"
    should be matched by "Chinese Traditional" and "chinese traditional".
    """
    name = name.casefold()
    name = name.replace("’", "'")
    name = name.replace("-", " ")
    name = name.replace("(", "")
    name = name.replace(")", "")
    name = name.replace(",", "")
    return name.strip()
 # def load_trie(filename):
 #     """
 #     Load a BytesTrie from the marisa_trie on-disk format.
 #     """
 #     trie = marisa_trie.BytesTrie()
 #     # marisa_trie raises warnings that make no sense. Ignore them.
 #     with warnings.catch_warnings():
 #         warnings.simplefilter("ignore")
 #         trie.load(filename)
 #     return trie
 def get_trie_value(trie, key):
    """
    Get the value that a BytesTrie stores for a particular key, decoded
    as Unicode. Raises a KeyError if there is no value for that key.
    """
    return trie[key][0].decode("utf-8")
 def name_to_code(category, name, language: str = "und"):
    """
    Get a language, script, or territory by its name in some language.
    The language here must be a string representing a language subtag only.
    The `Language.find` method can handle other representations of a language
    and normalize them to this form.
    The default language, "und", will allow matching names in any language,
    so you can get the code 'fr' by looking up "French", "Français", or
    "francés".
    A small amount of fuzzy matching is supported: if the name can be
    shortened or lengthened to match a single language name, you get that
    language. This allows, for example, "Hakka Chinese" to match "Hakka".
    Occasionally, names are ambiguous in a way that can be resolved by
    specifying what name the language is supposed to be in. For example,
    there is a language named 'Malayo' in English, but it's different from
    the language named 'Malayo' in Spanish (which is Malay). Specifying the
    language will look up the name in a trie that is only in that language.
    """
    assert "/" not in language, "Language codes cannot contain slashes"
    assert "-" not in language, "This code should be reduced to a language subtag only"
    trie_name = "{}/name_to_{}".format(language, category)
    if trie_name not in TRIES:
        TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))
    trie = TRIES[trie_name]
    lookup = normalize_name(name)
    if lookup in trie:
        return get_trie_value(trie, lookup)
    else:
        # Is this a language name plus extra verbiage? Maybe it has "...isch",
        # "... language", or "... Chinese" attached to it, for example. Look
        # for a matching prefix of the desired name with at least 4 characters.
        prefixes = trie.prefixes(lookup)
        if prefixes and len(prefixes[-1]) >= 4:
            return get_trie_value(trie, prefixes[-1])
        else:
            return None
 def code_to_names(code):
    """
    Given the code for a language, script, or territory, get a dictionary of its
    names in various languages.
    """
    # late import to save memory when possible
    import language_data.name_data
    return language_data.name_data.CODE_TO_NAMES.get(code, {})
--- a/lib/language_data/population_data.py
+++ b/lib/language_data/population_data.py
--- a/lib/language_data/registry_parser.py
+++ b/lib/language_data/registry_parser.py
@ -0,0 +1,58 @@
 from language_data.util import data_filename
 LIST_KEYS = {'Description', 'Prefix'}
 def parse_file(file):
    """
    Take an open file containing the IANA subtag registry, and yield a
    dictionary of information for each subtag it describes.
    """
    lines = []
    for line in file:
        line = line.rstrip('\n')
        if line == '%%':
            # This is a separator between items. Parse the data we've
            # collected and yield the result.
            yield from parse_item(lines)
            lines.clear()
        elif line.startswith('  '):
            # This is a continuation line. Concatenate it to the previous
            # line, including one of the spaces.
            lines[-1] += line[1:]
        else:
            lines.append(line)
    yield from parse_item(lines)
 def parse_item(lines):
    """
    Given the lines that form a subtag entry (after joining wrapped lines
    back together), parse the data they contain.
    Returns a generator that yields once if there was any data there
    (and an empty generator if this was just the header).
    """
    info = {}
    for line in lines:
        key, value = line.split(': ', 1)
        if key in LIST_KEYS:
            info.setdefault(key, []).append(value)
        else:
            assert key not in info
            info[key] = value
    if 'Subtag' in info or 'Tag' in info:
        yield info
 def parse_registry():
    """
    Yield a sequence of dictionaries, containing the info in the included
    IANA subtag registry file.
    """
    with open(data_filename('language-subtag-registry.txt'),
              encoding='utf-8') as data_file:
        # 'yield from' instead of returning, so that we only close the file
        # when finished.
        yield from parse_file(data_file)
--- a/lib/language_data/util.py
+++ b/lib/language_data/util.py
@ -0,0 +1,15 @@
 """
 Used for locating a file in the data directory.
 """
 from pkg_resources import resource_filename
 DATA_ROOT = resource_filename('language_data', 'data')
 import os
 def data_filename(filename):
    """
    Given a relative filename, get the full path to that file in the data
    directory.
    """
    return os.path.join(DATA_ROOT, filename)
--- a/sickgear/webserve.py
+++ b/sickgear/webserve.py
@ -4009,7 +4009,42 @@ class AddShows(Home):
            if all_langs:
                result.extend([lang['sg_lang'] for lang in all_langs if lang['sg_lang'] not in result])
-        return json_dumps({'results': result})
+        try:
            # noinspection PyPep8Naming
            from langcodes import Language as lang_obj, LanguageTagError, standardize_tag
        except ImportError:
            lang_obj = None
        result_ext = []
        if None is not lang_obj:
            prio_abbr = ''
            prio_lang = []
            try:
                lang = lang_obj.get(sickgear.ADD_SHOWS_METALANG)
                prio_abbr = lang.to_alpha3()
                prio_lang = [dict(orig_abbr=sickgear.ADD_SHOWS_METALANG, std_abbr=sickgear.ADD_SHOWS_METALANG,
                                  abbr=prio_abbr, en=lang.display_name(), native=lang.autonym())]
            except (BaseException, Exception) as _:
                pass
            dedupe = []
            for cur_lang in result:
                try:
                    lang = lang_obj.get(cur_lang)
                    abbr = lang.to_alpha3()
                except (BaseException, Exception) as _:
                    continue
                try:
                    std_abbr = standardize_tag(cur_lang, macro=True)
                except (BaseException, Exception) as _:
                    std_abbr = None
                if abbr not in dedupe and abbr != prio_abbr:
                    dedupe += [abbr]
                    result_ext += [dict(orig_abbr=cur_lang, std_abbr=std_abbr, abbr=abbr, en=lang.display_name(), native=lang.autonym())]
            result_ext = prio_lang + sorted(result_ext, key=lambda x: x['en'])
        return json_dumps({'results': [] if result_ext else result, 'results_ext': result_ext})
    @staticmethod
    def generate_show_dir_name(show_name):