Add langcodes 3.3.0 and language-data 1.1 (vendor'd from pip install langcodes[data])

Add a Select2 drop-down to `add-shows` and `edit-show`.
The Select2 enables displaying inline language flag images, this feature deprecated by the native `select` drop-down element on some browsers.
Change run existing TVInfo source language lists through validation (removes ~4 bad items), de-dupe list, get the native names, English names, and three letter abbr.
Change remove marisa-trie requirement from language_data/names.py because nothing in SG calls a function that requires it.
Change update some flags.
This commit is contained in:
JackDandy 2023-01-26 03:30:07 +00:00
parent fce8878fa9
commit 9009cc7a7b
36 changed files with 134485 additions and 32 deletions

View file

@ -17,6 +17,7 @@ Libs with customisations...
/lib/hachoir_parser/guess.py /lib/hachoir_parser/guess.py
/lib/hachoir_parser/misc/torrent.py /lib/hachoir_parser/misc/torrent.py
/lib/imdbpie /lib/imdbpie
/lib/language_data/names.py
/lib/lockfile/mkdirlockfile.py /lib/lockfile/mkdirlockfile.py
/lib/rtorrent /lib/rtorrent
/lib/scandir/scandir.py /lib/scandir/scandir.py

View file

@ -1312,6 +1312,9 @@ div.formpaginate{
width:480px; width:480px;
margin-top:0 margin-top:0
} }
#addShowForm #nameToSearch.select2{
width:428px;
}
#addShowForm #nameToSearch.wide{ #addShowForm #nameToSearch.wide{
width:591px; width:591px;
} }
@ -3790,6 +3793,13 @@ option.flag{
background-position:10px 50% background-position:10px 50%
} }
#select2-infosrc-lang-select-container .flag,
#select2-infosrc-lang-select-results .flag{
padding-left:25px;
background-repeat:no-repeat;
background-position:0 50%
}
/* Anime section for editShow */ /* Anime section for editShow */
.anigrouplists-wrapper{ .anigrouplists-wrapper{
height:auto; height:auto;

Binary file not shown.

After

Width:  |  Height:  |  Size: 212 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 287 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 397 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 397 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 324 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 303 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 545 B

After

Width:  |  Height:  |  Size: 303 B

View file

@ -28,6 +28,24 @@
<script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script> <script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
<script type="text/javascript" src="$sbRoot/js/editShow.js?v=$sbPID"></script> <script type="text/javascript" src="$sbRoot/js/editShow.js?v=$sbPID"></script>
<script type="text/javascript" src="$sbRoot/js/livepanel.js?v=$sbPID"></script> <script type="text/javascript" src="$sbRoot/js/livepanel.js?v=$sbPID"></script>
<script src="$sbRoot/js/lib/select2.full.min.js"></script>
<link href="$sbRoot/css/lib/select2.css" rel="stylesheet">
<style>
.select2-container{height:32px; font-size:12px; margin-right:6px}
.select2-container .select2-selection--single{height:30px}
.select2-results__group{color: #eee; background-color: rgb(51,51,51)}
.select2-results__options .select2-results__option{color: #222; background-color: #ddd}
.select2-results__options .select2-results__option .ended{color: #888}
.select2-container--default .select2-results > .select2-results__options{max-height: 300px}
#select2-infosrc-lang-select-results .select2-results__option,
#select2-infosrc-lang-select-results .select2-results__group{padding-top: 2px !important; padding-bottom:2px !important}
#select2-infosrc-lang-select-results .select2-results__option--highlighted.select2-results__option--selectable .ended{color:white}
#select2-infosrc-lang-select-results .select2-results__option--selected,
#select2-infosrc-lang-select-results .select2-results__option--selected span{color:rgb(143, 21, 21) !important}
#select2-infosrc-lang-select-results span.flag{width:100%; height:100%; display:block}
</style>
#if $varExists('header') #if $varExists('header')
<h1 class="header"><span class="grey-text">Edit&nbsp;</span>$header</h1> <h1 class="header"><span class="grey-text">Edit&nbsp;</span>$header</h1>
#else #else
@ -244,10 +262,10 @@
</div> </div>
<div class="field-pair"> <div class="field-pair">
<label for="infosrc-lang-select-edit"> <label for="infosrc-lang-select">
<span class="component-title">Info language</span> <span class="component-title">Info language</span>
<span class="component-desc"> <span class="component-desc">
<select name="tvinfo_lang" id="infosrc-lang-select-edit" class="form-control form-control-inline input-sm"></select> <select name="tvinfo_lang" id="infosrc-lang-select" class="form-control form-control-inline input-sm"></select>
<span>fetch show information in this language</span> <span>fetch show information in this language</span>
</span> </span>
</label> </label>

View file

@ -35,6 +35,23 @@
<script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script> <script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
<script type="text/javascript" src="$sbRoot/js/newShow.js?v=$sbPID"></script> <script type="text/javascript" src="$sbRoot/js/newShow.js?v=$sbPID"></script>
<script type="text/javascript" src="$sbRoot/js/addShowOptions.js?v=$sbPID"></script> <script type="text/javascript" src="$sbRoot/js/addShowOptions.js?v=$sbPID"></script>
<script src="$sbRoot/js/lib/select2.full.min.js"></script>
<link href="$sbRoot/css/lib/select2.css" rel="stylesheet">
<style>
.select2-container{height:32px; font-size:12px}
.select2-container .select2-selection--single{height:30px}
.select2-results__group{color: #eee; background-color: rgb(51,51,51)}
.select2-results__options .select2-results__option{color: #222; background-color: #ddd}
.select2-results__options .select2-results__option .ended{color: #888}
.select2-container--default .select2-results > .select2-results__options{max-height: 300px}
#select2-infosrc-lang-select-results .select2-results__option,
#select2-infosrc-lang-select-results .select2-results__group{padding-top: 2px !important; padding-bottom:2px !important}
#select2-infosrc-lang-select-results .select2-results__option--highlighted.select2-results__option--selectable .ended{color:white}
#select2-infosrc-lang-select-results .select2-results__option--selected,
#select2-infosrc-lang-select-results .select2-results__option--selected span{color:rgb(143, 21, 21) !important}
#select2-infosrc-lang-select-results span.flag{width:100%; height:100%; display:block}
</style>
#if $varExists('header') #if $varExists('header')
<h1 class="header">$header</h1> <h1 class="header">$header</h1>

View file

@ -16,31 +16,73 @@ $(document).ready(function () {
return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"' return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
} }
$.getJSON($.SickGear.Root + '/add-shows/get-infosrc-languages', {}, function (data) { function uriFlag(lang) {
var result = '', currentLangAdded = '', selected = ' selected="selected"'; return $.SickGear.Root + '/images/flags/' + lang + '.png'
}
if (!data.results.length) { $.getJSON($.SickGear.Root + '/add-shows/get-infosrc-languages', {}, function (data) {
result = '<option value="' + config.showLang + '"' + selected + htmlFlag(config.showLang) + '>' var htmlText = '', currentLangAdded = '',
selected = ' selected="selected"', htmlSelected = '',
elInfosrcLang = $('#infosrc-lang-select'),
useSelect2 = 0 < data.results_ext.length, populateItem;
if (!data.results.length && !data.results_ext.length) {
htmlText = '<option value="' + config.showLang + '"' + selected + htmlFlag(config.showLang) + '>'
+ config.showLang + '</option>'; + config.showLang + '</option>';
} else { } else {
currentLangAdded = !1; currentLangAdded = !1;
$.each(data.results, function (index, strLang) { if (useSelect2){
// 3 letter abbr object
$.each(data.results_ext, function (index, obj) {
var htmlSelected = ''; htmlSelected = '';
if (strLang === config.showLang) { if (obj.std_abbr === config.showLang) {
currentLangAdded = !0; currentLangAdded = !0;
htmlSelected = selected; htmlSelected = selected;
} }
result += '<option value="' + strLang + '"' + htmlSelected + htmlFlag(strLang) + '>' htmlText += '<option style="padding-left:25px" value="' + obj.std_abbr + '"'
+ strLang + '</option>'; + ' data-abbr="' + obj.abbr + '"'
}); + ' data-img="' + uriFlag(obj.std_abbr) + '"'
+ ' data-title="' + obj.en + ' (' + obj.orig_abbr + '/' + obj.std_abbr + '/' + obj.abbr + ')' + '"'
+ (!!htmlSelected
? htmlSelected + '>&gt; '
: '>')
+ obj.native
+ '</option>';
});
} else {
// legacy 2 letter abbr list
$.each(data.results, function (index, strLang) {
htmlSelected = '';
if (strLang === config.showLang) {
currentLangAdded = !0;
htmlSelected = selected;
}
htmlText += '<option value="' + strLang + '"' + htmlSelected + htmlFlag(strLang) + '>'
+ strLang + '</option>';
});
}
if (!currentLangAdded) if (!currentLangAdded)
result += '<option value="' + config.showLang + '" ' + selected + '>' + config.showLang + '</option>'; htmlText += '<option value="' + config.showLang + '" ' + selected + '>' + config.showLang + '</option>';
} }
$('#infosrc-lang-select-edit').html(result); elInfosrcLang.html(htmlText);
if (useSelect2) {
populateItem = function (data) {
if (!!data.element)
return $('<span class="flag"'
+ ' style="background-image:url(' + $(data.element).data('img') + ')"'
+ ' title="' + $(data.element).data('title') + '">'
+ data.text
+ '</span>');
return data.text;
}
elInfosrcLang.select2({templateResult: populateItem, templateSelection: populateItem, width: 162});
}
}); });
function getExceptions() { function getExceptions() {

View file

@ -9,35 +9,70 @@ $(document).ready(function () {
return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"' return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
} }
function uriFlag(lang) {
return $.SickGear.Root + '/images/flags/' + lang + '.png'
}
function populateLangSelect() { function populateLangSelect() {
if (!$('#nameToSearch').length) if (!$('#nameToSearch').length)
return; return;
if (1 >= $('#infosrc-lang-select').find('option').length) { if (1 >= $('#infosrc-lang-select').find('option').length) {
$.getJSON(sbRoot + '/add-shows/get-infosrc-languages', {}, function (data) { $.getJSON(sbRoot + '/add-shows/get-infosrc-languages', {}, function (data) {
var resultStr = '', flag, var htmlText = '', flag,
selected = ' selected="selected"', selected = ' selected="selected"',
elInfosrcLang = $('#infosrc-lang-select'); elInfosrcLang = $('#infosrc-lang-select'),
useSelect2 = 0 < data.results_ext.length, populateItem;
if (0 === data.results.length) { if (0 === data.results.length && 0 === data.results_ext.length) {
resultStr = '<option value="en"' + selected + '>&gt; en</option>'; htmlText = '<option value="en"' + selected + '>&gt; en</option>';
} else { } else {
$.each(data.results, function (index, obj) { if (useSelect2) {
flag = htmlFlag(obj); $('#nameToSearch').addClass('select2');
resultStr += '<option value="' + obj + '"' // 3 letter abbr object
+ ('' === resultStr $.each(data.results_ext, function (index, obj) {
? flag.replace('"flag', '"flag selected-text') + selected + '>&gt; ' htmlText += '<option style="padding-left:25px" value="' + obj.std_abbr + '"'
: flag + '>') + ' data-abbr="' + obj.abbr + '"'
+ obj + '</option>'; + ' data-img="' + uriFlag(obj.std_abbr) + '"'
}); + ' data-title="' + obj.en + ' (' + obj.orig_abbr + '/' + obj.std_abbr + '/' + obj.abbr + ')' + '"'
+ ('' === htmlText
? selected + '>&gt; '
: '>')
+ obj.native
+ '</option>';
});
} else {
// legacy 2 letter abbr list
$.each(data.results, function (index, obj) {
flag = htmlFlag(obj);
htmlText += '<option value="' + obj + '"'
+ ('' === htmlText
? flag.replace('"flag', '"flag selected-text') + selected + '>&gt; '
: flag + '>')
+ obj + '</option>';
});
}
} }
elInfosrcLang.html(resultStr); elInfosrcLang.html(htmlText);
elInfosrcLang.change(function () { elInfosrcLang.change(function () {
searchIndexers(); searchIndexers();
}); });
if (useSelect2) {
populateItem = function(data) {
if (!!data.element)
return $('<span class="flag"'
+ ' style="background-image:url(' + $(data.element).data('img') + ')"'
+ ' title="' + $(data.element).data('title') + '">'
+ data.text
+ '</span>');
return data.text;
}
elInfosrcLang.select2({templateResult: populateItem, templateSelection: populateItem, width: 155});
}
}); });
} }
} }

1931
lib/langcodes/__init__.py Normal file

File diff suppressed because it is too large Load diff

242
lib/langcodes/build_data.py Normal file
View file

@ -0,0 +1,242 @@
import json
import xml.etree.ElementTree as ET
from langcodes.util import data_filename
from langcodes.registry_parser import parse_registry
def read_cldr_supplemental(dataname):
cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')
filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
fulldata = json.load(open(filename, encoding='utf-8'))
if dataname == 'aliases':
data = fulldata['supplemental']['metadata']['alias']
else:
data = fulldata['supplemental'][dataname]
return data
def read_iana_registry_suppress_scripts():
scripts = {}
for entry in parse_registry():
if entry['Type'] == 'language' and 'Suppress-Script' in entry:
scripts[entry['Subtag']] = entry['Suppress-Script']
return scripts
def read_iana_registry_scripts():
scripts = set()
for entry in parse_registry():
if entry['Type'] == 'script':
scripts.add(entry['Subtag'])
return scripts
def read_iana_registry_macrolanguages():
macros = {}
for entry in parse_registry():
if entry['Type'] == 'language' and 'Macrolanguage' in entry:
macros[entry['Subtag']] = entry['Macrolanguage']
return macros
def read_iana_registry_replacements():
replacements = {}
for entry in parse_registry():
if entry['Type'] == 'language' and 'Preferred-Value' in entry:
# Replacements for language codes
replacements[entry['Subtag']] = entry['Preferred-Value']
elif 'Tag' in entry and 'Preferred-Value' in entry:
# Replacements for entire tags
replacements[entry['Tag'].lower()] = entry['Preferred-Value']
return replacements
def write_python_dict(outfile, name, d):
print(f"{name} = {{", file=outfile)
for key in sorted(d):
value = d[key]
print(f" {key!r}: {value!r},", file=outfile)
print("}", file=outfile)
def write_python_set(outfile, name, s):
print(f"{name} = {{", file=outfile)
for key in sorted(set(s)):
print(f" {key!r},", file=outfile)
print("}", file=outfile)
GENERATED_HEADER = "# This file is generated by build_data.py."
def read_validity_regex():
validity_options = []
for codetype in ('language', 'region', 'script', 'variant'):
validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')
root = ET.fromstring(open(validity_path).read())
matches = root.findall('./idValidity/id')
for match in matches:
for item in match.text.strip().split():
if '~' in item:
assert item[-2] == '~'
prefix = item[:-3]
range_start = item[-3]
range_end = item[-1]
option = f"{prefix}[{range_start}-{range_end}]"
validity_options.append(option)
else:
validity_options.append(item)
options = '|'.join(validity_options)
return f'^({options})$'
def read_language_distances():
language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')
root = ET.fromstring(open(language_info_path).read())
matches = root.findall(
'./languageMatching/languageMatches[@type="written_new"]/languageMatch'
)
tag_distances = {}
for match in matches:
attribs = match.attrib
n_parts = attribs['desired'].count('_') + 1
if n_parts < 3:
if attribs.get('oneway') == 'true':
pairs = [(attribs['desired'], attribs['supported'])]
else:
pairs = [
(attribs['desired'], attribs['supported']),
(attribs['supported'], attribs['desired']),
]
for (desired, supported) in pairs:
desired_distance = tag_distances.setdefault(desired, {})
desired_distance[supported] = int(attribs['distance'])
# The 'languageInfo' data file contains distances for the unnormalized
# tag 'sh', but we work mostly with normalized tags, and they don't
# describe at all how to cope with this.
#
# 'sh' normalizes to 'sr-Latn', and when we're matching languages we
# aren't matching scripts yet, so when 'sh' appears we'll add a
# corresponding match for 'sr'.
#
# Then because we're kind of making this plan up, add 1 to the distance
# so it's a worse match than ones that are actually clearly defined
# in languageInfo.
if desired == 'sh' or supported == 'sh':
if desired == 'sh':
desired = 'sr'
if supported == 'sh':
supported = 'sr'
if desired != supported:
# don't try to define a non-zero distance for sr <=> sr
desired_distance = tag_distances.setdefault(desired, {})
desired_distance[supported] = int(attribs['distance']) + 1
return tag_distances
def build_data():
lang_scripts = read_iana_registry_suppress_scripts()
all_scripts = read_iana_registry_scripts()
macrolanguages = read_iana_registry_macrolanguages()
iana_replacements = read_iana_registry_replacements()
language_distances = read_language_distances()
alias_data = read_cldr_supplemental('aliases')
likely_subtags = read_cldr_supplemental('likelySubtags')
replacements = {}
# Aliased codes can still have alpha3 codes, and there's no unified source
# about what they are. It depends on whether the alias predates or postdates
# ISO 639-2, which nobody should have to care about. So let's set all the
# alpha3 codes for aliased alpha2 codes here.
alpha3_mapping = {
'tl': 'tgl', # even though it normalizes to 'fil'
'in': 'ind',
'iw': 'heb',
'ji': 'yid',
'jw': 'jav',
'sh': 'hbs',
}
alpha3_biblio = {}
norm_macrolanguages = {}
for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
aliases = alias_data[alias_type]
# Initially populate 'languageAlias' with the aliases from the IANA file
if alias_type == 'languageAlias':
replacements[alias_type] = iana_replacements
replacements[alias_type]['root'] = 'und'
else:
replacements[alias_type] = {}
for code, value in aliases.items():
# Make all keys lowercase so they can be looked up
# case-insensitively
code = code.lower()
# If there are multiple replacements, take the first one. For example,
# we just replace the Soviet Union (SU) with Russia (RU), instead of
# trying to do something context-sensitive and poorly standardized
# that selects one of the successor countries to the Soviet Union.
replacement = value['_replacement'].split()[0]
if value['_reason'] == 'macrolanguage':
norm_macrolanguages[code] = replacement
else:
# CLDR tries to oversimplify some codes as it assigns aliases.
# For example, 'nor' is the ISO alpha3 code for 'no', but CLDR
# would prefer you use 'nb' over 'no', so it makes 'nor' an
# alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.
#
# We undo this oversimplification so that we can get a
# canonical mapping between alpha2 and alpha3 codes.
if code == 'nor':
replacement = 'no'
elif code == 'mol':
replacement = 'mo'
elif code == 'twi':
replacement = 'tw'
elif code == 'bih':
replacement = 'bh'
replacements[alias_type][code] = replacement
if alias_type == 'languageAlias':
if value['_reason'] == 'overlong':
if replacement in alpha3_mapping:
raise ValueError(
"{code!r} is an alpha3 for {replacement!r}, which"
" already has an alpha3: {orig!r}".format(
code=code,
replacement=replacement,
orig=alpha3_mapping[replacement],
)
)
alpha3_mapping[replacement] = code
elif value['_reason'] == 'bibliographic':
alpha3_biblio[replacement] = code
validity_regex = read_validity_regex()
# Write the contents of data_dicts.py.
with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
print(GENERATED_HEADER, file=outfile)
print("import re\n", file=outfile)
write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
write_python_dict(
outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']
)
write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)
write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)
write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])
write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)
write_python_dict(
outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']
)
write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)
write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)
if __name__ == '__main__':
build_data()

File diff suppressed because it is too large Load diff

4377
lib/langcodes/data_dicts.py Normal file

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,188 @@
from .data_dicts import LANGUAGE_DISTANCES
from typing import Dict, Tuple
TagTriple = Tuple[str, str, str]
_DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
DEFAULT_TERRITORY_DISTANCE = 4
# Territory clusters used in territory matching:
# Maghreb (the western Arab world)
MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}
# United States and its territories
US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}
# Special Autonomous Regions of China
CNSAR = {"HK", "MO"}
LATIN_AMERICA = {
"419",
# Central America
"013",
"BZ",
"CR",
"SV",
"GT",
"HN",
"MX",
"NI",
"PA",
# South America
"005",
"AR",
"BO",
"BR",
"CL",
"CO",
"EC",
"FK",
"GF",
"GY",
"PY",
"PE",
"SR",
"UY",
"VE",
}
# North and South America
AMERICAS = {
"019",
# Caribbean
"029",
"AI",
"AG",
"AW",
"BS",
"BB",
"VG",
"BQ",
"KY",
"CU",
"CW",
"DM",
"DO",
"GD",
"GP",
"HT",
"JM",
"MQ",
"MS",
"PR",
"SX",
"BL",
"KN",
"LC",
"MF",
"VC",
"TT",
"TC",
"VI",
# Northern America
"021",
"BM",
"CA",
"GL",
"PM",
"US",
# North America as a whole
"003",
} | LATIN_AMERICA
def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
"""
Takes in triples of (language, script, territory), which can be derived by
'maximizing' a language tag. Returns a number from 0 to 135 indicating the
'distance' between these for the purposes of language matching.
"""
# First of all, if these are identical, return quickly:
if supported == desired:
return 0
# If we've already figured it out, return the cached distance.
if (desired, supported) in _DISTANCE_CACHE:
return _DISTANCE_CACHE[desired, supported]
else:
result = _tuple_distance(desired, supported)
_DISTANCE_CACHE[desired, supported] = result
return result
def _get2(dictionary: dict, key1: str, key2: str, default):
return dictionary.get(key1, {}).get(key2, default)
def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
desired_language, desired_script, desired_territory = desired
supported_language, supported_script, supported_territory = supported
distance = 0
if desired_language != supported_language:
distance += _get2(
LANGUAGE_DISTANCES,
desired_language,
supported_language,
DEFAULT_LANGUAGE_DISTANCE,
)
desired_script_pair = f"{desired_language}_{desired_script}"
supported_script_pair = f"{supported_language}_{supported_script}"
if desired_script != supported_script:
# Scripts can match other scripts, but only when paired with a
# language. For example, there is no reason to assume someone who can
# read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
# someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
# a language written in two scripts.
distance += _get2(
LANGUAGE_DISTANCES,
desired_script_pair,
supported_script_pair,
DEFAULT_SCRIPT_DISTANCE,
)
if desired_territory != supported_territory:
# The rules for matching territories are too weird to implement the
# general case efficiently. Instead of implementing all the possible
# match rules the XML could define, instead we just reimplement the
# rules of CLDR 36.1 here in code.
tdist = DEFAULT_TERRITORY_DISTANCE
if desired_script_pair == supported_script_pair:
if desired_language == "ar":
if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
tdist = 5
elif desired_language == "en":
if (desired_territory == "GB") and (supported_territory not in US):
tdist = 3
elif (desired_territory not in US) and (supported_territory == "GB"):
tdist = 3
elif (desired_territory in US) != (supported_territory in US):
tdist = 5
# This is not a rule that's spelled out in CLDR, but is implied by things
# about territory containment mentioned in other standards. Numeric values
# for territories, like '003', represent broad regions that contain more
# specific territories.
#
# 419 is the numeric value most often seen in language codes, particularly
# 'es-419' for Latin American Spanish. If you have a language code that
# differs only in that its territory is more specific, like 'es-PY', it should
# be closer to a supported 'es-419' than anything with a territory difference.
#
# We can implement this for 419 without becoming responsible for keeping up
# with which countries/territories/regions contain others in the general case.
elif desired_territory in LATIN_AMERICA and supported_territory == "419":
tdist = 1
elif desired_language == "es" or desired_language == "pt":
if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
tdist = 5
elif desired_script_pair == "zh_Hant":
if (desired_territory in CNSAR) != (supported_territory in CNSAR):
tdist = 5
distance += tdist
return distance

View file

@ -0,0 +1,517 @@
# This is the list of language codes with the 'modern' level of support in CLDR
# (compared to 'full', which contains many more languages). We use this as the
# list of languages that we store specific name-to-code mappings for.
CLDR_LANGUAGES = {
'af',
'am',
'ar',
'az',
'be',
'bg',
'bn',
'bs',
'ca',
'cs',
'cy',
'da',
'de',
'el',
'en',
'es',
'et',
'eu',
'fa',
'fi',
'fil',
'fo',
'fr',
'ga',
'gl',
'gu',
'he',
'hi',
'hr',
'hu',
'hy',
'id',
'is',
'it',
'ja',
'ka',
'kk',
'km',
'kn',
'ko',
'ky',
'lo',
'lt',
'lv',
'mk',
'ml',
'mn',
'mr',
'ms',
'my',
'nb',
'ne',
'nl',
'pa',
'pl',
'pt',
'ro',
'ru',
'si',
'sk',
'sl',
'sq',
'sr',
'sv',
'sw',
'ta',
'te',
'th',
'ti',
'to',
'tr',
'uk',
'und',
'ur',
'uz',
'vi',
'yue',
'zh',
'zu',
}
# These are the names languages that have the most entries on the English and
# German Wiktionaries. Wiktionary only consistently identifies languages by their
# name, making it important to be able to recognize the names.
#
# These lists of names are used in `tests/test_wikt_languages.py`.
WIKT_LANGUAGE_NAMES = {}
WIKT_LANGUAGE_NAMES['en'] = [
"Spanish",
"French",
"Latvian",
"Latin",
"English",
"Mandarin",
"Italian",
"Portuguese",
"Cantonese",
"Japanese",
"German",
"Swedish",
"Korean",
"Serbo-Croatian",
"Serbian",
"Croatian",
"Bosnian",
"Finnish",
"Vietnamese",
"Dutch",
"Galician",
"Catalan",
"Polish",
"Danish",
"Norwegian Nynorsk",
"Turkish",
"Romanian",
"Lithuanian",
"Ido",
"Old French",
"Czech",
"Norwegian",
# Jèrriais -- same as Norman
"Esperanto",
"Icelandic",
# Old Armenian
"Norwegian Bokmål",
"Asturian",
"Hungarian",
"Proto-Germanic",
"Russian",
"Slovene",
"Min Nan",
"Scottish Gaelic",
"Greek",
"Irish",
"Lojban",
"Middle French",
"Malay",
"Luxembourgish",
"Slovak",
"Estonian",
"Persian",
"Venetian",
"Old English",
"Volapük",
"Ladin",
"Faroese",
"Scots",
"Interlingua",
"Romansch",
"Urdu",
# Middle Chinese
"Indonesian",
"Swahili",
"Middle English",
"Occitan",
"Welsh",
"Old Norse",
"Albanian",
"Old Irish",
"Old Saxon",
"Lower Sorbian",
"Afrikaans",
"Ukrainian",
"Proto-Slavic",
"Ancient Greek",
"Gothic",
"Hawaiian",
"Kurdish",
"Tagalog",
"Old High German",
"Crimean Tatar",
"Manx",
"Sanskrit",
"Hiligaynon",
"West Frisian",
"Hebrew",
"Tok Pisin",
"Proto-Indo-European",
"Macedonian",
"Novial",
"Armenian",
"Arabic",
"Maltese",
"Hakka",
"Sicilian",
"Ladino",
"Basque",
"Breton",
# Guernésiais -- same as Norman
"Vai",
"Navajo",
"Azeri",
"Vilamovian",
# Tarantino
"Maori",
"Friulian",
"Hausa",
"Haitian Creole",
"Yiddish",
"Tatar",
"Proto-Malayo-Polynesian",
"Aromanian",
"Ottoman Turkish",
"Old Provençal",
"Northern Sami",
"Dalmatian",
"Bulgarian",
"Neapolitan",
"Cornish",
"Middle Dutch",
"Rapa Nui",
# Old Portuguese
"Egyptian Arabic",
"Romani",
"Tahitian",
"Thai",
"Limburgish",
"Karelian",
"Tajik",
"Turkmen",
"Kabardian",
"Uzbek",
"Samoan",
"Mongolian",
"Zulu",
"Upper Sorbian",
"Walloon",
# Proto-Finnic
"Frankish",
"Mapudungun",
"Pashto",
"Low German",
"Bashkir",
"Kashubian",
"Sranan Tongo",
"Proto-Sino-Tibetan",
"Norman",
"Proto-Austronesian",
"Marathi",
"Rohingya",
"Classical Nahuatl",
# Proto-Malayic
# German Low German
"Fijian",
"Zazaki",
"Proto-Italic",
"Old Dutch",
"Egyptian",
"Old Frisian",
"Greenlandic",
"Burmese",
"Votic",
"Ewe",
"Cherokee",
"Old Church Slavonic",
"Quechua",
"Mirandese",
"Livonian",
"Bengali",
"Skolt Sami",
# Proto-Balto-Slavic
"Pitjantjatjara",
"Georgian",
"North Frisian",
"Tetum",
"Tongan",
# Mauritian Creole
"Torres Strait Creole",
"Papiamentu",
"Lao",
"Malagasy",
"Interlingue",
"Aragonese",
"Istriot",
"Sumerian",
"Proto-Celtic",
"Võro",
# Proto-Polynesian
"Nepali",
"Chickasaw",
"Akkadian",
"Middle Armenian",
"Cimbrian",
"Somali",
"Sardinian",
"Tocharian B",
"Telugu",
"Javanese",
"Taos",
"Proto-Semitic",
# Old Prussian
"Kyrgyz",
"Corsican",
"Veps",
"Baluchi",
"Middle Low German",
"Middle High German",
"Uyghur",
# Dutch Low Saxon
"Belarusian",
"Guaraní",
"Undetermined",
"Inuktitut",
"Tocharian A",
"Nigerian Pidgin",
# Gallo
# Saterland Frisian
"Punjabi",
"Proto-Algonquian",
# Istro-Romanian
"Wiradhuri",
"Sichuan Yi",
"Wu",
# White Hmong
"Ugaritic",
"Sundanese",
# Old East Slavic
# Fala
# Elfdalian
"Tamil",
"Pijin",
"Okinawan",
"Kazakh",
"Hindi",
"Tuvan",
"Polabian",
"Aramaic",
"Malayalam",
"Kumyk",
"Inari Sami",
"Ilocano",
"Tswana",
"Libyan Arabic",
"Latgalian",
"Yakut",
"Sindhi",
"Khmer",
"Gamilaraay",
"Ojibwe",
"Choctaw",
"Chinese",
"Chamorro",
"Yucatec Maya",
"Picard",
"Ngarrindjeri",
"Kott",
"Ingrian",
# Crimean Gothic
"Chamicuro",
"Rajasthani",
# Old Tupi
"Old Spanish",
"Gagauz",
"Extremaduran",
"Chinook Jargon",
"Cahuilla",
"Kannada",
"Iban",
"American Sign Language",
"Adyghe",
"Warlpiri",
"Tibetan",
"Ossetian",
"Meriam",
"Marshallese",
"Khakas",
"Balinese",
"Zhuang",
"Tuvaluan",
"Niuean",
"Martuthunira",
"Guugu Yimidhirr",
"Chechen",
"Campidanese Sardinian",
"Tolai",
# Old Javanese
"Nahuatl",
"Lombard",
"West Coast Bajau",
"Romagnol",
"Middle Irish",
"Yoruba",
"Wangaaybuwan-Ngiyambaa",
# Old Swedish
"Lingala",
"Fiji Hindi",
"Shabo",
"Sasak",
"Judeo-Arabic",
"Central Kurdish",
"Bislama",
]
WIKT_LANGUAGE_NAMES['de'] = [
"Deutsch",
"Englisch",
"Polnisch",
"Italienisch",
"Französisch",
"Esperanto",
"Schwedisch",
"Lateinisch",
"Tschechisch",
"Katalanisch",
"Spanisch",
"Okzitanisch",
"Ungarisch",
"Litauisch",
"Finnisch",
"Russisch",
"Altgriechisch",
"Niederländisch",
"Kurdisch",
"Baskisch",
"Armenisch",
"Isländisch",
"Bulgarisch",
"Färöisch",
"Dänisch",
"Portugiesisch",
"Slowakisch",
"Türkisch",
"Maori",
"Albanisch",
"Japanisch",
"Norwegisch",
"Irisch",
"Koreanisch",
"Chinesisch",
"Venezianisch",
"Friaulisch",
"Serbisch",
"Indonesisch",
"Walisisch",
"Arabisch",
"Zentral-Nahuatl",
"Neugriechisch",
"Sumerisch",
"Obersorbisch",
"Sesotho",
"Rumänisch",
"Suaheli",
"Persisch",
"Krimtatarisch",
"Plattdeutsch",
"Prußisch",
"Thai",
"Bosnisch",
"Sardisch",
"Maltesisch",
"Akkadisch",
"Hawaiianisch",
"Hebräisch",
"Gotisch",
"Afrikaans",
"Rätoromanisch",
"Tamil",
"Bretonisch",
"Ukrainisch",
"Hindi",
"Georgisch",
"Panjabi",
"Papiamentu",
"Slowenisch",
"Nauruisch",
"Schottisch-Gälisch",
"Balinesisch",
"Estnisch",
"Manx",
"Korsisch",
# "Frühneuhochdeutsch",
"Lettisch",
"isiZulu",
"Tagalog",
"Tok Pisin",
# "Südpikenisch",
"Kroatisch",
"Niedersorbisch",
"Kannada",
"Guanche",
"Weißrussisch",
"Sanskrit",
"Aserbaidschanisch",
"Mittelhochdeutsch",
"Laotisch",
"Altnordisch",
"Altenglisch",
"Vietnamesisch",
"Tadschikisch",
"Samoanisch",
"Mazedonisch",
"Luxemburgisch",
"Hethitisch",
# "Yukatekisch",
"Kaschubisch",
"Wallonisch",
# "Klassisches Nahuatl",
"Telugu",
"Rapanui",
"Jiddisch",
"Ido",
# "Galicisch",
"Volapük",
"Bengalisch",
"Mapudungun",
"Lojban",
"Tuvaluisch",
"Gujarati",
"Assamesisch",
]

View file

@ -0,0 +1,59 @@
from langcodes.util import data_filename
LIST_KEYS = {'Description', 'Prefix'}
def parse_file(file):
"""
Take an open file containing the IANA subtag registry, and yield a
dictionary of information for each subtag it describes.
"""
lines = []
for line in file:
line = line.rstrip('\n')
if line == '%%':
# This is a separator between items. Parse the data we've
# collected and yield the result.
yield from parse_item(lines)
lines.clear()
elif line.startswith(' '):
# This is a continuation line. Concatenate it to the previous
# line, including one of the spaces.
lines[-1] += line[1:]
else:
lines.append(line)
yield from parse_item(lines)
def parse_item(lines):
"""
Given the lines that form a subtag entry (after joining wrapped lines
back together), parse the data they contain.
Returns a generator that yields once if there was any data there
(and an empty generator if this was just the header).
"""
info = {}
for line in lines:
key, value = line.split(': ', 1)
if key in LIST_KEYS:
info.setdefault(key, []).append(value)
else:
assert key not in info
info[key] = value
if 'Subtag' in info or 'Tag' in info:
yield info
def parse_registry():
"""
Yield a sequence of dictionaries, containing the info in the included
IANA subtag registry file.
"""
with open(
data_filename('language-subtag-registry.txt'), encoding='utf-8'
) as data_file:
# 'yield from' instead of returning, so that we only close the file
# when finished.
yield from parse_file(data_file)

422
lib/langcodes/tag_parser.py Normal file
View file

@ -0,0 +1,422 @@
"""
This module implements a parser for language tags, according to the RFC 5646
(BCP 47) standard.
Here, we're only concerned with the syntax of the language tag. Looking up
what they actually mean in a data file is a separate step.
For a full description of the syntax of a language tag, see page 3 of
http://tools.ietf.org/html/bcp47
>>> parse_tag('en')
[('language', 'en')]
>>> parse_tag('en_US')
[('language', 'en'), ('territory', 'US')]
>>> parse_tag('en-Latn')
[('language', 'en'), ('script', 'Latn')]
>>> parse_tag('es-419')
[('language', 'es'), ('territory', '419')]
>>> parse_tag('zh-hant-tw')
[('language', 'zh'), ('script', 'Hant'), ('territory', 'TW')]
>>> parse_tag('zh-tw-hant')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: This script subtag, 'hant', is out of place. Expected variant, extension, or end of string.
>>> parse_tag('de-DE-1901')
[('language', 'de'), ('territory', 'DE'), ('variant', '1901')]
>>> parse_tag('ja-latn-hepburn')
[('language', 'ja'), ('script', 'Latn'), ('variant', 'hepburn')]
>>> parse_tag('ja-hepburn-latn')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
>>> parse_tag('zh-yue')
[('language', 'zh'), ('extlang', 'yue')]
>>> parse_tag('zh-yue-Hant')
[('language', 'zh'), ('extlang', 'yue'), ('script', 'Hant')]
>>> parse_tag('zh-min-nan')
[('grandfathered', 'zh-min-nan')]
>>> parse_tag('x-dothraki')
[('language', 'x-dothraki')]
>>> parse_tag('en-u-co-backward-x-pig-latin')
[('language', 'en'), ('extension', 'u-co-backward'), ('private', 'x-pig-latin')]
>>> parse_tag('en-x-pig-latin-u-co-backward')
[('language', 'en'), ('private', 'x-pig-latin-u-co-backward')]
>>> parse_tag('u-co-backward')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Expected a language code, got 'u'
>>> parse_tag('x-')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
>>> parse_tag('und-u-')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
>>> parse_tag('und-0-foo')
[('language', 'und'), ('extension', '0-foo')]
>>> parse_tag('und-?-foo')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '?'
>>> parse_tag('und-x-123456789')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '123456789'
>>> parse_tag('en-a-b-foo')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Tag extensions may not contain two singletons in a row
>>> parse_tag('ar-٠٠١')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Language tags must be made of ASCII characters
"""
# These tags should not be parsed by the usual parser; they're grandfathered
# in from RFC 3066. The 'irregular' ones don't fit the syntax at all; the
# 'regular' ones do, but would give meaningless results when parsed.
#
# These are all lowercased so they can be matched case-insensitively, as the
# standard requires.
EXCEPTIONS = {
# Irregular exceptions
"en-gb-oed",
"i-ami",
"i-bnn",
"i-default",
"i-enochian",
"i-hak",
"i-klingon",
"i-lux",
"i-mingo",
"i-navajo",
"i-pwn",
"i-tao",
"i-tay",
"i-tsu",
"sgn-be-fr",
"sgn-be-nl",
"sgn-ch-de",
# Regular exceptions
"art-lojban",
"cel-gaulish",
"no-bok",
"no-nyn",
"zh-guoyu",
"zh-hakka",
"zh-min",
"zh-min-nan",
"zh-xiang",
}
# Define the order of subtags as integer constants, but also give them names
# so we can describe them in error messages
EXTLANG, SCRIPT, TERRITORY, VARIANT, EXTENSION = range(5)
SUBTAG_TYPES = [
'extlang',
'script',
'territory',
'variant',
'extension',
'end of string',
]
def _is_ascii(s):
"""
Determine whether a tag consists of ASCII characters.
"""
# When Python 3.6 support is dropped, we can replace this with str.isascii().
try:
s.encode('ascii')
return True
except UnicodeEncodeError:
return False
def normalize_characters(tag):
"""
BCP 47 is case-insensitive, and CLDR's use of it considers underscores
equivalent to hyphens. So here we smash tags into lowercase with hyphens,
so we can make exact comparisons.
>>> normalize_characters('en_US')
'en-us'
>>> normalize_characters('zh-Hant_TW')
'zh-hant-tw'
"""
return tag.lower().replace('_', '-')
def parse_tag(tag):
"""
Parse the syntax of a language tag, without looking up anything in the
registry, yet. Returns a list of (type, value) tuples indicating what
information will need to be looked up.
"""
if not _is_ascii(tag):
raise LanguageTagError("Language tags must be made of ASCII characters")
tag = normalize_characters(tag)
if tag in EXCEPTIONS:
return [('grandfathered', tag)]
else:
# The first subtag is always either the language code, or 'x' to mark
# the entire tag as private-use. Other subtags are distinguished
# by their length and format, but the language code is distinguished
# by the fact that it is required to come first.
subtags = tag.split('-')
# check all subtags for their shape: 1-8 alphanumeric characters
for subtag in subtags:
if len(subtag) < 1 or len(subtag) > 8 or not subtag.isalnum():
raise LanguageTagError(
f"Expected 1-8 alphanumeric characters, got {subtag!r}"
)
if subtags[0] == 'x':
if len(subtags) == 1:
raise LanguageTagError("'x' is not a language tag on its own")
# the entire language tag is private use, but we know that,
# whatever it is, it fills the "language" slot
return [('language', tag)]
elif 2 <= len(subtags[0]) <= 4:
# Language codes should be 2 or 3 letters, but 4-letter codes
# are allowed to parse for legacy Unicode reasons
return [('language', subtags[0])] + parse_subtags(subtags[1:])
else:
subtag_error(subtags[0], 'a language code')
def parse_subtags(subtags, expect=EXTLANG):
"""
Parse everything that comes after the language tag: scripts, territories,
variants, and assorted extensions.
"""
# We parse the parts of a language code recursively: each step of
# language code parsing handles one component of the code, recurses
# to handle the rest of the code, and adds what it found onto the
# list of things that were in the rest of the code.
#
# This could just as well have been iterative, but the loops would have
# been convoluted.
#
# So here's the base case.
if not subtags:
return []
# There's a subtag that comes next. We need to find out what it is.
#
# The primary thing that distinguishes different types of subtags is
# length, but the subtags also come in a specified order. The 'expect'
# parameter keeps track of where we are in that order. expect=TERRITORY,
# for example, means we're expecting a territory code, or anything later
# (because everything but the language is optional).
subtag = subtags[0]
tag_length = len(subtag)
# In the usual case, our goal is to recognize what kind of tag this is,
# and set it in 'tagtype' -- as an integer, so we can compare where it
# should go in order. You can see the enumerated list of tagtypes above,
# where the SUBTAG_TYPES global is defined.
tagtype = None
if tag_length == 1:
# A one-letter subtag introduces an extension, which can itself have
# sub-subtags, so we dispatch to a different function at this point.
#
# We don't need to check anything about the order, because extensions
# necessarily come last.
if subtag.isalnum():
return parse_extension(subtags)
else:
subtag_error(subtag)
elif tag_length == 2:
if subtag.isalpha():
# Two-letter alphabetic subtags are territories. These are the only
# two-character subtags after the language.
tagtype = TERRITORY
elif tag_length == 3:
if subtag.isalpha():
# Three-letter alphabetic subtags are 'extended languages'.
# It's allowed for there to be up to three of them in a row, so we
# need another function to enforce that. Before we dispatch to that
# function, though, we need to check whether we're in the right
# place in order.
if expect <= EXTLANG:
return parse_extlang(subtags)
else:
order_error(subtag, EXTLANG, expect)
elif subtag.isdigit():
# Three-digit subtags are territories representing broad regions,
# such as Latin America (419).
tagtype = TERRITORY
elif tag_length == 4:
if subtag.isalpha():
# Four-letter alphabetic subtags are scripts.
tagtype = SCRIPT
elif subtag[0].isdigit():
# Four-character subtags that start with a digit are variants.
tagtype = VARIANT
else:
# Tags of length 5-8 are variants.
tagtype = VARIANT
# That's the end of the big elif block for figuring out what kind of
# subtag we have based on its length. Now we should do something with that
# kind of subtag.
if tagtype is None:
# We haven't recognized a type of tag. This subtag just doesn't fit the
# standard.
subtag_error(subtag)
elif tagtype < expect:
# We got a tag type that was supposed to appear earlier in the order.
order_error(subtag, tagtype, expect)
else:
# We've recognized a subtag of a particular type. If it's a territory or
# script, we expect the next subtag to be a strictly later type, because
# there can be at most one territory and one script. Otherwise, we expect
# the next subtag to be the type we got or later.
if tagtype in (SCRIPT, TERRITORY):
expect = tagtype + 1
else:
expect = tagtype
# Get the name of this subtag type instead of its integer value.
typename = SUBTAG_TYPES[tagtype]
# Some subtags are conventionally written with capitalization. Apply
# those conventions.
if tagtype == SCRIPT:
subtag = subtag.title()
elif tagtype == TERRITORY:
subtag = subtag.upper()
# Recurse on the remaining subtags.
return [(typename, subtag)] + parse_subtags(subtags[1:], expect)
def parse_extlang(subtags):
"""
Parse an 'extended language' tag, which consists of 1 to 3 three-letter
language codes.
Extended languages are used for distinguishing dialects/sublanguages
(depending on your view) of macrolanguages such as Arabic, Bahasa Malay,
and Chinese.
It's supposed to also be acceptable to just use the sublanguage as the
primary language code, and your code should know what's a macrolanguage of
what. For example, 'zh-yue' and 'yue' are the same language (Cantonese),
and differ only in whether they explicitly spell out that Cantonese is a
kind of Chinese.
"""
index = 0
parsed = []
while index < len(subtags) and len(subtags[index]) == 3 and index < 3:
parsed.append(('extlang', subtags[index]))
index += 1
return parsed + parse_subtags(subtags[index:], SCRIPT)
def parse_extension(subtags):
"""
An extension tag consists of a 'singleton' -- a one-character subtag --
followed by other subtags. Extension tags are in the BCP 47 syntax, but
their meaning is outside the scope of the standard.
For example, there's the u- extension, which is used for setting Unicode
properties in some context I'm not aware of.
If the singleton is 'x', it's a private use extension, and consumes the
rest of the tag. Otherwise, it stops at the next singleton.
"""
subtag = subtags[0]
if len(subtags) == 1:
raise LanguageTagError(f"The subtag {subtag!r} must be followed by something")
if subtag == 'x':
# Private use. Everything after this is arbitrary codes that we
# can't look up.
return [('private', '-'.join(subtags))]
else:
# Look for the next singleton, if there is one.
boundary = 1
while boundary < len(subtags) and len(subtags[boundary]) != 1:
boundary += 1
if boundary == 1:
raise LanguageTagError(
"Tag extensions may not contain two singletons in a row"
)
# We've parsed a complete extension subtag. Return to the main
# parse_subtags function, but expect to find nothing but more
# extensions at this point.
return [('extension', '-'.join(subtags[:boundary]))] + parse_subtags(
subtags[boundary:], EXTENSION
)
class LanguageTagError(ValueError):
pass
def order_error(subtag, got, expected):
"""
Output an error indicating that tags were out of order.
"""
options = SUBTAG_TYPES[expected:]
if len(options) == 1:
expect_str = options[0]
elif len(options) == 2:
expect_str = f'{options[0]} or {options[1]}'
else:
joined = ', '.join(options[:-1])
last = options[-1]
expect_str = f'{joined}, or {last}'
got_str = SUBTAG_TYPES[got]
raise LanguageTagError(
f"This {got_str} subtag, {subtag!r}, is out of place. Expected {expect_str}."
)
def subtag_error(subtag, expected='a valid subtag'):
"""
Try to output a reasonably helpful error message based on our state of
parsing. Most of this code is about how to list, in English, the kinds
of things we were expecting to find.
"""
raise LanguageTagError(f"Expected {expected}, got {subtag!r}")

8
lib/langcodes/util.py Normal file
View file

@ -0,0 +1,8 @@
from pkg_resources import resource_filename
DATA_ROOT = resource_filename('langcodes', 'data')
import os
def data_filename(filename):
return os.path.join(DATA_ROOT, filename)

View file

View file

@ -0,0 +1,68 @@
en,av,Avar
en,frr,North Frisian
en,frs,East Frisian
en,fy,West Frisian
en,gn,Guaraní
en,ilo,Ilocano
en,jam,Jamaican Creole
en,kky,Guugu Yimidhirr
en,kky,Guugu Yimithirr
en,ksd,Tolai
en,liv,Livonian
en,nay,Ngarrindjeri
en,nmn,ǃXóõ
en,nrf,Norman
en,oj,Ojibwe
en,pap,Papiamentu
en,pms,Piedmontese
en,rap,Rapa Nui
en,rm,Romansch
en,rom,Romani
en,ryu,Okinawan
en,sl,Slovene
en,st,Sesotho
en,tvl,Tuvaluan
en,twf,Taos
en,txb,Tocharian B
en,tyv,Tuvan
en,vma,Martuthunira
en,wym,Vilamovian
en,xto,Tocharian A
en,zu,isiZulu
de,el,Neugriechisch
de,la,Lateinisch
de,fur,Friaulisch
de,gd,Schottisch-Gälisch
de,haw,Hawaiianisch
de,nds,Plattdeutsch
de,nhn,Zentral-Nahuatl
de,pa,Panjabi
de,pap,Papiamentu
de,prg,Prußisch
de,vec,Venezianisch
de,tvl,Tuvaluisch
sh,sh,Srpskohrvatski
la,la,Lingua latina
ceb,ceb,Sinugbuanong Binisayâ
ceb,ceb,Bisayâ
ceb,ceb,Bisaya
lah,lah,لہندا پنجابی
bho,bho,भोजपुरी
ang,ang,Ænglisc
vo,vo,Volapük
io,io,Ido
jbo,jbo,lojban
jbo,jbo,lojbau
rup,rup,armãneashti
nv,nv,Diné bizaad
zh-Hant,nan,閩南語
zh-Hans,nan,闽南语
nan-Latn,nan,Bân-lâm-gú
zh-Hant,hak,客家語
zh-Hans,hak,客家语
ilo,ilo,Ilokano
hil,hil,Ilonggo
nah,nah,Nāhuatl
tpi,tpi,Tok Pisin
ve,ve,tshiVenḓa
kcm,kcm,Kristang
1 en av Avar
2 en frr North Frisian
3 en frs East Frisian
4 en fy West Frisian
5 en gn Guaraní
6 en ilo Ilocano
7 en jam Jamaican Creole
8 en kky Guugu Yimidhirr
9 en kky Guugu Yimithirr
10 en ksd Tolai
11 en liv Livonian
12 en nay Ngarrindjeri
13 en nmn ǃXóõ
14 en nrf Norman
15 en oj Ojibwe
16 en pap Papiamentu
17 en pms Piedmontese
18 en rap Rapa Nui
19 en rm Romansch
20 en rom Romani
21 en ryu Okinawan
22 en sl Slovene
23 en st Sesotho
24 en tvl Tuvaluan
25 en twf Taos
26 en txb Tocharian B
27 en tyv Tuvan
28 en vma Martuthunira
29 en wym Vilamovian
30 en xto Tocharian A
31 en zu isiZulu
32 de el Neugriechisch
33 de la Lateinisch
34 de fur Friaulisch
35 de gd Schottisch-Gälisch
36 de haw Hawaiianisch
37 de nds Plattdeutsch
38 de nhn Zentral-Nahuatl
39 de pa Panjabi
40 de pap Papiamentu
41 de prg Prußisch
42 de vec Venezianisch
43 de tvl Tuvaluisch
44 sh sh Srpskohrvatski
45 la la Lingua latina
46 ceb ceb Sinugbuanong Binisayâ
47 ceb ceb Bisayâ
48 ceb ceb Bisaya
49 lah lah لہندا پنجابی
50 bho bho भोजपुरी
51 ang ang Ænglisc
52 vo vo Volapük
53 io io Ido
54 jbo jbo lojban
55 jbo jbo lojbau
56 rup rup armãneashti
57 nv nv Diné bizaad
58 zh-Hant nan 閩南語
59 zh-Hans nan 闽南语
60 nan-Latn nan Bân-lâm-gú
61 zh-Hant hak 客家語
62 zh-Hans hak 客家语
63 ilo ilo Ilokano
64 hil hil Ilonggo
65 nah nah Nāhuatl
66 tpi tpi Tok Pisin
67 ve ve tshiVenḓa
68 kcm kcm Kristang

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,442 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
<!--
Copyright © 1991-2020 Unicode, Inc.
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
For terms of use, see http://www.unicode.org/copyright.html
-->
<supplementalData>
<version number="$Revision$"/>
<languageMatching>
<languageMatches type="written_new">
<paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
<matchVariable id="$enUS" value="AS+GU+MH+MP+PR+UM+US+VI"/>
<matchVariable id="$cnsar" value="HK+MO"/>
<matchVariable id="$americas" value="019"/>
<matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
<languageMatch desired="no" supported="nb" distance="1"/> <!-- no ⇒ nb -->
<!-- languageMatch desired="ku" supported="ckb" distance="4" oneway="true"/ --> <!-- ku ⇒ ckb -->
<!-- languageMatch desired="ckb" supported="ku" percent="8" oneway="true"/ --> <!-- ckb ⇒ ku -->
<languageMatch desired="hr" supported="bs" distance="4"/> <!-- hr ⇒ bs -->
<languageMatch desired="sh" supported="bs" distance="4"/> <!-- sh ⇒ bs -->
<!-- languageMatch desired="sr" supported="bs" distance="4"/--> <!-- sr ⇒ bs -->
<languageMatch desired="sh" supported="hr" distance="4"/> <!-- sh ⇒ hr -->
<!-- languageMatch desired="sr" supported="hr" distance="4"/--> <!-- sr ⇒ hr -->
<languageMatch desired="sh" supported="sr" distance="4"/> <!-- sh ⇒ sr -->
<languageMatch desired="ssy" supported="aa" distance="4"/> <!-- ssy ⇒ aa -->
<languageMatch desired="gsw" supported="de" distance="4" oneway="true"/> <!-- gsw ⇒ de -->
<languageMatch desired="lb" supported="de" distance="4" oneway="true"/> <!-- lb ⇒ de -->
<languageMatch desired="da" supported="no" distance="8"/> <!-- da ⇒ no -->
<languageMatch desired="da" supported="nb" distance="8"/> <!-- da ⇒ nb -->
<!-- various fallbacks for more or less loosely related languages -->
<!-- CLDR-13528:
Distance 20 for some linguistic relation (e.g., Creoles to French)
or a local language in the area of another (e.g., Breton to French).
Distance 30 for fallbacks to prevalent second languages,
and in the absence of better information. -->
<languageMatch desired="ab" supported="ru" distance="30" oneway="true"/> <!-- Abkhazian: ab ⇒ ru -->
<languageMatch desired="ach" supported="en" distance="30" oneway="true"/> <!-- Acoli (Southern Luo dialect in Uganda): ach ⇒ en -->
<languageMatch desired="af" supported="nl" distance="20" oneway="true"/> <!-- Afrikaans: af ⇒ nl -->
<languageMatch desired="ak" supported="en" distance="30" oneway="true"/> <!-- Akan: ak ⇒ en -->
<languageMatch desired="ay" supported="es" distance="20" oneway="true"/> <!-- Aymara: ay ⇒ es -->
<languageMatch desired="az" supported="ru" distance="30" oneway="true"/> <!-- Azerbaijani: az ⇒ ru -->
<languageMatch desired="be" supported="ru" distance="20" oneway="true"/> <!-- Belarusian: be ⇒ ru -->
<languageMatch desired="bem" supported="en" distance="30" oneway="true"/> <!-- Bemba (Zambia): bem ⇒ en -->
<languageMatch desired="bh" supported="hi" distance="30" oneway="true"/> <!-- Bihari languages (gets canonicalized to bho): bh ⇒ hi -->
<languageMatch desired="bn" supported="en" distance="30" oneway="true"/> <!-- Bangla: bn ⇒ en -->
<languageMatch desired="br" supported="fr" distance="20" oneway="true"/> <!-- Breton: br ⇒ fr -->
<languageMatch desired="ceb" supported="fil" distance="30" oneway="true"/> <!-- Cebuano: ceb ⇒ fil -->
<languageMatch desired="chr" supported="en" distance="20" oneway="true"/> <!-- Cherokee: chr ⇒ en -->
<languageMatch desired="ckb" supported="ar" distance="30" oneway="true"/> <!-- Sorani Kurdish: ckb ⇒ ar -->
<languageMatch desired="co" supported="fr" distance="20" oneway="true"/> <!-- Corsican: co ⇒ fr -->
<languageMatch desired="crs" supported="fr" distance="20" oneway="true"/> <!-- Seselwa Creole French: crs ⇒ fr -->
<languageMatch desired="cy" supported="en" distance="20" oneway="true"/> <!-- Welsh: cy ⇒ en -->
<languageMatch desired="ee" supported="en" distance="30" oneway="true"/> <!-- Ewe: ee ⇒ en -->
<languageMatch desired="eo" supported="en" distance="30" oneway="true"/> <!-- Esperanto: eo ⇒ en -->
<!-- CLDR-13650: No fallback for Estonian -->
<!-- languageMatch desired="et" supported="fi" distance="30" oneway="true"/--> <!-- Estonian: et ⇒ fi -->
<languageMatch desired="eu" supported="es" distance="20" oneway="true"/> <!-- Basque: eu ⇒ es -->
<languageMatch desired="fo" supported="da" distance="20" oneway="true"/> <!-- Faroese: fo ⇒ da -->
<languageMatch desired="fy" supported="nl" distance="20" oneway="true"/> <!-- Western Frisian: fy ⇒ nl -->
<languageMatch desired="ga" supported="en" distance="20" oneway="true"/> <!-- Irish: ga ⇒ en -->
<languageMatch desired="gaa" supported="en" distance="30" oneway="true"/> <!-- Ga: gaa ⇒ en -->
<languageMatch desired="gd" supported="en" distance="20" oneway="true"/> <!-- Scottish Gaelic: gd ⇒ en -->
<languageMatch desired="gl" supported="es" distance="20" oneway="true"/> <!-- Galician: gl ⇒ es -->
<languageMatch desired="gn" supported="es" distance="20" oneway="true"/> <!-- Guarani: gn ⇒ es -->
<languageMatch desired="gu" supported="hi" distance="30" oneway="true"/> <!-- Gujarati: gu ⇒ hi -->
<languageMatch desired="ha" supported="en" distance="30" oneway="true"/> <!-- Hausa: ha ⇒ en -->
<languageMatch desired="haw" supported="en" distance="20" oneway="true"/> <!-- Hawaiian: haw ⇒ en -->
<languageMatch desired="ht" supported="fr" distance="20" oneway="true"/> <!-- Haitian Creole: ht ⇒ fr -->
<languageMatch desired="hy" supported="ru" distance="30" oneway="true"/> <!-- Armenian: hy ⇒ ru -->
<languageMatch desired="ia" supported="en" distance="30" oneway="true"/> <!-- Interlingua: ia ⇒ en -->
<languageMatch desired="ig" supported="en" distance="30" oneway="true"/> <!-- Igbo: ig ⇒ en -->
<languageMatch desired="is" supported="en" distance="20" oneway="true"/> <!-- Icelandic: is ⇒ en -->
<languageMatch desired="jv" supported="id" distance="20" oneway="true"/> <!-- Javanese: jv ⇒ id -->
<languageMatch desired="ka" supported="en" distance="30" oneway="true"/> <!-- Georgian: ka ⇒ en -->
<languageMatch desired="kg" supported="fr" distance="30" oneway="true"/> <!-- Kongo: kg ⇒ fr -->
<languageMatch desired="kk" supported="ru" distance="30" oneway="true"/> <!-- Kazakh: kk ⇒ ru -->
<languageMatch desired="km" supported="en" distance="30" oneway="true"/> <!-- Khmer: km ⇒ en -->
<languageMatch desired="kn" supported="en" distance="30" oneway="true"/> <!-- Kannada: kn ⇒ en -->
<languageMatch desired="kri" supported="en" distance="30" oneway="true"/> <!-- Krio: kri ⇒ en -->
<languageMatch desired="ku" supported="tr" distance="30" oneway="true"/> <!-- Kurdish: ku ⇒ tr -->
<languageMatch desired="ky" supported="ru" distance="30" oneway="true"/> <!-- Kirghiz: ky ⇒ ru -->
<languageMatch desired="la" supported="it" distance="20" oneway="true"/> <!-- Latin: la ⇒ it -->
<languageMatch desired="lg" supported="en" distance="30" oneway="true"/> <!-- Luganda: lg ⇒ en -->
<languageMatch desired="ln" supported="fr" distance="30" oneway="true"/> <!-- Lingala: ln ⇒ fr -->
<languageMatch desired="lo" supported="en" distance="30" oneway="true"/> <!-- Lao: lo ⇒ en -->
<languageMatch desired="loz" supported="en" distance="30" oneway="true"/> <!-- Lozi: loz ⇒ en -->
<languageMatch desired="lua" supported="fr" distance="30" oneway="true"/> <!-- Luba-Lulua: lua ⇒ fr -->
<languageMatch desired="mfe" supported="en" distance="30" oneway="true"/> <!-- Morisyen: mfe ⇒ en -->
<languageMatch desired="mg" supported="fr" distance="30" oneway="true"/> <!-- Malagasy: mg ⇒ fr -->
<languageMatch desired="mi" supported="en" distance="20" oneway="true"/> <!-- Maori: mi ⇒ en -->
<!-- CLDR-13625: Macedonian should not fall back to Bulgarian -->
<!-- languageMatch desired="mk" supported="bg" distance="30" oneway="true"/--> <!-- Macedonian: mk ⇒ bg -->
<languageMatch desired="ml" supported="en" distance="30" oneway="true"/> <!-- Malayalam: ml ⇒ en -->
<languageMatch desired="mn" supported="ru" distance="30" oneway="true"/> <!-- Mongolian: mn ⇒ ru -->
<languageMatch desired="mr" supported="hi" distance="30" oneway="true"/> <!-- Marathi: mr ⇒ hi -->
<languageMatch desired="ms" supported="id" distance="30" oneway="true"/> <!-- Malay: ms ⇒ id -->
<languageMatch desired="mt" supported="en" distance="30" oneway="true"/> <!-- Maltese: mt ⇒ en -->
<languageMatch desired="my" supported="en" distance="30" oneway="true"/> <!-- Myanmar: my ⇒ en -->
<languageMatch desired="ne" supported="en" distance="30" oneway="true"/> <!-- Nepali: ne ⇒ en -->
<languageMatch desired="nn" supported="nb" distance="20"/> <!-- Nynorsk: nn ⟺ nb -->
<languageMatch desired="nn" supported="no" distance="20"/> <!-- Nynorsk: nn ⟺ no; CLDR-13679 -->
<languageMatch desired="nso" supported="en" distance="30" oneway="true"/> <!-- Northern Sotho: nso ⇒ en -->
<languageMatch desired="ny" supported="en" distance="30" oneway="true"/> <!-- Nyanja: ny ⇒ en -->
<languageMatch desired="nyn" supported="en" distance="30" oneway="true"/> <!-- Nyankole: nyn ⇒ en -->
<languageMatch desired="oc" supported="fr" distance="20" oneway="true"/> <!-- Occitan: oc ⇒ fr -->
<languageMatch desired="om" supported="en" distance="30" oneway="true"/> <!-- Oromo: om ⇒ en -->
<languageMatch desired="or" supported="en" distance="30" oneway="true"/> <!-- Odia: or ⇒ en -->
<languageMatch desired="pa" supported="en" distance="30" oneway="true"/> <!-- Punjabi: pa ⇒ en -->
<languageMatch desired="pcm" supported="en" distance="20" oneway="true"/> <!-- Nigerian Pidgin: pcm ⇒ en -->
<languageMatch desired="ps" supported="en" distance="30" oneway="true"/> <!-- Pashto: ps ⇒ en -->
<languageMatch desired="qu" supported="es" distance="30" oneway="true"/> <!-- Quechua: qu ⇒ es -->
<languageMatch desired="rm" supported="de" distance="20" oneway="true"/> <!-- Romansh: rm ⇒ de -->
<languageMatch desired="rn" supported="en" distance="30" oneway="true"/> <!-- Rundi: rn ⇒ en -->
<languageMatch desired="rw" supported="fr" distance="30" oneway="true"/> <!-- Kinyarwanda: rw ⇒ fr -->
<languageMatch desired="sa" supported="hi" distance="30" oneway="true"/> <!-- Sanskrit: sa ⇒ hi -->
<languageMatch desired="sd" supported="en" distance="30" oneway="true"/> <!-- Sindhi: sd ⇒ en -->
<languageMatch desired="si" supported="en" distance="30" oneway="true"/> <!-- Sinhalese: si ⇒ en -->
<languageMatch desired="sn" supported="en" distance="30" oneway="true"/> <!-- Shona: sn ⇒ en -->
<languageMatch desired="so" supported="en" distance="30" oneway="true"/> <!-- Somali: so ⇒ en -->
<languageMatch desired="sq" supported="en" distance="30" oneway="true"/> <!-- Albanian: sq ⇒ en -->
<languageMatch desired="st" supported="en" distance="30" oneway="true"/> <!-- Southern Sotho: st ⇒ en -->
<languageMatch desired="su" supported="id" distance="20" oneway="true"/> <!-- Sundanese: su ⇒ id -->
<languageMatch desired="sw" supported="en" distance="30" oneway="true"/> <!-- Swahili: sw ⇒ en -->
<languageMatch desired="ta" supported="en" distance="30" oneway="true"/> <!-- Tamil: ta ⇒ en -->
<languageMatch desired="te" supported="en" distance="30" oneway="true"/> <!-- Telugu: te ⇒ en -->
<languageMatch desired="tg" supported="ru" distance="30" oneway="true"/> <!-- Tajik: tg ⇒ ru -->
<languageMatch desired="ti" supported="en" distance="30" oneway="true"/> <!-- Tigrinya: ti ⇒ en -->
<languageMatch desired="tk" supported="ru" distance="30" oneway="true"/> <!-- Turkmen: tk ⇒ ru -->
<languageMatch desired="tlh" supported="en" distance="30" oneway="true"/> <!-- Klingon: tlh ⇒ en -->
<languageMatch desired="tn" supported="en" distance="30" oneway="true"/> <!-- Tswana: tn ⇒ en -->
<languageMatch desired="to" supported="en" distance="30" oneway="true"/> <!-- Tonga: to ⇒ en -->
<languageMatch desired="tt" supported="ru" distance="30" oneway="true"/> <!-- Tatar: tt ⇒ ru -->
<languageMatch desired="tum" supported="en" distance="30" oneway="true"/> <!-- Tumbuka: tum ⇒ en -->
<languageMatch desired="ug" supported="zh" distance="20" oneway="true"/> <!-- Uighur: ug ⇒ zh -->
<languageMatch desired="ur" supported="en" distance="30" oneway="true"/> <!-- Urdu: ur ⇒ en -->
<languageMatch desired="uz" supported="ru" distance="30" oneway="true"/> <!-- Uzbek: uz ⇒ ru -->
<languageMatch desired="wo" supported="fr" distance="30" oneway="true"/> <!-- Wolof: wo ⇒ fr -->
<languageMatch desired="xh" supported="en" distance="30" oneway="true"/> <!-- Xhosa: xh ⇒ en -->
<languageMatch desired="yi" supported="en" distance="30" oneway="true"/> <!-- Yiddish: yi ⇒ en -->
<languageMatch desired="yo" supported="en" distance="30" oneway="true"/> <!-- Yoruba: yo ⇒ en -->
<languageMatch desired="zu" supported="en" distance="30" oneway="true"/> <!-- Zulu: zu ⇒ en -->
<!-- START generated by GenerateLanguageMatches.java: don't manually change -->
<!-- Encompassed by Arabic -->
<languageMatch desired="aao" supported="ar" distance="10" oneway="true"/> <!-- Algerian Saharan Arabic -->
<languageMatch desired="abh" supported="ar" distance="10" oneway="true"/> <!-- Tajiki Arabic -->
<languageMatch desired="abv" supported="ar" distance="10" oneway="true"/> <!-- Baharna Arabic -->
<languageMatch desired="acm" supported="ar" distance="10" oneway="true"/> <!-- Mesopotamian Arabic -->
<languageMatch desired="acq" supported="ar" distance="10" oneway="true"/> <!-- Ta'izzi-Adeni Arabic -->
<languageMatch desired="acw" supported="ar" distance="10" oneway="true"/> <!-- Hijazi Arabic -->
<languageMatch desired="acx" supported="ar" distance="10" oneway="true"/> <!-- Omani Arabic -->
<languageMatch desired="acy" supported="ar" distance="10" oneway="true"/> <!-- Cypriot Arabic -->
<languageMatch desired="adf" supported="ar" distance="10" oneway="true"/> <!-- Dhofari Arabic -->
<languageMatch desired="aeb" supported="ar" distance="10" oneway="true"/> <!-- Tunisian Arabic -->
<languageMatch desired="aec" supported="ar" distance="10" oneway="true"/> <!-- Saidi Arabic -->
<languageMatch desired="afb" supported="ar" distance="10" oneway="true"/> <!-- Gulf Arabic -->
<languageMatch desired="ajp" supported="ar" distance="10" oneway="true"/> <!-- South Levantine Arabic -->
<languageMatch desired="apc" supported="ar" distance="10" oneway="true"/> <!-- North Levantine Arabic -->
<languageMatch desired="apd" supported="ar" distance="10" oneway="true"/> <!-- Sudanese Arabic -->
<languageMatch desired="arq" supported="ar" distance="10" oneway="true"/> <!-- Algerian Arabic -->
<languageMatch desired="ars" supported="ar" distance="10" oneway="true"/> <!-- Najdi Arabic -->
<languageMatch desired="ary" supported="ar" distance="10" oneway="true"/> <!-- Moroccan Arabic -->
<languageMatch desired="arz" supported="ar" distance="10" oneway="true"/> <!-- Egyptian Arabic -->
<languageMatch desired="auz" supported="ar" distance="10" oneway="true"/> <!-- Uzbeki Arabic -->
<languageMatch desired="avl" supported="ar" distance="10" oneway="true"/> <!-- Eastern Egyptian Bedawi Arabic -->
<languageMatch desired="ayh" supported="ar" distance="10" oneway="true"/> <!-- Hadrami Arabic -->
<languageMatch desired="ayl" supported="ar" distance="10" oneway="true"/> <!-- Libyan Arabic -->
<languageMatch desired="ayn" supported="ar" distance="10" oneway="true"/> <!-- Sanaani Arabic -->
<languageMatch desired="ayp" supported="ar" distance="10" oneway="true"/> <!-- North Mesopotamian Arabic -->
<languageMatch desired="bbz" supported="ar" distance="10" oneway="true"/> <!-- Babalia Creole Arabic -->
<languageMatch desired="pga" supported="ar" distance="10" oneway="true"/> <!-- Sudanese Creole Arabic -->
<languageMatch desired="shu" supported="ar" distance="10" oneway="true"/> <!-- Chadian Arabic -->
<languageMatch desired="ssh" supported="ar" distance="10" oneway="true"/> <!-- Shihhi Arabic -->
<!-- Encompassed by Azerbaijani -->
<languageMatch desired="azb" supported="az" distance="10" oneway="true"/> <!-- South Azerbaijani -->
<!-- Encompassed by Estonian -->
<languageMatch desired="vro" supported="et" distance="10" oneway="true"/> <!-- Võro -->
<!-- Encompassed by Fulah -->
<languageMatch desired="ffm" supported="ff" distance="10" oneway="true"/> <!-- Maasina Fulfulde -->
<languageMatch desired="fub" supported="ff" distance="10" oneway="true"/> <!-- Adamawa Fulfulde -->
<languageMatch desired="fue" supported="ff" distance="10" oneway="true"/> <!-- Borgu Fulfulde -->
<languageMatch desired="fuf" supported="ff" distance="10" oneway="true"/> <!-- Pular -->
<languageMatch desired="fuh" supported="ff" distance="10" oneway="true"/> <!-- Western Niger Fulfulde -->
<languageMatch desired="fui" supported="ff" distance="10" oneway="true"/> <!-- Bagirmi Fulfulde -->
<languageMatch desired="fuq" supported="ff" distance="10" oneway="true"/> <!-- Central-Eastern Niger Fulfulde -->
<languageMatch desired="fuv" supported="ff" distance="10" oneway="true"/> <!-- Nigerian Fulfulde -->
<!-- Encompassed by Guarani -->
<languageMatch desired="gnw" supported="gn" distance="10" oneway="true"/> <!-- Western Bolivian Guaraní -->
<languageMatch desired="gui" supported="gn" distance="10" oneway="true"/> <!-- Eastern Bolivian Guaraní -->
<languageMatch desired="gun" supported="gn" distance="10" oneway="true"/> <!-- Mbyá Guaraní -->
<languageMatch desired="nhd" supported="gn" distance="10" oneway="true"/> <!-- Chiripá -->
<!-- Encompassed by Inuktitut -->
<languageMatch desired="ikt" supported="iu" distance="10" oneway="true"/> <!-- Inuinnaqtun -->
<!-- Encompassed by Kalenjin -->
<languageMatch desired="enb" supported="kln" distance="10" oneway="true"/> <!-- Markweeta -->
<languageMatch desired="eyo" supported="kln" distance="10" oneway="true"/> <!-- Keiyo -->
<languageMatch desired="niq" supported="kln" distance="10" oneway="true"/> <!-- Nandi -->
<languageMatch desired="oki" supported="kln" distance="10" oneway="true"/> <!-- Okiek -->
<languageMatch desired="pko" supported="kln" distance="10" oneway="true"/> <!-- Pökoot -->
<languageMatch desired="sgc" supported="kln" distance="10" oneway="true"/> <!-- Kipsigis -->
<languageMatch desired="tec" supported="kln" distance="10" oneway="true"/> <!-- Terik -->
<languageMatch desired="tuy" supported="kln" distance="10" oneway="true"/> <!-- Tugen -->
<!-- Encompassed by Konkani -->
<languageMatch desired="gom" supported="kok" distance="10" oneway="true"/> <!-- Goan Konkani -->
<!-- Encompassed by Kpelle -->
<languageMatch desired="gkp" supported="kpe" distance="10" oneway="true"/> <!-- Guinea Kpelle -->
<!-- Encompassed by Luyia -->
<languageMatch desired="ida" supported="luy" distance="10" oneway="true"/> <!-- Idakho-Isukha-Tiriki -->
<languageMatch desired="lkb" supported="luy" distance="10" oneway="true"/> <!-- Kabras -->
<languageMatch desired="lko" supported="luy" distance="10" oneway="true"/> <!-- Khayo -->
<languageMatch desired="lks" supported="luy" distance="10" oneway="true"/> <!-- Kisa -->
<languageMatch desired="lri" supported="luy" distance="10" oneway="true"/> <!-- Marachi -->
<languageMatch desired="lrm" supported="luy" distance="10" oneway="true"/> <!-- Marama -->
<languageMatch desired="lsm" supported="luy" distance="10" oneway="true"/> <!-- Saamia -->
<languageMatch desired="lto" supported="luy" distance="10" oneway="true"/> <!-- Tsotso -->
<languageMatch desired="lts" supported="luy" distance="10" oneway="true"/> <!-- Tachoni -->
<languageMatch desired="lwg" supported="luy" distance="10" oneway="true"/> <!-- Wanga -->
<languageMatch desired="nle" supported="luy" distance="10" oneway="true"/> <!-- East Nyala -->
<languageMatch desired="nyd" supported="luy" distance="10" oneway="true"/> <!-- Nyore -->
<languageMatch desired="rag" supported="luy" distance="10" oneway="true"/> <!-- Logooli -->
<!-- Encompassed by Latvian -->
<languageMatch desired="ltg" supported="lv" distance="10" oneway="true"/> <!-- Latgalian -->
<!-- Encompassed by Malagasy -->
<languageMatch desired="bhr" supported="mg" distance="10" oneway="true"/> <!-- Bara Malagasy -->
<languageMatch desired="bjq" supported="mg" distance="10" oneway="true"/> <!-- Southern Betsimisaraka Malagasy -->
<languageMatch desired="bmm" supported="mg" distance="10" oneway="true"/> <!-- Northern Betsimisaraka Malagasy -->
<languageMatch desired="bzc" supported="mg" distance="10" oneway="true"/> <!-- Southern Betsimisaraka Malagasy -->
<languageMatch desired="msh" supported="mg" distance="10" oneway="true"/> <!-- Masikoro Malagasy -->
<languageMatch desired="skg" supported="mg" distance="10" oneway="true"/> <!-- Sakalava Malagasy -->
<languageMatch desired="tdx" supported="mg" distance="10" oneway="true"/> <!-- Tandroy-Mahafaly Malagasy -->
<languageMatch desired="tkg" supported="mg" distance="10" oneway="true"/> <!-- Tesaka Malagasy -->
<languageMatch desired="txy" supported="mg" distance="10" oneway="true"/> <!-- Tanosy Malagasy -->
<languageMatch desired="xmv" supported="mg" distance="10" oneway="true"/> <!-- Antankarana Malagasy -->
<languageMatch desired="xmw" supported="mg" distance="10" oneway="true"/> <!-- Tsimihety Malagasy -->
<!-- Encompassed by Mongolian -->
<languageMatch desired="mvf" supported="mn" distance="10" oneway="true"/> <!-- Peripheral Mongolian -->
<!-- Encompassed by Malay -->
<languageMatch desired="bjn" supported="ms" distance="10" oneway="true"/> <!-- Banjar -->
<languageMatch desired="btj" supported="ms" distance="10" oneway="true"/> <!-- Bacanese Malay -->
<languageMatch desired="bve" supported="ms" distance="10" oneway="true"/> <!-- Berau Malay -->
<languageMatch desired="bvu" supported="ms" distance="10" oneway="true"/> <!-- Bukit Malay -->
<languageMatch desired="coa" supported="ms" distance="10" oneway="true"/> <!-- Cocos Islands Malay -->
<languageMatch desired="dup" supported="ms" distance="10" oneway="true"/> <!-- Duano -->
<languageMatch desired="hji" supported="ms" distance="10" oneway="true"/> <!-- Haji -->
<languageMatch desired="id" supported="ms" distance="10" oneway="true"/> <!-- Indonesian -->
<languageMatch desired="jak" supported="ms" distance="10" oneway="true"/> <!-- Jakun -->
<languageMatch desired="jax" supported="ms" distance="10" oneway="true"/> <!-- Jambi Malay -->
<languageMatch desired="kvb" supported="ms" distance="10" oneway="true"/> <!-- Kubu -->
<languageMatch desired="kvr" supported="ms" distance="10" oneway="true"/> <!-- Kerinci -->
<languageMatch desired="kxd" supported="ms" distance="10" oneway="true"/> <!-- Brunei -->
<languageMatch desired="lce" supported="ms" distance="10" oneway="true"/> <!-- Loncong -->
<languageMatch desired="lcf" supported="ms" distance="10" oneway="true"/> <!-- Lubu -->
<languageMatch desired="liw" supported="ms" distance="10" oneway="true"/> <!-- Col -->
<languageMatch desired="max" supported="ms" distance="10" oneway="true"/> <!-- North Moluccan Malay -->
<languageMatch desired="meo" supported="ms" distance="10" oneway="true"/> <!-- Kedah Malay -->
<languageMatch desired="mfa" supported="ms" distance="10" oneway="true"/> <!-- Pattani Malay -->
<languageMatch desired="mfb" supported="ms" distance="10" oneway="true"/> <!-- Bangka -->
<languageMatch desired="min" supported="ms" distance="10" oneway="true"/> <!-- Minangkabau -->
<languageMatch desired="mqg" supported="ms" distance="10" oneway="true"/> <!-- Kota Bangun Kutai Malay -->
<languageMatch desired="msi" supported="ms" distance="10" oneway="true"/> <!-- Sabah Malay -->
<languageMatch desired="mui" supported="ms" distance="10" oneway="true"/> <!-- Musi -->
<languageMatch desired="orn" supported="ms" distance="10" oneway="true"/> <!-- Orang Kanaq -->
<languageMatch desired="ors" supported="ms" distance="10" oneway="true"/> <!-- Orang Seletar -->
<languageMatch desired="pel" supported="ms" distance="10" oneway="true"/> <!-- Pekal -->
<languageMatch desired="pse" supported="ms" distance="10" oneway="true"/> <!-- Central Malay -->
<languageMatch desired="tmw" supported="ms" distance="10" oneway="true"/> <!-- Temuan -->
<languageMatch desired="urk" supported="ms" distance="10" oneway="true"/> <!-- Urak Lawoi' -->
<languageMatch desired="vkk" supported="ms" distance="10" oneway="true"/> <!-- Kaur -->
<languageMatch desired="vkt" supported="ms" distance="10" oneway="true"/> <!-- Tenggarong Kutai Malay -->
<languageMatch desired="xmm" supported="ms" distance="10" oneway="true"/> <!-- Manado Malay -->
<languageMatch desired="zlm" supported="ms" distance="10" oneway="true"/> <!-- Malay (individual language) -->
<languageMatch desired="zmi" supported="ms" distance="10" oneway="true"/> <!-- Negeri Sembilan Malay -->
<!-- Encompassed by Nepali -->
<languageMatch desired="dty" supported="ne" distance="10" oneway="true"/> <!-- Dotyali -->
<!-- Encompassed by Oromo -->
<languageMatch desired="gax" supported="om" distance="10" oneway="true"/> <!-- Borana-Arsi-Guji Oromo -->
<languageMatch desired="hae" supported="om" distance="10" oneway="true"/> <!-- Eastern Oromo -->
<languageMatch desired="orc" supported="om" distance="10" oneway="true"/> <!-- Orma -->
<!-- Encompassed by Odia -->
<languageMatch desired="spv" supported="or" distance="10" oneway="true"/> <!-- Sambalpuri -->
<!-- Encompassed by Pashto -->
<languageMatch desired="pbt" supported="ps" distance="10" oneway="true"/> <!-- Southern Pashto -->
<languageMatch desired="pst" supported="ps" distance="10" oneway="true"/> <!-- Central Pashto -->
<!-- Encompassed by Quechua -->
<languageMatch desired="qub" supported="qu" distance="10" oneway="true"/> <!-- Huallaga Huánuco Quechua -->
<languageMatch desired="qud" supported="qu" distance="10" oneway="true"/> <!-- Calderón Highland Quichua -->
<languageMatch desired="quf" supported="qu" distance="10" oneway="true"/> <!-- Lambayeque Quechua -->
<languageMatch desired="qug" supported="qu" distance="10" oneway="true"/> <!-- Chimborazo Highland Quichua -->
<languageMatch desired="quh" supported="qu" distance="10" oneway="true"/> <!-- South Bolivian Quechua -->
<languageMatch desired="quk" supported="qu" distance="10" oneway="true"/> <!-- Chachapoyas Quechua -->
<languageMatch desired="qul" supported="qu" distance="10" oneway="true"/> <!-- North Bolivian Quechua -->
<languageMatch desired="qup" supported="qu" distance="10" oneway="true"/> <!-- Southern Pastaza Quechua -->
<languageMatch desired="qur" supported="qu" distance="10" oneway="true"/> <!-- Yanahuanca Pasco Quechua -->
<languageMatch desired="qus" supported="qu" distance="10" oneway="true"/> <!-- Santiago del Estero Quichua -->
<languageMatch desired="quw" supported="qu" distance="10" oneway="true"/> <!-- Tena Lowland Quichua -->
<languageMatch desired="qux" supported="qu" distance="10" oneway="true"/> <!-- Yauyos Quechua -->
<languageMatch desired="quy" supported="qu" distance="10" oneway="true"/> <!-- Ayacucho Quechua -->
<languageMatch desired="qva" supported="qu" distance="10" oneway="true"/> <!-- Ambo-Pasco Quechua -->
<languageMatch desired="qvc" supported="qu" distance="10" oneway="true"/> <!-- Cajamarca Quechua -->
<languageMatch desired="qve" supported="qu" distance="10" oneway="true"/> <!-- Eastern Apurímac Quechua -->
<languageMatch desired="qvh" supported="qu" distance="10" oneway="true"/> <!-- Huamalíes-Dos de Mayo Huánuco Quechua -->
<languageMatch desired="qvi" supported="qu" distance="10" oneway="true"/> <!-- Imbabura Highland Quichua -->
<languageMatch desired="qvj" supported="qu" distance="10" oneway="true"/> <!-- Loja Highland Quichua -->
<languageMatch desired="qvl" supported="qu" distance="10" oneway="true"/> <!-- Cajatambo North Lima Quechua -->
<languageMatch desired="qvm" supported="qu" distance="10" oneway="true"/> <!-- Margos-Yarowilca-Lauricocha Quechua -->
<languageMatch desired="qvn" supported="qu" distance="10" oneway="true"/> <!-- North Junín Quechua -->
<languageMatch desired="qvo" supported="qu" distance="10" oneway="true"/> <!-- Napo Lowland Quechua -->
<languageMatch desired="qvp" supported="qu" distance="10" oneway="true"/> <!-- Pacaraos Quechua -->
<languageMatch desired="qvs" supported="qu" distance="10" oneway="true"/> <!-- San Martín Quechua -->
<languageMatch desired="qvw" supported="qu" distance="10" oneway="true"/> <!-- Huaylla Wanca Quechua -->
<languageMatch desired="qvz" supported="qu" distance="10" oneway="true"/> <!-- Northern Pastaza Quichua -->
<languageMatch desired="qwa" supported="qu" distance="10" oneway="true"/> <!-- Corongo Ancash Quechua -->
<languageMatch desired="qwc" supported="qu" distance="10" oneway="true"/> <!-- Classical Quechua -->
<languageMatch desired="qwh" supported="qu" distance="10" oneway="true"/> <!-- Huaylas Ancash Quechua -->
<languageMatch desired="qws" supported="qu" distance="10" oneway="true"/> <!-- Sihuas Ancash Quechua -->
<languageMatch desired="qxa" supported="qu" distance="10" oneway="true"/> <!-- Chiquián Ancash Quechua -->
<languageMatch desired="qxc" supported="qu" distance="10" oneway="true"/> <!-- Chincha Quechua -->
<languageMatch desired="qxh" supported="qu" distance="10" oneway="true"/> <!-- Panao Huánuco Quechua -->
<languageMatch desired="qxl" supported="qu" distance="10" oneway="true"/> <!-- Salasaca Highland Quichua -->
<languageMatch desired="qxn" supported="qu" distance="10" oneway="true"/> <!-- Northern Conchucos Ancash Quechua -->
<languageMatch desired="qxo" supported="qu" distance="10" oneway="true"/> <!-- Southern Conchucos Ancash Quechua -->
<languageMatch desired="qxp" supported="qu" distance="10" oneway="true"/> <!-- Puno Quechua -->
<languageMatch desired="qxr" supported="qu" distance="10" oneway="true"/> <!-- Cañar Highland Quichua -->
<languageMatch desired="qxt" supported="qu" distance="10" oneway="true"/> <!-- Santa Ana de Tusi Pasco Quechua -->
<languageMatch desired="qxu" supported="qu" distance="10" oneway="true"/> <!-- Arequipa-La Unión Quechua -->
<languageMatch desired="qxw" supported="qu" distance="10" oneway="true"/> <!-- Jauja Wanca Quechua -->
<!-- Encompassed by Sardinian -->
<languageMatch desired="sdc" supported="sc" distance="10" oneway="true"/> <!-- Sassarese Sardinian -->
<languageMatch desired="sdn" supported="sc" distance="10" oneway="true"/> <!-- Gallurese Sardinian -->
<languageMatch desired="sro" supported="sc" distance="10" oneway="true"/> <!-- Campidanese Sardinian -->
<!-- Encompassed by Albanian -->
<languageMatch desired="aae" supported="sq" distance="10" oneway="true"/> <!-- Arbëreshë Albanian -->
<languageMatch desired="aat" supported="sq" distance="10" oneway="true"/> <!-- Arvanitika Albanian -->
<languageMatch desired="aln" supported="sq" distance="10" oneway="true"/> <!-- Gheg Albanian -->
<!-- Encompassed by Syriac -->
<languageMatch desired="aii" supported="syr" distance="10" oneway="true"/> <!-- Assyrian Neo-Aramaic -->
<!-- Encompassed by Uzbek -->
<languageMatch desired="uzs" supported="uz" distance="10" oneway="true"/> <!-- Southern Uzbek -->
<!-- Encompassed by Yiddish -->
<languageMatch desired="yih" supported="yi" distance="10" oneway="true"/> <!-- Western Yiddish -->
<!-- Encompassed by Chinese, Mandarin -->
<languageMatch desired="cdo" supported="zh" distance="10" oneway="true"/> <!-- Min Dong Chinese -->
<languageMatch desired="cjy" supported="zh" distance="10" oneway="true"/> <!-- Jinyu Chinese -->
<languageMatch desired="cpx" supported="zh" distance="10" oneway="true"/> <!-- Pu-Xian Chinese -->
<languageMatch desired="czh" supported="zh" distance="10" oneway="true"/> <!-- Huizhou Chinese -->
<languageMatch desired="czo" supported="zh" distance="10" oneway="true"/> <!-- Min Zhong Chinese -->
<languageMatch desired="gan" supported="zh" distance="10" oneway="true"/> <!-- Gan Chinese -->
<languageMatch desired="hak" supported="zh" distance="10" oneway="true"/> <!-- Hakka Chinese -->
<languageMatch desired="hsn" supported="zh" distance="10" oneway="true"/> <!-- Xiang Chinese -->
<languageMatch desired="lzh" supported="zh" distance="10" oneway="true"/> <!-- Literary Chinese -->
<languageMatch desired="mnp" supported="zh" distance="10" oneway="true"/> <!-- Min Bei Chinese -->
<languageMatch desired="nan" supported="zh" distance="10" oneway="true"/> <!-- Min Nan Chinese -->
<languageMatch desired="wuu" supported="zh" distance="10" oneway="true"/> <!-- Wu Chinese -->
<languageMatch desired="yue" supported="zh" distance="10" oneway="true"/> <!-- Chinese, Cantonese -->
<!-- END generated by GenerateLanguageMatches.java -->
<languageMatch desired="*" supported="*" distance="80"/> <!-- * ⇒ * -->
<languageMatch desired="az_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- az; Latn ⇒ ru; Cyrl -->
<languageMatch desired="bn_Beng" supported="en_Latn" distance="10" oneway="true"/> <!-- bn; Beng ⇒ en; Latn -->
<languageMatch desired="hy_Armn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- hy; Armn ⇒ ru; Cyrl -->
<languageMatch desired="ka_Geor" supported="en_Latn" distance="10" oneway="true"/> <!-- ka; Geor ⇒ en; Latn -->
<languageMatch desired="km_Khmr" supported="en_Latn" distance="10" oneway="true"/> <!-- km; Khmr ⇒ en; Latn -->
<languageMatch desired="kn_Knda" supported="en_Latn" distance="10" oneway="true"/> <!-- kn; Knda ⇒ en; Latn -->
<languageMatch desired="lo_Laoo" supported="en_Latn" distance="10" oneway="true"/> <!-- lo; Laoo ⇒ en; Latn -->
<languageMatch desired="ml_Mlym" supported="en_Latn" distance="10" oneway="true"/> <!-- ml; Mlym ⇒ en; Latn -->
<languageMatch desired="my_Mymr" supported="en_Latn" distance="10" oneway="true"/> <!-- my; Mymr ⇒ en; Latn -->
<languageMatch desired="ne_Deva" supported="en_Latn" distance="10" oneway="true"/> <!-- ne; Deva ⇒ en; Latn -->
<languageMatch desired="or_Orya" supported="en_Latn" distance="10" oneway="true"/> <!-- or; Orya ⇒ en; Latn -->
<languageMatch desired="pa_Guru" supported="en_Latn" distance="10" oneway="true"/> <!-- pa; Guru ⇒ en; Latn -->
<languageMatch desired="ps_Arab" supported="en_Latn" distance="10" oneway="true"/> <!-- ps; Arab ⇒ en; Latn -->
<languageMatch desired="sd_Arab" supported="en_Latn" distance="10" oneway="true"/> <!-- sd; Arab ⇒ en; Latn -->
<languageMatch desired="si_Sinh" supported="en_Latn" distance="10" oneway="true"/> <!-- si; Sinh ⇒ en; Latn -->
<languageMatch desired="ta_Taml" supported="en_Latn" distance="10" oneway="true"/> <!-- ta; Taml ⇒ en; Latn -->
<languageMatch desired="te_Telu" supported="en_Latn" distance="10" oneway="true"/> <!-- te; Telu ⇒ en; Latn -->
<languageMatch desired="ti_Ethi" supported="en_Latn" distance="10" oneway="true"/> <!-- ti; Ethi ⇒ en; Latn -->
<languageMatch desired="tk_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- tk; Latn ⇒ ru; Cyrl -->
<languageMatch desired="ur_Arab" supported="en_Latn" distance="10" oneway="true"/> <!-- ur; Arab ⇒ en; Latn -->
<languageMatch desired="uz_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- uz; Latn ⇒ ru; Cyrl -->
<languageMatch desired="yi_Hebr" supported="en_Latn" distance="10" oneway="true"/> <!-- yi; Hebr ⇒ en; Latn -->
<languageMatch desired="sr_Latn" supported="sr_Cyrl" distance="5"/> <!-- sr; Latn ⇒ sr; Cyrl -->
<languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/> <!-- zh; Hans ⇒ zh; Hant -->
<languageMatch desired="zh_Hant" supported="zh_Hans" distance="19" oneway="true"/> <!-- zh; Hant ⇒ zh; Hans -->
<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans -->
<languageMatch desired="zh_Hani" supported="zh_Hans" distance="20" oneway="true"/>
<languageMatch desired="zh_Hani" supported="zh_Hant" distance="20" oneway="true"/>
<!-- Latin transliterations of some languages, initially from CLDR-13577 -->
<languageMatch desired="ar_Latn" supported="ar_Arab" distance="20" oneway="true"/>
<languageMatch desired="bn_Latn" supported="bn_Beng" distance="20" oneway="true"/>
<languageMatch desired="gu_Latn" supported="gu_Gujr" distance="20" oneway="true"/>
<languageMatch desired="hi_Latn" supported="hi_Deva" distance="20" oneway="true"/>
<languageMatch desired="kn_Latn" supported="kn_Knda" distance="20" oneway="true"/>
<languageMatch desired="ml_Latn" supported="ml_Mlym" distance="20" oneway="true"/>
<languageMatch desired="mr_Latn" supported="mr_Deva" distance="20" oneway="true"/>
<languageMatch desired="ta_Latn" supported="ta_Taml" distance="20" oneway="true"/>
<languageMatch desired="te_Latn" supported="te_Telu" distance="20" oneway="true"/>
<languageMatch desired="zh_Latn" supported="zh_Hans" distance="20" oneway="true"/> <!-- Pinyin -->
<!-- start fallbacks for group script codes, initially from CLDR-13526
Look for plus signs on https://www.unicode.org/iso15924/iso15924-codes.html -->
<languageMatch desired="ja_Latn" supported="ja_Jpan" distance="5" oneway="true"/>
<languageMatch desired="ja_Hani" supported="ja_Jpan" distance="5" oneway="true"/>
<languageMatch desired="ja_Hira" supported="ja_Jpan" distance="5" oneway="true"/>
<languageMatch desired="ja_Kana" supported="ja_Jpan" distance="5" oneway="true"/>
<languageMatch desired="ja_Hrkt" supported="ja_Jpan" distance="5" oneway="true"/>
<languageMatch desired="ja_Hira" supported="ja_Hrkt" distance="5" oneway="true"/>
<languageMatch desired="ja_Kana" supported="ja_Hrkt" distance="5" oneway="true"/>
<languageMatch desired="ko_Hani" supported="ko_Kore" distance="5" oneway="true"/>
<languageMatch desired="ko_Hang" supported="ko_Kore" distance="5" oneway="true"/>
<languageMatch desired="ko_Jamo" supported="ko_Kore" distance="5" oneway="true"/>
<languageMatch desired="ko_Jamo" supported="ko_Hang" distance="5" oneway="true"/>
<!-- No special mappings for zh Bopo/Hanb
because Bopomofo is used only in TW, and unsure how widely.
No special mappings for styled scripts like Latf or Aran
because those would apply to many languages;
if desired, those would be better handled as matcher-specific script aliases. -->
<!-- end fallbacks for group script codes -->
<!-- default script mismatch distance -->
<languageMatch desired="*_*" supported="*_*" distance="50"/> <!-- *; * ⇒ *; * -->
<languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/> <!-- ar; *; $maghreb ⇒ ar; *; $maghreb -->
<languageMatch desired="ar_*_$!maghreb" supported="ar_*_$!maghreb" distance="4"/> <!-- ar; *; $!maghreb ⇒ ar; *; $!maghreb -->
<languageMatch desired="ar_*_*" supported="ar_*_*" distance="5"/> <!-- ar; *; * ⇒ ar; *; * -->
<languageMatch desired="en_*_$enUS" supported="en_*_$enUS" distance="4"/> <!-- en; *; $enUS ⇒ en; *; $enUS -->
<languageMatch desired="en_*_$!enUS" supported="en_*_GB" distance="3"/> <!-- Make en_GB preferred... -->
<languageMatch desired="en_*_$!enUS" supported="en_*_$!enUS" distance="4"/> <!-- en; *; $!enUS ⇒ en; *; $!enUS -->
<languageMatch desired="en_*_*" supported="en_*_*" distance="5"/> <!-- en; *; * ⇒ en; *; * -->
<languageMatch desired="es_*_$americas" supported="es_*_$americas" distance="4"/> <!-- es; *; $americas ⇒ es; *; $americas -->
<languageMatch desired="es_*_$!americas" supported="es_*_$!americas" distance="4"/> <!-- es; *; $!americas ⇒ es; *; $!americas -->
<languageMatch desired="es_*_*" supported="es_*_*" distance="5"/> <!-- es; *; * ⇒ es; *; * -->
<languageMatch desired="pt_*_$americas" supported="pt_*_$americas" distance="4"/> <!-- pt; *; $americas ⇒ pt; *; $americas -->
<languageMatch desired="pt_*_$!americas" supported="pt_*_$!americas" distance="4"/> <!-- pt; *; $!americas ⇒ pt; *; $!americas -->
<languageMatch desired="pt_*_*" supported="pt_*_*" distance="5"/> <!-- pt; *; * ⇒ pt; *; * -->
<languageMatch desired="zh_Hant_$cnsar" supported="zh_Hant_$cnsar" distance="4"/> <!-- zh; Hant; $cnsar ⇒ zh; Hant; $cnsar -->
<languageMatch desired="zh_Hant_$!cnsar" supported="zh_Hant_$!cnsar" distance="4"/> <!-- zh; Hant; $!cnsar ⇒ zh; Hant; $!cnsar -->
<languageMatch desired="zh_Hant_*" supported="zh_Hant_*" distance="5"/> <!-- zh; Hant; * ⇒ zh; Hant; * -->
<languageMatch desired="*_*_*" supported="*_*_*" distance="4"/> <!-- *; *; * ⇒ *; *; * -->
</languageMatches>
</languageMatching>
</supplementalData>

View file

@ -0,0 +1,3 @@
zsm,zsm,bahasa Malaysia
id,id,bahasa Indonesia
ms,ms,bahasa Malaysia
1 zsm zsm bahasa Malaysia
2 id id bahasa Indonesia
3 ms ms bahasa Malaysia

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,89 @@
# This is the list of language codes with the 'modern' level of support in CLDR
# (compared to 'full', which contains many more languages). We use this as the
# list of languages that we store specific name-to-code mappings for.
CLDR_LANGUAGES = {
"af",
"am",
"ar",
"as",
"az",
"be",
"bg",
"bn",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fil",
"fr",
"ga",
"gl",
"gu",
"he",
"hi",
"hr",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jv",
"ka",
"kk",
"km",
"kn",
"ko",
"ky",
"lo",
"lt",
"lv",
"mk",
"ml",
"mn",
"mr",
"ms",
"my",
"nb",
"ne",
"nl",
"or",
"pa",
"pl",
"pt",
"ro",
"ru",
"sd",
"si",
"sk",
"sl",
"so",
"sq",
"sr",
"sv",
"sw",
"ta",
"te",
"th",
"ti",
"tk",
"tr",
"uk",
"und",
"ur",
"uz",
"vi",
"yue",
"zh",
"zu",
}

File diff suppressed because one or more lines are too long

112
lib/language_data/names.py Normal file
View file

@ -0,0 +1,112 @@
# import marisa_trie
import warnings
from language_data.util import data_filename
TRIES = {}
# This is something we could hypothetically discover from XML files, but
# we end up learning that most languages separate things with commas, with
# a few exceptions. We'll just put those exceptions here.
DISPLAY_SEPARATORS = {
'am': '',
'ar': '، ',
'brx': ',',
'fa': '، ',
'ja': '',
'my': '',
'ug': '، ',
'und': ', ',
'ur': '، ',
'yue': '',
'zh': '',
}
def normalize_name(name):
"""
When looking up a language-code component by name, we would rather ignore
distinctions of case and certain punctuation. "Chinese (Traditional)"
should be matched by "Chinese Traditional" and "chinese traditional".
"""
name = name.casefold()
name = name.replace("", "'")
name = name.replace("-", " ")
name = name.replace("(", "")
name = name.replace(")", "")
name = name.replace(",", "")
return name.strip()
# def load_trie(filename):
# """
# Load a BytesTrie from the marisa_trie on-disk format.
# """
# trie = marisa_trie.BytesTrie()
# # marisa_trie raises warnings that make no sense. Ignore them.
# with warnings.catch_warnings():
# warnings.simplefilter("ignore")
# trie.load(filename)
# return trie
def get_trie_value(trie, key):
"""
Get the value that a BytesTrie stores for a particular key, decoded
as Unicode. Raises a KeyError if there is no value for that key.
"""
return trie[key][0].decode("utf-8")
def name_to_code(category, name, language: str = "und"):
"""
Get a language, script, or territory by its name in some language.
The language here must be a string representing a language subtag only.
The `Language.find` method can handle other representations of a language
and normalize them to this form.
The default language, "und", will allow matching names in any language,
so you can get the code 'fr' by looking up "French", "Français", or
"francés".
A small amount of fuzzy matching is supported: if the name can be
shortened or lengthened to match a single language name, you get that
language. This allows, for example, "Hakka Chinese" to match "Hakka".
Occasionally, names are ambiguous in a way that can be resolved by
specifying what name the language is supposed to be in. For example,
there is a language named 'Malayo' in English, but it's different from
the language named 'Malayo' in Spanish (which is Malay). Specifying the
language will look up the name in a trie that is only in that language.
"""
assert "/" not in language, "Language codes cannot contain slashes"
assert "-" not in language, "This code should be reduced to a language subtag only"
trie_name = "{}/name_to_{}".format(language, category)
if trie_name not in TRIES:
TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))
trie = TRIES[trie_name]
lookup = normalize_name(name)
if lookup in trie:
return get_trie_value(trie, lookup)
else:
# Is this a language name plus extra verbiage? Maybe it has "...isch",
# "... language", or "... Chinese" attached to it, for example. Look
# for a matching prefix of the desired name with at least 4 characters.
prefixes = trie.prefixes(lookup)
if prefixes and len(prefixes[-1]) >= 4:
return get_trie_value(trie, prefixes[-1])
else:
return None
def code_to_names(code):
"""
Given the code for a language, script, or territory, get a dictionary of its
names in various languages.
"""
# late import to save memory when possible
import language_data.name_data
return language_data.name_data.CODE_TO_NAMES.get(code, {})

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,58 @@
from language_data.util import data_filename
LIST_KEYS = {'Description', 'Prefix'}
def parse_file(file):
"""
Take an open file containing the IANA subtag registry, and yield a
dictionary of information for each subtag it describes.
"""
lines = []
for line in file:
line = line.rstrip('\n')
if line == '%%':
# This is a separator between items. Parse the data we've
# collected and yield the result.
yield from parse_item(lines)
lines.clear()
elif line.startswith(' '):
# This is a continuation line. Concatenate it to the previous
# line, including one of the spaces.
lines[-1] += line[1:]
else:
lines.append(line)
yield from parse_item(lines)
def parse_item(lines):
"""
Given the lines that form a subtag entry (after joining wrapped lines
back together), parse the data they contain.
Returns a generator that yields once if there was any data there
(and an empty generator if this was just the header).
"""
info = {}
for line in lines:
key, value = line.split(': ', 1)
if key in LIST_KEYS:
info.setdefault(key, []).append(value)
else:
assert key not in info
info[key] = value
if 'Subtag' in info or 'Tag' in info:
yield info
def parse_registry():
"""
Yield a sequence of dictionaries, containing the info in the included
IANA subtag registry file.
"""
with open(data_filename('language-subtag-registry.txt'),
encoding='utf-8') as data_file:
# 'yield from' instead of returning, so that we only close the file
# when finished.
yield from parse_file(data_file)

15
lib/language_data/util.py Normal file
View file

@ -0,0 +1,15 @@
"""
Used for locating a file in the data directory.
"""
from pkg_resources import resource_filename
DATA_ROOT = resource_filename('language_data', 'data')
import os
def data_filename(filename):
"""
Given a relative filename, get the full path to that file in the data
directory.
"""
return os.path.join(DATA_ROOT, filename)

View file

@ -4009,7 +4009,42 @@ class AddShows(Home):
if all_langs: if all_langs:
result.extend([lang['sg_lang'] for lang in all_langs if lang['sg_lang'] not in result]) result.extend([lang['sg_lang'] for lang in all_langs if lang['sg_lang'] not in result])
return json_dumps({'results': result}) try:
# noinspection PyPep8Naming
from langcodes import Language as lang_obj, LanguageTagError, standardize_tag
except ImportError:
lang_obj = None
result_ext = []
if None is not lang_obj:
prio_abbr = ''
prio_lang = []
try:
lang = lang_obj.get(sickgear.ADD_SHOWS_METALANG)
prio_abbr = lang.to_alpha3()
prio_lang = [dict(orig_abbr=sickgear.ADD_SHOWS_METALANG, std_abbr=sickgear.ADD_SHOWS_METALANG,
abbr=prio_abbr, en=lang.display_name(), native=lang.autonym())]
except (BaseException, Exception) as _:
pass
dedupe = []
for cur_lang in result:
try:
lang = lang_obj.get(cur_lang)
abbr = lang.to_alpha3()
except (BaseException, Exception) as _:
continue
try:
std_abbr = standardize_tag(cur_lang, macro=True)
except (BaseException, Exception) as _:
std_abbr = None
if abbr not in dedupe and abbr != prio_abbr:
dedupe += [abbr]
result_ext += [dict(orig_abbr=cur_lang, std_abbr=std_abbr, abbr=abbr, en=lang.display_name(), native=lang.autonym())]
result_ext = prio_lang + sorted(result_ext, key=lambda x: x['en'])
return json_dumps({'results': [] if result_ext else result, 'results_ext': result_ext})
@staticmethod @staticmethod
def generate_show_dir_name(show_name): def generate_show_dir_name(show_name):