Add langcodes 3.3.0 and language-data 1.1 (vendor'd from pip install langcodes[data]
)
Add a Select2 drop-down to `add-shows` and `edit-show`. The Select2 enables displaying inline language flag images, this feature deprecated by the native `select` drop-down element on some browsers. Change run existing TVInfo source language lists through validation (removes ~4 bad items), de-dupe list, get the native names, English names, and three letter abbr. Change remove marisa-trie requirement from language_data/names.py because nothing in SG calls a function that requires it. Change update some flags.
|
@ -17,6 +17,7 @@ Libs with customisations...
|
||||||
/lib/hachoir_parser/guess.py
|
/lib/hachoir_parser/guess.py
|
||||||
/lib/hachoir_parser/misc/torrent.py
|
/lib/hachoir_parser/misc/torrent.py
|
||||||
/lib/imdbpie
|
/lib/imdbpie
|
||||||
|
/lib/language_data/names.py
|
||||||
/lib/lockfile/mkdirlockfile.py
|
/lib/lockfile/mkdirlockfile.py
|
||||||
/lib/rtorrent
|
/lib/rtorrent
|
||||||
/lib/scandir/scandir.py
|
/lib/scandir/scandir.py
|
||||||
|
|
|
@ -1312,6 +1312,9 @@ div.formpaginate{
|
||||||
width:480px;
|
width:480px;
|
||||||
margin-top:0
|
margin-top:0
|
||||||
}
|
}
|
||||||
|
#addShowForm #nameToSearch.select2{
|
||||||
|
width:428px;
|
||||||
|
}
|
||||||
#addShowForm #nameToSearch.wide{
|
#addShowForm #nameToSearch.wide{
|
||||||
width:591px;
|
width:591px;
|
||||||
}
|
}
|
||||||
|
@ -3790,6 +3793,13 @@ option.flag{
|
||||||
background-position:10px 50%
|
background-position:10px 50%
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#select2-infosrc-lang-select-container .flag,
|
||||||
|
#select2-infosrc-lang-select-results .flag{
|
||||||
|
padding-left:25px;
|
||||||
|
background-repeat:no-repeat;
|
||||||
|
background-position:0 50%
|
||||||
|
}
|
||||||
|
|
||||||
/* Anime section for editShow */
|
/* Anime section for editShow */
|
||||||
.anigrouplists-wrapper{
|
.anigrouplists-wrapper{
|
||||||
height:auto;
|
height:auto;
|
||||||
|
|
BIN
gui/slick/images/flags/hy.png
Normal file
After Width: | Height: | Size: 212 B |
BIN
gui/slick/images/flags/ka.png
Normal file
After Width: | Height: | Size: 287 B |
BIN
gui/slick/images/flags/nb.png
Normal file
After Width: | Height: | Size: 397 B |
BIN
gui/slick/images/flags/nn.png
Normal file
After Width: | Height: | Size: 397 B |
BIN
gui/slick/images/flags/sq.png
Normal file
After Width: | Height: | Size: 324 B |
Before Width: | Height: | Size: 303 B |
Before Width: | Height: | Size: 545 B After Width: | Height: | Size: 303 B |
|
@ -28,6 +28,24 @@
|
||||||
<script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
|
<script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
|
||||||
<script type="text/javascript" src="$sbRoot/js/editShow.js?v=$sbPID"></script>
|
<script type="text/javascript" src="$sbRoot/js/editShow.js?v=$sbPID"></script>
|
||||||
<script type="text/javascript" src="$sbRoot/js/livepanel.js?v=$sbPID"></script>
|
<script type="text/javascript" src="$sbRoot/js/livepanel.js?v=$sbPID"></script>
|
||||||
|
<script src="$sbRoot/js/lib/select2.full.min.js"></script>
|
||||||
|
<link href="$sbRoot/css/lib/select2.css" rel="stylesheet">
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.select2-container{height:32px; font-size:12px; margin-right:6px}
|
||||||
|
.select2-container .select2-selection--single{height:30px}
|
||||||
|
.select2-results__group{color: #eee; background-color: rgb(51,51,51)}
|
||||||
|
.select2-results__options .select2-results__option{color: #222; background-color: #ddd}
|
||||||
|
.select2-results__options .select2-results__option .ended{color: #888}
|
||||||
|
.select2-container--default .select2-results > .select2-results__options{max-height: 300px}
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option,
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__group{padding-top: 2px !important; padding-bottom:2px !important}
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option--highlighted.select2-results__option--selectable .ended{color:white}
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option--selected,
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option--selected span{color:rgb(143, 21, 21) !important}
|
||||||
|
#select2-infosrc-lang-select-results span.flag{width:100%; height:100%; display:block}
|
||||||
|
</style>
|
||||||
|
|
||||||
#if $varExists('header')
|
#if $varExists('header')
|
||||||
<h1 class="header"><span class="grey-text">Edit </span>$header</h1>
|
<h1 class="header"><span class="grey-text">Edit </span>$header</h1>
|
||||||
#else
|
#else
|
||||||
|
@ -244,10 +262,10 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="field-pair">
|
<div class="field-pair">
|
||||||
<label for="infosrc-lang-select-edit">
|
<label for="infosrc-lang-select">
|
||||||
<span class="component-title">Info language</span>
|
<span class="component-title">Info language</span>
|
||||||
<span class="component-desc">
|
<span class="component-desc">
|
||||||
<select name="tvinfo_lang" id="infosrc-lang-select-edit" class="form-control form-control-inline input-sm"></select>
|
<select name="tvinfo_lang" id="infosrc-lang-select" class="form-control form-control-inline input-sm"></select>
|
||||||
<span>fetch show information in this language</span>
|
<span>fetch show information in this language</span>
|
||||||
</span>
|
</span>
|
||||||
</label>
|
</label>
|
||||||
|
|
|
@ -35,6 +35,23 @@
|
||||||
<script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
|
<script type="text/javascript" src="$sbRoot/js/qualityChooser.js?v=$sbPID"></script>
|
||||||
<script type="text/javascript" src="$sbRoot/js/newShow.js?v=$sbPID"></script>
|
<script type="text/javascript" src="$sbRoot/js/newShow.js?v=$sbPID"></script>
|
||||||
<script type="text/javascript" src="$sbRoot/js/addShowOptions.js?v=$sbPID"></script>
|
<script type="text/javascript" src="$sbRoot/js/addShowOptions.js?v=$sbPID"></script>
|
||||||
|
<script src="$sbRoot/js/lib/select2.full.min.js"></script>
|
||||||
|
<link href="$sbRoot/css/lib/select2.css" rel="stylesheet">
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.select2-container{height:32px; font-size:12px}
|
||||||
|
.select2-container .select2-selection--single{height:30px}
|
||||||
|
.select2-results__group{color: #eee; background-color: rgb(51,51,51)}
|
||||||
|
.select2-results__options .select2-results__option{color: #222; background-color: #ddd}
|
||||||
|
.select2-results__options .select2-results__option .ended{color: #888}
|
||||||
|
.select2-container--default .select2-results > .select2-results__options{max-height: 300px}
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option,
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__group{padding-top: 2px !important; padding-bottom:2px !important}
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option--highlighted.select2-results__option--selectable .ended{color:white}
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option--selected,
|
||||||
|
#select2-infosrc-lang-select-results .select2-results__option--selected span{color:rgb(143, 21, 21) !important}
|
||||||
|
#select2-infosrc-lang-select-results span.flag{width:100%; height:100%; display:block}
|
||||||
|
</style>
|
||||||
|
|
||||||
#if $varExists('header')
|
#if $varExists('header')
|
||||||
<h1 class="header">$header</h1>
|
<h1 class="header">$header</h1>
|
||||||
|
|
|
@ -16,31 +16,73 @@ $(document).ready(function () {
|
||||||
return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
|
return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
|
||||||
}
|
}
|
||||||
|
|
||||||
$.getJSON($.SickGear.Root + '/add-shows/get-infosrc-languages', {}, function (data) {
|
function uriFlag(lang) {
|
||||||
var result = '', currentLangAdded = '', selected = ' selected="selected"';
|
return $.SickGear.Root + '/images/flags/' + lang + '.png'
|
||||||
|
}
|
||||||
|
|
||||||
if (!data.results.length) {
|
$.getJSON($.SickGear.Root + '/add-shows/get-infosrc-languages', {}, function (data) {
|
||||||
result = '<option value="' + config.showLang + '"' + selected + htmlFlag(config.showLang) + '>'
|
var htmlText = '', currentLangAdded = '',
|
||||||
|
selected = ' selected="selected"', htmlSelected = '',
|
||||||
|
elInfosrcLang = $('#infosrc-lang-select'),
|
||||||
|
useSelect2 = 0 < data.results_ext.length, populateItem;
|
||||||
|
|
||||||
|
if (!data.results.length && !data.results_ext.length) {
|
||||||
|
htmlText = '<option value="' + config.showLang + '"' + selected + htmlFlag(config.showLang) + '>'
|
||||||
+ config.showLang + '</option>';
|
+ config.showLang + '</option>';
|
||||||
} else {
|
} else {
|
||||||
currentLangAdded = !1;
|
currentLangAdded = !1;
|
||||||
|
if (useSelect2){
|
||||||
|
// 3 letter abbr object
|
||||||
|
$.each(data.results_ext, function (index, obj) {
|
||||||
|
|
||||||
|
htmlSelected = '';
|
||||||
|
if (obj.std_abbr === config.showLang) {
|
||||||
|
currentLangAdded = !0;
|
||||||
|
htmlSelected = selected;
|
||||||
|
}
|
||||||
|
|
||||||
|
htmlText += '<option style="padding-left:25px" value="' + obj.std_abbr + '"'
|
||||||
|
+ ' data-abbr="' + obj.abbr + '"'
|
||||||
|
+ ' data-img="' + uriFlag(obj.std_abbr) + '"'
|
||||||
|
+ ' data-title="' + obj.en + ' (' + obj.orig_abbr + '/' + obj.std_abbr + '/' + obj.abbr + ')' + '"'
|
||||||
|
+ (!!htmlSelected
|
||||||
|
? htmlSelected + '>> '
|
||||||
|
: '>')
|
||||||
|
+ obj.native
|
||||||
|
+ '</option>';
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// legacy 2 letter abbr list
|
||||||
$.each(data.results, function (index, strLang) {
|
$.each(data.results, function (index, strLang) {
|
||||||
|
|
||||||
var htmlSelected = '';
|
htmlSelected = '';
|
||||||
if (strLang === config.showLang) {
|
if (strLang === config.showLang) {
|
||||||
currentLangAdded = !0;
|
currentLangAdded = !0;
|
||||||
htmlSelected = selected;
|
htmlSelected = selected;
|
||||||
}
|
}
|
||||||
|
|
||||||
result += '<option value="' + strLang + '"' + htmlSelected + htmlFlag(strLang) + '>'
|
htmlText += '<option value="' + strLang + '"' + htmlSelected + htmlFlag(strLang) + '>'
|
||||||
+ strLang + '</option>';
|
+ strLang + '</option>';
|
||||||
});
|
});
|
||||||
|
}
|
||||||
if (!currentLangAdded)
|
if (!currentLangAdded)
|
||||||
result += '<option value="' + config.showLang + '" ' + selected + '>' + config.showLang + '</option>';
|
htmlText += '<option value="' + config.showLang + '" ' + selected + '>' + config.showLang + '</option>';
|
||||||
}
|
}
|
||||||
|
|
||||||
$('#infosrc-lang-select-edit').html(result);
|
elInfosrcLang.html(htmlText);
|
||||||
|
|
||||||
|
if (useSelect2) {
|
||||||
|
populateItem = function (data) {
|
||||||
|
if (!!data.element)
|
||||||
|
return $('<span class="flag"'
|
||||||
|
+ ' style="background-image:url(' + $(data.element).data('img') + ')"'
|
||||||
|
+ ' title="' + $(data.element).data('title') + '">'
|
||||||
|
+ data.text
|
||||||
|
+ '</span>');
|
||||||
|
return data.text;
|
||||||
|
}
|
||||||
|
elInfosrcLang.select2({templateResult: populateItem, templateSelection: populateItem, width: 162});
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
function getExceptions() {
|
function getExceptions() {
|
||||||
|
|
|
@ -9,6 +9,10 @@ $(document).ready(function () {
|
||||||
return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
|
return ' class="flag" style="background-image:url(' + $.SickGear.Root + '/images/flags/' + lang + '.png)"'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function uriFlag(lang) {
|
||||||
|
return $.SickGear.Root + '/images/flags/' + lang + '.png'
|
||||||
|
}
|
||||||
|
|
||||||
function populateLangSelect() {
|
function populateLangSelect() {
|
||||||
if (!$('#nameToSearch').length)
|
if (!$('#nameToSearch').length)
|
||||||
return;
|
return;
|
||||||
|
@ -17,27 +21,58 @@ $(document).ready(function () {
|
||||||
|
|
||||||
$.getJSON(sbRoot + '/add-shows/get-infosrc-languages', {}, function (data) {
|
$.getJSON(sbRoot + '/add-shows/get-infosrc-languages', {}, function (data) {
|
||||||
|
|
||||||
var resultStr = '', flag,
|
var htmlText = '', flag,
|
||||||
selected = ' selected="selected"',
|
selected = ' selected="selected"',
|
||||||
elInfosrcLang = $('#infosrc-lang-select');
|
elInfosrcLang = $('#infosrc-lang-select'),
|
||||||
|
useSelect2 = 0 < data.results_ext.length, populateItem;
|
||||||
|
|
||||||
if (0 === data.results.length) {
|
if (0 === data.results.length && 0 === data.results_ext.length) {
|
||||||
resultStr = '<option value="en"' + selected + '>> en</option>';
|
htmlText = '<option value="en"' + selected + '>> en</option>';
|
||||||
} else {
|
} else {
|
||||||
|
if (useSelect2) {
|
||||||
|
$('#nameToSearch').addClass('select2');
|
||||||
|
// 3 letter abbr object
|
||||||
|
$.each(data.results_ext, function (index, obj) {
|
||||||
|
htmlText += '<option style="padding-left:25px" value="' + obj.std_abbr + '"'
|
||||||
|
+ ' data-abbr="' + obj.abbr + '"'
|
||||||
|
+ ' data-img="' + uriFlag(obj.std_abbr) + '"'
|
||||||
|
+ ' data-title="' + obj.en + ' (' + obj.orig_abbr + '/' + obj.std_abbr + '/' + obj.abbr + ')' + '"'
|
||||||
|
+ ('' === htmlText
|
||||||
|
? selected + '>> '
|
||||||
|
: '>')
|
||||||
|
+ obj.native
|
||||||
|
+ '</option>';
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// legacy 2 letter abbr list
|
||||||
$.each(data.results, function (index, obj) {
|
$.each(data.results, function (index, obj) {
|
||||||
flag = htmlFlag(obj);
|
flag = htmlFlag(obj);
|
||||||
resultStr += '<option value="' + obj + '"'
|
htmlText += '<option value="' + obj + '"'
|
||||||
+ ('' === resultStr
|
+ ('' === htmlText
|
||||||
? flag.replace('"flag', '"flag selected-text') + selected + '>> '
|
? flag.replace('"flag', '"flag selected-text') + selected + '>> '
|
||||||
: flag + '>')
|
: flag + '>')
|
||||||
+ obj + '</option>';
|
+ obj + '</option>';
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
elInfosrcLang.html(resultStr);
|
elInfosrcLang.html(htmlText);
|
||||||
elInfosrcLang.change(function () {
|
elInfosrcLang.change(function () {
|
||||||
searchIndexers();
|
searchIndexers();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (useSelect2) {
|
||||||
|
populateItem = function(data) {
|
||||||
|
if (!!data.element)
|
||||||
|
return $('<span class="flag"'
|
||||||
|
+ ' style="background-image:url(' + $(data.element).data('img') + ')"'
|
||||||
|
+ ' title="' + $(data.element).data('title') + '">'
|
||||||
|
+ data.text
|
||||||
|
+ '</span>');
|
||||||
|
return data.text;
|
||||||
|
}
|
||||||
|
elInfosrcLang.select2({templateResult: populateItem, templateSelection: populateItem, width: 155});
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
1931
lib/langcodes/__init__.py
Normal file
242
lib/langcodes/build_data.py
Normal file
|
@ -0,0 +1,242 @@
|
||||||
|
import json
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from langcodes.util import data_filename
|
||||||
|
from langcodes.registry_parser import parse_registry
|
||||||
|
|
||||||
|
|
||||||
|
def read_cldr_supplemental(dataname):
|
||||||
|
cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')
|
||||||
|
filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
|
||||||
|
fulldata = json.load(open(filename, encoding='utf-8'))
|
||||||
|
if dataname == 'aliases':
|
||||||
|
data = fulldata['supplemental']['metadata']['alias']
|
||||||
|
else:
|
||||||
|
data = fulldata['supplemental'][dataname]
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def read_iana_registry_suppress_scripts():
|
||||||
|
scripts = {}
|
||||||
|
for entry in parse_registry():
|
||||||
|
if entry['Type'] == 'language' and 'Suppress-Script' in entry:
|
||||||
|
scripts[entry['Subtag']] = entry['Suppress-Script']
|
||||||
|
return scripts
|
||||||
|
|
||||||
|
|
||||||
|
def read_iana_registry_scripts():
|
||||||
|
scripts = set()
|
||||||
|
for entry in parse_registry():
|
||||||
|
if entry['Type'] == 'script':
|
||||||
|
scripts.add(entry['Subtag'])
|
||||||
|
return scripts
|
||||||
|
|
||||||
|
|
||||||
|
def read_iana_registry_macrolanguages():
|
||||||
|
macros = {}
|
||||||
|
for entry in parse_registry():
|
||||||
|
if entry['Type'] == 'language' and 'Macrolanguage' in entry:
|
||||||
|
macros[entry['Subtag']] = entry['Macrolanguage']
|
||||||
|
return macros
|
||||||
|
|
||||||
|
|
||||||
|
def read_iana_registry_replacements():
|
||||||
|
replacements = {}
|
||||||
|
for entry in parse_registry():
|
||||||
|
if entry['Type'] == 'language' and 'Preferred-Value' in entry:
|
||||||
|
# Replacements for language codes
|
||||||
|
replacements[entry['Subtag']] = entry['Preferred-Value']
|
||||||
|
elif 'Tag' in entry and 'Preferred-Value' in entry:
|
||||||
|
# Replacements for entire tags
|
||||||
|
replacements[entry['Tag'].lower()] = entry['Preferred-Value']
|
||||||
|
return replacements
|
||||||
|
|
||||||
|
|
||||||
|
def write_python_dict(outfile, name, d):
|
||||||
|
print(f"{name} = {{", file=outfile)
|
||||||
|
for key in sorted(d):
|
||||||
|
value = d[key]
|
||||||
|
print(f" {key!r}: {value!r},", file=outfile)
|
||||||
|
print("}", file=outfile)
|
||||||
|
|
||||||
|
|
||||||
|
def write_python_set(outfile, name, s):
|
||||||
|
print(f"{name} = {{", file=outfile)
|
||||||
|
for key in sorted(set(s)):
|
||||||
|
print(f" {key!r},", file=outfile)
|
||||||
|
print("}", file=outfile)
|
||||||
|
|
||||||
|
|
||||||
|
GENERATED_HEADER = "# This file is generated by build_data.py."
|
||||||
|
|
||||||
|
|
||||||
|
def read_validity_regex():
|
||||||
|
validity_options = []
|
||||||
|
for codetype in ('language', 'region', 'script', 'variant'):
|
||||||
|
validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')
|
||||||
|
root = ET.fromstring(open(validity_path).read())
|
||||||
|
matches = root.findall('./idValidity/id')
|
||||||
|
for match in matches:
|
||||||
|
for item in match.text.strip().split():
|
||||||
|
if '~' in item:
|
||||||
|
assert item[-2] == '~'
|
||||||
|
prefix = item[:-3]
|
||||||
|
range_start = item[-3]
|
||||||
|
range_end = item[-1]
|
||||||
|
option = f"{prefix}[{range_start}-{range_end}]"
|
||||||
|
validity_options.append(option)
|
||||||
|
else:
|
||||||
|
validity_options.append(item)
|
||||||
|
options = '|'.join(validity_options)
|
||||||
|
return f'^({options})$'
|
||||||
|
|
||||||
|
|
||||||
|
def read_language_distances():
|
||||||
|
language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')
|
||||||
|
root = ET.fromstring(open(language_info_path).read())
|
||||||
|
matches = root.findall(
|
||||||
|
'./languageMatching/languageMatches[@type="written_new"]/languageMatch'
|
||||||
|
)
|
||||||
|
tag_distances = {}
|
||||||
|
for match in matches:
|
||||||
|
attribs = match.attrib
|
||||||
|
n_parts = attribs['desired'].count('_') + 1
|
||||||
|
if n_parts < 3:
|
||||||
|
if attribs.get('oneway') == 'true':
|
||||||
|
pairs = [(attribs['desired'], attribs['supported'])]
|
||||||
|
else:
|
||||||
|
pairs = [
|
||||||
|
(attribs['desired'], attribs['supported']),
|
||||||
|
(attribs['supported'], attribs['desired']),
|
||||||
|
]
|
||||||
|
for (desired, supported) in pairs:
|
||||||
|
desired_distance = tag_distances.setdefault(desired, {})
|
||||||
|
desired_distance[supported] = int(attribs['distance'])
|
||||||
|
|
||||||
|
# The 'languageInfo' data file contains distances for the unnormalized
|
||||||
|
# tag 'sh', but we work mostly with normalized tags, and they don't
|
||||||
|
# describe at all how to cope with this.
|
||||||
|
#
|
||||||
|
# 'sh' normalizes to 'sr-Latn', and when we're matching languages we
|
||||||
|
# aren't matching scripts yet, so when 'sh' appears we'll add a
|
||||||
|
# corresponding match for 'sr'.
|
||||||
|
#
|
||||||
|
# Then because we're kind of making this plan up, add 1 to the distance
|
||||||
|
# so it's a worse match than ones that are actually clearly defined
|
||||||
|
# in languageInfo.
|
||||||
|
if desired == 'sh' or supported == 'sh':
|
||||||
|
if desired == 'sh':
|
||||||
|
desired = 'sr'
|
||||||
|
if supported == 'sh':
|
||||||
|
supported = 'sr'
|
||||||
|
if desired != supported:
|
||||||
|
# don't try to define a non-zero distance for sr <=> sr
|
||||||
|
desired_distance = tag_distances.setdefault(desired, {})
|
||||||
|
desired_distance[supported] = int(attribs['distance']) + 1
|
||||||
|
|
||||||
|
return tag_distances
|
||||||
|
|
||||||
|
|
||||||
|
def build_data():
|
||||||
|
lang_scripts = read_iana_registry_suppress_scripts()
|
||||||
|
all_scripts = read_iana_registry_scripts()
|
||||||
|
macrolanguages = read_iana_registry_macrolanguages()
|
||||||
|
iana_replacements = read_iana_registry_replacements()
|
||||||
|
language_distances = read_language_distances()
|
||||||
|
|
||||||
|
alias_data = read_cldr_supplemental('aliases')
|
||||||
|
likely_subtags = read_cldr_supplemental('likelySubtags')
|
||||||
|
replacements = {}
|
||||||
|
|
||||||
|
# Aliased codes can still have alpha3 codes, and there's no unified source
|
||||||
|
# about what they are. It depends on whether the alias predates or postdates
|
||||||
|
# ISO 639-2, which nobody should have to care about. So let's set all the
|
||||||
|
# alpha3 codes for aliased alpha2 codes here.
|
||||||
|
alpha3_mapping = {
|
||||||
|
'tl': 'tgl', # even though it normalizes to 'fil'
|
||||||
|
'in': 'ind',
|
||||||
|
'iw': 'heb',
|
||||||
|
'ji': 'yid',
|
||||||
|
'jw': 'jav',
|
||||||
|
'sh': 'hbs',
|
||||||
|
}
|
||||||
|
alpha3_biblio = {}
|
||||||
|
norm_macrolanguages = {}
|
||||||
|
for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
|
||||||
|
aliases = alias_data[alias_type]
|
||||||
|
# Initially populate 'languageAlias' with the aliases from the IANA file
|
||||||
|
if alias_type == 'languageAlias':
|
||||||
|
replacements[alias_type] = iana_replacements
|
||||||
|
replacements[alias_type]['root'] = 'und'
|
||||||
|
else:
|
||||||
|
replacements[alias_type] = {}
|
||||||
|
for code, value in aliases.items():
|
||||||
|
# Make all keys lowercase so they can be looked up
|
||||||
|
# case-insensitively
|
||||||
|
code = code.lower()
|
||||||
|
|
||||||
|
# If there are multiple replacements, take the first one. For example,
|
||||||
|
# we just replace the Soviet Union (SU) with Russia (RU), instead of
|
||||||
|
# trying to do something context-sensitive and poorly standardized
|
||||||
|
# that selects one of the successor countries to the Soviet Union.
|
||||||
|
replacement = value['_replacement'].split()[0]
|
||||||
|
if value['_reason'] == 'macrolanguage':
|
||||||
|
norm_macrolanguages[code] = replacement
|
||||||
|
else:
|
||||||
|
# CLDR tries to oversimplify some codes as it assigns aliases.
|
||||||
|
# For example, 'nor' is the ISO alpha3 code for 'no', but CLDR
|
||||||
|
# would prefer you use 'nb' over 'no', so it makes 'nor' an
|
||||||
|
# alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.
|
||||||
|
#
|
||||||
|
# We undo this oversimplification so that we can get a
|
||||||
|
# canonical mapping between alpha2 and alpha3 codes.
|
||||||
|
if code == 'nor':
|
||||||
|
replacement = 'no'
|
||||||
|
elif code == 'mol':
|
||||||
|
replacement = 'mo'
|
||||||
|
elif code == 'twi':
|
||||||
|
replacement = 'tw'
|
||||||
|
elif code == 'bih':
|
||||||
|
replacement = 'bh'
|
||||||
|
|
||||||
|
replacements[alias_type][code] = replacement
|
||||||
|
if alias_type == 'languageAlias':
|
||||||
|
if value['_reason'] == 'overlong':
|
||||||
|
if replacement in alpha3_mapping:
|
||||||
|
raise ValueError(
|
||||||
|
"{code!r} is an alpha3 for {replacement!r}, which"
|
||||||
|
" already has an alpha3: {orig!r}".format(
|
||||||
|
code=code,
|
||||||
|
replacement=replacement,
|
||||||
|
orig=alpha3_mapping[replacement],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
alpha3_mapping[replacement] = code
|
||||||
|
elif value['_reason'] == 'bibliographic':
|
||||||
|
alpha3_biblio[replacement] = code
|
||||||
|
|
||||||
|
validity_regex = read_validity_regex()
|
||||||
|
|
||||||
|
# Write the contents of data_dicts.py.
|
||||||
|
with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
|
||||||
|
print(GENERATED_HEADER, file=outfile)
|
||||||
|
print("import re\n", file=outfile)
|
||||||
|
write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
|
||||||
|
write_python_dict(
|
||||||
|
outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']
|
||||||
|
)
|
||||||
|
write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)
|
||||||
|
write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)
|
||||||
|
write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])
|
||||||
|
write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)
|
||||||
|
write_python_dict(
|
||||||
|
outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']
|
||||||
|
)
|
||||||
|
write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
|
||||||
|
write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)
|
||||||
|
write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
|
||||||
|
write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
|
||||||
|
print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
build_data()
|
48462
lib/langcodes/data/language-subtag-registry.txt
Normal file
4377
lib/langcodes/data_dicts.py
Normal file
188
lib/langcodes/language_distance.py
Normal file
|
@ -0,0 +1,188 @@
|
||||||
|
from .data_dicts import LANGUAGE_DISTANCES
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
TagTriple = Tuple[str, str, str]
|
||||||
|
_DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
|
||||||
|
DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
|
||||||
|
DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
|
||||||
|
DEFAULT_TERRITORY_DISTANCE = 4
|
||||||
|
|
||||||
|
|
||||||
|
# Territory clusters used in territory matching:
|
||||||
|
# Maghreb (the western Arab world)
|
||||||
|
MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}
|
||||||
|
|
||||||
|
# United States and its territories
|
||||||
|
US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}
|
||||||
|
|
||||||
|
# Special Autonomous Regions of China
|
||||||
|
CNSAR = {"HK", "MO"}
|
||||||
|
|
||||||
|
LATIN_AMERICA = {
|
||||||
|
"419",
|
||||||
|
# Central America
|
||||||
|
"013",
|
||||||
|
"BZ",
|
||||||
|
"CR",
|
||||||
|
"SV",
|
||||||
|
"GT",
|
||||||
|
"HN",
|
||||||
|
"MX",
|
||||||
|
"NI",
|
||||||
|
"PA",
|
||||||
|
# South America
|
||||||
|
"005",
|
||||||
|
"AR",
|
||||||
|
"BO",
|
||||||
|
"BR",
|
||||||
|
"CL",
|
||||||
|
"CO",
|
||||||
|
"EC",
|
||||||
|
"FK",
|
||||||
|
"GF",
|
||||||
|
"GY",
|
||||||
|
"PY",
|
||||||
|
"PE",
|
||||||
|
"SR",
|
||||||
|
"UY",
|
||||||
|
"VE",
|
||||||
|
}
|
||||||
|
|
||||||
|
# North and South America
|
||||||
|
AMERICAS = {
|
||||||
|
"019",
|
||||||
|
# Caribbean
|
||||||
|
"029",
|
||||||
|
"AI",
|
||||||
|
"AG",
|
||||||
|
"AW",
|
||||||
|
"BS",
|
||||||
|
"BB",
|
||||||
|
"VG",
|
||||||
|
"BQ",
|
||||||
|
"KY",
|
||||||
|
"CU",
|
||||||
|
"CW",
|
||||||
|
"DM",
|
||||||
|
"DO",
|
||||||
|
"GD",
|
||||||
|
"GP",
|
||||||
|
"HT",
|
||||||
|
"JM",
|
||||||
|
"MQ",
|
||||||
|
"MS",
|
||||||
|
"PR",
|
||||||
|
"SX",
|
||||||
|
"BL",
|
||||||
|
"KN",
|
||||||
|
"LC",
|
||||||
|
"MF",
|
||||||
|
"VC",
|
||||||
|
"TT",
|
||||||
|
"TC",
|
||||||
|
"VI",
|
||||||
|
# Northern America
|
||||||
|
"021",
|
||||||
|
"BM",
|
||||||
|
"CA",
|
||||||
|
"GL",
|
||||||
|
"PM",
|
||||||
|
"US",
|
||||||
|
# North America as a whole
|
||||||
|
"003",
|
||||||
|
} | LATIN_AMERICA
|
||||||
|
|
||||||
|
|
||||||
|
def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
|
||||||
|
"""
|
||||||
|
Takes in triples of (language, script, territory), which can be derived by
|
||||||
|
'maximizing' a language tag. Returns a number from 0 to 135 indicating the
|
||||||
|
'distance' between these for the purposes of language matching.
|
||||||
|
"""
|
||||||
|
# First of all, if these are identical, return quickly:
|
||||||
|
if supported == desired:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# If we've already figured it out, return the cached distance.
|
||||||
|
if (desired, supported) in _DISTANCE_CACHE:
|
||||||
|
return _DISTANCE_CACHE[desired, supported]
|
||||||
|
else:
|
||||||
|
result = _tuple_distance(desired, supported)
|
||||||
|
_DISTANCE_CACHE[desired, supported] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get2(dictionary: dict, key1: str, key2: str, default):
|
||||||
|
return dictionary.get(key1, {}).get(key2, default)
|
||||||
|
|
||||||
|
|
||||||
|
def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
|
||||||
|
desired_language, desired_script, desired_territory = desired
|
||||||
|
supported_language, supported_script, supported_territory = supported
|
||||||
|
distance = 0
|
||||||
|
|
||||||
|
if desired_language != supported_language:
|
||||||
|
distance += _get2(
|
||||||
|
LANGUAGE_DISTANCES,
|
||||||
|
desired_language,
|
||||||
|
supported_language,
|
||||||
|
DEFAULT_LANGUAGE_DISTANCE,
|
||||||
|
)
|
||||||
|
|
||||||
|
desired_script_pair = f"{desired_language}_{desired_script}"
|
||||||
|
supported_script_pair = f"{supported_language}_{supported_script}"
|
||||||
|
|
||||||
|
if desired_script != supported_script:
|
||||||
|
# Scripts can match other scripts, but only when paired with a
|
||||||
|
# language. For example, there is no reason to assume someone who can
|
||||||
|
# read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
|
||||||
|
# someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
|
||||||
|
# a language written in two scripts.
|
||||||
|
distance += _get2(
|
||||||
|
LANGUAGE_DISTANCES,
|
||||||
|
desired_script_pair,
|
||||||
|
supported_script_pair,
|
||||||
|
DEFAULT_SCRIPT_DISTANCE,
|
||||||
|
)
|
||||||
|
|
||||||
|
if desired_territory != supported_territory:
|
||||||
|
# The rules for matching territories are too weird to implement the
|
||||||
|
# general case efficiently. Instead of implementing all the possible
|
||||||
|
# match rules the XML could define, instead we just reimplement the
|
||||||
|
# rules of CLDR 36.1 here in code.
|
||||||
|
|
||||||
|
tdist = DEFAULT_TERRITORY_DISTANCE
|
||||||
|
if desired_script_pair == supported_script_pair:
|
||||||
|
if desired_language == "ar":
|
||||||
|
if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
|
||||||
|
tdist = 5
|
||||||
|
elif desired_language == "en":
|
||||||
|
if (desired_territory == "GB") and (supported_territory not in US):
|
||||||
|
tdist = 3
|
||||||
|
elif (desired_territory not in US) and (supported_territory == "GB"):
|
||||||
|
tdist = 3
|
||||||
|
elif (desired_territory in US) != (supported_territory in US):
|
||||||
|
tdist = 5
|
||||||
|
# This is not a rule that's spelled out in CLDR, but is implied by things
|
||||||
|
# about territory containment mentioned in other standards. Numeric values
|
||||||
|
# for territories, like '003', represent broad regions that contain more
|
||||||
|
# specific territories.
|
||||||
|
#
|
||||||
|
# 419 is the numeric value most often seen in language codes, particularly
|
||||||
|
# 'es-419' for Latin American Spanish. If you have a language code that
|
||||||
|
# differs only in that its territory is more specific, like 'es-PY', it should
|
||||||
|
# be closer to a supported 'es-419' than anything with a territory difference.
|
||||||
|
#
|
||||||
|
# We can implement this for 419 without becoming responsible for keeping up
|
||||||
|
# with which countries/territories/regions contain others in the general case.
|
||||||
|
elif desired_territory in LATIN_AMERICA and supported_territory == "419":
|
||||||
|
tdist = 1
|
||||||
|
elif desired_language == "es" or desired_language == "pt":
|
||||||
|
if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
|
||||||
|
tdist = 5
|
||||||
|
elif desired_script_pair == "zh_Hant":
|
||||||
|
if (desired_territory in CNSAR) != (supported_territory in CNSAR):
|
||||||
|
tdist = 5
|
||||||
|
distance += tdist
|
||||||
|
return distance
|
517
lib/langcodes/language_lists.py
Normal file
|
@ -0,0 +1,517 @@
|
||||||
|
# This is the list of language codes with the 'modern' level of support in CLDR
|
||||||
|
# (compared to 'full', which contains many more languages). We use this as the
|
||||||
|
# list of languages that we store specific name-to-code mappings for.
|
||||||
|
|
||||||
|
CLDR_LANGUAGES = {
|
||||||
|
'af',
|
||||||
|
'am',
|
||||||
|
'ar',
|
||||||
|
'az',
|
||||||
|
'be',
|
||||||
|
'bg',
|
||||||
|
'bn',
|
||||||
|
'bs',
|
||||||
|
'ca',
|
||||||
|
'cs',
|
||||||
|
'cy',
|
||||||
|
'da',
|
||||||
|
'de',
|
||||||
|
'el',
|
||||||
|
'en',
|
||||||
|
'es',
|
||||||
|
'et',
|
||||||
|
'eu',
|
||||||
|
'fa',
|
||||||
|
'fi',
|
||||||
|
'fil',
|
||||||
|
'fo',
|
||||||
|
'fr',
|
||||||
|
'ga',
|
||||||
|
'gl',
|
||||||
|
'gu',
|
||||||
|
'he',
|
||||||
|
'hi',
|
||||||
|
'hr',
|
||||||
|
'hu',
|
||||||
|
'hy',
|
||||||
|
'id',
|
||||||
|
'is',
|
||||||
|
'it',
|
||||||
|
'ja',
|
||||||
|
'ka',
|
||||||
|
'kk',
|
||||||
|
'km',
|
||||||
|
'kn',
|
||||||
|
'ko',
|
||||||
|
'ky',
|
||||||
|
'lo',
|
||||||
|
'lt',
|
||||||
|
'lv',
|
||||||
|
'mk',
|
||||||
|
'ml',
|
||||||
|
'mn',
|
||||||
|
'mr',
|
||||||
|
'ms',
|
||||||
|
'my',
|
||||||
|
'nb',
|
||||||
|
'ne',
|
||||||
|
'nl',
|
||||||
|
'pa',
|
||||||
|
'pl',
|
||||||
|
'pt',
|
||||||
|
'ro',
|
||||||
|
'ru',
|
||||||
|
'si',
|
||||||
|
'sk',
|
||||||
|
'sl',
|
||||||
|
'sq',
|
||||||
|
'sr',
|
||||||
|
'sv',
|
||||||
|
'sw',
|
||||||
|
'ta',
|
||||||
|
'te',
|
||||||
|
'th',
|
||||||
|
'ti',
|
||||||
|
'to',
|
||||||
|
'tr',
|
||||||
|
'uk',
|
||||||
|
'und',
|
||||||
|
'ur',
|
||||||
|
'uz',
|
||||||
|
'vi',
|
||||||
|
'yue',
|
||||||
|
'zh',
|
||||||
|
'zu',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# These are the names languages that have the most entries on the English and
|
||||||
|
# German Wiktionaries. Wiktionary only consistently identifies languages by their
|
||||||
|
# name, making it important to be able to recognize the names.
|
||||||
|
#
|
||||||
|
# These lists of names are used in `tests/test_wikt_languages.py`.
|
||||||
|
WIKT_LANGUAGE_NAMES = {}
|
||||||
|
|
||||||
|
WIKT_LANGUAGE_NAMES['en'] = [
|
||||||
|
"Spanish",
|
||||||
|
"French",
|
||||||
|
"Latvian",
|
||||||
|
"Latin",
|
||||||
|
"English",
|
||||||
|
"Mandarin",
|
||||||
|
"Italian",
|
||||||
|
"Portuguese",
|
||||||
|
"Cantonese",
|
||||||
|
"Japanese",
|
||||||
|
"German",
|
||||||
|
"Swedish",
|
||||||
|
"Korean",
|
||||||
|
"Serbo-Croatian",
|
||||||
|
"Serbian",
|
||||||
|
"Croatian",
|
||||||
|
"Bosnian",
|
||||||
|
"Finnish",
|
||||||
|
"Vietnamese",
|
||||||
|
"Dutch",
|
||||||
|
"Galician",
|
||||||
|
"Catalan",
|
||||||
|
"Polish",
|
||||||
|
"Danish",
|
||||||
|
"Norwegian Nynorsk",
|
||||||
|
"Turkish",
|
||||||
|
"Romanian",
|
||||||
|
"Lithuanian",
|
||||||
|
"Ido",
|
||||||
|
"Old French",
|
||||||
|
"Czech",
|
||||||
|
"Norwegian",
|
||||||
|
# Jèrriais -- same as Norman
|
||||||
|
"Esperanto",
|
||||||
|
"Icelandic",
|
||||||
|
# Old Armenian
|
||||||
|
"Norwegian Bokmål",
|
||||||
|
"Asturian",
|
||||||
|
"Hungarian",
|
||||||
|
"Proto-Germanic",
|
||||||
|
"Russian",
|
||||||
|
"Slovene",
|
||||||
|
"Min Nan",
|
||||||
|
"Scottish Gaelic",
|
||||||
|
"Greek",
|
||||||
|
"Irish",
|
||||||
|
"Lojban",
|
||||||
|
"Middle French",
|
||||||
|
"Malay",
|
||||||
|
"Luxembourgish",
|
||||||
|
"Slovak",
|
||||||
|
"Estonian",
|
||||||
|
"Persian",
|
||||||
|
"Venetian",
|
||||||
|
"Old English",
|
||||||
|
"Volapük",
|
||||||
|
"Ladin",
|
||||||
|
"Faroese",
|
||||||
|
"Scots",
|
||||||
|
"Interlingua",
|
||||||
|
"Romansch",
|
||||||
|
"Urdu",
|
||||||
|
# Middle Chinese
|
||||||
|
"Indonesian",
|
||||||
|
"Swahili",
|
||||||
|
"Middle English",
|
||||||
|
"Occitan",
|
||||||
|
"Welsh",
|
||||||
|
"Old Norse",
|
||||||
|
"Albanian",
|
||||||
|
"Old Irish",
|
||||||
|
"Old Saxon",
|
||||||
|
"Lower Sorbian",
|
||||||
|
"Afrikaans",
|
||||||
|
"Ukrainian",
|
||||||
|
"Proto-Slavic",
|
||||||
|
"Ancient Greek",
|
||||||
|
"Gothic",
|
||||||
|
"Hawaiian",
|
||||||
|
"Kurdish",
|
||||||
|
"Tagalog",
|
||||||
|
"Old High German",
|
||||||
|
"Crimean Tatar",
|
||||||
|
"Manx",
|
||||||
|
"Sanskrit",
|
||||||
|
"Hiligaynon",
|
||||||
|
"West Frisian",
|
||||||
|
"Hebrew",
|
||||||
|
"Tok Pisin",
|
||||||
|
"Proto-Indo-European",
|
||||||
|
"Macedonian",
|
||||||
|
"Novial",
|
||||||
|
"Armenian",
|
||||||
|
"Arabic",
|
||||||
|
"Maltese",
|
||||||
|
"Hakka",
|
||||||
|
"Sicilian",
|
||||||
|
"Ladino",
|
||||||
|
"Basque",
|
||||||
|
"Breton",
|
||||||
|
# Guernésiais -- same as Norman
|
||||||
|
"Vai",
|
||||||
|
"Navajo",
|
||||||
|
"Azeri",
|
||||||
|
"Vilamovian",
|
||||||
|
# Tarantino
|
||||||
|
"Maori",
|
||||||
|
"Friulian",
|
||||||
|
"Hausa",
|
||||||
|
"Haitian Creole",
|
||||||
|
"Yiddish",
|
||||||
|
"Tatar",
|
||||||
|
"Proto-Malayo-Polynesian",
|
||||||
|
"Aromanian",
|
||||||
|
"Ottoman Turkish",
|
||||||
|
"Old Provençal",
|
||||||
|
"Northern Sami",
|
||||||
|
"Dalmatian",
|
||||||
|
"Bulgarian",
|
||||||
|
"Neapolitan",
|
||||||
|
"Cornish",
|
||||||
|
"Middle Dutch",
|
||||||
|
"Rapa Nui",
|
||||||
|
# Old Portuguese
|
||||||
|
"Egyptian Arabic",
|
||||||
|
"Romani",
|
||||||
|
"Tahitian",
|
||||||
|
"Thai",
|
||||||
|
"Limburgish",
|
||||||
|
"Karelian",
|
||||||
|
"Tajik",
|
||||||
|
"Turkmen",
|
||||||
|
"Kabardian",
|
||||||
|
"Uzbek",
|
||||||
|
"Samoan",
|
||||||
|
"Mongolian",
|
||||||
|
"Zulu",
|
||||||
|
"Upper Sorbian",
|
||||||
|
"Walloon",
|
||||||
|
# Proto-Finnic
|
||||||
|
"Frankish",
|
||||||
|
"Mapudungun",
|
||||||
|
"Pashto",
|
||||||
|
"Low German",
|
||||||
|
"Bashkir",
|
||||||
|
"Kashubian",
|
||||||
|
"Sranan Tongo",
|
||||||
|
"Proto-Sino-Tibetan",
|
||||||
|
"Norman",
|
||||||
|
"Proto-Austronesian",
|
||||||
|
"Marathi",
|
||||||
|
"Rohingya",
|
||||||
|
"Classical Nahuatl",
|
||||||
|
# Proto-Malayic
|
||||||
|
# German Low German
|
||||||
|
"Fijian",
|
||||||
|
"Zazaki",
|
||||||
|
"Proto-Italic",
|
||||||
|
"Old Dutch",
|
||||||
|
"Egyptian",
|
||||||
|
"Old Frisian",
|
||||||
|
"Greenlandic",
|
||||||
|
"Burmese",
|
||||||
|
"Votic",
|
||||||
|
"Ewe",
|
||||||
|
"Cherokee",
|
||||||
|
"Old Church Slavonic",
|
||||||
|
"Quechua",
|
||||||
|
"Mirandese",
|
||||||
|
"Livonian",
|
||||||
|
"Bengali",
|
||||||
|
"Skolt Sami",
|
||||||
|
# Proto-Balto-Slavic
|
||||||
|
"Pitjantjatjara",
|
||||||
|
"Georgian",
|
||||||
|
"North Frisian",
|
||||||
|
"Tetum",
|
||||||
|
"Tongan",
|
||||||
|
# Mauritian Creole
|
||||||
|
"Torres Strait Creole",
|
||||||
|
"Papiamentu",
|
||||||
|
"Lao",
|
||||||
|
"Malagasy",
|
||||||
|
"Interlingue",
|
||||||
|
"Aragonese",
|
||||||
|
"Istriot",
|
||||||
|
"Sumerian",
|
||||||
|
"Proto-Celtic",
|
||||||
|
"Võro",
|
||||||
|
# Proto-Polynesian
|
||||||
|
"Nepali",
|
||||||
|
"Chickasaw",
|
||||||
|
"Akkadian",
|
||||||
|
"Middle Armenian",
|
||||||
|
"Cimbrian",
|
||||||
|
"Somali",
|
||||||
|
"Sardinian",
|
||||||
|
"Tocharian B",
|
||||||
|
"Telugu",
|
||||||
|
"Javanese",
|
||||||
|
"Taos",
|
||||||
|
"Proto-Semitic",
|
||||||
|
# Old Prussian
|
||||||
|
"Kyrgyz",
|
||||||
|
"Corsican",
|
||||||
|
"Veps",
|
||||||
|
"Baluchi",
|
||||||
|
"Middle Low German",
|
||||||
|
"Middle High German",
|
||||||
|
"Uyghur",
|
||||||
|
# Dutch Low Saxon
|
||||||
|
"Belarusian",
|
||||||
|
"Guaraní",
|
||||||
|
"Undetermined",
|
||||||
|
"Inuktitut",
|
||||||
|
"Tocharian A",
|
||||||
|
"Nigerian Pidgin",
|
||||||
|
# Gallo
|
||||||
|
# Saterland Frisian
|
||||||
|
"Punjabi",
|
||||||
|
"Proto-Algonquian",
|
||||||
|
# Istro-Romanian
|
||||||
|
"Wiradhuri",
|
||||||
|
"Sichuan Yi",
|
||||||
|
"Wu",
|
||||||
|
# White Hmong
|
||||||
|
"Ugaritic",
|
||||||
|
"Sundanese",
|
||||||
|
# Old East Slavic
|
||||||
|
# Fala
|
||||||
|
# Elfdalian
|
||||||
|
"Tamil",
|
||||||
|
"Pijin",
|
||||||
|
"Okinawan",
|
||||||
|
"Kazakh",
|
||||||
|
"Hindi",
|
||||||
|
"Tuvan",
|
||||||
|
"Polabian",
|
||||||
|
"Aramaic",
|
||||||
|
"Malayalam",
|
||||||
|
"Kumyk",
|
||||||
|
"Inari Sami",
|
||||||
|
"Ilocano",
|
||||||
|
"Tswana",
|
||||||
|
"Libyan Arabic",
|
||||||
|
"Latgalian",
|
||||||
|
"Yakut",
|
||||||
|
"Sindhi",
|
||||||
|
"Khmer",
|
||||||
|
"Gamilaraay",
|
||||||
|
"Ojibwe",
|
||||||
|
"Choctaw",
|
||||||
|
"Chinese",
|
||||||
|
"Chamorro",
|
||||||
|
"Yucatec Maya",
|
||||||
|
"Picard",
|
||||||
|
"Ngarrindjeri",
|
||||||
|
"Kott",
|
||||||
|
"Ingrian",
|
||||||
|
# Crimean Gothic
|
||||||
|
"Chamicuro",
|
||||||
|
"Rajasthani",
|
||||||
|
# Old Tupi
|
||||||
|
"Old Spanish",
|
||||||
|
"Gagauz",
|
||||||
|
"Extremaduran",
|
||||||
|
"Chinook Jargon",
|
||||||
|
"Cahuilla",
|
||||||
|
"Kannada",
|
||||||
|
"Iban",
|
||||||
|
"American Sign Language",
|
||||||
|
"Adyghe",
|
||||||
|
"Warlpiri",
|
||||||
|
"Tibetan",
|
||||||
|
"Ossetian",
|
||||||
|
"Meriam",
|
||||||
|
"Marshallese",
|
||||||
|
"Khakas",
|
||||||
|
"Balinese",
|
||||||
|
"Zhuang",
|
||||||
|
"Tuvaluan",
|
||||||
|
"Niuean",
|
||||||
|
"Martuthunira",
|
||||||
|
"Guugu Yimidhirr",
|
||||||
|
"Chechen",
|
||||||
|
"Campidanese Sardinian",
|
||||||
|
"Tolai",
|
||||||
|
# Old Javanese
|
||||||
|
"Nahuatl",
|
||||||
|
"Lombard",
|
||||||
|
"West Coast Bajau",
|
||||||
|
"Romagnol",
|
||||||
|
"Middle Irish",
|
||||||
|
"Yoruba",
|
||||||
|
"Wangaaybuwan-Ngiyambaa",
|
||||||
|
# Old Swedish
|
||||||
|
"Lingala",
|
||||||
|
"Fiji Hindi",
|
||||||
|
"Shabo",
|
||||||
|
"Sasak",
|
||||||
|
"Judeo-Arabic",
|
||||||
|
"Central Kurdish",
|
||||||
|
"Bislama",
|
||||||
|
]
|
||||||
|
|
||||||
|
WIKT_LANGUAGE_NAMES['de'] = [
|
||||||
|
"Deutsch",
|
||||||
|
"Englisch",
|
||||||
|
"Polnisch",
|
||||||
|
"Italienisch",
|
||||||
|
"Französisch",
|
||||||
|
"Esperanto",
|
||||||
|
"Schwedisch",
|
||||||
|
"Lateinisch",
|
||||||
|
"Tschechisch",
|
||||||
|
"Katalanisch",
|
||||||
|
"Spanisch",
|
||||||
|
"Okzitanisch",
|
||||||
|
"Ungarisch",
|
||||||
|
"Litauisch",
|
||||||
|
"Finnisch",
|
||||||
|
"Russisch",
|
||||||
|
"Altgriechisch",
|
||||||
|
"Niederländisch",
|
||||||
|
"Kurdisch",
|
||||||
|
"Baskisch",
|
||||||
|
"Armenisch",
|
||||||
|
"Isländisch",
|
||||||
|
"Bulgarisch",
|
||||||
|
"Färöisch",
|
||||||
|
"Dänisch",
|
||||||
|
"Portugiesisch",
|
||||||
|
"Slowakisch",
|
||||||
|
"Türkisch",
|
||||||
|
"Maori",
|
||||||
|
"Albanisch",
|
||||||
|
"Japanisch",
|
||||||
|
"Norwegisch",
|
||||||
|
"Irisch",
|
||||||
|
"Koreanisch",
|
||||||
|
"Chinesisch",
|
||||||
|
"Venezianisch",
|
||||||
|
"Friaulisch",
|
||||||
|
"Serbisch",
|
||||||
|
"Indonesisch",
|
||||||
|
"Walisisch",
|
||||||
|
"Arabisch",
|
||||||
|
"Zentral-Nahuatl",
|
||||||
|
"Neugriechisch",
|
||||||
|
"Sumerisch",
|
||||||
|
"Obersorbisch",
|
||||||
|
"Sesotho",
|
||||||
|
"Rumänisch",
|
||||||
|
"Suaheli",
|
||||||
|
"Persisch",
|
||||||
|
"Krimtatarisch",
|
||||||
|
"Plattdeutsch",
|
||||||
|
"Prußisch",
|
||||||
|
"Thai",
|
||||||
|
"Bosnisch",
|
||||||
|
"Sardisch",
|
||||||
|
"Maltesisch",
|
||||||
|
"Akkadisch",
|
||||||
|
"Hawaiianisch",
|
||||||
|
"Hebräisch",
|
||||||
|
"Gotisch",
|
||||||
|
"Afrikaans",
|
||||||
|
"Rätoromanisch",
|
||||||
|
"Tamil",
|
||||||
|
"Bretonisch",
|
||||||
|
"Ukrainisch",
|
||||||
|
"Hindi",
|
||||||
|
"Georgisch",
|
||||||
|
"Panjabi",
|
||||||
|
"Papiamentu",
|
||||||
|
"Slowenisch",
|
||||||
|
"Nauruisch",
|
||||||
|
"Schottisch-Gälisch",
|
||||||
|
"Balinesisch",
|
||||||
|
"Estnisch",
|
||||||
|
"Manx",
|
||||||
|
"Korsisch",
|
||||||
|
# "Frühneuhochdeutsch",
|
||||||
|
"Lettisch",
|
||||||
|
"isiZulu",
|
||||||
|
"Tagalog",
|
||||||
|
"Tok Pisin",
|
||||||
|
# "Südpikenisch",
|
||||||
|
"Kroatisch",
|
||||||
|
"Niedersorbisch",
|
||||||
|
"Kannada",
|
||||||
|
"Guanche",
|
||||||
|
"Weißrussisch",
|
||||||
|
"Sanskrit",
|
||||||
|
"Aserbaidschanisch",
|
||||||
|
"Mittelhochdeutsch",
|
||||||
|
"Laotisch",
|
||||||
|
"Altnordisch",
|
||||||
|
"Altenglisch",
|
||||||
|
"Vietnamesisch",
|
||||||
|
"Tadschikisch",
|
||||||
|
"Samoanisch",
|
||||||
|
"Mazedonisch",
|
||||||
|
"Luxemburgisch",
|
||||||
|
"Hethitisch",
|
||||||
|
# "Yukatekisch",
|
||||||
|
"Kaschubisch",
|
||||||
|
"Wallonisch",
|
||||||
|
# "Klassisches Nahuatl",
|
||||||
|
"Telugu",
|
||||||
|
"Rapanui",
|
||||||
|
"Jiddisch",
|
||||||
|
"Ido",
|
||||||
|
# "Galicisch",
|
||||||
|
"Volapük",
|
||||||
|
"Bengalisch",
|
||||||
|
"Mapudungun",
|
||||||
|
"Lojban",
|
||||||
|
"Tuvaluisch",
|
||||||
|
"Gujarati",
|
||||||
|
"Assamesisch",
|
||||||
|
]
|
59
lib/langcodes/registry_parser.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
from langcodes.util import data_filename
|
||||||
|
|
||||||
|
LIST_KEYS = {'Description', 'Prefix'}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_file(file):
|
||||||
|
"""
|
||||||
|
Take an open file containing the IANA subtag registry, and yield a
|
||||||
|
dictionary of information for each subtag it describes.
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
for line in file:
|
||||||
|
line = line.rstrip('\n')
|
||||||
|
if line == '%%':
|
||||||
|
# This is a separator between items. Parse the data we've
|
||||||
|
# collected and yield the result.
|
||||||
|
yield from parse_item(lines)
|
||||||
|
lines.clear()
|
||||||
|
elif line.startswith(' '):
|
||||||
|
# This is a continuation line. Concatenate it to the previous
|
||||||
|
# line, including one of the spaces.
|
||||||
|
lines[-1] += line[1:]
|
||||||
|
else:
|
||||||
|
lines.append(line)
|
||||||
|
yield from parse_item(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_item(lines):
|
||||||
|
"""
|
||||||
|
Given the lines that form a subtag entry (after joining wrapped lines
|
||||||
|
back together), parse the data they contain.
|
||||||
|
|
||||||
|
Returns a generator that yields once if there was any data there
|
||||||
|
(and an empty generator if this was just the header).
|
||||||
|
"""
|
||||||
|
info = {}
|
||||||
|
for line in lines:
|
||||||
|
key, value = line.split(': ', 1)
|
||||||
|
if key in LIST_KEYS:
|
||||||
|
info.setdefault(key, []).append(value)
|
||||||
|
else:
|
||||||
|
assert key not in info
|
||||||
|
info[key] = value
|
||||||
|
|
||||||
|
if 'Subtag' in info or 'Tag' in info:
|
||||||
|
yield info
|
||||||
|
|
||||||
|
|
||||||
|
def parse_registry():
|
||||||
|
"""
|
||||||
|
Yield a sequence of dictionaries, containing the info in the included
|
||||||
|
IANA subtag registry file.
|
||||||
|
"""
|
||||||
|
with open(
|
||||||
|
data_filename('language-subtag-registry.txt'), encoding='utf-8'
|
||||||
|
) as data_file:
|
||||||
|
# 'yield from' instead of returning, so that we only close the file
|
||||||
|
# when finished.
|
||||||
|
yield from parse_file(data_file)
|
422
lib/langcodes/tag_parser.py
Normal file
|
@ -0,0 +1,422 @@
|
||||||
|
"""
|
||||||
|
This module implements a parser for language tags, according to the RFC 5646
|
||||||
|
(BCP 47) standard.
|
||||||
|
|
||||||
|
Here, we're only concerned with the syntax of the language tag. Looking up
|
||||||
|
what they actually mean in a data file is a separate step.
|
||||||
|
|
||||||
|
For a full description of the syntax of a language tag, see page 3 of
|
||||||
|
http://tools.ietf.org/html/bcp47
|
||||||
|
|
||||||
|
>>> parse_tag('en')
|
||||||
|
[('language', 'en')]
|
||||||
|
|
||||||
|
>>> parse_tag('en_US')
|
||||||
|
[('language', 'en'), ('territory', 'US')]
|
||||||
|
|
||||||
|
>>> parse_tag('en-Latn')
|
||||||
|
[('language', 'en'), ('script', 'Latn')]
|
||||||
|
|
||||||
|
>>> parse_tag('es-419')
|
||||||
|
[('language', 'es'), ('territory', '419')]
|
||||||
|
|
||||||
|
>>> parse_tag('zh-hant-tw')
|
||||||
|
[('language', 'zh'), ('script', 'Hant'), ('territory', 'TW')]
|
||||||
|
|
||||||
|
>>> parse_tag('zh-tw-hant')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: This script subtag, 'hant', is out of place. Expected variant, extension, or end of string.
|
||||||
|
|
||||||
|
>>> parse_tag('de-DE-1901')
|
||||||
|
[('language', 'de'), ('territory', 'DE'), ('variant', '1901')]
|
||||||
|
|
||||||
|
>>> parse_tag('ja-latn-hepburn')
|
||||||
|
[('language', 'ja'), ('script', 'Latn'), ('variant', 'hepburn')]
|
||||||
|
|
||||||
|
>>> parse_tag('ja-hepburn-latn')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
|
||||||
|
|
||||||
|
>>> parse_tag('zh-yue')
|
||||||
|
[('language', 'zh'), ('extlang', 'yue')]
|
||||||
|
|
||||||
|
>>> parse_tag('zh-yue-Hant')
|
||||||
|
[('language', 'zh'), ('extlang', 'yue'), ('script', 'Hant')]
|
||||||
|
|
||||||
|
>>> parse_tag('zh-min-nan')
|
||||||
|
[('grandfathered', 'zh-min-nan')]
|
||||||
|
|
||||||
|
>>> parse_tag('x-dothraki')
|
||||||
|
[('language', 'x-dothraki')]
|
||||||
|
|
||||||
|
>>> parse_tag('en-u-co-backward-x-pig-latin')
|
||||||
|
[('language', 'en'), ('extension', 'u-co-backward'), ('private', 'x-pig-latin')]
|
||||||
|
|
||||||
|
>>> parse_tag('en-x-pig-latin-u-co-backward')
|
||||||
|
[('language', 'en'), ('private', 'x-pig-latin-u-co-backward')]
|
||||||
|
|
||||||
|
>>> parse_tag('u-co-backward')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Expected a language code, got 'u'
|
||||||
|
|
||||||
|
>>> parse_tag('x-')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
|
||||||
|
|
||||||
|
>>> parse_tag('und-u-')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
|
||||||
|
|
||||||
|
>>> parse_tag('und-0-foo')
|
||||||
|
[('language', 'und'), ('extension', '0-foo')]
|
||||||
|
|
||||||
|
>>> parse_tag('und-?-foo')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '?'
|
||||||
|
|
||||||
|
>>> parse_tag('und-x-123456789')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '123456789'
|
||||||
|
|
||||||
|
>>> parse_tag('en-a-b-foo')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Tag extensions may not contain two singletons in a row
|
||||||
|
|
||||||
|
>>> parse_tag('ar-٠٠١')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
...
|
||||||
|
langcodes.tag_parser.LanguageTagError: Language tags must be made of ASCII characters
|
||||||
|
"""
|
||||||
|
|
||||||
|
# These tags should not be parsed by the usual parser; they're grandfathered
|
||||||
|
# in from RFC 3066. The 'irregular' ones don't fit the syntax at all; the
|
||||||
|
# 'regular' ones do, but would give meaningless results when parsed.
|
||||||
|
#
|
||||||
|
# These are all lowercased so they can be matched case-insensitively, as the
|
||||||
|
# standard requires.
|
||||||
|
EXCEPTIONS = {
|
||||||
|
# Irregular exceptions
|
||||||
|
"en-gb-oed",
|
||||||
|
"i-ami",
|
||||||
|
"i-bnn",
|
||||||
|
"i-default",
|
||||||
|
"i-enochian",
|
||||||
|
"i-hak",
|
||||||
|
"i-klingon",
|
||||||
|
"i-lux",
|
||||||
|
"i-mingo",
|
||||||
|
"i-navajo",
|
||||||
|
"i-pwn",
|
||||||
|
"i-tao",
|
||||||
|
"i-tay",
|
||||||
|
"i-tsu",
|
||||||
|
"sgn-be-fr",
|
||||||
|
"sgn-be-nl",
|
||||||
|
"sgn-ch-de",
|
||||||
|
# Regular exceptions
|
||||||
|
"art-lojban",
|
||||||
|
"cel-gaulish",
|
||||||
|
"no-bok",
|
||||||
|
"no-nyn",
|
||||||
|
"zh-guoyu",
|
||||||
|
"zh-hakka",
|
||||||
|
"zh-min",
|
||||||
|
"zh-min-nan",
|
||||||
|
"zh-xiang",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Define the order of subtags as integer constants, but also give them names
|
||||||
|
# so we can describe them in error messages
|
||||||
|
EXTLANG, SCRIPT, TERRITORY, VARIANT, EXTENSION = range(5)
|
||||||
|
SUBTAG_TYPES = [
|
||||||
|
'extlang',
|
||||||
|
'script',
|
||||||
|
'territory',
|
||||||
|
'variant',
|
||||||
|
'extension',
|
||||||
|
'end of string',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _is_ascii(s):
|
||||||
|
"""
|
||||||
|
Determine whether a tag consists of ASCII characters.
|
||||||
|
"""
|
||||||
|
# When Python 3.6 support is dropped, we can replace this with str.isascii().
|
||||||
|
try:
|
||||||
|
s.encode('ascii')
|
||||||
|
return True
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_characters(tag):
|
||||||
|
"""
|
||||||
|
BCP 47 is case-insensitive, and CLDR's use of it considers underscores
|
||||||
|
equivalent to hyphens. So here we smash tags into lowercase with hyphens,
|
||||||
|
so we can make exact comparisons.
|
||||||
|
|
||||||
|
>>> normalize_characters('en_US')
|
||||||
|
'en-us'
|
||||||
|
>>> normalize_characters('zh-Hant_TW')
|
||||||
|
'zh-hant-tw'
|
||||||
|
"""
|
||||||
|
return tag.lower().replace('_', '-')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tag(tag):
|
||||||
|
"""
|
||||||
|
Parse the syntax of a language tag, without looking up anything in the
|
||||||
|
registry, yet. Returns a list of (type, value) tuples indicating what
|
||||||
|
information will need to be looked up.
|
||||||
|
"""
|
||||||
|
if not _is_ascii(tag):
|
||||||
|
raise LanguageTagError("Language tags must be made of ASCII characters")
|
||||||
|
|
||||||
|
tag = normalize_characters(tag)
|
||||||
|
if tag in EXCEPTIONS:
|
||||||
|
return [('grandfathered', tag)]
|
||||||
|
else:
|
||||||
|
# The first subtag is always either the language code, or 'x' to mark
|
||||||
|
# the entire tag as private-use. Other subtags are distinguished
|
||||||
|
# by their length and format, but the language code is distinguished
|
||||||
|
# by the fact that it is required to come first.
|
||||||
|
subtags = tag.split('-')
|
||||||
|
|
||||||
|
# check all subtags for their shape: 1-8 alphanumeric characters
|
||||||
|
for subtag in subtags:
|
||||||
|
if len(subtag) < 1 or len(subtag) > 8 or not subtag.isalnum():
|
||||||
|
raise LanguageTagError(
|
||||||
|
f"Expected 1-8 alphanumeric characters, got {subtag!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if subtags[0] == 'x':
|
||||||
|
if len(subtags) == 1:
|
||||||
|
raise LanguageTagError("'x' is not a language tag on its own")
|
||||||
|
# the entire language tag is private use, but we know that,
|
||||||
|
# whatever it is, it fills the "language" slot
|
||||||
|
return [('language', tag)]
|
||||||
|
elif 2 <= len(subtags[0]) <= 4:
|
||||||
|
# Language codes should be 2 or 3 letters, but 4-letter codes
|
||||||
|
# are allowed to parse for legacy Unicode reasons
|
||||||
|
return [('language', subtags[0])] + parse_subtags(subtags[1:])
|
||||||
|
else:
|
||||||
|
subtag_error(subtags[0], 'a language code')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_subtags(subtags, expect=EXTLANG):
|
||||||
|
"""
|
||||||
|
Parse everything that comes after the language tag: scripts, territories,
|
||||||
|
variants, and assorted extensions.
|
||||||
|
"""
|
||||||
|
# We parse the parts of a language code recursively: each step of
|
||||||
|
# language code parsing handles one component of the code, recurses
|
||||||
|
# to handle the rest of the code, and adds what it found onto the
|
||||||
|
# list of things that were in the rest of the code.
|
||||||
|
#
|
||||||
|
# This could just as well have been iterative, but the loops would have
|
||||||
|
# been convoluted.
|
||||||
|
#
|
||||||
|
# So here's the base case.
|
||||||
|
if not subtags:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# There's a subtag that comes next. We need to find out what it is.
|
||||||
|
#
|
||||||
|
# The primary thing that distinguishes different types of subtags is
|
||||||
|
# length, but the subtags also come in a specified order. The 'expect'
|
||||||
|
# parameter keeps track of where we are in that order. expect=TERRITORY,
|
||||||
|
# for example, means we're expecting a territory code, or anything later
|
||||||
|
# (because everything but the language is optional).
|
||||||
|
subtag = subtags[0]
|
||||||
|
tag_length = len(subtag)
|
||||||
|
|
||||||
|
# In the usual case, our goal is to recognize what kind of tag this is,
|
||||||
|
# and set it in 'tagtype' -- as an integer, so we can compare where it
|
||||||
|
# should go in order. You can see the enumerated list of tagtypes above,
|
||||||
|
# where the SUBTAG_TYPES global is defined.
|
||||||
|
tagtype = None
|
||||||
|
|
||||||
|
if tag_length == 1:
|
||||||
|
# A one-letter subtag introduces an extension, which can itself have
|
||||||
|
# sub-subtags, so we dispatch to a different function at this point.
|
||||||
|
#
|
||||||
|
# We don't need to check anything about the order, because extensions
|
||||||
|
# necessarily come last.
|
||||||
|
if subtag.isalnum():
|
||||||
|
return parse_extension(subtags)
|
||||||
|
else:
|
||||||
|
subtag_error(subtag)
|
||||||
|
|
||||||
|
elif tag_length == 2:
|
||||||
|
if subtag.isalpha():
|
||||||
|
# Two-letter alphabetic subtags are territories. These are the only
|
||||||
|
# two-character subtags after the language.
|
||||||
|
tagtype = TERRITORY
|
||||||
|
|
||||||
|
elif tag_length == 3:
|
||||||
|
if subtag.isalpha():
|
||||||
|
# Three-letter alphabetic subtags are 'extended languages'.
|
||||||
|
# It's allowed for there to be up to three of them in a row, so we
|
||||||
|
# need another function to enforce that. Before we dispatch to that
|
||||||
|
# function, though, we need to check whether we're in the right
|
||||||
|
# place in order.
|
||||||
|
if expect <= EXTLANG:
|
||||||
|
return parse_extlang(subtags)
|
||||||
|
else:
|
||||||
|
order_error(subtag, EXTLANG, expect)
|
||||||
|
elif subtag.isdigit():
|
||||||
|
# Three-digit subtags are territories representing broad regions,
|
||||||
|
# such as Latin America (419).
|
||||||
|
tagtype = TERRITORY
|
||||||
|
|
||||||
|
elif tag_length == 4:
|
||||||
|
if subtag.isalpha():
|
||||||
|
# Four-letter alphabetic subtags are scripts.
|
||||||
|
tagtype = SCRIPT
|
||||||
|
elif subtag[0].isdigit():
|
||||||
|
# Four-character subtags that start with a digit are variants.
|
||||||
|
tagtype = VARIANT
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Tags of length 5-8 are variants.
|
||||||
|
tagtype = VARIANT
|
||||||
|
|
||||||
|
# That's the end of the big elif block for figuring out what kind of
|
||||||
|
# subtag we have based on its length. Now we should do something with that
|
||||||
|
# kind of subtag.
|
||||||
|
|
||||||
|
if tagtype is None:
|
||||||
|
# We haven't recognized a type of tag. This subtag just doesn't fit the
|
||||||
|
# standard.
|
||||||
|
subtag_error(subtag)
|
||||||
|
|
||||||
|
elif tagtype < expect:
|
||||||
|
# We got a tag type that was supposed to appear earlier in the order.
|
||||||
|
order_error(subtag, tagtype, expect)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# We've recognized a subtag of a particular type. If it's a territory or
|
||||||
|
# script, we expect the next subtag to be a strictly later type, because
|
||||||
|
# there can be at most one territory and one script. Otherwise, we expect
|
||||||
|
# the next subtag to be the type we got or later.
|
||||||
|
|
||||||
|
if tagtype in (SCRIPT, TERRITORY):
|
||||||
|
expect = tagtype + 1
|
||||||
|
else:
|
||||||
|
expect = tagtype
|
||||||
|
|
||||||
|
# Get the name of this subtag type instead of its integer value.
|
||||||
|
typename = SUBTAG_TYPES[tagtype]
|
||||||
|
|
||||||
|
# Some subtags are conventionally written with capitalization. Apply
|
||||||
|
# those conventions.
|
||||||
|
if tagtype == SCRIPT:
|
||||||
|
subtag = subtag.title()
|
||||||
|
elif tagtype == TERRITORY:
|
||||||
|
subtag = subtag.upper()
|
||||||
|
|
||||||
|
# Recurse on the remaining subtags.
|
||||||
|
return [(typename, subtag)] + parse_subtags(subtags[1:], expect)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_extlang(subtags):
|
||||||
|
"""
|
||||||
|
Parse an 'extended language' tag, which consists of 1 to 3 three-letter
|
||||||
|
language codes.
|
||||||
|
|
||||||
|
Extended languages are used for distinguishing dialects/sublanguages
|
||||||
|
(depending on your view) of macrolanguages such as Arabic, Bahasa Malay,
|
||||||
|
and Chinese.
|
||||||
|
|
||||||
|
It's supposed to also be acceptable to just use the sublanguage as the
|
||||||
|
primary language code, and your code should know what's a macrolanguage of
|
||||||
|
what. For example, 'zh-yue' and 'yue' are the same language (Cantonese),
|
||||||
|
and differ only in whether they explicitly spell out that Cantonese is a
|
||||||
|
kind of Chinese.
|
||||||
|
"""
|
||||||
|
index = 0
|
||||||
|
parsed = []
|
||||||
|
while index < len(subtags) and len(subtags[index]) == 3 and index < 3:
|
||||||
|
parsed.append(('extlang', subtags[index]))
|
||||||
|
index += 1
|
||||||
|
return parsed + parse_subtags(subtags[index:], SCRIPT)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_extension(subtags):
|
||||||
|
"""
|
||||||
|
An extension tag consists of a 'singleton' -- a one-character subtag --
|
||||||
|
followed by other subtags. Extension tags are in the BCP 47 syntax, but
|
||||||
|
their meaning is outside the scope of the standard.
|
||||||
|
|
||||||
|
For example, there's the u- extension, which is used for setting Unicode
|
||||||
|
properties in some context I'm not aware of.
|
||||||
|
|
||||||
|
If the singleton is 'x', it's a private use extension, and consumes the
|
||||||
|
rest of the tag. Otherwise, it stops at the next singleton.
|
||||||
|
"""
|
||||||
|
subtag = subtags[0]
|
||||||
|
if len(subtags) == 1:
|
||||||
|
raise LanguageTagError(f"The subtag {subtag!r} must be followed by something")
|
||||||
|
|
||||||
|
if subtag == 'x':
|
||||||
|
# Private use. Everything after this is arbitrary codes that we
|
||||||
|
# can't look up.
|
||||||
|
return [('private', '-'.join(subtags))]
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Look for the next singleton, if there is one.
|
||||||
|
boundary = 1
|
||||||
|
while boundary < len(subtags) and len(subtags[boundary]) != 1:
|
||||||
|
boundary += 1
|
||||||
|
|
||||||
|
if boundary == 1:
|
||||||
|
raise LanguageTagError(
|
||||||
|
"Tag extensions may not contain two singletons in a row"
|
||||||
|
)
|
||||||
|
# We've parsed a complete extension subtag. Return to the main
|
||||||
|
# parse_subtags function, but expect to find nothing but more
|
||||||
|
# extensions at this point.
|
||||||
|
return [('extension', '-'.join(subtags[:boundary]))] + parse_subtags(
|
||||||
|
subtags[boundary:], EXTENSION
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageTagError(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def order_error(subtag, got, expected):
|
||||||
|
"""
|
||||||
|
Output an error indicating that tags were out of order.
|
||||||
|
"""
|
||||||
|
options = SUBTAG_TYPES[expected:]
|
||||||
|
if len(options) == 1:
|
||||||
|
expect_str = options[0]
|
||||||
|
elif len(options) == 2:
|
||||||
|
expect_str = f'{options[0]} or {options[1]}'
|
||||||
|
else:
|
||||||
|
joined = ', '.join(options[:-1])
|
||||||
|
last = options[-1]
|
||||||
|
expect_str = f'{joined}, or {last}'
|
||||||
|
got_str = SUBTAG_TYPES[got]
|
||||||
|
raise LanguageTagError(
|
||||||
|
f"This {got_str} subtag, {subtag!r}, is out of place. Expected {expect_str}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def subtag_error(subtag, expected='a valid subtag'):
|
||||||
|
"""
|
||||||
|
Try to output a reasonably helpful error message based on our state of
|
||||||
|
parsing. Most of this code is about how to list, in English, the kinds
|
||||||
|
of things we were expecting to find.
|
||||||
|
"""
|
||||||
|
raise LanguageTagError(f"Expected {expected}, got {subtag!r}")
|
8
lib/langcodes/util.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from pkg_resources import resource_filename
|
||||||
|
|
||||||
|
DATA_ROOT = resource_filename('langcodes', 'data')
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def data_filename(filename):
|
||||||
|
return os.path.join(DATA_ROOT, filename)
|
0
lib/language_data/__init__.py
Normal file
68
lib/language_data/data/extra_language_names.csv
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
en,av,Avar
|
||||||
|
en,frr,North Frisian
|
||||||
|
en,frs,East Frisian
|
||||||
|
en,fy,West Frisian
|
||||||
|
en,gn,Guaraní
|
||||||
|
en,ilo,Ilocano
|
||||||
|
en,jam,Jamaican Creole
|
||||||
|
en,kky,Guugu Yimidhirr
|
||||||
|
en,kky,Guugu Yimithirr
|
||||||
|
en,ksd,Tolai
|
||||||
|
en,liv,Livonian
|
||||||
|
en,nay,Ngarrindjeri
|
||||||
|
en,nmn,ǃXóõ
|
||||||
|
en,nrf,Norman
|
||||||
|
en,oj,Ojibwe
|
||||||
|
en,pap,Papiamentu
|
||||||
|
en,pms,Piedmontese
|
||||||
|
en,rap,Rapa Nui
|
||||||
|
en,rm,Romansch
|
||||||
|
en,rom,Romani
|
||||||
|
en,ryu,Okinawan
|
||||||
|
en,sl,Slovene
|
||||||
|
en,st,Sesotho
|
||||||
|
en,tvl,Tuvaluan
|
||||||
|
en,twf,Taos
|
||||||
|
en,txb,Tocharian B
|
||||||
|
en,tyv,Tuvan
|
||||||
|
en,vma,Martuthunira
|
||||||
|
en,wym,Vilamovian
|
||||||
|
en,xto,Tocharian A
|
||||||
|
en,zu,isiZulu
|
||||||
|
de,el,Neugriechisch
|
||||||
|
de,la,Lateinisch
|
||||||
|
de,fur,Friaulisch
|
||||||
|
de,gd,Schottisch-Gälisch
|
||||||
|
de,haw,Hawaiianisch
|
||||||
|
de,nds,Plattdeutsch
|
||||||
|
de,nhn,Zentral-Nahuatl
|
||||||
|
de,pa,Panjabi
|
||||||
|
de,pap,Papiamentu
|
||||||
|
de,prg,Prußisch
|
||||||
|
de,vec,Venezianisch
|
||||||
|
de,tvl,Tuvaluisch
|
||||||
|
sh,sh,Srpskohrvatski
|
||||||
|
la,la,Lingua latina
|
||||||
|
ceb,ceb,Sinugbuanong Binisayâ
|
||||||
|
ceb,ceb,Bisayâ
|
||||||
|
ceb,ceb,Bisaya
|
||||||
|
lah,lah,لہندا پنجابی
|
||||||
|
bho,bho,भोजपुरी
|
||||||
|
ang,ang,Ænglisc
|
||||||
|
vo,vo,Volapük
|
||||||
|
io,io,Ido
|
||||||
|
jbo,jbo,lojban
|
||||||
|
jbo,jbo,lojbau
|
||||||
|
rup,rup,armãneashti
|
||||||
|
nv,nv,Diné bizaad
|
||||||
|
zh-Hant,nan,閩南語
|
||||||
|
zh-Hans,nan,闽南语
|
||||||
|
nan-Latn,nan,Bân-lâm-gú
|
||||||
|
zh-Hant,hak,客家語
|
||||||
|
zh-Hans,hak,客家语
|
||||||
|
ilo,ilo,Ilokano
|
||||||
|
hil,hil,Ilonggo
|
||||||
|
nah,nah,Nāhuatl
|
||||||
|
tpi,tpi,Tok Pisin
|
||||||
|
ve,ve,tshiVenḓa
|
||||||
|
kcm,kcm,Kristang
|
|
48462
lib/language_data/data/language-subtag-registry.txt
Normal file
442
lib/language_data/data/languageInfo.xml
Normal file
|
@ -0,0 +1,442 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
|
||||||
|
<!--
|
||||||
|
Copyright © 1991-2020 Unicode, Inc.
|
||||||
|
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
|
||||||
|
For terms of use, see http://www.unicode.org/copyright.html
|
||||||
|
-->
|
||||||
|
<supplementalData>
|
||||||
|
<version number="$Revision$"/>
|
||||||
|
<languageMatching>
|
||||||
|
<languageMatches type="written_new">
|
||||||
|
<paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
|
||||||
|
<matchVariable id="$enUS" value="AS+GU+MH+MP+PR+UM+US+VI"/>
|
||||||
|
<matchVariable id="$cnsar" value="HK+MO"/>
|
||||||
|
<matchVariable id="$americas" value="019"/>
|
||||||
|
<matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
|
||||||
|
<languageMatch desired="no" supported="nb" distance="1"/> <!-- no ⇒ nb -->
|
||||||
|
<!-- languageMatch desired="ku" supported="ckb" distance="4" oneway="true"/ --> <!-- ku ⇒ ckb -->
|
||||||
|
<!-- languageMatch desired="ckb" supported="ku" percent="8" oneway="true"/ --> <!-- ckb ⇒ ku -->
|
||||||
|
<languageMatch desired="hr" supported="bs" distance="4"/> <!-- hr ⇒ bs -->
|
||||||
|
<languageMatch desired="sh" supported="bs" distance="4"/> <!-- sh ⇒ bs -->
|
||||||
|
<!-- languageMatch desired="sr" supported="bs" distance="4"/--> <!-- sr ⇒ bs -->
|
||||||
|
<languageMatch desired="sh" supported="hr" distance="4"/> <!-- sh ⇒ hr -->
|
||||||
|
<!-- languageMatch desired="sr" supported="hr" distance="4"/--> <!-- sr ⇒ hr -->
|
||||||
|
<languageMatch desired="sh" supported="sr" distance="4"/> <!-- sh ⇒ sr -->
|
||||||
|
<languageMatch desired="ssy" supported="aa" distance="4"/> <!-- ssy ⇒ aa -->
|
||||||
|
<languageMatch desired="gsw" supported="de" distance="4" oneway="true"/> <!-- gsw ⇒ de -->
|
||||||
|
<languageMatch desired="lb" supported="de" distance="4" oneway="true"/> <!-- lb ⇒ de -->
|
||||||
|
<languageMatch desired="da" supported="no" distance="8"/> <!-- da ⇒ no -->
|
||||||
|
<languageMatch desired="da" supported="nb" distance="8"/> <!-- da ⇒ nb -->
|
||||||
|
<!-- various fallbacks for more or less loosely related languages -->
|
||||||
|
<!-- CLDR-13528:
|
||||||
|
Distance 20 for some linguistic relation (e.g., Creoles to French)
|
||||||
|
or a local language in the area of another (e.g., Breton to French).
|
||||||
|
Distance 30 for fallbacks to prevalent second languages,
|
||||||
|
and in the absence of better information. -->
|
||||||
|
<languageMatch desired="ab" supported="ru" distance="30" oneway="true"/> <!-- Abkhazian: ab ⇒ ru -->
|
||||||
|
<languageMatch desired="ach" supported="en" distance="30" oneway="true"/> <!-- Acoli (Southern Luo dialect in Uganda): ach ⇒ en -->
|
||||||
|
<languageMatch desired="af" supported="nl" distance="20" oneway="true"/> <!-- Afrikaans: af ⇒ nl -->
|
||||||
|
<languageMatch desired="ak" supported="en" distance="30" oneway="true"/> <!-- Akan: ak ⇒ en -->
|
||||||
|
<languageMatch desired="ay" supported="es" distance="20" oneway="true"/> <!-- Aymara: ay ⇒ es -->
|
||||||
|
<languageMatch desired="az" supported="ru" distance="30" oneway="true"/> <!-- Azerbaijani: az ⇒ ru -->
|
||||||
|
<languageMatch desired="be" supported="ru" distance="20" oneway="true"/> <!-- Belarusian: be ⇒ ru -->
|
||||||
|
<languageMatch desired="bem" supported="en" distance="30" oneway="true"/> <!-- Bemba (Zambia): bem ⇒ en -->
|
||||||
|
<languageMatch desired="bh" supported="hi" distance="30" oneway="true"/> <!-- Bihari languages (gets canonicalized to bho): bh ⇒ hi -->
|
||||||
|
<languageMatch desired="bn" supported="en" distance="30" oneway="true"/> <!-- Bangla: bn ⇒ en -->
|
||||||
|
<languageMatch desired="br" supported="fr" distance="20" oneway="true"/> <!-- Breton: br ⇒ fr -->
|
||||||
|
<languageMatch desired="ceb" supported="fil" distance="30" oneway="true"/> <!-- Cebuano: ceb ⇒ fil -->
|
||||||
|
<languageMatch desired="chr" supported="en" distance="20" oneway="true"/> <!-- Cherokee: chr ⇒ en -->
|
||||||
|
<languageMatch desired="ckb" supported="ar" distance="30" oneway="true"/> <!-- Sorani Kurdish: ckb ⇒ ar -->
|
||||||
|
<languageMatch desired="co" supported="fr" distance="20" oneway="true"/> <!-- Corsican: co ⇒ fr -->
|
||||||
|
<languageMatch desired="crs" supported="fr" distance="20" oneway="true"/> <!-- Seselwa Creole French: crs ⇒ fr -->
|
||||||
|
<languageMatch desired="cy" supported="en" distance="20" oneway="true"/> <!-- Welsh: cy ⇒ en -->
|
||||||
|
<languageMatch desired="ee" supported="en" distance="30" oneway="true"/> <!-- Ewe: ee ⇒ en -->
|
||||||
|
<languageMatch desired="eo" supported="en" distance="30" oneway="true"/> <!-- Esperanto: eo ⇒ en -->
|
||||||
|
|
||||||
|
<!-- CLDR-13650: No fallback for Estonian -->
|
||||||
|
<!-- languageMatch desired="et" supported="fi" distance="30" oneway="true"/--> <!-- Estonian: et ⇒ fi -->
|
||||||
|
|
||||||
|
<languageMatch desired="eu" supported="es" distance="20" oneway="true"/> <!-- Basque: eu ⇒ es -->
|
||||||
|
<languageMatch desired="fo" supported="da" distance="20" oneway="true"/> <!-- Faroese: fo ⇒ da -->
|
||||||
|
<languageMatch desired="fy" supported="nl" distance="20" oneway="true"/> <!-- Western Frisian: fy ⇒ nl -->
|
||||||
|
<languageMatch desired="ga" supported="en" distance="20" oneway="true"/> <!-- Irish: ga ⇒ en -->
|
||||||
|
<languageMatch desired="gaa" supported="en" distance="30" oneway="true"/> <!-- Ga: gaa ⇒ en -->
|
||||||
|
<languageMatch desired="gd" supported="en" distance="20" oneway="true"/> <!-- Scottish Gaelic: gd ⇒ en -->
|
||||||
|
<languageMatch desired="gl" supported="es" distance="20" oneway="true"/> <!-- Galician: gl ⇒ es -->
|
||||||
|
<languageMatch desired="gn" supported="es" distance="20" oneway="true"/> <!-- Guarani: gn ⇒ es -->
|
||||||
|
<languageMatch desired="gu" supported="hi" distance="30" oneway="true"/> <!-- Gujarati: gu ⇒ hi -->
|
||||||
|
<languageMatch desired="ha" supported="en" distance="30" oneway="true"/> <!-- Hausa: ha ⇒ en -->
|
||||||
|
<languageMatch desired="haw" supported="en" distance="20" oneway="true"/> <!-- Hawaiian: haw ⇒ en -->
|
||||||
|
<languageMatch desired="ht" supported="fr" distance="20" oneway="true"/> <!-- Haitian Creole: ht ⇒ fr -->
|
||||||
|
<languageMatch desired="hy" supported="ru" distance="30" oneway="true"/> <!-- Armenian: hy ⇒ ru -->
|
||||||
|
<languageMatch desired="ia" supported="en" distance="30" oneway="true"/> <!-- Interlingua: ia ⇒ en -->
|
||||||
|
<languageMatch desired="ig" supported="en" distance="30" oneway="true"/> <!-- Igbo: ig ⇒ en -->
|
||||||
|
<languageMatch desired="is" supported="en" distance="20" oneway="true"/> <!-- Icelandic: is ⇒ en -->
|
||||||
|
<languageMatch desired="jv" supported="id" distance="20" oneway="true"/> <!-- Javanese: jv ⇒ id -->
|
||||||
|
<languageMatch desired="ka" supported="en" distance="30" oneway="true"/> <!-- Georgian: ka ⇒ en -->
|
||||||
|
<languageMatch desired="kg" supported="fr" distance="30" oneway="true"/> <!-- Kongo: kg ⇒ fr -->
|
||||||
|
<languageMatch desired="kk" supported="ru" distance="30" oneway="true"/> <!-- Kazakh: kk ⇒ ru -->
|
||||||
|
<languageMatch desired="km" supported="en" distance="30" oneway="true"/> <!-- Khmer: km ⇒ en -->
|
||||||
|
<languageMatch desired="kn" supported="en" distance="30" oneway="true"/> <!-- Kannada: kn ⇒ en -->
|
||||||
|
<languageMatch desired="kri" supported="en" distance="30" oneway="true"/> <!-- Krio: kri ⇒ en -->
|
||||||
|
<languageMatch desired="ku" supported="tr" distance="30" oneway="true"/> <!-- Kurdish: ku ⇒ tr -->
|
||||||
|
<languageMatch desired="ky" supported="ru" distance="30" oneway="true"/> <!-- Kirghiz: ky ⇒ ru -->
|
||||||
|
<languageMatch desired="la" supported="it" distance="20" oneway="true"/> <!-- Latin: la ⇒ it -->
|
||||||
|
<languageMatch desired="lg" supported="en" distance="30" oneway="true"/> <!-- Luganda: lg ⇒ en -->
|
||||||
|
<languageMatch desired="ln" supported="fr" distance="30" oneway="true"/> <!-- Lingala: ln ⇒ fr -->
|
||||||
|
<languageMatch desired="lo" supported="en" distance="30" oneway="true"/> <!-- Lao: lo ⇒ en -->
|
||||||
|
<languageMatch desired="loz" supported="en" distance="30" oneway="true"/> <!-- Lozi: loz ⇒ en -->
|
||||||
|
<languageMatch desired="lua" supported="fr" distance="30" oneway="true"/> <!-- Luba-Lulua: lua ⇒ fr -->
|
||||||
|
<languageMatch desired="mfe" supported="en" distance="30" oneway="true"/> <!-- Morisyen: mfe ⇒ en -->
|
||||||
|
<languageMatch desired="mg" supported="fr" distance="30" oneway="true"/> <!-- Malagasy: mg ⇒ fr -->
|
||||||
|
<languageMatch desired="mi" supported="en" distance="20" oneway="true"/> <!-- Maori: mi ⇒ en -->
|
||||||
|
|
||||||
|
<!-- CLDR-13625: Macedonian should not fall back to Bulgarian -->
|
||||||
|
<!-- languageMatch desired="mk" supported="bg" distance="30" oneway="true"/--> <!-- Macedonian: mk ⇒ bg -->
|
||||||
|
|
||||||
|
<languageMatch desired="ml" supported="en" distance="30" oneway="true"/> <!-- Malayalam: ml ⇒ en -->
|
||||||
|
<languageMatch desired="mn" supported="ru" distance="30" oneway="true"/> <!-- Mongolian: mn ⇒ ru -->
|
||||||
|
<languageMatch desired="mr" supported="hi" distance="30" oneway="true"/> <!-- Marathi: mr ⇒ hi -->
|
||||||
|
<languageMatch desired="ms" supported="id" distance="30" oneway="true"/> <!-- Malay: ms ⇒ id -->
|
||||||
|
<languageMatch desired="mt" supported="en" distance="30" oneway="true"/> <!-- Maltese: mt ⇒ en -->
|
||||||
|
<languageMatch desired="my" supported="en" distance="30" oneway="true"/> <!-- Myanmar: my ⇒ en -->
|
||||||
|
<languageMatch desired="ne" supported="en" distance="30" oneway="true"/> <!-- Nepali: ne ⇒ en -->
|
||||||
|
<languageMatch desired="nn" supported="nb" distance="20"/> <!-- Nynorsk: nn ⟺ nb -->
|
||||||
|
<languageMatch desired="nn" supported="no" distance="20"/> <!-- Nynorsk: nn ⟺ no; CLDR-13679 -->
|
||||||
|
<languageMatch desired="nso" supported="en" distance="30" oneway="true"/> <!-- Northern Sotho: nso ⇒ en -->
|
||||||
|
<languageMatch desired="ny" supported="en" distance="30" oneway="true"/> <!-- Nyanja: ny ⇒ en -->
|
||||||
|
<languageMatch desired="nyn" supported="en" distance="30" oneway="true"/> <!-- Nyankole: nyn ⇒ en -->
|
||||||
|
<languageMatch desired="oc" supported="fr" distance="20" oneway="true"/> <!-- Occitan: oc ⇒ fr -->
|
||||||
|
<languageMatch desired="om" supported="en" distance="30" oneway="true"/> <!-- Oromo: om ⇒ en -->
|
||||||
|
<languageMatch desired="or" supported="en" distance="30" oneway="true"/> <!-- Odia: or ⇒ en -->
|
||||||
|
<languageMatch desired="pa" supported="en" distance="30" oneway="true"/> <!-- Punjabi: pa ⇒ en -->
|
||||||
|
<languageMatch desired="pcm" supported="en" distance="20" oneway="true"/> <!-- Nigerian Pidgin: pcm ⇒ en -->
|
||||||
|
<languageMatch desired="ps" supported="en" distance="30" oneway="true"/> <!-- Pashto: ps ⇒ en -->
|
||||||
|
<languageMatch desired="qu" supported="es" distance="30" oneway="true"/> <!-- Quechua: qu ⇒ es -->
|
||||||
|
<languageMatch desired="rm" supported="de" distance="20" oneway="true"/> <!-- Romansh: rm ⇒ de -->
|
||||||
|
<languageMatch desired="rn" supported="en" distance="30" oneway="true"/> <!-- Rundi: rn ⇒ en -->
|
||||||
|
<languageMatch desired="rw" supported="fr" distance="30" oneway="true"/> <!-- Kinyarwanda: rw ⇒ fr -->
|
||||||
|
<languageMatch desired="sa" supported="hi" distance="30" oneway="true"/> <!-- Sanskrit: sa ⇒ hi -->
|
||||||
|
<languageMatch desired="sd" supported="en" distance="30" oneway="true"/> <!-- Sindhi: sd ⇒ en -->
|
||||||
|
<languageMatch desired="si" supported="en" distance="30" oneway="true"/> <!-- Sinhalese: si ⇒ en -->
|
||||||
|
<languageMatch desired="sn" supported="en" distance="30" oneway="true"/> <!-- Shona: sn ⇒ en -->
|
||||||
|
<languageMatch desired="so" supported="en" distance="30" oneway="true"/> <!-- Somali: so ⇒ en -->
|
||||||
|
<languageMatch desired="sq" supported="en" distance="30" oneway="true"/> <!-- Albanian: sq ⇒ en -->
|
||||||
|
<languageMatch desired="st" supported="en" distance="30" oneway="true"/> <!-- Southern Sotho: st ⇒ en -->
|
||||||
|
<languageMatch desired="su" supported="id" distance="20" oneway="true"/> <!-- Sundanese: su ⇒ id -->
|
||||||
|
<languageMatch desired="sw" supported="en" distance="30" oneway="true"/> <!-- Swahili: sw ⇒ en -->
|
||||||
|
<languageMatch desired="ta" supported="en" distance="30" oneway="true"/> <!-- Tamil: ta ⇒ en -->
|
||||||
|
<languageMatch desired="te" supported="en" distance="30" oneway="true"/> <!-- Telugu: te ⇒ en -->
|
||||||
|
<languageMatch desired="tg" supported="ru" distance="30" oneway="true"/> <!-- Tajik: tg ⇒ ru -->
|
||||||
|
<languageMatch desired="ti" supported="en" distance="30" oneway="true"/> <!-- Tigrinya: ti ⇒ en -->
|
||||||
|
<languageMatch desired="tk" supported="ru" distance="30" oneway="true"/> <!-- Turkmen: tk ⇒ ru -->
|
||||||
|
<languageMatch desired="tlh" supported="en" distance="30" oneway="true"/> <!-- Klingon: tlh ⇒ en -->
|
||||||
|
<languageMatch desired="tn" supported="en" distance="30" oneway="true"/> <!-- Tswana: tn ⇒ en -->
|
||||||
|
<languageMatch desired="to" supported="en" distance="30" oneway="true"/> <!-- Tonga: to ⇒ en -->
|
||||||
|
<languageMatch desired="tt" supported="ru" distance="30" oneway="true"/> <!-- Tatar: tt ⇒ ru -->
|
||||||
|
<languageMatch desired="tum" supported="en" distance="30" oneway="true"/> <!-- Tumbuka: tum ⇒ en -->
|
||||||
|
<languageMatch desired="ug" supported="zh" distance="20" oneway="true"/> <!-- Uighur: ug ⇒ zh -->
|
||||||
|
<languageMatch desired="ur" supported="en" distance="30" oneway="true"/> <!-- Urdu: ur ⇒ en -->
|
||||||
|
<languageMatch desired="uz" supported="ru" distance="30" oneway="true"/> <!-- Uzbek: uz ⇒ ru -->
|
||||||
|
<languageMatch desired="wo" supported="fr" distance="30" oneway="true"/> <!-- Wolof: wo ⇒ fr -->
|
||||||
|
<languageMatch desired="xh" supported="en" distance="30" oneway="true"/> <!-- Xhosa: xh ⇒ en -->
|
||||||
|
<languageMatch desired="yi" supported="en" distance="30" oneway="true"/> <!-- Yiddish: yi ⇒ en -->
|
||||||
|
<languageMatch desired="yo" supported="en" distance="30" oneway="true"/> <!-- Yoruba: yo ⇒ en -->
|
||||||
|
<languageMatch desired="zu" supported="en" distance="30" oneway="true"/> <!-- Zulu: zu ⇒ en -->
|
||||||
|
|
||||||
|
<!-- START generated by GenerateLanguageMatches.java: don't manually change -->
|
||||||
|
<!-- Encompassed by Arabic -->
|
||||||
|
<languageMatch desired="aao" supported="ar" distance="10" oneway="true"/> <!-- Algerian Saharan Arabic -->
|
||||||
|
<languageMatch desired="abh" supported="ar" distance="10" oneway="true"/> <!-- Tajiki Arabic -->
|
||||||
|
<languageMatch desired="abv" supported="ar" distance="10" oneway="true"/> <!-- Baharna Arabic -->
|
||||||
|
<languageMatch desired="acm" supported="ar" distance="10" oneway="true"/> <!-- Mesopotamian Arabic -->
|
||||||
|
<languageMatch desired="acq" supported="ar" distance="10" oneway="true"/> <!-- Ta'izzi-Adeni Arabic -->
|
||||||
|
<languageMatch desired="acw" supported="ar" distance="10" oneway="true"/> <!-- Hijazi Arabic -->
|
||||||
|
<languageMatch desired="acx" supported="ar" distance="10" oneway="true"/> <!-- Omani Arabic -->
|
||||||
|
<languageMatch desired="acy" supported="ar" distance="10" oneway="true"/> <!-- Cypriot Arabic -->
|
||||||
|
<languageMatch desired="adf" supported="ar" distance="10" oneway="true"/> <!-- Dhofari Arabic -->
|
||||||
|
<languageMatch desired="aeb" supported="ar" distance="10" oneway="true"/> <!-- Tunisian Arabic -->
|
||||||
|
<languageMatch desired="aec" supported="ar" distance="10" oneway="true"/> <!-- Saidi Arabic -->
|
||||||
|
<languageMatch desired="afb" supported="ar" distance="10" oneway="true"/> <!-- Gulf Arabic -->
|
||||||
|
<languageMatch desired="ajp" supported="ar" distance="10" oneway="true"/> <!-- South Levantine Arabic -->
|
||||||
|
<languageMatch desired="apc" supported="ar" distance="10" oneway="true"/> <!-- North Levantine Arabic -->
|
||||||
|
<languageMatch desired="apd" supported="ar" distance="10" oneway="true"/> <!-- Sudanese Arabic -->
|
||||||
|
<languageMatch desired="arq" supported="ar" distance="10" oneway="true"/> <!-- Algerian Arabic -->
|
||||||
|
<languageMatch desired="ars" supported="ar" distance="10" oneway="true"/> <!-- Najdi Arabic -->
|
||||||
|
<languageMatch desired="ary" supported="ar" distance="10" oneway="true"/> <!-- Moroccan Arabic -->
|
||||||
|
<languageMatch desired="arz" supported="ar" distance="10" oneway="true"/> <!-- Egyptian Arabic -->
|
||||||
|
<languageMatch desired="auz" supported="ar" distance="10" oneway="true"/> <!-- Uzbeki Arabic -->
|
||||||
|
<languageMatch desired="avl" supported="ar" distance="10" oneway="true"/> <!-- Eastern Egyptian Bedawi Arabic -->
|
||||||
|
<languageMatch desired="ayh" supported="ar" distance="10" oneway="true"/> <!-- Hadrami Arabic -->
|
||||||
|
<languageMatch desired="ayl" supported="ar" distance="10" oneway="true"/> <!-- Libyan Arabic -->
|
||||||
|
<languageMatch desired="ayn" supported="ar" distance="10" oneway="true"/> <!-- Sanaani Arabic -->
|
||||||
|
<languageMatch desired="ayp" supported="ar" distance="10" oneway="true"/> <!-- North Mesopotamian Arabic -->
|
||||||
|
<languageMatch desired="bbz" supported="ar" distance="10" oneway="true"/> <!-- Babalia Creole Arabic -->
|
||||||
|
<languageMatch desired="pga" supported="ar" distance="10" oneway="true"/> <!-- Sudanese Creole Arabic -->
|
||||||
|
<languageMatch desired="shu" supported="ar" distance="10" oneway="true"/> <!-- Chadian Arabic -->
|
||||||
|
<languageMatch desired="ssh" supported="ar" distance="10" oneway="true"/> <!-- Shihhi Arabic -->
|
||||||
|
<!-- Encompassed by Azerbaijani -->
|
||||||
|
<languageMatch desired="azb" supported="az" distance="10" oneway="true"/> <!-- South Azerbaijani -->
|
||||||
|
<!-- Encompassed by Estonian -->
|
||||||
|
<languageMatch desired="vro" supported="et" distance="10" oneway="true"/> <!-- Võro -->
|
||||||
|
<!-- Encompassed by Fulah -->
|
||||||
|
<languageMatch desired="ffm" supported="ff" distance="10" oneway="true"/> <!-- Maasina Fulfulde -->
|
||||||
|
<languageMatch desired="fub" supported="ff" distance="10" oneway="true"/> <!-- Adamawa Fulfulde -->
|
||||||
|
<languageMatch desired="fue" supported="ff" distance="10" oneway="true"/> <!-- Borgu Fulfulde -->
|
||||||
|
<languageMatch desired="fuf" supported="ff" distance="10" oneway="true"/> <!-- Pular -->
|
||||||
|
<languageMatch desired="fuh" supported="ff" distance="10" oneway="true"/> <!-- Western Niger Fulfulde -->
|
||||||
|
<languageMatch desired="fui" supported="ff" distance="10" oneway="true"/> <!-- Bagirmi Fulfulde -->
|
||||||
|
<languageMatch desired="fuq" supported="ff" distance="10" oneway="true"/> <!-- Central-Eastern Niger Fulfulde -->
|
||||||
|
<languageMatch desired="fuv" supported="ff" distance="10" oneway="true"/> <!-- Nigerian Fulfulde -->
|
||||||
|
<!-- Encompassed by Guarani -->
|
||||||
|
<languageMatch desired="gnw" supported="gn" distance="10" oneway="true"/> <!-- Western Bolivian Guaraní -->
|
||||||
|
<languageMatch desired="gui" supported="gn" distance="10" oneway="true"/> <!-- Eastern Bolivian Guaraní -->
|
||||||
|
<languageMatch desired="gun" supported="gn" distance="10" oneway="true"/> <!-- Mbyá Guaraní -->
|
||||||
|
<languageMatch desired="nhd" supported="gn" distance="10" oneway="true"/> <!-- Chiripá -->
|
||||||
|
<!-- Encompassed by Inuktitut -->
|
||||||
|
<languageMatch desired="ikt" supported="iu" distance="10" oneway="true"/> <!-- Inuinnaqtun -->
|
||||||
|
<!-- Encompassed by Kalenjin -->
|
||||||
|
<languageMatch desired="enb" supported="kln" distance="10" oneway="true"/> <!-- Markweeta -->
|
||||||
|
<languageMatch desired="eyo" supported="kln" distance="10" oneway="true"/> <!-- Keiyo -->
|
||||||
|
<languageMatch desired="niq" supported="kln" distance="10" oneway="true"/> <!-- Nandi -->
|
||||||
|
<languageMatch desired="oki" supported="kln" distance="10" oneway="true"/> <!-- Okiek -->
|
||||||
|
<languageMatch desired="pko" supported="kln" distance="10" oneway="true"/> <!-- Pökoot -->
|
||||||
|
<languageMatch desired="sgc" supported="kln" distance="10" oneway="true"/> <!-- Kipsigis -->
|
||||||
|
<languageMatch desired="tec" supported="kln" distance="10" oneway="true"/> <!-- Terik -->
|
||||||
|
<languageMatch desired="tuy" supported="kln" distance="10" oneway="true"/> <!-- Tugen -->
|
||||||
|
<!-- Encompassed by Konkani -->
|
||||||
|
<languageMatch desired="gom" supported="kok" distance="10" oneway="true"/> <!-- Goan Konkani -->
|
||||||
|
<!-- Encompassed by Kpelle -->
|
||||||
|
<languageMatch desired="gkp" supported="kpe" distance="10" oneway="true"/> <!-- Guinea Kpelle -->
|
||||||
|
<!-- Encompassed by Luyia -->
|
||||||
|
<languageMatch desired="ida" supported="luy" distance="10" oneway="true"/> <!-- Idakho-Isukha-Tiriki -->
|
||||||
|
<languageMatch desired="lkb" supported="luy" distance="10" oneway="true"/> <!-- Kabras -->
|
||||||
|
<languageMatch desired="lko" supported="luy" distance="10" oneway="true"/> <!-- Khayo -->
|
||||||
|
<languageMatch desired="lks" supported="luy" distance="10" oneway="true"/> <!-- Kisa -->
|
||||||
|
<languageMatch desired="lri" supported="luy" distance="10" oneway="true"/> <!-- Marachi -->
|
||||||
|
<languageMatch desired="lrm" supported="luy" distance="10" oneway="true"/> <!-- Marama -->
|
||||||
|
<languageMatch desired="lsm" supported="luy" distance="10" oneway="true"/> <!-- Saamia -->
|
||||||
|
<languageMatch desired="lto" supported="luy" distance="10" oneway="true"/> <!-- Tsotso -->
|
||||||
|
<languageMatch desired="lts" supported="luy" distance="10" oneway="true"/> <!-- Tachoni -->
|
||||||
|
<languageMatch desired="lwg" supported="luy" distance="10" oneway="true"/> <!-- Wanga -->
|
||||||
|
<languageMatch desired="nle" supported="luy" distance="10" oneway="true"/> <!-- East Nyala -->
|
||||||
|
<languageMatch desired="nyd" supported="luy" distance="10" oneway="true"/> <!-- Nyore -->
|
||||||
|
<languageMatch desired="rag" supported="luy" distance="10" oneway="true"/> <!-- Logooli -->
|
||||||
|
<!-- Encompassed by Latvian -->
|
||||||
|
<languageMatch desired="ltg" supported="lv" distance="10" oneway="true"/> <!-- Latgalian -->
|
||||||
|
<!-- Encompassed by Malagasy -->
|
||||||
|
<languageMatch desired="bhr" supported="mg" distance="10" oneway="true"/> <!-- Bara Malagasy -->
|
||||||
|
<languageMatch desired="bjq" supported="mg" distance="10" oneway="true"/> <!-- Southern Betsimisaraka Malagasy -->
|
||||||
|
<languageMatch desired="bmm" supported="mg" distance="10" oneway="true"/> <!-- Northern Betsimisaraka Malagasy -->
|
||||||
|
<languageMatch desired="bzc" supported="mg" distance="10" oneway="true"/> <!-- Southern Betsimisaraka Malagasy -->
|
||||||
|
<languageMatch desired="msh" supported="mg" distance="10" oneway="true"/> <!-- Masikoro Malagasy -->
|
||||||
|
<languageMatch desired="skg" supported="mg" distance="10" oneway="true"/> <!-- Sakalava Malagasy -->
|
||||||
|
<languageMatch desired="tdx" supported="mg" distance="10" oneway="true"/> <!-- Tandroy-Mahafaly Malagasy -->
|
||||||
|
<languageMatch desired="tkg" supported="mg" distance="10" oneway="true"/> <!-- Tesaka Malagasy -->
|
||||||
|
<languageMatch desired="txy" supported="mg" distance="10" oneway="true"/> <!-- Tanosy Malagasy -->
|
||||||
|
<languageMatch desired="xmv" supported="mg" distance="10" oneway="true"/> <!-- Antankarana Malagasy -->
|
||||||
|
<languageMatch desired="xmw" supported="mg" distance="10" oneway="true"/> <!-- Tsimihety Malagasy -->
|
||||||
|
<!-- Encompassed by Mongolian -->
|
||||||
|
<languageMatch desired="mvf" supported="mn" distance="10" oneway="true"/> <!-- Peripheral Mongolian -->
|
||||||
|
<!-- Encompassed by Malay -->
|
||||||
|
<languageMatch desired="bjn" supported="ms" distance="10" oneway="true"/> <!-- Banjar -->
|
||||||
|
<languageMatch desired="btj" supported="ms" distance="10" oneway="true"/> <!-- Bacanese Malay -->
|
||||||
|
<languageMatch desired="bve" supported="ms" distance="10" oneway="true"/> <!-- Berau Malay -->
|
||||||
|
<languageMatch desired="bvu" supported="ms" distance="10" oneway="true"/> <!-- Bukit Malay -->
|
||||||
|
<languageMatch desired="coa" supported="ms" distance="10" oneway="true"/> <!-- Cocos Islands Malay -->
|
||||||
|
<languageMatch desired="dup" supported="ms" distance="10" oneway="true"/> <!-- Duano -->
|
||||||
|
<languageMatch desired="hji" supported="ms" distance="10" oneway="true"/> <!-- Haji -->
|
||||||
|
<languageMatch desired="id" supported="ms" distance="10" oneway="true"/> <!-- Indonesian -->
|
||||||
|
<languageMatch desired="jak" supported="ms" distance="10" oneway="true"/> <!-- Jakun -->
|
||||||
|
<languageMatch desired="jax" supported="ms" distance="10" oneway="true"/> <!-- Jambi Malay -->
|
||||||
|
<languageMatch desired="kvb" supported="ms" distance="10" oneway="true"/> <!-- Kubu -->
|
||||||
|
<languageMatch desired="kvr" supported="ms" distance="10" oneway="true"/> <!-- Kerinci -->
|
||||||
|
<languageMatch desired="kxd" supported="ms" distance="10" oneway="true"/> <!-- Brunei -->
|
||||||
|
<languageMatch desired="lce" supported="ms" distance="10" oneway="true"/> <!-- Loncong -->
|
||||||
|
<languageMatch desired="lcf" supported="ms" distance="10" oneway="true"/> <!-- Lubu -->
|
||||||
|
<languageMatch desired="liw" supported="ms" distance="10" oneway="true"/> <!-- Col -->
|
||||||
|
<languageMatch desired="max" supported="ms" distance="10" oneway="true"/> <!-- North Moluccan Malay -->
|
||||||
|
<languageMatch desired="meo" supported="ms" distance="10" oneway="true"/> <!-- Kedah Malay -->
|
||||||
|
<languageMatch desired="mfa" supported="ms" distance="10" oneway="true"/> <!-- Pattani Malay -->
|
||||||
|
<languageMatch desired="mfb" supported="ms" distance="10" oneway="true"/> <!-- Bangka -->
|
||||||
|
<languageMatch desired="min" supported="ms" distance="10" oneway="true"/> <!-- Minangkabau -->
|
||||||
|
<languageMatch desired="mqg" supported="ms" distance="10" oneway="true"/> <!-- Kota Bangun Kutai Malay -->
|
||||||
|
<languageMatch desired="msi" supported="ms" distance="10" oneway="true"/> <!-- Sabah Malay -->
|
||||||
|
<languageMatch desired="mui" supported="ms" distance="10" oneway="true"/> <!-- Musi -->
|
||||||
|
<languageMatch desired="orn" supported="ms" distance="10" oneway="true"/> <!-- Orang Kanaq -->
|
||||||
|
<languageMatch desired="ors" supported="ms" distance="10" oneway="true"/> <!-- Orang Seletar -->
|
||||||
|
<languageMatch desired="pel" supported="ms" distance="10" oneway="true"/> <!-- Pekal -->
|
||||||
|
<languageMatch desired="pse" supported="ms" distance="10" oneway="true"/> <!-- Central Malay -->
|
||||||
|
<languageMatch desired="tmw" supported="ms" distance="10" oneway="true"/> <!-- Temuan -->
|
||||||
|
<languageMatch desired="urk" supported="ms" distance="10" oneway="true"/> <!-- Urak Lawoi' -->
|
||||||
|
<languageMatch desired="vkk" supported="ms" distance="10" oneway="true"/> <!-- Kaur -->
|
||||||
|
<languageMatch desired="vkt" supported="ms" distance="10" oneway="true"/> <!-- Tenggarong Kutai Malay -->
|
||||||
|
<languageMatch desired="xmm" supported="ms" distance="10" oneway="true"/> <!-- Manado Malay -->
|
||||||
|
<languageMatch desired="zlm" supported="ms" distance="10" oneway="true"/> <!-- Malay (individual language) -->
|
||||||
|
<languageMatch desired="zmi" supported="ms" distance="10" oneway="true"/> <!-- Negeri Sembilan Malay -->
|
||||||
|
<!-- Encompassed by Nepali -->
|
||||||
|
<languageMatch desired="dty" supported="ne" distance="10" oneway="true"/> <!-- Dotyali -->
|
||||||
|
<!-- Encompassed by Oromo -->
|
||||||
|
<languageMatch desired="gax" supported="om" distance="10" oneway="true"/> <!-- Borana-Arsi-Guji Oromo -->
|
||||||
|
<languageMatch desired="hae" supported="om" distance="10" oneway="true"/> <!-- Eastern Oromo -->
|
||||||
|
<languageMatch desired="orc" supported="om" distance="10" oneway="true"/> <!-- Orma -->
|
||||||
|
<!-- Encompassed by Odia -->
|
||||||
|
<languageMatch desired="spv" supported="or" distance="10" oneway="true"/> <!-- Sambalpuri -->
|
||||||
|
<!-- Encompassed by Pashto -->
|
||||||
|
<languageMatch desired="pbt" supported="ps" distance="10" oneway="true"/> <!-- Southern Pashto -->
|
||||||
|
<languageMatch desired="pst" supported="ps" distance="10" oneway="true"/> <!-- Central Pashto -->
|
||||||
|
<!-- Encompassed by Quechua -->
|
||||||
|
<languageMatch desired="qub" supported="qu" distance="10" oneway="true"/> <!-- Huallaga Huánuco Quechua -->
|
||||||
|
<languageMatch desired="qud" supported="qu" distance="10" oneway="true"/> <!-- Calderón Highland Quichua -->
|
||||||
|
<languageMatch desired="quf" supported="qu" distance="10" oneway="true"/> <!-- Lambayeque Quechua -->
|
||||||
|
<languageMatch desired="qug" supported="qu" distance="10" oneway="true"/> <!-- Chimborazo Highland Quichua -->
|
||||||
|
<languageMatch desired="quh" supported="qu" distance="10" oneway="true"/> <!-- South Bolivian Quechua -->
|
||||||
|
<languageMatch desired="quk" supported="qu" distance="10" oneway="true"/> <!-- Chachapoyas Quechua -->
|
||||||
|
<languageMatch desired="qul" supported="qu" distance="10" oneway="true"/> <!-- North Bolivian Quechua -->
|
||||||
|
<languageMatch desired="qup" supported="qu" distance="10" oneway="true"/> <!-- Southern Pastaza Quechua -->
|
||||||
|
<languageMatch desired="qur" supported="qu" distance="10" oneway="true"/> <!-- Yanahuanca Pasco Quechua -->
|
||||||
|
<languageMatch desired="qus" supported="qu" distance="10" oneway="true"/> <!-- Santiago del Estero Quichua -->
|
||||||
|
<languageMatch desired="quw" supported="qu" distance="10" oneway="true"/> <!-- Tena Lowland Quichua -->
|
||||||
|
<languageMatch desired="qux" supported="qu" distance="10" oneway="true"/> <!-- Yauyos Quechua -->
|
||||||
|
<languageMatch desired="quy" supported="qu" distance="10" oneway="true"/> <!-- Ayacucho Quechua -->
|
||||||
|
<languageMatch desired="qva" supported="qu" distance="10" oneway="true"/> <!-- Ambo-Pasco Quechua -->
|
||||||
|
<languageMatch desired="qvc" supported="qu" distance="10" oneway="true"/> <!-- Cajamarca Quechua -->
|
||||||
|
<languageMatch desired="qve" supported="qu" distance="10" oneway="true"/> <!-- Eastern Apurímac Quechua -->
|
||||||
|
<languageMatch desired="qvh" supported="qu" distance="10" oneway="true"/> <!-- Huamalíes-Dos de Mayo Huánuco Quechua -->
|
||||||
|
<languageMatch desired="qvi" supported="qu" distance="10" oneway="true"/> <!-- Imbabura Highland Quichua -->
|
||||||
|
<languageMatch desired="qvj" supported="qu" distance="10" oneway="true"/> <!-- Loja Highland Quichua -->
|
||||||
|
<languageMatch desired="qvl" supported="qu" distance="10" oneway="true"/> <!-- Cajatambo North Lima Quechua -->
|
||||||
|
<languageMatch desired="qvm" supported="qu" distance="10" oneway="true"/> <!-- Margos-Yarowilca-Lauricocha Quechua -->
|
||||||
|
<languageMatch desired="qvn" supported="qu" distance="10" oneway="true"/> <!-- North Junín Quechua -->
|
||||||
|
<languageMatch desired="qvo" supported="qu" distance="10" oneway="true"/> <!-- Napo Lowland Quechua -->
|
||||||
|
<languageMatch desired="qvp" supported="qu" distance="10" oneway="true"/> <!-- Pacaraos Quechua -->
|
||||||
|
<languageMatch desired="qvs" supported="qu" distance="10" oneway="true"/> <!-- San Martín Quechua -->
|
||||||
|
<languageMatch desired="qvw" supported="qu" distance="10" oneway="true"/> <!-- Huaylla Wanca Quechua -->
|
||||||
|
<languageMatch desired="qvz" supported="qu" distance="10" oneway="true"/> <!-- Northern Pastaza Quichua -->
|
||||||
|
<languageMatch desired="qwa" supported="qu" distance="10" oneway="true"/> <!-- Corongo Ancash Quechua -->
|
||||||
|
<languageMatch desired="qwc" supported="qu" distance="10" oneway="true"/> <!-- Classical Quechua -->
|
||||||
|
<languageMatch desired="qwh" supported="qu" distance="10" oneway="true"/> <!-- Huaylas Ancash Quechua -->
|
||||||
|
<languageMatch desired="qws" supported="qu" distance="10" oneway="true"/> <!-- Sihuas Ancash Quechua -->
|
||||||
|
<languageMatch desired="qxa" supported="qu" distance="10" oneway="true"/> <!-- Chiquián Ancash Quechua -->
|
||||||
|
<languageMatch desired="qxc" supported="qu" distance="10" oneway="true"/> <!-- Chincha Quechua -->
|
||||||
|
<languageMatch desired="qxh" supported="qu" distance="10" oneway="true"/> <!-- Panao Huánuco Quechua -->
|
||||||
|
<languageMatch desired="qxl" supported="qu" distance="10" oneway="true"/> <!-- Salasaca Highland Quichua -->
|
||||||
|
<languageMatch desired="qxn" supported="qu" distance="10" oneway="true"/> <!-- Northern Conchucos Ancash Quechua -->
|
||||||
|
<languageMatch desired="qxo" supported="qu" distance="10" oneway="true"/> <!-- Southern Conchucos Ancash Quechua -->
|
||||||
|
<languageMatch desired="qxp" supported="qu" distance="10" oneway="true"/> <!-- Puno Quechua -->
|
||||||
|
<languageMatch desired="qxr" supported="qu" distance="10" oneway="true"/> <!-- Cañar Highland Quichua -->
|
||||||
|
<languageMatch desired="qxt" supported="qu" distance="10" oneway="true"/> <!-- Santa Ana de Tusi Pasco Quechua -->
|
||||||
|
<languageMatch desired="qxu" supported="qu" distance="10" oneway="true"/> <!-- Arequipa-La Unión Quechua -->
|
||||||
|
<languageMatch desired="qxw" supported="qu" distance="10" oneway="true"/> <!-- Jauja Wanca Quechua -->
|
||||||
|
<!-- Encompassed by Sardinian -->
|
||||||
|
<languageMatch desired="sdc" supported="sc" distance="10" oneway="true"/> <!-- Sassarese Sardinian -->
|
||||||
|
<languageMatch desired="sdn" supported="sc" distance="10" oneway="true"/> <!-- Gallurese Sardinian -->
|
||||||
|
<languageMatch desired="sro" supported="sc" distance="10" oneway="true"/> <!-- Campidanese Sardinian -->
|
||||||
|
<!-- Encompassed by Albanian -->
|
||||||
|
<languageMatch desired="aae" supported="sq" distance="10" oneway="true"/> <!-- Arbëreshë Albanian -->
|
||||||
|
<languageMatch desired="aat" supported="sq" distance="10" oneway="true"/> <!-- Arvanitika Albanian -->
|
||||||
|
<languageMatch desired="aln" supported="sq" distance="10" oneway="true"/> <!-- Gheg Albanian -->
|
||||||
|
<!-- Encompassed by Syriac -->
|
||||||
|
<languageMatch desired="aii" supported="syr" distance="10" oneway="true"/> <!-- Assyrian Neo-Aramaic -->
|
||||||
|
<!-- Encompassed by Uzbek -->
|
||||||
|
<languageMatch desired="uzs" supported="uz" distance="10" oneway="true"/> <!-- Southern Uzbek -->
|
||||||
|
<!-- Encompassed by Yiddish -->
|
||||||
|
<languageMatch desired="yih" supported="yi" distance="10" oneway="true"/> <!-- Western Yiddish -->
|
||||||
|
<!-- Encompassed by Chinese, Mandarin -->
|
||||||
|
<languageMatch desired="cdo" supported="zh" distance="10" oneway="true"/> <!-- Min Dong Chinese -->
|
||||||
|
<languageMatch desired="cjy" supported="zh" distance="10" oneway="true"/> <!-- Jinyu Chinese -->
|
||||||
|
<languageMatch desired="cpx" supported="zh" distance="10" oneway="true"/> <!-- Pu-Xian Chinese -->
|
||||||
|
<languageMatch desired="czh" supported="zh" distance="10" oneway="true"/> <!-- Huizhou Chinese -->
|
||||||
|
<languageMatch desired="czo" supported="zh" distance="10" oneway="true"/> <!-- Min Zhong Chinese -->
|
||||||
|
<languageMatch desired="gan" supported="zh" distance="10" oneway="true"/> <!-- Gan Chinese -->
|
||||||
|
<languageMatch desired="hak" supported="zh" distance="10" oneway="true"/> <!-- Hakka Chinese -->
|
||||||
|
<languageMatch desired="hsn" supported="zh" distance="10" oneway="true"/> <!-- Xiang Chinese -->
|
||||||
|
<languageMatch desired="lzh" supported="zh" distance="10" oneway="true"/> <!-- Literary Chinese -->
|
||||||
|
<languageMatch desired="mnp" supported="zh" distance="10" oneway="true"/> <!-- Min Bei Chinese -->
|
||||||
|
<languageMatch desired="nan" supported="zh" distance="10" oneway="true"/> <!-- Min Nan Chinese -->
|
||||||
|
<languageMatch desired="wuu" supported="zh" distance="10" oneway="true"/> <!-- Wu Chinese -->
|
||||||
|
<languageMatch desired="yue" supported="zh" distance="10" oneway="true"/> <!-- Chinese, Cantonese -->
|
||||||
|
<!-- END generated by GenerateLanguageMatches.java -->
|
||||||
|
<languageMatch desired="*" supported="*" distance="80"/> <!-- * ⇒ * -->
|
||||||
|
<languageMatch desired="az_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- az; Latn ⇒ ru; Cyrl -->
|
||||||
|
<languageMatch desired="bn_Beng" supported="en_Latn" distance="10" oneway="true"/> <!-- bn; Beng ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="hy_Armn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- hy; Armn ⇒ ru; Cyrl -->
|
||||||
|
<languageMatch desired="ka_Geor" supported="en_Latn" distance="10" oneway="true"/> <!-- ka; Geor ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="km_Khmr" supported="en_Latn" distance="10" oneway="true"/> <!-- km; Khmr ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="kn_Knda" supported="en_Latn" distance="10" oneway="true"/> <!-- kn; Knda ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="lo_Laoo" supported="en_Latn" distance="10" oneway="true"/> <!-- lo; Laoo ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="ml_Mlym" supported="en_Latn" distance="10" oneway="true"/> <!-- ml; Mlym ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="my_Mymr" supported="en_Latn" distance="10" oneway="true"/> <!-- my; Mymr ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="ne_Deva" supported="en_Latn" distance="10" oneway="true"/> <!-- ne; Deva ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="or_Orya" supported="en_Latn" distance="10" oneway="true"/> <!-- or; Orya ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="pa_Guru" supported="en_Latn" distance="10" oneway="true"/> <!-- pa; Guru ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="ps_Arab" supported="en_Latn" distance="10" oneway="true"/> <!-- ps; Arab ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="sd_Arab" supported="en_Latn" distance="10" oneway="true"/> <!-- sd; Arab ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="si_Sinh" supported="en_Latn" distance="10" oneway="true"/> <!-- si; Sinh ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="ta_Taml" supported="en_Latn" distance="10" oneway="true"/> <!-- ta; Taml ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="te_Telu" supported="en_Latn" distance="10" oneway="true"/> <!-- te; Telu ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="ti_Ethi" supported="en_Latn" distance="10" oneway="true"/> <!-- ti; Ethi ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="tk_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- tk; Latn ⇒ ru; Cyrl -->
|
||||||
|
<languageMatch desired="ur_Arab" supported="en_Latn" distance="10" oneway="true"/> <!-- ur; Arab ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="uz_Latn" supported="ru_Cyrl" distance="10" oneway="true"/> <!-- uz; Latn ⇒ ru; Cyrl -->
|
||||||
|
<languageMatch desired="yi_Hebr" supported="en_Latn" distance="10" oneway="true"/> <!-- yi; Hebr ⇒ en; Latn -->
|
||||||
|
<languageMatch desired="sr_Latn" supported="sr_Cyrl" distance="5"/> <!-- sr; Latn ⇒ sr; Cyrl -->
|
||||||
|
<languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/> <!-- zh; Hans ⇒ zh; Hant -->
|
||||||
|
<languageMatch desired="zh_Hant" supported="zh_Hans" distance="19" oneway="true"/> <!-- zh; Hant ⇒ zh; Hans -->
|
||||||
|
<!-- zh_Hani: Slightly bigger distance than zh_Hant->zh_Hans -->
|
||||||
|
<languageMatch desired="zh_Hani" supported="zh_Hans" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="zh_Hani" supported="zh_Hant" distance="20" oneway="true"/>
|
||||||
|
<!-- Latin transliterations of some languages, initially from CLDR-13577 -->
|
||||||
|
<languageMatch desired="ar_Latn" supported="ar_Arab" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="bn_Latn" supported="bn_Beng" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="gu_Latn" supported="gu_Gujr" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="hi_Latn" supported="hi_Deva" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="kn_Latn" supported="kn_Knda" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="ml_Latn" supported="ml_Mlym" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="mr_Latn" supported="mr_Deva" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="ta_Latn" supported="ta_Taml" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="te_Latn" supported="te_Telu" distance="20" oneway="true"/>
|
||||||
|
<languageMatch desired="zh_Latn" supported="zh_Hans" distance="20" oneway="true"/> <!-- Pinyin -->
|
||||||
|
<!-- start fallbacks for group script codes, initially from CLDR-13526
|
||||||
|
Look for plus signs on https://www.unicode.org/iso15924/iso15924-codes.html -->
|
||||||
|
<languageMatch desired="ja_Latn" supported="ja_Jpan" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ja_Hani" supported="ja_Jpan" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ja_Hira" supported="ja_Jpan" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ja_Kana" supported="ja_Jpan" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ja_Hrkt" supported="ja_Jpan" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ja_Hira" supported="ja_Hrkt" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ja_Kana" supported="ja_Hrkt" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ko_Hani" supported="ko_Kore" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ko_Hang" supported="ko_Kore" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ko_Jamo" supported="ko_Kore" distance="5" oneway="true"/>
|
||||||
|
<languageMatch desired="ko_Jamo" supported="ko_Hang" distance="5" oneway="true"/>
|
||||||
|
<!-- No special mappings for zh Bopo/Hanb
|
||||||
|
because Bopomofo is used only in TW, and unsure how widely.
|
||||||
|
No special mappings for styled scripts like Latf or Aran
|
||||||
|
because those would apply to many languages;
|
||||||
|
if desired, those would be better handled as matcher-specific script aliases. -->
|
||||||
|
<!-- end fallbacks for group script codes -->
|
||||||
|
<!-- default script mismatch distance -->
|
||||||
|
<languageMatch desired="*_*" supported="*_*" distance="50"/> <!-- *; * ⇒ *; * -->
|
||||||
|
|
||||||
|
<languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/> <!-- ar; *; $maghreb ⇒ ar; *; $maghreb -->
|
||||||
|
<languageMatch desired="ar_*_$!maghreb" supported="ar_*_$!maghreb" distance="4"/> <!-- ar; *; $!maghreb ⇒ ar; *; $!maghreb -->
|
||||||
|
<languageMatch desired="ar_*_*" supported="ar_*_*" distance="5"/> <!-- ar; *; * ⇒ ar; *; * -->
|
||||||
|
<languageMatch desired="en_*_$enUS" supported="en_*_$enUS" distance="4"/> <!-- en; *; $enUS ⇒ en; *; $enUS -->
|
||||||
|
<languageMatch desired="en_*_$!enUS" supported="en_*_GB" distance="3"/> <!-- Make en_GB preferred... -->
|
||||||
|
<languageMatch desired="en_*_$!enUS" supported="en_*_$!enUS" distance="4"/> <!-- en; *; $!enUS ⇒ en; *; $!enUS -->
|
||||||
|
<languageMatch desired="en_*_*" supported="en_*_*" distance="5"/> <!-- en; *; * ⇒ en; *; * -->
|
||||||
|
<languageMatch desired="es_*_$americas" supported="es_*_$americas" distance="4"/> <!-- es; *; $americas ⇒ es; *; $americas -->
|
||||||
|
<languageMatch desired="es_*_$!americas" supported="es_*_$!americas" distance="4"/> <!-- es; *; $!americas ⇒ es; *; $!americas -->
|
||||||
|
<languageMatch desired="es_*_*" supported="es_*_*" distance="5"/> <!-- es; *; * ⇒ es; *; * -->
|
||||||
|
<languageMatch desired="pt_*_$americas" supported="pt_*_$americas" distance="4"/> <!-- pt; *; $americas ⇒ pt; *; $americas -->
|
||||||
|
<languageMatch desired="pt_*_$!americas" supported="pt_*_$!americas" distance="4"/> <!-- pt; *; $!americas ⇒ pt; *; $!americas -->
|
||||||
|
<languageMatch desired="pt_*_*" supported="pt_*_*" distance="5"/> <!-- pt; *; * ⇒ pt; *; * -->
|
||||||
|
<languageMatch desired="zh_Hant_$cnsar" supported="zh_Hant_$cnsar" distance="4"/> <!-- zh; Hant; $cnsar ⇒ zh; Hant; $cnsar -->
|
||||||
|
<languageMatch desired="zh_Hant_$!cnsar" supported="zh_Hant_$!cnsar" distance="4"/> <!-- zh; Hant; $!cnsar ⇒ zh; Hant; $!cnsar -->
|
||||||
|
<languageMatch desired="zh_Hant_*" supported="zh_Hant_*" distance="5"/> <!-- zh; Hant; * ⇒ zh; Hant; * -->
|
||||||
|
<languageMatch desired="*_*_*" supported="*_*_*" distance="4"/> <!-- *; *; * ⇒ *; *; * -->
|
||||||
|
</languageMatches>
|
||||||
|
</languageMatching>
|
||||||
|
</supplementalData>
|
3
lib/language_data/data/override_language_names.csv
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
zsm,zsm,bahasa Malaysia
|
||||||
|
id,id,bahasa Indonesia
|
||||||
|
ms,ms,bahasa Malaysia
|
|
5645
lib/language_data/data/supplementalData.xml
Normal file
7845
lib/language_data/data/wiktionary/codes-en.csv
Normal file
89
lib/language_data/language_lists.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
# This is the list of language codes with the 'modern' level of support in CLDR
|
||||||
|
# (compared to 'full', which contains many more languages). We use this as the
|
||||||
|
# list of languages that we store specific name-to-code mappings for.
|
||||||
|
|
||||||
|
CLDR_LANGUAGES = {
|
||||||
|
"af",
|
||||||
|
"am",
|
||||||
|
"ar",
|
||||||
|
"as",
|
||||||
|
"az",
|
||||||
|
"be",
|
||||||
|
"bg",
|
||||||
|
"bn",
|
||||||
|
"bs",
|
||||||
|
"ca",
|
||||||
|
"cs",
|
||||||
|
"cy",
|
||||||
|
"da",
|
||||||
|
"de",
|
||||||
|
"el",
|
||||||
|
"en",
|
||||||
|
"es",
|
||||||
|
"et",
|
||||||
|
"eu",
|
||||||
|
"fa",
|
||||||
|
"fi",
|
||||||
|
"fil",
|
||||||
|
"fr",
|
||||||
|
"ga",
|
||||||
|
"gl",
|
||||||
|
"gu",
|
||||||
|
"he",
|
||||||
|
"hi",
|
||||||
|
"hr",
|
||||||
|
"hu",
|
||||||
|
"hy",
|
||||||
|
"id",
|
||||||
|
"is",
|
||||||
|
"it",
|
||||||
|
"ja",
|
||||||
|
"jv",
|
||||||
|
"ka",
|
||||||
|
"kk",
|
||||||
|
"km",
|
||||||
|
"kn",
|
||||||
|
"ko",
|
||||||
|
"ky",
|
||||||
|
"lo",
|
||||||
|
"lt",
|
||||||
|
"lv",
|
||||||
|
"mk",
|
||||||
|
"ml",
|
||||||
|
"mn",
|
||||||
|
"mr",
|
||||||
|
"ms",
|
||||||
|
"my",
|
||||||
|
"nb",
|
||||||
|
"ne",
|
||||||
|
"nl",
|
||||||
|
"or",
|
||||||
|
"pa",
|
||||||
|
"pl",
|
||||||
|
"pt",
|
||||||
|
"ro",
|
||||||
|
"ru",
|
||||||
|
"sd",
|
||||||
|
"si",
|
||||||
|
"sk",
|
||||||
|
"sl",
|
||||||
|
"so",
|
||||||
|
"sq",
|
||||||
|
"sr",
|
||||||
|
"sv",
|
||||||
|
"sw",
|
||||||
|
"ta",
|
||||||
|
"te",
|
||||||
|
"th",
|
||||||
|
"ti",
|
||||||
|
"tk",
|
||||||
|
"tr",
|
||||||
|
"uk",
|
||||||
|
"und",
|
||||||
|
"ur",
|
||||||
|
"uz",
|
||||||
|
"vi",
|
||||||
|
"yue",
|
||||||
|
"zh",
|
||||||
|
"zu",
|
||||||
|
}
|
9210
lib/language_data/name_data.py
Normal file
112
lib/language_data/names.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
# import marisa_trie
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from language_data.util import data_filename
|
||||||
|
|
||||||
|
|
||||||
|
TRIES = {}
|
||||||
|
|
||||||
|
# This is something we could hypothetically discover from XML files, but
|
||||||
|
# we end up learning that most languages separate things with commas, with
|
||||||
|
# a few exceptions. We'll just put those exceptions here.
|
||||||
|
DISPLAY_SEPARATORS = {
|
||||||
|
'am': '፣',
|
||||||
|
'ar': '، ',
|
||||||
|
'brx': ',',
|
||||||
|
'fa': '، ',
|
||||||
|
'ja': '、',
|
||||||
|
'my': '၊ ',
|
||||||
|
'ug': '، ',
|
||||||
|
'und': ', ',
|
||||||
|
'ur': '، ',
|
||||||
|
'yue': ',',
|
||||||
|
'zh': ',',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_name(name):
|
||||||
|
"""
|
||||||
|
When looking up a language-code component by name, we would rather ignore
|
||||||
|
distinctions of case and certain punctuation. "Chinese (Traditional)"
|
||||||
|
should be matched by "Chinese Traditional" and "chinese traditional".
|
||||||
|
"""
|
||||||
|
name = name.casefold()
|
||||||
|
name = name.replace("’", "'")
|
||||||
|
name = name.replace("-", " ")
|
||||||
|
name = name.replace("(", "")
|
||||||
|
name = name.replace(")", "")
|
||||||
|
name = name.replace(",", "")
|
||||||
|
return name.strip()
|
||||||
|
|
||||||
|
|
||||||
|
# def load_trie(filename):
|
||||||
|
# """
|
||||||
|
# Load a BytesTrie from the marisa_trie on-disk format.
|
||||||
|
# """
|
||||||
|
# trie = marisa_trie.BytesTrie()
|
||||||
|
# # marisa_trie raises warnings that make no sense. Ignore them.
|
||||||
|
# with warnings.catch_warnings():
|
||||||
|
# warnings.simplefilter("ignore")
|
||||||
|
# trie.load(filename)
|
||||||
|
# return trie
|
||||||
|
|
||||||
|
|
||||||
|
def get_trie_value(trie, key):
|
||||||
|
"""
|
||||||
|
Get the value that a BytesTrie stores for a particular key, decoded
|
||||||
|
as Unicode. Raises a KeyError if there is no value for that key.
|
||||||
|
"""
|
||||||
|
return trie[key][0].decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def name_to_code(category, name, language: str = "und"):
|
||||||
|
"""
|
||||||
|
Get a language, script, or territory by its name in some language.
|
||||||
|
|
||||||
|
The language here must be a string representing a language subtag only.
|
||||||
|
The `Language.find` method can handle other representations of a language
|
||||||
|
and normalize them to this form.
|
||||||
|
|
||||||
|
The default language, "und", will allow matching names in any language,
|
||||||
|
so you can get the code 'fr' by looking up "French", "Français", or
|
||||||
|
"francés".
|
||||||
|
|
||||||
|
A small amount of fuzzy matching is supported: if the name can be
|
||||||
|
shortened or lengthened to match a single language name, you get that
|
||||||
|
language. This allows, for example, "Hakka Chinese" to match "Hakka".
|
||||||
|
|
||||||
|
Occasionally, names are ambiguous in a way that can be resolved by
|
||||||
|
specifying what name the language is supposed to be in. For example,
|
||||||
|
there is a language named 'Malayo' in English, but it's different from
|
||||||
|
the language named 'Malayo' in Spanish (which is Malay). Specifying the
|
||||||
|
language will look up the name in a trie that is only in that language.
|
||||||
|
"""
|
||||||
|
assert "/" not in language, "Language codes cannot contain slashes"
|
||||||
|
assert "-" not in language, "This code should be reduced to a language subtag only"
|
||||||
|
trie_name = "{}/name_to_{}".format(language, category)
|
||||||
|
if trie_name not in TRIES:
|
||||||
|
TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))
|
||||||
|
|
||||||
|
trie = TRIES[trie_name]
|
||||||
|
lookup = normalize_name(name)
|
||||||
|
if lookup in trie:
|
||||||
|
return get_trie_value(trie, lookup)
|
||||||
|
else:
|
||||||
|
# Is this a language name plus extra verbiage? Maybe it has "...isch",
|
||||||
|
# "... language", or "... Chinese" attached to it, for example. Look
|
||||||
|
# for a matching prefix of the desired name with at least 4 characters.
|
||||||
|
prefixes = trie.prefixes(lookup)
|
||||||
|
if prefixes and len(prefixes[-1]) >= 4:
|
||||||
|
return get_trie_value(trie, prefixes[-1])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def code_to_names(code):
|
||||||
|
"""
|
||||||
|
Given the code for a language, script, or territory, get a dictionary of its
|
||||||
|
names in various languages.
|
||||||
|
"""
|
||||||
|
# late import to save memory when possible
|
||||||
|
import language_data.name_data
|
||||||
|
return language_data.name_data.CODE_TO_NAMES.get(code, {})
|
6140
lib/language_data/population_data.py
Normal file
58
lib/language_data/registry_parser.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
from language_data.util import data_filename
|
||||||
|
|
||||||
|
LIST_KEYS = {'Description', 'Prefix'}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_file(file):
|
||||||
|
"""
|
||||||
|
Take an open file containing the IANA subtag registry, and yield a
|
||||||
|
dictionary of information for each subtag it describes.
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
for line in file:
|
||||||
|
line = line.rstrip('\n')
|
||||||
|
if line == '%%':
|
||||||
|
# This is a separator between items. Parse the data we've
|
||||||
|
# collected and yield the result.
|
||||||
|
yield from parse_item(lines)
|
||||||
|
lines.clear()
|
||||||
|
elif line.startswith(' '):
|
||||||
|
# This is a continuation line. Concatenate it to the previous
|
||||||
|
# line, including one of the spaces.
|
||||||
|
lines[-1] += line[1:]
|
||||||
|
else:
|
||||||
|
lines.append(line)
|
||||||
|
yield from parse_item(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_item(lines):
|
||||||
|
"""
|
||||||
|
Given the lines that form a subtag entry (after joining wrapped lines
|
||||||
|
back together), parse the data they contain.
|
||||||
|
|
||||||
|
Returns a generator that yields once if there was any data there
|
||||||
|
(and an empty generator if this was just the header).
|
||||||
|
"""
|
||||||
|
info = {}
|
||||||
|
for line in lines:
|
||||||
|
key, value = line.split(': ', 1)
|
||||||
|
if key in LIST_KEYS:
|
||||||
|
info.setdefault(key, []).append(value)
|
||||||
|
else:
|
||||||
|
assert key not in info
|
||||||
|
info[key] = value
|
||||||
|
|
||||||
|
if 'Subtag' in info or 'Tag' in info:
|
||||||
|
yield info
|
||||||
|
|
||||||
|
|
||||||
|
def parse_registry():
|
||||||
|
"""
|
||||||
|
Yield a sequence of dictionaries, containing the info in the included
|
||||||
|
IANA subtag registry file.
|
||||||
|
"""
|
||||||
|
with open(data_filename('language-subtag-registry.txt'),
|
||||||
|
encoding='utf-8') as data_file:
|
||||||
|
# 'yield from' instead of returning, so that we only close the file
|
||||||
|
# when finished.
|
||||||
|
yield from parse_file(data_file)
|
15
lib/language_data/util.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
"""
|
||||||
|
Used for locating a file in the data directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pkg_resources import resource_filename
|
||||||
|
DATA_ROOT = resource_filename('language_data', 'data')
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def data_filename(filename):
|
||||||
|
"""
|
||||||
|
Given a relative filename, get the full path to that file in the data
|
||||||
|
directory.
|
||||||
|
"""
|
||||||
|
return os.path.join(DATA_ROOT, filename)
|
|
@ -4009,7 +4009,42 @@ class AddShows(Home):
|
||||||
if all_langs:
|
if all_langs:
|
||||||
result.extend([lang['sg_lang'] for lang in all_langs if lang['sg_lang'] not in result])
|
result.extend([lang['sg_lang'] for lang in all_langs if lang['sg_lang'] not in result])
|
||||||
|
|
||||||
return json_dumps({'results': result})
|
try:
|
||||||
|
# noinspection PyPep8Naming
|
||||||
|
from langcodes import Language as lang_obj, LanguageTagError, standardize_tag
|
||||||
|
except ImportError:
|
||||||
|
lang_obj = None
|
||||||
|
result_ext = []
|
||||||
|
if None is not lang_obj:
|
||||||
|
prio_abbr = ''
|
||||||
|
prio_lang = []
|
||||||
|
try:
|
||||||
|
lang = lang_obj.get(sickgear.ADD_SHOWS_METALANG)
|
||||||
|
prio_abbr = lang.to_alpha3()
|
||||||
|
prio_lang = [dict(orig_abbr=sickgear.ADD_SHOWS_METALANG, std_abbr=sickgear.ADD_SHOWS_METALANG,
|
||||||
|
abbr=prio_abbr, en=lang.display_name(), native=lang.autonym())]
|
||||||
|
except (BaseException, Exception) as _:
|
||||||
|
pass
|
||||||
|
dedupe = []
|
||||||
|
for cur_lang in result:
|
||||||
|
try:
|
||||||
|
lang = lang_obj.get(cur_lang)
|
||||||
|
abbr = lang.to_alpha3()
|
||||||
|
except (BaseException, Exception) as _:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
std_abbr = standardize_tag(cur_lang, macro=True)
|
||||||
|
except (BaseException, Exception) as _:
|
||||||
|
std_abbr = None
|
||||||
|
|
||||||
|
if abbr not in dedupe and abbr != prio_abbr:
|
||||||
|
dedupe += [abbr]
|
||||||
|
result_ext += [dict(orig_abbr=cur_lang, std_abbr=std_abbr, abbr=abbr, en=lang.display_name(), native=lang.autonym())]
|
||||||
|
|
||||||
|
result_ext = prio_lang + sorted(result_ext, key=lambda x: x['en'])
|
||||||
|
|
||||||
|
return json_dumps({'results': [] if result_ext else result, 'results_ext': result_ext})
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def generate_show_dir_name(show_name):
|
def generate_show_dir_name(show_name):
|
||||||
|
|