mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-30 14:43:39 +00:00
1932 lines
68 KiB
Python
1932 lines
68 KiB
Python
|
"""
|
|||
|
langcodes knows what languages are. It knows the standardized codes that
|
|||
|
refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi.
|
|||
|
Often, it knows what these languages are called *in* a language, and that
|
|||
|
language doesn't have to be English.
|
|||
|
|
|||
|
See README.md for the main documentation, or read it on GitHub at
|
|||
|
https://github.com/LuminosoInsight/langcodes/ . For more specific documentation
|
|||
|
on the functions in langcodes, scroll down and read the docstrings.
|
|||
|
|
|||
|
Some of these functions, particularly those that work with the names of
|
|||
|
languages, require the `language_data` module to be installed.
|
|||
|
"""
|
|||
|
from operator import itemgetter
|
|||
|
from typing import Any, List, Tuple, Dict, Sequence, Iterable, Optional, Mapping, Union
|
|||
|
import warnings
|
|||
|
import sys
|
|||
|
|
|||
|
from langcodes.tag_parser import LanguageTagError, parse_tag, normalize_characters
|
|||
|
from langcodes.language_distance import tuple_distance_cached
|
|||
|
from langcodes.data_dicts import (
|
|||
|
ALL_SCRIPTS,
|
|||
|
DEFAULT_SCRIPTS,
|
|||
|
LANGUAGE_REPLACEMENTS,
|
|||
|
LANGUAGE_ALPHA3,
|
|||
|
LANGUAGE_ALPHA3_BIBLIOGRAPHIC,
|
|||
|
TERRITORY_REPLACEMENTS,
|
|||
|
NORMALIZED_MACROLANGUAGES,
|
|||
|
LIKELY_SUBTAGS,
|
|||
|
VALIDITY,
|
|||
|
)
|
|||
|
|
|||
|
# When we're getting natural language information *about* languages, it's in
|
|||
|
# English if you don't specify the language.
|
|||
|
DEFAULT_LANGUAGE = 'en'
|
|||
|
|
|||
|
|
|||
|
LANGUAGE_NAME_IMPORT_MESSAGE = """
|
|||
|
Looking up language names now requires the `language_data` package.
|
|||
|
|
|||
|
Install it with:
|
|||
|
pip install language_data
|
|||
|
Or as an optional feature of langcodes:
|
|||
|
pip install langcodes[data]
|
|||
|
"""
|
|||
|
|
|||
|
|
|||
|
class Language:
|
|||
|
"""
|
|||
|
The Language class defines the results of parsing a language tag.
|
|||
|
Language objects have the following attributes, any of which may be
|
|||
|
unspecified (in which case their value is None):
|
|||
|
|
|||
|
- *language*: the code for the language itself.
|
|||
|
- *script*: the 4-letter code for the writing system being used.
|
|||
|
- *territory*: the 2-letter or 3-digit code for the country or similar territory
|
|||
|
of the world whose usage of the language appears in this text.
|
|||
|
- *extlangs*: a list of more specific language codes that follow the language
|
|||
|
code. (This is allowed by the language code syntax, but deprecated.)
|
|||
|
- *variants*: codes for specific variations of language usage that aren't
|
|||
|
covered by the *script* or *territory* codes.
|
|||
|
- *extensions*: information that's attached to the language code for use in
|
|||
|
some specific system, such as Unicode collation orders.
|
|||
|
- *private*: a code starting with `x-` that has no defined meaning.
|
|||
|
|
|||
|
The `Language.get` method converts a string to a Language instance.
|
|||
|
It's also available at the top level of this module as the `get` function.
|
|||
|
"""
|
|||
|
|
|||
|
ATTRIBUTES = [
|
|||
|
'language',
|
|||
|
'extlangs',
|
|||
|
'script',
|
|||
|
'territory',
|
|||
|
'variants',
|
|||
|
'extensions',
|
|||
|
'private',
|
|||
|
]
|
|||
|
|
|||
|
# When looking up "likely subtags" data, we try looking up the data for
|
|||
|
# increasingly less specific versions of the language code.
|
|||
|
BROADER_KEYSETS = [
|
|||
|
{'language', 'script', 'territory'},
|
|||
|
{'language', 'territory'},
|
|||
|
{'language', 'script'},
|
|||
|
{'language'},
|
|||
|
{'script'},
|
|||
|
{},
|
|||
|
]
|
|||
|
|
|||
|
MATCHABLE_KEYSETS = [
|
|||
|
{'language', 'script', 'territory'},
|
|||
|
{'language', 'script'},
|
|||
|
{'language'},
|
|||
|
]
|
|||
|
|
|||
|
# Values cached at the class level
|
|||
|
_INSTANCES: Dict[tuple, 'Language'] = {}
|
|||
|
_PARSE_CACHE: Dict[Tuple[str, bool], 'Language'] = {}
|
|||
|
|
|||
|
def __init__(
|
|||
|
self,
|
|||
|
language: Optional[str] = None,
|
|||
|
extlangs: Optional[Sequence[str]] = None,
|
|||
|
script: Optional[str] = None,
|
|||
|
territory: Optional[str] = None,
|
|||
|
variants: Optional[Sequence[str]] = None,
|
|||
|
extensions: Optional[Sequence[str]] = None,
|
|||
|
private: Optional[str] = None,
|
|||
|
):
|
|||
|
"""
|
|||
|
The constructor for Language objects.
|
|||
|
|
|||
|
It's inefficient to call this directly, because it can't return
|
|||
|
an existing instance. Instead, call Language.make(), which
|
|||
|
has the same signature.
|
|||
|
"""
|
|||
|
self.language = language
|
|||
|
self.extlangs = extlangs
|
|||
|
self.script = script
|
|||
|
self.territory = territory
|
|||
|
self.variants = variants
|
|||
|
self.extensions = extensions
|
|||
|
self.private = private
|
|||
|
|
|||
|
# Cached values
|
|||
|
self._simplified: 'Language' = None
|
|||
|
self._searchable: 'Language' = None
|
|||
|
self._broader: List[str] = None
|
|||
|
self._assumed: 'Language' = None
|
|||
|
self._filled: 'Language' = None
|
|||
|
self._macrolanguage: Optional['Language'] = None
|
|||
|
self._str_tag: str = None
|
|||
|
self._dict: dict = None
|
|||
|
self._disp_separator: str = None
|
|||
|
self._disp_pattern: str = None
|
|||
|
|
|||
|
# Make sure the str_tag value is cached
|
|||
|
self.to_tag()
|
|||
|
|
|||
|
@classmethod
|
|||
|
def make(
|
|||
|
cls,
|
|||
|
language: Optional[str] = None,
|
|||
|
extlangs: Optional[Sequence[str]] = None,
|
|||
|
script: Optional[str] = None,
|
|||
|
territory: Optional[str] = None,
|
|||
|
variants: Optional[Sequence[str]] = None,
|
|||
|
extensions: Optional[Sequence[str]] = None,
|
|||
|
private: Optional[str] = None,
|
|||
|
) -> 'Language':
|
|||
|
"""
|
|||
|
Create a Language object by giving any subset of its attributes.
|
|||
|
|
|||
|
If this value has been created before, return the existing value.
|
|||
|
"""
|
|||
|
values = (
|
|||
|
language,
|
|||
|
tuple(extlangs or ()),
|
|||
|
script,
|
|||
|
territory,
|
|||
|
tuple(variants or ()),
|
|||
|
tuple(extensions or ()),
|
|||
|
private,
|
|||
|
)
|
|||
|
if values in cls._INSTANCES:
|
|||
|
return cls._INSTANCES[values]
|
|||
|
|
|||
|
instance = cls(
|
|||
|
language=language,
|
|||
|
extlangs=extlangs,
|
|||
|
script=script,
|
|||
|
territory=territory,
|
|||
|
variants=variants,
|
|||
|
extensions=extensions,
|
|||
|
private=private,
|
|||
|
)
|
|||
|
cls._INSTANCES[values] = instance
|
|||
|
return instance
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def get(tag: Union[str, 'Language'], normalize=True) -> 'Language':
|
|||
|
"""
|
|||
|
Create a Language object from a language tag string.
|
|||
|
|
|||
|
If normalize=True, non-standard or overlong tags will be replaced as
|
|||
|
they're interpreted. This is recommended.
|
|||
|
|
|||
|
Here are several examples of language codes, which are also test cases.
|
|||
|
Most language codes are straightforward, but these examples will get
|
|||
|
pretty obscure toward the end.
|
|||
|
|
|||
|
>>> Language.get('en-US')
|
|||
|
Language.make(language='en', territory='US')
|
|||
|
|
|||
|
>>> Language.get('zh-Hant')
|
|||
|
Language.make(language='zh', script='Hant')
|
|||
|
|
|||
|
>>> Language.get('und')
|
|||
|
Language.make()
|
|||
|
|
|||
|
This function is idempotent, in case you already have a Language object:
|
|||
|
|
|||
|
>>> Language.get(Language.get('en-us'))
|
|||
|
Language.make(language='en', territory='US')
|
|||
|
|
|||
|
The non-code 'root' is sometimes used to represent the lack of any
|
|||
|
language information, similar to 'und'.
|
|||
|
|
|||
|
>>> Language.get('root')
|
|||
|
Language.make()
|
|||
|
|
|||
|
By default, getting a Language object will automatically convert
|
|||
|
deprecated tags:
|
|||
|
|
|||
|
>>> Language.get('iw')
|
|||
|
Language.make(language='he')
|
|||
|
|
|||
|
>>> Language.get('in')
|
|||
|
Language.make(language='id')
|
|||
|
|
|||
|
One type of deprecated tag that should be replaced is for sign
|
|||
|
languages, which used to all be coded as regional variants of a
|
|||
|
fictitious global sign language called 'sgn'. Of course, there is no
|
|||
|
global sign language, so sign languages now have their own language
|
|||
|
codes.
|
|||
|
|
|||
|
>>> Language.get('sgn-US')
|
|||
|
Language.make(language='ase')
|
|||
|
|
|||
|
>>> Language.get('sgn-US', normalize=False)
|
|||
|
Language.make(language='sgn', territory='US')
|
|||
|
|
|||
|
'en-gb-oed' is a tag that's grandfathered into the standard because it
|
|||
|
has been used to mean "spell-check this with Oxford English Dictionary
|
|||
|
spelling", but that tag has the wrong shape. We interpret this as the
|
|||
|
new standardized tag 'en-gb-oxendict', unless asked not to normalize.
|
|||
|
|
|||
|
>>> Language.get('en-gb-oed')
|
|||
|
Language.make(language='en', territory='GB', variants=['oxendict'])
|
|||
|
|
|||
|
>>> Language.get('en-gb-oed', normalize=False)
|
|||
|
Language.make(language='en-gb-oed')
|
|||
|
|
|||
|
'zh-min-nan' is another oddly-formed tag, used to represent the
|
|||
|
Southern Min language, which includes Taiwanese as a regional form. It
|
|||
|
now has its own language code.
|
|||
|
|
|||
|
>>> Language.get('zh-min-nan')
|
|||
|
Language.make(language='nan')
|
|||
|
|
|||
|
The vague tag 'zh-min' is now also interpreted as 'nan', with a private
|
|||
|
extension indicating that it had a different form:
|
|||
|
|
|||
|
>>> Language.get('zh-min')
|
|||
|
Language.make(language='nan', private='x-zh-min')
|
|||
|
|
|||
|
Occasionally Wiktionary will use 'extlang' tags in strange ways, such
|
|||
|
as using the tag 'und-ibe' for some unspecified Iberian language.
|
|||
|
|
|||
|
>>> Language.get('und-ibe')
|
|||
|
Language.make(extlangs=['ibe'])
|
|||
|
|
|||
|
Here's an example of replacing multiple deprecated tags.
|
|||
|
|
|||
|
The language tag 'sh' (Serbo-Croatian) ended up being politically
|
|||
|
problematic, and different standards took different steps to address
|
|||
|
this. The IANA made it into a macrolanguage that contains 'sr', 'hr',
|
|||
|
and 'bs'. Unicode further decided that it's a legacy tag that should
|
|||
|
be interpreted as 'sr-Latn', which the language matching rules say
|
|||
|
is mutually intelligible with all those languages.
|
|||
|
|
|||
|
We complicate the example by adding on the territory tag 'QU', an old
|
|||
|
provisional tag for the European Union, which is now standardized as
|
|||
|
'EU'.
|
|||
|
|
|||
|
>>> Language.get('sh-QU')
|
|||
|
Language.make(language='sr', script='Latn', territory='EU')
|
|||
|
"""
|
|||
|
if isinstance(tag, Language):
|
|||
|
if not normalize:
|
|||
|
# shortcut: we have the tag already
|
|||
|
return tag
|
|||
|
|
|||
|
# We might need to normalize this tag. Convert it back into a
|
|||
|
# string tag, to cover all the edge cases of normalization in a
|
|||
|
# way that we've already solved.
|
|||
|
tag = tag.to_tag()
|
|||
|
|
|||
|
if (tag, normalize) in Language._PARSE_CACHE:
|
|||
|
return Language._PARSE_CACHE[tag, normalize]
|
|||
|
|
|||
|
data: Dict[str, Any] = {}
|
|||
|
|
|||
|
# If the complete tag appears as something to normalize, do the
|
|||
|
# normalization right away. Smash case and convert underscores to
|
|||
|
# hyphens when checking, because the case normalization that comes from
|
|||
|
# parse_tag() hasn't been applied yet.
|
|||
|
|
|||
|
tag_lower = normalize_characters(tag)
|
|||
|
if normalize and tag_lower in LANGUAGE_REPLACEMENTS:
|
|||
|
tag = LANGUAGE_REPLACEMENTS[tag_lower]
|
|||
|
|
|||
|
components = parse_tag(tag)
|
|||
|
|
|||
|
for typ, value in components:
|
|||
|
if typ == 'extlang' and normalize and 'language' in data:
|
|||
|
# smash extlangs when possible
|
|||
|
minitag = f"{data['language']}-{value}"
|
|||
|
norm = LANGUAGE_REPLACEMENTS.get(normalize_characters(minitag))
|
|||
|
if norm is not None:
|
|||
|
data.update(Language.get(norm, normalize).to_dict())
|
|||
|
else:
|
|||
|
data.setdefault('extlangs', []).append(value)
|
|||
|
elif typ in {'extlang', 'variant', 'extension'}:
|
|||
|
data.setdefault(typ + 's', []).append(value)
|
|||
|
elif typ == 'language':
|
|||
|
if value == 'und':
|
|||
|
pass
|
|||
|
elif normalize:
|
|||
|
replacement = LANGUAGE_REPLACEMENTS.get(value.lower())
|
|||
|
if replacement is not None:
|
|||
|
# parse the replacement if necessary -- this helps with
|
|||
|
# Serbian and Moldovan
|
|||
|
data.update(Language.get(replacement, normalize).to_dict())
|
|||
|
else:
|
|||
|
data['language'] = value
|
|||
|
else:
|
|||
|
data['language'] = value
|
|||
|
elif typ == 'territory':
|
|||
|
if normalize:
|
|||
|
data['territory'] = TERRITORY_REPLACEMENTS.get(value.lower(), value)
|
|||
|
else:
|
|||
|
data['territory'] = value
|
|||
|
elif typ == 'grandfathered':
|
|||
|
# If we got here, we got a grandfathered tag but we were asked
|
|||
|
# not to normalize it, or the CLDR data doesn't know how to
|
|||
|
# normalize it. The best we can do is set the entire tag as the
|
|||
|
# language.
|
|||
|
data['language'] = value
|
|||
|
else:
|
|||
|
data[typ] = value
|
|||
|
|
|||
|
result = Language.make(**data)
|
|||
|
Language._PARSE_CACHE[tag, normalize] = result
|
|||
|
return result
|
|||
|
|
|||
|
def to_tag(self) -> str:
|
|||
|
"""
|
|||
|
Convert a Language back to a standard language tag, as a string.
|
|||
|
This is also the str() representation of a Language object.
|
|||
|
|
|||
|
>>> Language.make(language='en', territory='GB').to_tag()
|
|||
|
'en-GB'
|
|||
|
|
|||
|
>>> Language.make(language='yue', script='Hant', territory='HK').to_tag()
|
|||
|
'yue-Hant-HK'
|
|||
|
|
|||
|
>>> Language.make(script='Arab').to_tag()
|
|||
|
'und-Arab'
|
|||
|
|
|||
|
>>> str(Language.make(territory='IN'))
|
|||
|
'und-IN'
|
|||
|
"""
|
|||
|
if self._str_tag is not None:
|
|||
|
return self._str_tag
|
|||
|
subtags = ['und']
|
|||
|
if self.language:
|
|||
|
subtags[0] = self.language
|
|||
|
if self.extlangs:
|
|||
|
for extlang in sorted(self.extlangs):
|
|||
|
subtags.append(extlang)
|
|||
|
if self.script:
|
|||
|
subtags.append(self.script)
|
|||
|
if self.territory:
|
|||
|
subtags.append(self.territory)
|
|||
|
if self.variants:
|
|||
|
for variant in sorted(self.variants):
|
|||
|
subtags.append(variant)
|
|||
|
if self.extensions:
|
|||
|
for ext in self.extensions:
|
|||
|
subtags.append(ext)
|
|||
|
if self.private:
|
|||
|
subtags.append(self.private)
|
|||
|
self._str_tag = '-'.join(subtags)
|
|||
|
return self._str_tag
|
|||
|
|
|||
|
def simplify_script(self) -> 'Language':
|
|||
|
"""
|
|||
|
Remove the script from some parsed language data, if the script is
|
|||
|
redundant with the language.
|
|||
|
|
|||
|
>>> Language.make(language='en', script='Latn').simplify_script()
|
|||
|
Language.make(language='en')
|
|||
|
|
|||
|
>>> Language.make(language='yi', script='Latn').simplify_script()
|
|||
|
Language.make(language='yi', script='Latn')
|
|||
|
|
|||
|
>>> Language.make(language='yi', script='Hebr').simplify_script()
|
|||
|
Language.make(language='yi')
|
|||
|
"""
|
|||
|
if self._simplified is not None:
|
|||
|
return self._simplified
|
|||
|
|
|||
|
if self.language and self.script:
|
|||
|
if DEFAULT_SCRIPTS.get(self.language) == self.script:
|
|||
|
result = self.update_dict({'script': None})
|
|||
|
self._simplified = result
|
|||
|
return self._simplified
|
|||
|
|
|||
|
self._simplified = self
|
|||
|
return self._simplified
|
|||
|
|
|||
|
def assume_script(self) -> 'Language':
|
|||
|
"""
|
|||
|
Fill in the script if it's missing, and if it can be assumed from the
|
|||
|
language subtag. This is the opposite of `simplify_script`.
|
|||
|
|
|||
|
>>> Language.make(language='en').assume_script()
|
|||
|
Language.make(language='en', script='Latn')
|
|||
|
|
|||
|
>>> Language.make(language='yi').assume_script()
|
|||
|
Language.make(language='yi', script='Hebr')
|
|||
|
|
|||
|
>>> Language.make(language='yi', script='Latn').assume_script()
|
|||
|
Language.make(language='yi', script='Latn')
|
|||
|
|
|||
|
This fills in nothing when the script cannot be assumed -- such as when
|
|||
|
the language has multiple scripts, or it has no standard orthography:
|
|||
|
|
|||
|
>>> Language.make(language='sr').assume_script()
|
|||
|
Language.make(language='sr')
|
|||
|
|
|||
|
>>> Language.make(language='eee').assume_script()
|
|||
|
Language.make(language='eee')
|
|||
|
|
|||
|
It also dosn't fill anything in when the language is unspecified.
|
|||
|
|
|||
|
>>> Language.make(territory='US').assume_script()
|
|||
|
Language.make(territory='US')
|
|||
|
"""
|
|||
|
if self._assumed is not None:
|
|||
|
return self._assumed
|
|||
|
if self.language and not self.script:
|
|||
|
try:
|
|||
|
self._assumed = self.update_dict(
|
|||
|
{'script': DEFAULT_SCRIPTS[self.language]}
|
|||
|
)
|
|||
|
except KeyError:
|
|||
|
self._assumed = self
|
|||
|
else:
|
|||
|
self._assumed = self
|
|||
|
return self._assumed
|
|||
|
|
|||
|
def prefer_macrolanguage(self) -> 'Language':
|
|||
|
"""
|
|||
|
BCP 47 doesn't specify what to do with macrolanguages and the languages
|
|||
|
they contain. The Unicode CLDR, on the other hand, says that when a
|
|||
|
macrolanguage has a dominant standardized language, the macrolanguage
|
|||
|
code should be used for that language. For example, Mandarin Chinese
|
|||
|
is 'zh', not 'cmn', according to Unicode, and Malay is 'ms', not 'zsm'.
|
|||
|
|
|||
|
This isn't a rule you'd want to follow in all cases -- for example, you may
|
|||
|
want to be able to specifically say that 'ms' (the Malay macrolanguage)
|
|||
|
contains both 'zsm' (Standard Malay) and 'id' (Indonesian). But applying
|
|||
|
this rule helps when interoperating with the Unicode CLDR.
|
|||
|
|
|||
|
So, applying `prefer_macrolanguage` to a Language object will
|
|||
|
return a new object, replacing the language with the macrolanguage if
|
|||
|
it is the dominant language within that macrolanguage. It will leave
|
|||
|
non-dominant languages that have macrolanguages alone.
|
|||
|
|
|||
|
>>> Language.get('arb').prefer_macrolanguage()
|
|||
|
Language.make(language='ar')
|
|||
|
|
|||
|
>>> Language.get('cmn-Hant').prefer_macrolanguage()
|
|||
|
Language.make(language='zh', script='Hant')
|
|||
|
|
|||
|
>>> Language.get('yue-Hant').prefer_macrolanguage()
|
|||
|
Language.make(language='yue', script='Hant')
|
|||
|
"""
|
|||
|
if self._macrolanguage is not None:
|
|||
|
return self._macrolanguage
|
|||
|
language = self.language or 'und'
|
|||
|
if language in NORMALIZED_MACROLANGUAGES:
|
|||
|
self._macrolanguage = self.update_dict(
|
|||
|
{'language': NORMALIZED_MACROLANGUAGES[language]}
|
|||
|
)
|
|||
|
else:
|
|||
|
self._macrolanguage = self
|
|||
|
return self._macrolanguage
|
|||
|
|
|||
|
def to_alpha3(self, variant: str = 'T') -> str:
|
|||
|
"""
|
|||
|
Get the three-letter language code for this language, even if it's
|
|||
|
canonically written with a two-letter code.
|
|||
|
|
|||
|
These codes are the 'alpha3' codes defined by ISO 639-2.
|
|||
|
|
|||
|
When this function returns, it always returns a 3-letter string. If
|
|||
|
there is no known alpha3 code for the language, it raises a LookupError.
|
|||
|
|
|||
|
In cases where the distinction matters, we default to the 'terminology'
|
|||
|
code. You can pass `variant='B'` to get the 'bibliographic' code instead.
|
|||
|
For example, the terminology code for German is 'deu', while the
|
|||
|
bibliographic code is 'ger'.
|
|||
|
|
|||
|
(The confusion between these two sets of codes is a good reason to avoid
|
|||
|
using alpha3 codes. Every language that has two different alpha3 codes
|
|||
|
also has an alpha2 code that's preferred, such as 'de' for German.)
|
|||
|
|
|||
|
>>> Language.get('fr').to_alpha3()
|
|||
|
'fra'
|
|||
|
>>> Language.get('fr-CA').to_alpha3()
|
|||
|
'fra'
|
|||
|
>>> Language.get('fr').to_alpha3(variant='B')
|
|||
|
'fre'
|
|||
|
>>> Language.get('de').to_alpha3(variant='T')
|
|||
|
'deu'
|
|||
|
>>> Language.get('ja').to_alpha3()
|
|||
|
'jpn'
|
|||
|
>>> Language.get('un').to_alpha3()
|
|||
|
Traceback (most recent call last):
|
|||
|
...
|
|||
|
LookupError: 'un' is not a known language code, and has no alpha3 code.
|
|||
|
|
|||
|
|
|||
|
All valid two-letter language codes have corresponding alpha3 codes,
|
|||
|
even the un-normalized ones. If they were assigned an alpha3 code by ISO
|
|||
|
before they were assigned a normalized code by CLDR, these codes may be
|
|||
|
different:
|
|||
|
|
|||
|
>>> Language.get('tl', normalize=False).to_alpha3()
|
|||
|
'tgl'
|
|||
|
>>> Language.get('tl').to_alpha3()
|
|||
|
'fil'
|
|||
|
>>> Language.get('sh', normalize=False).to_alpha3()
|
|||
|
'hbs'
|
|||
|
|
|||
|
|
|||
|
Three-letter codes are preserved, even if they're unknown:
|
|||
|
|
|||
|
>>> Language.get('qqq').to_alpha3()
|
|||
|
'qqq'
|
|||
|
>>> Language.get('und').to_alpha3()
|
|||
|
'und'
|
|||
|
"""
|
|||
|
variant = variant.upper()
|
|||
|
if variant not in 'BT':
|
|||
|
raise ValueError("Variant must be 'B' or 'T'")
|
|||
|
|
|||
|
language = self.language
|
|||
|
if language is None:
|
|||
|
return 'und'
|
|||
|
elif len(language) == 3:
|
|||
|
return language
|
|||
|
else:
|
|||
|
if variant == 'B' and language in LANGUAGE_ALPHA3_BIBLIOGRAPHIC:
|
|||
|
return LANGUAGE_ALPHA3_BIBLIOGRAPHIC[language]
|
|||
|
elif language in LANGUAGE_ALPHA3:
|
|||
|
return LANGUAGE_ALPHA3[language]
|
|||
|
else:
|
|||
|
raise LookupError(
|
|||
|
f"{language!r} is not a known language code, "
|
|||
|
"and has no alpha3 code."
|
|||
|
)
|
|||
|
|
|||
|
def broader_tags(self) -> List[str]:
|
|||
|
"""
|
|||
|
Iterate through increasingly general tags for this language.
|
|||
|
|
|||
|
This isn't actually that useful for matching two arbitrary language tags
|
|||
|
against each other, but it is useful for matching them against a known
|
|||
|
standardized form, such as in the CLDR data.
|
|||
|
|
|||
|
The list of broader versions to try appears in UTR 35, section 4.3,
|
|||
|
"Likely Subtags".
|
|||
|
|
|||
|
>>> Language.get('nn-Latn-NO-x-thingy').broader_tags()
|
|||
|
['nn-Latn-NO-x-thingy', 'nn-Latn-NO', 'nn-NO', 'nn-Latn', 'nn', 'und-Latn', 'und']
|
|||
|
|
|||
|
>>> Language.get('arb-Arab').broader_tags()
|
|||
|
['arb-Arab', 'ar-Arab', 'arb', 'ar', 'und-Arab', 'und']
|
|||
|
"""
|
|||
|
if self._broader is not None:
|
|||
|
return self._broader
|
|||
|
self._broader = [self.to_tag()]
|
|||
|
seen = set([self.to_tag()])
|
|||
|
for keyset in self.BROADER_KEYSETS:
|
|||
|
for start_language in (self, self.prefer_macrolanguage()):
|
|||
|
filtered = start_language._filter_attributes(keyset)
|
|||
|
tag = filtered.to_tag()
|
|||
|
if tag not in seen:
|
|||
|
self._broader.append(tag)
|
|||
|
seen.add(tag)
|
|||
|
return self._broader
|
|||
|
|
|||
|
def broaden(self) -> 'List[Language]':
|
|||
|
"""
|
|||
|
Like `broader_tags`, but returrns Language objects instead of strings.
|
|||
|
"""
|
|||
|
return [Language.get(tag) for tag in self.broader_tags()]
|
|||
|
|
|||
|
def maximize(self) -> 'Language':
|
|||
|
"""
|
|||
|
The Unicode CLDR contains a "likelySubtags" data file, which can guess
|
|||
|
reasonable values for fields that are missing from a language tag.
|
|||
|
|
|||
|
This is particularly useful for comparing, for example, "zh-Hant" and
|
|||
|
"zh-TW", two common language tags that say approximately the same thing
|
|||
|
via rather different information. (Using traditional Han characters is
|
|||
|
not the same as being in Taiwan, but each implies that the other is
|
|||
|
likely.)
|
|||
|
|
|||
|
These implications are provided in the CLDR supplemental data, and are
|
|||
|
based on the likelihood of people using the language to transmit text
|
|||
|
on the Internet. (This is why the overall default is English, not
|
|||
|
Chinese.)
|
|||
|
|
|||
|
It's important to recognize that these tags amplify majorities, and
|
|||
|
that not all language support fits into a "likely" language tag.
|
|||
|
|
|||
|
>>> str(Language.get('zh-Hant').maximize())
|
|||
|
'zh-Hant-TW'
|
|||
|
>>> str(Language.get('zh-TW').maximize())
|
|||
|
'zh-Hant-TW'
|
|||
|
>>> str(Language.get('ja').maximize())
|
|||
|
'ja-Jpan-JP'
|
|||
|
>>> str(Language.get('pt').maximize())
|
|||
|
'pt-Latn-BR'
|
|||
|
>>> str(Language.get('und-Arab').maximize())
|
|||
|
'ar-Arab-EG'
|
|||
|
>>> str(Language.get('und-CH').maximize())
|
|||
|
'de-Latn-CH'
|
|||
|
|
|||
|
As many standards are, this is US-centric:
|
|||
|
|
|||
|
>>> str(Language.make().maximize())
|
|||
|
'en-Latn-US'
|
|||
|
|
|||
|
"Extlangs" have no likely-subtags information, so they will give
|
|||
|
maximized results that make no sense:
|
|||
|
|
|||
|
>>> str(Language.get('und-ibe').maximize())
|
|||
|
'en-ibe-Latn-US'
|
|||
|
"""
|
|||
|
if self._filled is not None:
|
|||
|
return self._filled
|
|||
|
|
|||
|
for tag in self.broader_tags():
|
|||
|
if tag in LIKELY_SUBTAGS:
|
|||
|
result = Language.get(LIKELY_SUBTAGS[tag], normalize=False)
|
|||
|
result = result.update(self)
|
|||
|
self._filled = result
|
|||
|
return result
|
|||
|
|
|||
|
raise RuntimeError(
|
|||
|
"Couldn't fill in likely values. This represents a problem with "
|
|||
|
"the LIKELY_SUBTAGS data."
|
|||
|
)
|
|||
|
|
|||
|
# Support an old, wordier name for the method
|
|||
|
fill_likely_values = maximize
|
|||
|
|
|||
|
def match_score(self, supported: 'Language') -> int:
|
|||
|
"""
|
|||
|
DEPRECATED: use .distance() instead, which uses newer data and is _lower_
|
|||
|
for better matching languages.
|
|||
|
"""
|
|||
|
warnings.warn(
|
|||
|
"`match_score` is deprecated because it's based on deprecated CLDR info. "
|
|||
|
"Use `distance` instead, which is _lower_ for better matching languages. ",
|
|||
|
DeprecationWarning,
|
|||
|
)
|
|||
|
return 100 - min(self.distance(supported), 100)
|
|||
|
|
|||
|
def distance(self, supported: 'Language') -> int:
|
|||
|
"""
|
|||
|
Suppose that `self` is the language that the user desires, and
|
|||
|
`supported` is a language that is actually supported.
|
|||
|
|
|||
|
This method returns a number from 0 to 134 measuring the 'distance'
|
|||
|
between the languages (lower numbers are better). This is not a
|
|||
|
symmetric relation.
|
|||
|
|
|||
|
The language distance is not really about the linguistic similarity or
|
|||
|
history of the languages; instead, it's based largely on sociopolitical
|
|||
|
factors, indicating which language speakers are likely to know which
|
|||
|
other languages in the present world. Much of the heuristic is about
|
|||
|
finding a widespread 'world language' like English, Chinese, French, or
|
|||
|
Russian that speakers of a more localized language will accept.
|
|||
|
|
|||
|
A version that works on language tags, as strings, is in the function
|
|||
|
`tag_distance`. See that function for copious examples.
|
|||
|
"""
|
|||
|
if supported == self:
|
|||
|
return 0
|
|||
|
|
|||
|
# CLDR has realized that these matching rules are undermined when the
|
|||
|
# unspecified language 'und' gets maximized to 'en-Latn-US', so this case
|
|||
|
# is specifically not maximized:
|
|||
|
if self.language is None and self.script is None and self.territory is None:
|
|||
|
desired_triple = ('und', 'Zzzz', 'ZZ')
|
|||
|
else:
|
|||
|
desired_complete = self.prefer_macrolanguage().maximize()
|
|||
|
desired_triple = (
|
|||
|
desired_complete.language,
|
|||
|
desired_complete.script,
|
|||
|
desired_complete.territory,
|
|||
|
)
|
|||
|
|
|||
|
if (
|
|||
|
supported.language is None
|
|||
|
and supported.script is None
|
|||
|
and supported.territory is None
|
|||
|
):
|
|||
|
supported_triple = ('und', 'Zzzz', 'ZZ')
|
|||
|
else:
|
|||
|
supported_complete = supported.prefer_macrolanguage().maximize()
|
|||
|
supported_triple = (
|
|||
|
supported_complete.language,
|
|||
|
supported_complete.script,
|
|||
|
supported_complete.territory,
|
|||
|
)
|
|||
|
|
|||
|
return tuple_distance_cached(desired_triple, supported_triple)
|
|||
|
|
|||
|
def is_valid(self) -> bool:
|
|||
|
"""
|
|||
|
Checks whether the language, script, territory, and variants
|
|||
|
(if present) are all tags that have meanings assigned by IANA.
|
|||
|
For example, 'ja' (Japanese) is a valid tag, and 'jp' is not.
|
|||
|
|
|||
|
The data is current as of CLDR 40.
|
|||
|
|
|||
|
>>> Language.get('ja').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('jp').is_valid()
|
|||
|
False
|
|||
|
>>> Language.get('en-001').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('en-000').is_valid()
|
|||
|
False
|
|||
|
>>> Language.get('en-Latn').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('en-Latnx').is_valid()
|
|||
|
False
|
|||
|
>>> Language.get('und').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('en-GB-oxendict').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('en-GB-oxenfree').is_valid()
|
|||
|
False
|
|||
|
>>> Language.get('x-heptapod').is_valid()
|
|||
|
True
|
|||
|
|
|||
|
Some scripts are, confusingly, not included in CLDR's 'validity' pattern.
|
|||
|
If a script appears in the IANA registry, we consider it valid.
|
|||
|
|
|||
|
>>> Language.get('ur-Aran').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('cu-Cyrs').is_valid()
|
|||
|
True
|
|||
|
|
|||
|
A language tag with multiple extlangs will parse, but is not valid.
|
|||
|
The only allowed example is 'zh-min-nan', which normalizes to the
|
|||
|
language 'nan'.
|
|||
|
|
|||
|
>>> Language.get('zh-min-nan').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('sgn-ase-bfi').is_valid()
|
|||
|
False
|
|||
|
|
|||
|
These examples check that duplicate tags are not valid:
|
|||
|
|
|||
|
>>> Language.get('de-1901').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('de-1901-1901').is_valid()
|
|||
|
False
|
|||
|
>>> Language.get('en-a-bbb-c-ddd').is_valid()
|
|||
|
True
|
|||
|
>>> Language.get('en-a-bbb-a-ddd').is_valid()
|
|||
|
False
|
|||
|
|
|||
|
Of course, you should be prepared to catch a failure to parse the
|
|||
|
language code at all:
|
|||
|
|
|||
|
>>> Language.get('C').is_valid()
|
|||
|
Traceback (most recent call last):
|
|||
|
...
|
|||
|
langcodes.tag_parser.LanguageTagError: Expected a language code, got 'c'
|
|||
|
"""
|
|||
|
if self.extlangs is not None:
|
|||
|
# An erratum to BCP 47 says that tags with more than one extlang are
|
|||
|
# invalid.
|
|||
|
if len(self.extlangs) > 1:
|
|||
|
return False
|
|||
|
|
|||
|
subtags = [self.language, self.script, self.territory]
|
|||
|
checked_subtags = []
|
|||
|
if self.variants is not None:
|
|||
|
subtags.extend(self.variants)
|
|||
|
for subtag in subtags:
|
|||
|
if subtag is not None:
|
|||
|
checked_subtags.append(subtag)
|
|||
|
if not subtag.startswith('x-') and not VALIDITY.match(subtag):
|
|||
|
if subtag not in ALL_SCRIPTS:
|
|||
|
return False
|
|||
|
|
|||
|
# We check extensions for validity by ensuring that there aren't
|
|||
|
# two extensions introduced by the same letter. For example, you can't
|
|||
|
# have two 'u-' extensions.
|
|||
|
if self.extensions:
|
|||
|
checked_subtags.extend([extension[:2] for extension in self.extensions])
|
|||
|
if len(set(checked_subtags)) != len(checked_subtags):
|
|||
|
return False
|
|||
|
return True
|
|||
|
|
|||
|
def has_name_data(self) -> bool:
|
|||
|
"""
|
|||
|
Return True when we can name languages in this language. Requires
|
|||
|
`language_data` to be installed.
|
|||
|
|
|||
|
This is true when the language, or one of its 'broader' versions, is in
|
|||
|
the list of CLDR target languages.
|
|||
|
|
|||
|
>>> Language.get('fr').has_name_data()
|
|||
|
True
|
|||
|
>>> Language.get('so').has_name_data()
|
|||
|
True
|
|||
|
>>> Language.get('enc').has_name_data()
|
|||
|
False
|
|||
|
>>> Language.get('und').has_name_data()
|
|||
|
False
|
|||
|
"""
|
|||
|
try:
|
|||
|
from language_data.name_data import LANGUAGES_WITH_NAME_DATA
|
|||
|
except ImportError:
|
|||
|
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
|||
|
raise
|
|||
|
|
|||
|
matches = set(self.broader_tags()) & LANGUAGES_WITH_NAME_DATA
|
|||
|
return bool(matches)
|
|||
|
|
|||
|
# These methods help to show what the language tag means in natural
|
|||
|
# language. They actually apply the language-matching algorithm to find
|
|||
|
# the right language to name things in.
|
|||
|
|
|||
|
def _get_name(
|
|||
|
self, attribute: str, language: Union[str, 'Language'], max_distance: int
|
|||
|
) -> str:
|
|||
|
try:
|
|||
|
from language_data.names import code_to_names
|
|||
|
except ImportError:
|
|||
|
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
|||
|
raise
|
|||
|
|
|||
|
assert attribute in self.ATTRIBUTES
|
|||
|
if isinstance(language, str):
|
|||
|
language = Language.get(language)
|
|||
|
|
|||
|
attr_value = getattr(self, attribute)
|
|||
|
if attr_value is None:
|
|||
|
if attribute == 'language':
|
|||
|
attr_value = 'und'
|
|||
|
else:
|
|||
|
return None
|
|||
|
names = code_to_names(attr_value)
|
|||
|
|
|||
|
result = self._best_name(names, language, max_distance)
|
|||
|
if result is not None:
|
|||
|
return result
|
|||
|
else:
|
|||
|
# Construct a string like "Unknown language [zzz]"
|
|||
|
placeholder = None
|
|||
|
if attribute == 'language':
|
|||
|
placeholder = 'und'
|
|||
|
elif attribute == 'script':
|
|||
|
placeholder = 'Zzzz'
|
|||
|
elif attribute == 'territory':
|
|||
|
placeholder = 'ZZ'
|
|||
|
|
|||
|
unknown_name = None
|
|||
|
if placeholder is not None:
|
|||
|
names = code_to_names(placeholder)
|
|||
|
unknown_name = self._best_name(names, language, max_distance)
|
|||
|
if unknown_name is None:
|
|||
|
unknown_name = 'Unknown language subtag'
|
|||
|
return f'{unknown_name} [{attr_value}]'
|
|||
|
|
|||
|
def _best_name(
|
|||
|
self, names: Mapping[str, str], language: 'Language', max_distance: int
|
|||
|
):
|
|||
|
matchable_languages = set(language.broader_tags())
|
|||
|
possible_languages = [
|
|||
|
key for key in sorted(names.keys()) if key in matchable_languages
|
|||
|
]
|
|||
|
|
|||
|
target_language, score = closest_match(
|
|||
|
language, possible_languages, max_distance
|
|||
|
)
|
|||
|
if target_language in names:
|
|||
|
return names[target_language]
|
|||
|
else:
|
|||
|
return names.get(DEFAULT_LANGUAGE)
|
|||
|
|
|||
|
def language_name(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> str:
|
|||
|
"""
|
|||
|
Give the name of the language (not the entire tag, just the language part)
|
|||
|
in a natural language. The target language can be given as a string or
|
|||
|
another Language object.
|
|||
|
|
|||
|
By default, things are named in English:
|
|||
|
|
|||
|
>>> Language.get('fr').language_name()
|
|||
|
'French'
|
|||
|
>>> Language.get('el').language_name()
|
|||
|
'Greek'
|
|||
|
|
|||
|
But you can ask for language names in numerous other languages:
|
|||
|
|
|||
|
>>> Language.get('fr').language_name('fr')
|
|||
|
'français'
|
|||
|
>>> Language.get('el').language_name('fr')
|
|||
|
'grec'
|
|||
|
|
|||
|
Why does everyone get Slovak and Slovenian confused? Let's ask them.
|
|||
|
|
|||
|
>>> Language.get('sl').language_name('sl')
|
|||
|
'slovenščina'
|
|||
|
>>> Language.get('sk').language_name('sk')
|
|||
|
'slovenčina'
|
|||
|
>>> Language.get('sl').language_name('sk')
|
|||
|
'slovinčina'
|
|||
|
>>> Language.get('sk').language_name('sl')
|
|||
|
'slovaščina'
|
|||
|
"""
|
|||
|
return self._get_name('language', language, max_distance)
|
|||
|
|
|||
|
def display_name(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> str:
|
|||
|
"""
|
|||
|
It's often helpful to be able to describe a language code in a way that a user
|
|||
|
(or you) can understand, instead of in inscrutable short codes. The
|
|||
|
`display_name` method lets you describe a Language object *in a language*.
|
|||
|
|
|||
|
The `.display_name(language, min_score)` method will look up the name of the
|
|||
|
language. The names come from the IANA language tag registry, which is only in
|
|||
|
English, plus CLDR, which names languages in many commonly-used languages.
|
|||
|
|
|||
|
The default language for naming things is English:
|
|||
|
|
|||
|
>>> Language.make(language='fr').display_name()
|
|||
|
'French'
|
|||
|
|
|||
|
>>> Language.make().display_name()
|
|||
|
'Unknown language'
|
|||
|
|
|||
|
>>> Language.get('zh-Hans').display_name()
|
|||
|
'Chinese (Simplified)'
|
|||
|
|
|||
|
>>> Language.get('en-US').display_name()
|
|||
|
'English (United States)'
|
|||
|
|
|||
|
But you can ask for language names in numerous other languages:
|
|||
|
|
|||
|
>>> Language.get('fr').display_name('fr')
|
|||
|
'français'
|
|||
|
|
|||
|
>>> Language.get('fr').display_name('es')
|
|||
|
'francés'
|
|||
|
|
|||
|
>>> Language.make().display_name('es')
|
|||
|
'lengua desconocida'
|
|||
|
|
|||
|
>>> Language.get('zh-Hans').display_name('de')
|
|||
|
'Chinesisch (Vereinfacht)'
|
|||
|
|
|||
|
>>> Language.get('en-US').display_name('zh-Hans')
|
|||
|
'英语(美国)'
|
|||
|
"""
|
|||
|
reduced = self.simplify_script()
|
|||
|
language = Language.get(language)
|
|||
|
language_name = reduced.language_name(language, max_distance)
|
|||
|
extra_parts = []
|
|||
|
|
|||
|
if reduced.script is not None:
|
|||
|
extra_parts.append(reduced.script_name(language, max_distance))
|
|||
|
if reduced.territory is not None:
|
|||
|
extra_parts.append(reduced.territory_name(language, max_distance))
|
|||
|
|
|||
|
if extra_parts:
|
|||
|
clarification = language._display_separator().join(extra_parts)
|
|||
|
pattern = language._display_pattern()
|
|||
|
return pattern.format(language_name, clarification)
|
|||
|
else:
|
|||
|
return language_name
|
|||
|
|
|||
|
def _display_pattern(self) -> str:
|
|||
|
"""
|
|||
|
Get the pattern, according to CLDR, that should be used for clarifying
|
|||
|
details of a language code.
|
|||
|
"""
|
|||
|
# Technically we are supposed to look up this pattern in each language.
|
|||
|
# Practically, it's the same in every language except Chinese, where the
|
|||
|
# parentheses are full-width.
|
|||
|
if self._disp_pattern is not None:
|
|||
|
return self._disp_pattern
|
|||
|
if self.distance(Language.get('zh')) <= 25:
|
|||
|
self._disp_pattern = "{0}({1})"
|
|||
|
else:
|
|||
|
self._disp_pattern = "{0} ({1})"
|
|||
|
return self._disp_pattern
|
|||
|
|
|||
|
def _display_separator(self) -> str:
|
|||
|
"""
|
|||
|
Get the symbol that should be used to separate multiple clarifying
|
|||
|
details -- such as a comma in English, or an ideographic comma in
|
|||
|
Japanese.
|
|||
|
|
|||
|
Requires that `language_data` is installed.
|
|||
|
"""
|
|||
|
try:
|
|||
|
from language_data.names import DISPLAY_SEPARATORS
|
|||
|
except ImportError:
|
|||
|
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
|||
|
raise
|
|||
|
|
|||
|
if self._disp_separator is not None:
|
|||
|
return self._disp_separator
|
|||
|
matched, _dist = closest_match(self, DISPLAY_SEPARATORS.keys())
|
|||
|
self._disp_separator = DISPLAY_SEPARATORS[matched]
|
|||
|
return self._disp_separator
|
|||
|
|
|||
|
def autonym(self, max_distance: int = 9) -> str:
|
|||
|
"""
|
|||
|
Give the display name of this language *in* this language.
|
|||
|
Requires that `language_data` is installed.
|
|||
|
|
|||
|
>>> Language.get('fr').autonym()
|
|||
|
'français'
|
|||
|
>>> Language.get('es').autonym()
|
|||
|
'español'
|
|||
|
>>> Language.get('ja').autonym()
|
|||
|
'日本語'
|
|||
|
|
|||
|
This uses the `display_name()` method, so it can include the name of a
|
|||
|
script or territory when appropriate.
|
|||
|
|
|||
|
>>> Language.get('en-AU').autonym()
|
|||
|
'English (Australia)'
|
|||
|
>>> Language.get('sr-Latn').autonym()
|
|||
|
'srpski (latinica)'
|
|||
|
>>> Language.get('sr-Cyrl').autonym()
|
|||
|
'српски (ћирилица)'
|
|||
|
>>> Language.get('pa').autonym()
|
|||
|
'ਪੰਜਾਬੀ'
|
|||
|
>>> Language.get('pa-Arab').autonym()
|
|||
|
'پنجابی (عربی)'
|
|||
|
|
|||
|
This only works for language codes that CLDR has locale data for. You
|
|||
|
can't ask for the autonym of 'ja-Latn' and get 'nihongo (rōmaji)'.
|
|||
|
"""
|
|||
|
lang = self.prefer_macrolanguage()
|
|||
|
return lang.display_name(language=lang, max_distance=max_distance)
|
|||
|
|
|||
|
def script_name(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> str:
|
|||
|
"""
|
|||
|
Describe the script part of the language tag in a natural language.
|
|||
|
Requires that `language_data` is installed.
|
|||
|
"""
|
|||
|
return self._get_name('script', language, max_distance)
|
|||
|
|
|||
|
def territory_name(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> str:
|
|||
|
"""
|
|||
|
Describe the territory part of the language tag in a natural language.
|
|||
|
Requires that `language_data` is installed.
|
|||
|
"""
|
|||
|
return self._get_name('territory', language, max_distance)
|
|||
|
|
|||
|
def region_name(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> str:
|
|||
|
warnings.warn(
|
|||
|
"`region_name` has been renamed to `territory_name` for consistency",
|
|||
|
DeprecationWarning,
|
|||
|
)
|
|||
|
return self.territory_name(language, max_distance)
|
|||
|
|
|||
|
@property
|
|||
|
def region(self):
|
|||
|
warnings.warn(
|
|||
|
"The `region` property has been renamed to `territory` for consistency",
|
|||
|
DeprecationWarning,
|
|||
|
)
|
|||
|
return self.territory
|
|||
|
|
|||
|
def variant_names(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> Sequence[str]:
|
|||
|
"""
|
|||
|
Deprecated in version 3.0.
|
|||
|
|
|||
|
We don't store names for variants anymore, so this just returns the list
|
|||
|
of variant codes, such as ['oxendict'] for en-GB-oxendict.
|
|||
|
"""
|
|||
|
warnings.warn(
|
|||
|
"variant_names is deprecated and just returns the variant codes",
|
|||
|
DeprecationWarning,
|
|||
|
)
|
|||
|
return self.variants or []
|
|||
|
|
|||
|
def describe(
|
|||
|
self,
|
|||
|
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
|||
|
max_distance: int = 25,
|
|||
|
) -> dict:
|
|||
|
"""
|
|||
|
Return a dictionary that describes a given language tag in a specified
|
|||
|
natural language. Requires that `language_data` is installed.
|
|||
|
|
|||
|
See `language_name` and related methods for more specific versions of this.
|
|||
|
|
|||
|
The desired `language` will in fact be matched against the available
|
|||
|
options using the matching technique that this module provides. We can
|
|||
|
illustrate many aspects of this by asking for a description of Shavian
|
|||
|
script (a phonetic script for English devised by author George Bernard
|
|||
|
Shaw), and where you might find it, in various languages.
|
|||
|
|
|||
|
>>> shaw = Language.make(script='Shaw').maximize()
|
|||
|
>>> shaw.describe('en')
|
|||
|
{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
|
|||
|
|
|||
|
>>> shaw.describe('fr')
|
|||
|
{'language': 'anglais', 'script': 'shavien', 'territory': 'Royaume-Uni'}
|
|||
|
|
|||
|
>>> shaw.describe('es')
|
|||
|
{'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}
|
|||
|
|
|||
|
>>> shaw.describe('pt')
|
|||
|
{'language': 'inglês', 'script': 'shaviano', 'territory': 'Reino Unido'}
|
|||
|
|
|||
|
>>> shaw.describe('uk')
|
|||
|
{'language': 'англійська', 'script': 'шоу', 'territory': 'Велика Британія'}
|
|||
|
|
|||
|
>>> shaw.describe('arb')
|
|||
|
{'language': 'الإنجليزية', 'script': 'الشواني', 'territory': 'المملكة المتحدة'}
|
|||
|
|
|||
|
>>> shaw.describe('th')
|
|||
|
{'language': 'อังกฤษ', 'script': 'ซอเวียน', 'territory': 'สหราชอาณาจักร'}
|
|||
|
|
|||
|
>>> shaw.describe('zh-Hans')
|
|||
|
{'language': '英语', 'script': '萧伯纳式文', 'territory': '英国'}
|
|||
|
|
|||
|
>>> shaw.describe('zh-Hant')
|
|||
|
{'language': '英文', 'script': '簫柏納字符', 'territory': '英國'}
|
|||
|
|
|||
|
>>> shaw.describe('ja')
|
|||
|
{'language': '英語', 'script': 'ショー文字', 'territory': 'イギリス'}
|
|||
|
|
|||
|
When we don't have a localization for the language, we fall back on English,
|
|||
|
because the IANA provides names for all known codes in English.
|
|||
|
|
|||
|
>>> shaw.describe('lol')
|
|||
|
{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
|
|||
|
|
|||
|
When the language tag itself is a valid tag but with no known meaning, we
|
|||
|
say so in the appropriate language.
|
|||
|
|
|||
|
>>> Language.get('xyz-ZY').display_name()
|
|||
|
'Unknown language [xyz] (Unknown Region [ZY])'
|
|||
|
|
|||
|
>>> Language.get('xyz-ZY').display_name('es')
|
|||
|
'lengua desconocida [xyz] (Región desconocida [ZY])'
|
|||
|
"""
|
|||
|
names = {}
|
|||
|
if self.language:
|
|||
|
names['language'] = self.language_name(language, max_distance)
|
|||
|
if self.script:
|
|||
|
names['script'] = self.script_name(language, max_distance)
|
|||
|
if self.territory:
|
|||
|
names['territory'] = self.territory_name(language, max_distance)
|
|||
|
return names
|
|||
|
|
|||
|
def speaking_population(self) -> int:
|
|||
|
"""
|
|||
|
Get an estimate of how many people in the world speak this language,
|
|||
|
derived from CLDR data. Requires that `language_data` is installed.
|
|||
|
|
|||
|
Only the language and territory codes will be considered. If a
|
|||
|
territory code is included, the population will count only the
|
|||
|
speakers of the language in that territory.
|
|||
|
|
|||
|
Script subtags are disregarded, because it doesn't make sense to ask
|
|||
|
how many people speak in a particular writing script.
|
|||
|
|
|||
|
>>> Language.get('es').speaking_population()
|
|||
|
487664083
|
|||
|
>>> Language.get('pt').speaking_population()
|
|||
|
237135429
|
|||
|
>>> Language.get('es-BR').speaking_population()
|
|||
|
76218
|
|||
|
>>> Language.get('pt-BR').speaking_population()
|
|||
|
192661560
|
|||
|
>>> Language.get('vo').speaking_population()
|
|||
|
0
|
|||
|
"""
|
|||
|
try:
|
|||
|
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
|||
|
except ImportError:
|
|||
|
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
|||
|
raise
|
|||
|
|
|||
|
lang = self._filter_attributes(['language', 'territory'])
|
|||
|
return LANGUAGE_SPEAKING_POPULATION.get(str(lang), 0)
|
|||
|
|
|||
|
def writing_population(self) -> int:
|
|||
|
"""
|
|||
|
Get an estimate of how many people in the world read and write
|
|||
|
this language, derived from CLDR data. Requires that `language_data`
|
|||
|
is installed.
|
|||
|
|
|||
|
For many languages that aren't typically written, this is an
|
|||
|
overestimate, according to CLDR -- the data often includes people who
|
|||
|
speak that language but write in a different language.
|
|||
|
|
|||
|
Only the language, script, and territory codes will be considered.
|
|||
|
If a territory code is included, the population will count only the
|
|||
|
speakers of the language in that territory.
|
|||
|
|
|||
|
>>> all = Language.get('zh').writing_population()
|
|||
|
>>> all
|
|||
|
1240326057
|
|||
|
|
|||
|
>>> traditional = Language.get('zh-Hant').writing_population()
|
|||
|
>>> traditional
|
|||
|
37019589
|
|||
|
|
|||
|
>>> simplified = Language.get('zh-Hans').writing_population()
|
|||
|
>>> all == traditional + simplified
|
|||
|
True
|
|||
|
|
|||
|
>>> Language.get('zh-Hant-HK').writing_population()
|
|||
|
6439733
|
|||
|
>>> Language.get('zh-Hans-HK').writing_population()
|
|||
|
338933
|
|||
|
|
|||
|
Note that if you want to get the total Chinese writing population
|
|||
|
of Hong Kong, you need to avoid normalization that would interpret
|
|||
|
'zh-HK' as 'zh-Hant-HK'.
|
|||
|
|
|||
|
>>> Language.get('zh-HK', normalize=False).writing_population()
|
|||
|
6778666
|
|||
|
|
|||
|
Unknown or unspecified language codes get a population of 0.
|
|||
|
|
|||
|
>>> Language.get('xyz').writing_population()
|
|||
|
0
|
|||
|
|
|||
|
>>> Language.get('und').writing_population()
|
|||
|
0
|
|||
|
"""
|
|||
|
try:
|
|||
|
from language_data.population_data import LANGUAGE_WRITING_POPULATION
|
|||
|
except ImportError:
|
|||
|
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
|||
|
raise
|
|||
|
|
|||
|
lang = self._filter_attributes(['language', 'script', 'territory'])
|
|||
|
if str(lang) in LANGUAGE_WRITING_POPULATION:
|
|||
|
return LANGUAGE_WRITING_POPULATION[str(lang)]
|
|||
|
else:
|
|||
|
lang = lang.simplify_script()
|
|||
|
return LANGUAGE_WRITING_POPULATION.get(str(lang), 0)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def find_name(
|
|||
|
tagtype: str, name: str, language: Optional[Union[str, 'Language']] = None
|
|||
|
) -> 'Language':
|
|||
|
"""
|
|||
|
Find the subtag of a particular `tagtype` that has the given `name`.
|
|||
|
Requires that `language_data` is installed.
|
|||
|
|
|||
|
The default language, "und", will allow matching names in any language,
|
|||
|
so you can get the code 'fr' by looking up "French", "Français", or
|
|||
|
"francés".
|
|||
|
|
|||
|
Occasionally, names are ambiguous in a way that can be resolved by
|
|||
|
specifying what name the language is supposed to be in. For example,
|
|||
|
there is a language named 'Malayo' in English, but it's different from
|
|||
|
the language named 'Malayo' in Spanish (which is Malay). Specifying the
|
|||
|
language will look up the name in a trie that is only in that language.
|
|||
|
|
|||
|
In a previous version, we thought we were going to deprecate the
|
|||
|
`language` parameter, as there weren't significant cases of conflicts
|
|||
|
in names of things between languages. Well, we got more data, and
|
|||
|
conflicts in names are everywhere.
|
|||
|
|
|||
|
Specifying the language that the name should be in is still not
|
|||
|
required, but it will help to make sure that names can be
|
|||
|
round-tripped.
|
|||
|
|
|||
|
>>> Language.find_name('language', 'francés')
|
|||
|
Language.make(language='fr')
|
|||
|
|
|||
|
>>> Language.find_name('territory', 'United Kingdom')
|
|||
|
Language.make(territory='GB')
|
|||
|
|
|||
|
>>> Language.find_name('script', 'Arabic')
|
|||
|
Language.make(script='Arab')
|
|||
|
|
|||
|
>>> Language.find_name('language', 'norsk bokmål')
|
|||
|
Language.make(language='nb')
|
|||
|
|
|||
|
>>> Language.find_name('language', 'norsk')
|
|||
|
Language.make(language='no')
|
|||
|
|
|||
|
>>> Language.find_name('language', 'norsk', 'en')
|
|||
|
Traceback (most recent call last):
|
|||
|
...
|
|||
|
LookupError: Can't find any language named 'norsk'
|
|||
|
|
|||
|
>>> Language.find_name('language', 'norsk', 'no')
|
|||
|
Language.make(language='no')
|
|||
|
|
|||
|
>>> Language.find_name('language', 'malayo', 'en')
|
|||
|
Language.make(language='mbp')
|
|||
|
|
|||
|
>>> Language.find_name('language', 'malayo', 'es')
|
|||
|
Language.make(language='ms')
|
|||
|
|
|||
|
Some langauge names resolve to more than a language. For example,
|
|||
|
the name 'Brazilian Portuguese' resolves to a language and a territory,
|
|||
|
and 'Simplified Chinese' resolves to a language and a script. In these
|
|||
|
cases, a Language object with multiple subtags will be returned.
|
|||
|
|
|||
|
>>> Language.find_name('language', 'Brazilian Portuguese', 'en')
|
|||
|
Language.make(language='pt', territory='BR')
|
|||
|
|
|||
|
>>> Language.find_name('language', 'Simplified Chinese', 'en')
|
|||
|
Language.make(language='zh', script='Hans')
|
|||
|
|
|||
|
A small amount of fuzzy matching is supported: if the name can be
|
|||
|
shortened to match a single language name, you get that language.
|
|||
|
This allows, for example, "Hakka dialect" to match "Hakka".
|
|||
|
|
|||
|
>>> Language.find_name('language', 'Hakka dialect')
|
|||
|
Language.make(language='hak')
|
|||
|
"""
|
|||
|
try:
|
|||
|
from language_data.names import name_to_code
|
|||
|
except ImportError:
|
|||
|
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
|||
|
raise
|
|||
|
|
|||
|
# No matter what form of language we got, normalize it to a single
|
|||
|
# language subtag
|
|||
|
if isinstance(language, Language):
|
|||
|
language = language.language
|
|||
|
elif isinstance(language, str):
|
|||
|
language = get(language).language
|
|||
|
if language is None:
|
|||
|
language = 'und'
|
|||
|
|
|||
|
code = name_to_code(tagtype, name, language)
|
|||
|
if code is None:
|
|||
|
raise LookupError(f"Can't find any {tagtype} named {name!r}")
|
|||
|
if '-' in code:
|
|||
|
return Language.get(code)
|
|||
|
else:
|
|||
|
data = {tagtype: code}
|
|||
|
return Language.make(**data)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def find(
|
|||
|
name: str, language: Optional[Union[str, 'Language']] = None
|
|||
|
) -> 'Language':
|
|||
|
"""
|
|||
|
A concise version of `find_name`, used to get a language tag by its
|
|||
|
name in a natural language. The language can be omitted in the large
|
|||
|
majority of cases, where the language name is not ambiguous.
|
|||
|
|
|||
|
>>> Language.find('Türkçe')
|
|||
|
Language.make(language='tr')
|
|||
|
>>> Language.find('brazilian portuguese')
|
|||
|
Language.make(language='pt', territory='BR')
|
|||
|
>>> Language.find('simplified chinese')
|
|||
|
Language.make(language='zh', script='Hans')
|
|||
|
|
|||
|
Some language names are ambiguous: for example, there is a language
|
|||
|
named 'Fala' in English (with code 'fax'), but 'Fala' is also the
|
|||
|
Kwasio word for French. In this case, specifying the language that
|
|||
|
the name is in is necessary for disambiguation.
|
|||
|
|
|||
|
>>> Language.find('fala')
|
|||
|
Language.make(language='fr')
|
|||
|
>>> Language.find('fala', 'nmg')
|
|||
|
Language.make(language='fr')
|
|||
|
>>> Language.find('fala', 'en')
|
|||
|
Language.make(language='fax')
|
|||
|
"""
|
|||
|
return Language.find_name('language', name, language)
|
|||
|
|
|||
|
def to_dict(self) -> dict:
|
|||
|
"""
|
|||
|
Get a dictionary of the attributes of this Language object, which
|
|||
|
can be useful for constructing a similar object.
|
|||
|
"""
|
|||
|
if self._dict is not None:
|
|||
|
return self._dict
|
|||
|
|
|||
|
result = {}
|
|||
|
for key in self.ATTRIBUTES:
|
|||
|
value = getattr(self, key)
|
|||
|
if value:
|
|||
|
result[key] = value
|
|||
|
self._dict = result
|
|||
|
return result
|
|||
|
|
|||
|
def update(self, other: 'Language') -> 'Language':
|
|||
|
"""
|
|||
|
Update this Language with the fields of another Language.
|
|||
|
"""
|
|||
|
return Language.make(
|
|||
|
language=other.language or self.language,
|
|||
|
extlangs=other.extlangs or self.extlangs,
|
|||
|
script=other.script or self.script,
|
|||
|
territory=other.territory or self.territory,
|
|||
|
variants=other.variants or self.variants,
|
|||
|
extensions=other.extensions or self.extensions,
|
|||
|
private=other.private or self.private,
|
|||
|
)
|
|||
|
|
|||
|
def update_dict(self, newdata: dict) -> 'Language':
|
|||
|
"""
|
|||
|
Update the attributes of this Language from a dictionary.
|
|||
|
"""
|
|||
|
return Language.make(
|
|||
|
language=newdata.get('language', self.language),
|
|||
|
extlangs=newdata.get('extlangs', self.extlangs),
|
|||
|
script=newdata.get('script', self.script),
|
|||
|
territory=newdata.get('territory', self.territory),
|
|||
|
variants=newdata.get('variants', self.variants),
|
|||
|
extensions=newdata.get('extensions', self.extensions),
|
|||
|
private=newdata.get('private', self.private),
|
|||
|
)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def _filter_keys(d: dict, keys: Iterable[str]) -> dict:
|
|||
|
"""
|
|||
|
Select a subset of keys from a dictionary.
|
|||
|
"""
|
|||
|
return {key: d[key] for key in keys if key in d}
|
|||
|
|
|||
|
def _filter_attributes(self, keyset: Iterable[str]) -> 'Language':
|
|||
|
"""
|
|||
|
Return a copy of this object with a subset of its attributes set.
|
|||
|
"""
|
|||
|
filtered = self._filter_keys(self.to_dict(), keyset)
|
|||
|
return Language.make(**filtered)
|
|||
|
|
|||
|
def _searchable_form(self) -> 'Language':
|
|||
|
"""
|
|||
|
Convert a parsed language tag so that the information it contains is in
|
|||
|
the best form for looking up information in the CLDR.
|
|||
|
"""
|
|||
|
if self._searchable is not None:
|
|||
|
return self._searchable
|
|||
|
|
|||
|
self._searchable = (
|
|||
|
self._filter_attributes({'language', 'script', 'territory'})
|
|||
|
.simplify_script()
|
|||
|
.prefer_macrolanguage()
|
|||
|
)
|
|||
|
return self._searchable
|
|||
|
|
|||
|
def __eq__(self, other):
|
|||
|
if self is other:
|
|||
|
return True
|
|||
|
if not isinstance(other, Language):
|
|||
|
return False
|
|||
|
return self._str_tag == other._str_tag
|
|||
|
|
|||
|
def __hash__(self) -> int:
|
|||
|
return hash(id(self))
|
|||
|
|
|||
|
def __getitem__(self, key: str) -> Optional[Union[str, List[str]]]:
|
|||
|
if key in self.ATTRIBUTES:
|
|||
|
return getattr(self, key)
|
|||
|
else:
|
|||
|
raise KeyError(key)
|
|||
|
|
|||
|
def __contains__(self, key: str) -> bool:
|
|||
|
return key in self.ATTRIBUTES and getattr(self, key)
|
|||
|
|
|||
|
def __repr__(self) -> str:
|
|||
|
items = []
|
|||
|
for attr in self.ATTRIBUTES:
|
|||
|
if getattr(self, attr):
|
|||
|
value = getattr(self, attr)
|
|||
|
items.append(f'{attr}={value!r}')
|
|||
|
joined = ', '.join(items)
|
|||
|
return f"Language.make({joined})"
|
|||
|
|
|||
|
def __str__(self) -> str:
|
|||
|
return self.to_tag()
|
|||
|
|
|||
|
|
|||
|
# Make the get(), find(), and find_name() functions available at the top level
|
|||
|
get = Language.get
|
|||
|
find = Language.find
|
|||
|
find_name = Language.find_name
|
|||
|
|
|||
|
# Make the Language object available under the old name LanguageData
|
|||
|
LanguageData = Language
|
|||
|
|
|||
|
|
|||
|
def standardize_tag(tag: Union[str, Language], macro: bool = False) -> str:
|
|||
|
"""
|
|||
|
Standardize a language tag:
|
|||
|
|
|||
|
- Replace deprecated values with their updated versions (if those exist)
|
|||
|
- Remove script tags that are redundant with the language
|
|||
|
- If *macro* is True, use a macrolanguage to represent the most common
|
|||
|
standardized language within that macrolanguage. For example, 'cmn'
|
|||
|
(Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
|
|||
|
becomes 'ar' (Arabic).
|
|||
|
- Format the result according to the conventions of BCP 47
|
|||
|
|
|||
|
Macrolanguage replacement is not required by BCP 47, but it is required
|
|||
|
by the Unicode CLDR.
|
|||
|
|
|||
|
>>> standardize_tag('en_US')
|
|||
|
'en-US'
|
|||
|
|
|||
|
>>> standardize_tag('en-Latn')
|
|||
|
'en'
|
|||
|
|
|||
|
>>> standardize_tag('en-uk')
|
|||
|
'en-GB'
|
|||
|
|
|||
|
>>> standardize_tag('eng')
|
|||
|
'en'
|
|||
|
|
|||
|
>>> standardize_tag('arb-Arab', macro=True)
|
|||
|
'ar'
|
|||
|
|
|||
|
>>> standardize_tag('sh-QU')
|
|||
|
'sr-Latn-EU'
|
|||
|
|
|||
|
>>> standardize_tag('sgn-US')
|
|||
|
'ase'
|
|||
|
|
|||
|
>>> standardize_tag('zh-cmn-hans-cn')
|
|||
|
'zh-Hans-CN'
|
|||
|
|
|||
|
>>> standardize_tag('zsm', macro=True)
|
|||
|
'ms'
|
|||
|
|
|||
|
>>> standardize_tag('ja-latn-hepburn')
|
|||
|
'ja-Latn-hepburn'
|
|||
|
|
|||
|
>>> standardize_tag('spa-latn-mx')
|
|||
|
'es-MX'
|
|||
|
|
|||
|
If the tag can't be parsed according to BCP 47, this will raise a
|
|||
|
LanguageTagError (a subclass of ValueError):
|
|||
|
|
|||
|
>>> standardize_tag('spa-mx-latn')
|
|||
|
Traceback (most recent call last):
|
|||
|
...
|
|||
|
langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
|
|||
|
"""
|
|||
|
langdata = Language.get(tag, normalize=True)
|
|||
|
if macro:
|
|||
|
langdata = langdata.prefer_macrolanguage()
|
|||
|
|
|||
|
return langdata.simplify_script().to_tag()
|
|||
|
|
|||
|
|
|||
|
def tag_is_valid(tag: Union[str, Language]) -> bool:
|
|||
|
"""
|
|||
|
Determines whether a string is a valid language tag. This is similar to
|
|||
|
Language.get(tag).is_valid(), but can return False in the case where
|
|||
|
the tag doesn't parse.
|
|||
|
|
|||
|
>>> tag_is_valid('ja')
|
|||
|
True
|
|||
|
>>> tag_is_valid('jp')
|
|||
|
False
|
|||
|
>>> tag_is_valid('spa-Latn-MX')
|
|||
|
True
|
|||
|
>>> tag_is_valid('spa-MX-Latn')
|
|||
|
False
|
|||
|
>>> tag_is_valid('')
|
|||
|
False
|
|||
|
>>> tag_is_valid('C.UTF-8')
|
|||
|
False
|
|||
|
"""
|
|||
|
try:
|
|||
|
langdata = Language.get(tag)
|
|||
|
return langdata.is_valid()
|
|||
|
except LanguageTagError:
|
|||
|
return False
|
|||
|
|
|||
|
|
|||
|
def tag_match_score(
|
|||
|
desired: Union[str, Language], supported: Union[str, Language]
|
|||
|
) -> int:
|
|||
|
"""
|
|||
|
DEPRECATED: use .distance() instead, which uses newer data and is _lower_
|
|||
|
for better matching languages.
|
|||
|
|
|||
|
Return a number from 0 to 100 indicating the strength of match between the
|
|||
|
language the user desires, D, and a supported language, S. Higher numbers
|
|||
|
are better. A reasonable cutoff for not messing with your users is to
|
|||
|
only accept scores of 75 or more.
|
|||
|
|
|||
|
A score of 100 means the languages are the same, possibly after normalizing
|
|||
|
and filling in likely values.
|
|||
|
"""
|
|||
|
warnings.warn(
|
|||
|
"tag_match_score is deprecated because it's based on deprecated CLDR info. "
|
|||
|
"Use tag_distance instead, which is _lower_ for better matching languages. ",
|
|||
|
DeprecationWarning,
|
|||
|
)
|
|||
|
desired_ld = Language.get(desired)
|
|||
|
supported_ld = Language.get(supported)
|
|||
|
return desired_ld.match_score(supported_ld)
|
|||
|
|
|||
|
|
|||
|
def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int:
|
|||
|
"""
|
|||
|
Tags that expand to the same thing when likely values are filled in get a
|
|||
|
distance of 0.
|
|||
|
|
|||
|
>>> tag_distance('en', 'en')
|
|||
|
0
|
|||
|
>>> tag_distance('en', 'en-US')
|
|||
|
0
|
|||
|
>>> tag_distance('zh-Hant', 'zh-TW')
|
|||
|
0
|
|||
|
>>> tag_distance('ru-Cyrl', 'ru')
|
|||
|
0
|
|||
|
|
|||
|
As a specific example, Serbo-Croatian is a politically contentious idea,
|
|||
|
but in CLDR, it's considered equivalent to Serbian in Latin characters.
|
|||
|
|
|||
|
>>> tag_distance('sh', 'sr-Latn')
|
|||
|
0
|
|||
|
|
|||
|
... which is very similar to Croatian but sociopolitically not the same.
|
|||
|
|
|||
|
>>> tag_distance('sh', 'hr')
|
|||
|
9
|
|||
|
|
|||
|
Unicode reorganized its distinction between 'no' (Norwegian) and 'nb'
|
|||
|
(Norwegian Bokmål) in 2021. 'no' is preferred in most contexts, and the more
|
|||
|
specific 'nb' is a distance of 1 from it:
|
|||
|
|
|||
|
>>> tag_distance('nb', 'no')
|
|||
|
1
|
|||
|
|
|||
|
These distances can be asymmetrical: this data includes the fact that speakers
|
|||
|
of Swiss German (gsw) know High German (de), but not at all the other way around.
|
|||
|
|
|||
|
The difference seems a little bit extreme, but the asymmetry is certainly
|
|||
|
there. And if your text is tagged as 'gsw', it must be that way for a
|
|||
|
reason.
|
|||
|
|
|||
|
>>> tag_distance('gsw', 'de')
|
|||
|
8
|
|||
|
>>> tag_distance('de', 'gsw')
|
|||
|
84
|
|||
|
|
|||
|
Unconnected languages get a distance of 80 to 134.
|
|||
|
|
|||
|
>>> tag_distance('en', 'zh')
|
|||
|
134
|
|||
|
>>> tag_distance('es', 'fr')
|
|||
|
84
|
|||
|
>>> tag_distance('fr-CH', 'de-CH')
|
|||
|
80
|
|||
|
|
|||
|
Different local variants of the same language get a distance from 3 to 5.
|
|||
|
>>> tag_distance('zh-HK', 'zh-MO') # Chinese is similar in Hong Kong and Macao
|
|||
|
4
|
|||
|
>>> tag_distance('en-AU', 'en-GB') # Australian English is similar to British English
|
|||
|
3
|
|||
|
>>> tag_distance('en-IN', 'en-GB') # Indian English is also similar to British English
|
|||
|
3
|
|||
|
>>> tag_distance('es-PE', 'es-419') # Peruvian Spanish is Latin American Spanish
|
|||
|
1
|
|||
|
>>> tag_distance('es-419', 'es-PE') # but Latin American Spanish is not necessarily Peruvian
|
|||
|
4
|
|||
|
>>> tag_distance('es-ES', 'es-419') # Spanish in Spain is further from Latin American Spanish
|
|||
|
5
|
|||
|
>>> tag_distance('en-US', 'en-GB') # American and British English are somewhat different
|
|||
|
5
|
|||
|
>>> tag_distance('es-MX', 'es-ES') # Mexican Spanish is different from Spanish Spanish
|
|||
|
5
|
|||
|
>>> # European Portuguese is different from the most common form (Brazilian Portuguese)
|
|||
|
>>> tag_distance('pt', 'pt-PT')
|
|||
|
5
|
|||
|
|
|||
|
>>> # Serbian has two scripts, and people might prefer one but understand both
|
|||
|
>>> tag_distance('sr-Latn', 'sr-Cyrl')
|
|||
|
5
|
|||
|
|
|||
|
A distance of 10 is used for matching a specific language to its
|
|||
|
more-commonly-used macrolanguage tag.
|
|||
|
|
|||
|
>>> tag_distance('arz', 'ar') # Egyptian Arabic to Modern Standard Arabic
|
|||
|
10
|
|||
|
>>> tag_distance('wuu', 'zh') # Wu Chinese to (Mandarin) Chinese
|
|||
|
10
|
|||
|
|
|||
|
Higher distances can arrive due to particularly contentious differences in
|
|||
|
the script for writing the language, where people who understand one script
|
|||
|
can learn the other but may not be happy with it. This specifically applies
|
|||
|
to Chinese.
|
|||
|
|
|||
|
>>> tag_distance('zh-TW', 'zh-CN')
|
|||
|
54
|
|||
|
>>> tag_distance('zh-Hans', 'zh-Hant')
|
|||
|
54
|
|||
|
>>> tag_distance('zh-CN', 'zh-HK')
|
|||
|
54
|
|||
|
>>> tag_distance('zh-CN', 'zh-TW')
|
|||
|
54
|
|||
|
>>> tag_distance('zh-Hant', 'zh-Hans')
|
|||
|
54
|
|||
|
|
|||
|
This distance range also applies to the differences between Norwegian
|
|||
|
Bokmål, Nynorsk, and Danish.
|
|||
|
|
|||
|
>>> tag_distance('no', 'da')
|
|||
|
12
|
|||
|
>>> tag_distance('no', 'nn')
|
|||
|
20
|
|||
|
|
|||
|
Differences of 20 to 50 can represent substantially different languages,
|
|||
|
in cases where speakers of the first may understand the second for demographic
|
|||
|
reasons.
|
|||
|
|
|||
|
>>> tag_distance('eu', 'es') # Basque to Spanish
|
|||
|
20
|
|||
|
>>> tag_distance('af', 'nl') # Afrikaans to Dutch
|
|||
|
24
|
|||
|
>>> tag_distance('mr', 'hi') # Marathi to Hindi
|
|||
|
30
|
|||
|
>>> tag_distance('ms', 'id') # Malay to Indonesian
|
|||
|
34
|
|||
|
>>> tag_distance('mg', 'fr') # Malagasy to French
|
|||
|
34
|
|||
|
>>> tag_distance('ta', 'en') # Tamil to English
|
|||
|
44
|
|||
|
|
|||
|
A complex example is the tag 'yue' for Cantonese. Written Chinese is usually
|
|||
|
presumed to be Mandarin Chinese, but colloquial Cantonese can be written as
|
|||
|
well. (Some things could not be written any other way, such as Cantonese
|
|||
|
song lyrics.)
|
|||
|
|
|||
|
The difference between Cantonese and Mandarin also implies script and
|
|||
|
territory differences by default, adding to the distance.
|
|||
|
|
|||
|
>>> tag_distance('yue', 'zh')
|
|||
|
64
|
|||
|
|
|||
|
When the supported script is a different one than desired, this is usually
|
|||
|
a major difference with score of 50 or more.
|
|||
|
|
|||
|
>>> tag_distance('ja', 'ja-Latn-US-hepburn')
|
|||
|
54
|
|||
|
|
|||
|
>>> # You can read the Shavian script, right?
|
|||
|
>>> tag_distance('en', 'en-Shaw')
|
|||
|
54
|
|||
|
"""
|
|||
|
desired_obj = Language.get(desired)
|
|||
|
supported_obj = Language.get(supported)
|
|||
|
return desired_obj.distance(supported_obj)
|
|||
|
|
|||
|
|
|||
|
def best_match(
|
|||
|
desired_language: Union[str, Language],
|
|||
|
supported_languages: Sequence[str],
|
|||
|
min_score: int = 75,
|
|||
|
) -> Tuple[str, int]:
|
|||
|
"""
|
|||
|
DEPRECATED: use .closest_match() instead. This function emulates the old
|
|||
|
matching behavior by subtracting the language distance from 100.
|
|||
|
|
|||
|
You have software that supports any of the `supported_languages`. You want
|
|||
|
to use `desired_language`. This function lets you choose the right language,
|
|||
|
even if there isn't an exact match.
|
|||
|
|
|||
|
Returns:
|
|||
|
|
|||
|
- The best-matching language code, which will be one of the
|
|||
|
`supported_languages` or 'und'
|
|||
|
- The score of the match, from 0 to 100; higher is better.
|
|||
|
|
|||
|
`min_score` sets the minimum match score. If all languages match with a lower
|
|||
|
score than that, the result will be 'und' with a score of 0.
|
|||
|
"""
|
|||
|
max_distance = 100 - min_score
|
|||
|
supported, distance = closest_match(
|
|||
|
desired_language, supported_languages, max_distance
|
|||
|
)
|
|||
|
score = max(0, 100 - distance)
|
|||
|
return supported, score
|
|||
|
|
|||
|
|
|||
|
def closest_match(
|
|||
|
desired_language: Union[str, Language],
|
|||
|
supported_languages: Sequence[str],
|
|||
|
max_distance: int = 25,
|
|||
|
) -> Tuple[str, int]:
|
|||
|
"""
|
|||
|
You have software that supports any of the `supported_languages`. You want
|
|||
|
to use `desired_language`. This function lets you choose the right language,
|
|||
|
even if there isn't an exact match.
|
|||
|
|
|||
|
Returns:
|
|||
|
|
|||
|
- The best-matching language code, which will be one of the
|
|||
|
`supported_languages` or 'und' for no match
|
|||
|
- The distance of the match, which is 0 for a perfect match and increases
|
|||
|
from there (see `tag_distance`)
|
|||
|
|
|||
|
`max_distance` sets the maximum match distance. If all matches are farther
|
|||
|
than that, the result will be 'und' with a distance of 1000. The default
|
|||
|
value is 25, and raising it can cause data to be processed in significantly
|
|||
|
the wrong language. The documentation for `tag_distance` describes the
|
|||
|
distance values in more detail.
|
|||
|
|
|||
|
When there is a tie for the best matching language, the first one in the
|
|||
|
tie will be used.
|
|||
|
|
|||
|
>>> closest_match('fr', ['de', 'en', 'fr'])
|
|||
|
('fr', 0)
|
|||
|
|
|||
|
>>> closest_match('pt', ['pt-BR', 'pt-PT'])
|
|||
|
('pt-BR', 0)
|
|||
|
|
|||
|
>>> closest_match('en-AU', ['en-GB', 'en-US'])
|
|||
|
('en-GB', 3)
|
|||
|
|
|||
|
>>> closest_match('af', ['en', 'nl', 'zu'])
|
|||
|
('nl', 24)
|
|||
|
|
|||
|
>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
|
|||
|
('und', 1000)
|
|||
|
"""
|
|||
|
desired_language = str(desired_language)
|
|||
|
|
|||
|
# Quickly return if the desired language is directly supported
|
|||
|
if desired_language in supported_languages:
|
|||
|
return desired_language, 0
|
|||
|
|
|||
|
# Reduce the desired language to a standard form that could also match
|
|||
|
desired_language = standardize_tag(desired_language)
|
|||
|
if desired_language in supported_languages:
|
|||
|
return desired_language, 0
|
|||
|
|
|||
|
match_distances = [
|
|||
|
(supported, tag_distance(desired_language, supported))
|
|||
|
for supported in supported_languages
|
|||
|
]
|
|||
|
match_distances = [
|
|||
|
(supported, distance)
|
|||
|
for (supported, distance) in match_distances
|
|||
|
if distance <= max_distance
|
|||
|
] + [('und', 1000)]
|
|||
|
|
|||
|
match_distances.sort(key=itemgetter(1))
|
|||
|
return match_distances[0]
|
|||
|
|
|||
|
|
|||
|
def closest_supported_match(
|
|||
|
desired_language: Union[str, Language],
|
|||
|
supported_languages: Sequence[str],
|
|||
|
max_distance: int = 25,
|
|||
|
) -> Optional[str]:
|
|||
|
"""
|
|||
|
Wraps `closest_match` with a simpler return type. Returns the language
|
|||
|
tag of the closest match if there is one, or None if there is not.
|
|||
|
|
|||
|
>>> closest_supported_match('fr', ['de', 'en', 'fr'])
|
|||
|
'fr'
|
|||
|
|
|||
|
>>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
|
|||
|
'pt-BR'
|
|||
|
|
|||
|
>>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
|
|||
|
'en-GB'
|
|||
|
|
|||
|
>>> closest_supported_match('und', ['en', 'und'])
|
|||
|
'und'
|
|||
|
|
|||
|
>>> closest_supported_match('af', ['en', 'nl', 'zu'])
|
|||
|
'nl'
|
|||
|
|
|||
|
>>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
|
|||
|
None
|
|||
|
"""
|
|||
|
code, distance = closest_match(desired_language, supported_languages, max_distance)
|
|||
|
if distance == 1000:
|
|||
|
return None
|
|||
|
else:
|
|||
|
return code
|