mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-15 09:25:04 +00:00
9009cc7a7b
Add a Select2 drop-down to `add-shows` and `edit-show`. The Select2 enables displaying inline language flag images, this feature deprecated by the native `select` drop-down element on some browsers. Change run existing TVInfo source language lists through validation (removes ~4 bad items), de-dupe list, get the native names, English names, and three letter abbr. Change remove marisa-trie requirement from language_data/names.py because nothing in SG calls a function that requires it. Change update some flags.
1931 lines
68 KiB
Python
1931 lines
68 KiB
Python
"""
|
||
langcodes knows what languages are. It knows the standardized codes that
|
||
refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi.
|
||
Often, it knows what these languages are called *in* a language, and that
|
||
language doesn't have to be English.
|
||
|
||
See README.md for the main documentation, or read it on GitHub at
|
||
https://github.com/LuminosoInsight/langcodes/ . For more specific documentation
|
||
on the functions in langcodes, scroll down and read the docstrings.
|
||
|
||
Some of these functions, particularly those that work with the names of
|
||
languages, require the `language_data` module to be installed.
|
||
"""
|
||
from operator import itemgetter
|
||
from typing import Any, List, Tuple, Dict, Sequence, Iterable, Optional, Mapping, Union
|
||
import warnings
|
||
import sys
|
||
|
||
from langcodes.tag_parser import LanguageTagError, parse_tag, normalize_characters
|
||
from langcodes.language_distance import tuple_distance_cached
|
||
from langcodes.data_dicts import (
|
||
ALL_SCRIPTS,
|
||
DEFAULT_SCRIPTS,
|
||
LANGUAGE_REPLACEMENTS,
|
||
LANGUAGE_ALPHA3,
|
||
LANGUAGE_ALPHA3_BIBLIOGRAPHIC,
|
||
TERRITORY_REPLACEMENTS,
|
||
NORMALIZED_MACROLANGUAGES,
|
||
LIKELY_SUBTAGS,
|
||
VALIDITY,
|
||
)
|
||
|
||
# When we're getting natural language information *about* languages, it's in
|
||
# English if you don't specify the language.
|
||
DEFAULT_LANGUAGE = 'en'
|
||
|
||
|
||
LANGUAGE_NAME_IMPORT_MESSAGE = """
|
||
Looking up language names now requires the `language_data` package.
|
||
|
||
Install it with:
|
||
pip install language_data
|
||
Or as an optional feature of langcodes:
|
||
pip install langcodes[data]
|
||
"""
|
||
|
||
|
||
class Language:
|
||
"""
|
||
The Language class defines the results of parsing a language tag.
|
||
Language objects have the following attributes, any of which may be
|
||
unspecified (in which case their value is None):
|
||
|
||
- *language*: the code for the language itself.
|
||
- *script*: the 4-letter code for the writing system being used.
|
||
- *territory*: the 2-letter or 3-digit code for the country or similar territory
|
||
of the world whose usage of the language appears in this text.
|
||
- *extlangs*: a list of more specific language codes that follow the language
|
||
code. (This is allowed by the language code syntax, but deprecated.)
|
||
- *variants*: codes for specific variations of language usage that aren't
|
||
covered by the *script* or *territory* codes.
|
||
- *extensions*: information that's attached to the language code for use in
|
||
some specific system, such as Unicode collation orders.
|
||
- *private*: a code starting with `x-` that has no defined meaning.
|
||
|
||
The `Language.get` method converts a string to a Language instance.
|
||
It's also available at the top level of this module as the `get` function.
|
||
"""
|
||
|
||
ATTRIBUTES = [
|
||
'language',
|
||
'extlangs',
|
||
'script',
|
||
'territory',
|
||
'variants',
|
||
'extensions',
|
||
'private',
|
||
]
|
||
|
||
# When looking up "likely subtags" data, we try looking up the data for
|
||
# increasingly less specific versions of the language code.
|
||
BROADER_KEYSETS = [
|
||
{'language', 'script', 'territory'},
|
||
{'language', 'territory'},
|
||
{'language', 'script'},
|
||
{'language'},
|
||
{'script'},
|
||
{},
|
||
]
|
||
|
||
MATCHABLE_KEYSETS = [
|
||
{'language', 'script', 'territory'},
|
||
{'language', 'script'},
|
||
{'language'},
|
||
]
|
||
|
||
# Values cached at the class level
|
||
_INSTANCES: Dict[tuple, 'Language'] = {}
|
||
_PARSE_CACHE: Dict[Tuple[str, bool], 'Language'] = {}
|
||
|
||
def __init__(
|
||
self,
|
||
language: Optional[str] = None,
|
||
extlangs: Optional[Sequence[str]] = None,
|
||
script: Optional[str] = None,
|
||
territory: Optional[str] = None,
|
||
variants: Optional[Sequence[str]] = None,
|
||
extensions: Optional[Sequence[str]] = None,
|
||
private: Optional[str] = None,
|
||
):
|
||
"""
|
||
The constructor for Language objects.
|
||
|
||
It's inefficient to call this directly, because it can't return
|
||
an existing instance. Instead, call Language.make(), which
|
||
has the same signature.
|
||
"""
|
||
self.language = language
|
||
self.extlangs = extlangs
|
||
self.script = script
|
||
self.territory = territory
|
||
self.variants = variants
|
||
self.extensions = extensions
|
||
self.private = private
|
||
|
||
# Cached values
|
||
self._simplified: 'Language' = None
|
||
self._searchable: 'Language' = None
|
||
self._broader: List[str] = None
|
||
self._assumed: 'Language' = None
|
||
self._filled: 'Language' = None
|
||
self._macrolanguage: Optional['Language'] = None
|
||
self._str_tag: str = None
|
||
self._dict: dict = None
|
||
self._disp_separator: str = None
|
||
self._disp_pattern: str = None
|
||
|
||
# Make sure the str_tag value is cached
|
||
self.to_tag()
|
||
|
||
@classmethod
|
||
def make(
|
||
cls,
|
||
language: Optional[str] = None,
|
||
extlangs: Optional[Sequence[str]] = None,
|
||
script: Optional[str] = None,
|
||
territory: Optional[str] = None,
|
||
variants: Optional[Sequence[str]] = None,
|
||
extensions: Optional[Sequence[str]] = None,
|
||
private: Optional[str] = None,
|
||
) -> 'Language':
|
||
"""
|
||
Create a Language object by giving any subset of its attributes.
|
||
|
||
If this value has been created before, return the existing value.
|
||
"""
|
||
values = (
|
||
language,
|
||
tuple(extlangs or ()),
|
||
script,
|
||
territory,
|
||
tuple(variants or ()),
|
||
tuple(extensions or ()),
|
||
private,
|
||
)
|
||
if values in cls._INSTANCES:
|
||
return cls._INSTANCES[values]
|
||
|
||
instance = cls(
|
||
language=language,
|
||
extlangs=extlangs,
|
||
script=script,
|
||
territory=territory,
|
||
variants=variants,
|
||
extensions=extensions,
|
||
private=private,
|
||
)
|
||
cls._INSTANCES[values] = instance
|
||
return instance
|
||
|
||
@staticmethod
|
||
def get(tag: Union[str, 'Language'], normalize=True) -> 'Language':
|
||
"""
|
||
Create a Language object from a language tag string.
|
||
|
||
If normalize=True, non-standard or overlong tags will be replaced as
|
||
they're interpreted. This is recommended.
|
||
|
||
Here are several examples of language codes, which are also test cases.
|
||
Most language codes are straightforward, but these examples will get
|
||
pretty obscure toward the end.
|
||
|
||
>>> Language.get('en-US')
|
||
Language.make(language='en', territory='US')
|
||
|
||
>>> Language.get('zh-Hant')
|
||
Language.make(language='zh', script='Hant')
|
||
|
||
>>> Language.get('und')
|
||
Language.make()
|
||
|
||
This function is idempotent, in case you already have a Language object:
|
||
|
||
>>> Language.get(Language.get('en-us'))
|
||
Language.make(language='en', territory='US')
|
||
|
||
The non-code 'root' is sometimes used to represent the lack of any
|
||
language information, similar to 'und'.
|
||
|
||
>>> Language.get('root')
|
||
Language.make()
|
||
|
||
By default, getting a Language object will automatically convert
|
||
deprecated tags:
|
||
|
||
>>> Language.get('iw')
|
||
Language.make(language='he')
|
||
|
||
>>> Language.get('in')
|
||
Language.make(language='id')
|
||
|
||
One type of deprecated tag that should be replaced is for sign
|
||
languages, which used to all be coded as regional variants of a
|
||
fictitious global sign language called 'sgn'. Of course, there is no
|
||
global sign language, so sign languages now have their own language
|
||
codes.
|
||
|
||
>>> Language.get('sgn-US')
|
||
Language.make(language='ase')
|
||
|
||
>>> Language.get('sgn-US', normalize=False)
|
||
Language.make(language='sgn', territory='US')
|
||
|
||
'en-gb-oed' is a tag that's grandfathered into the standard because it
|
||
has been used to mean "spell-check this with Oxford English Dictionary
|
||
spelling", but that tag has the wrong shape. We interpret this as the
|
||
new standardized tag 'en-gb-oxendict', unless asked not to normalize.
|
||
|
||
>>> Language.get('en-gb-oed')
|
||
Language.make(language='en', territory='GB', variants=['oxendict'])
|
||
|
||
>>> Language.get('en-gb-oed', normalize=False)
|
||
Language.make(language='en-gb-oed')
|
||
|
||
'zh-min-nan' is another oddly-formed tag, used to represent the
|
||
Southern Min language, which includes Taiwanese as a regional form. It
|
||
now has its own language code.
|
||
|
||
>>> Language.get('zh-min-nan')
|
||
Language.make(language='nan')
|
||
|
||
The vague tag 'zh-min' is now also interpreted as 'nan', with a private
|
||
extension indicating that it had a different form:
|
||
|
||
>>> Language.get('zh-min')
|
||
Language.make(language='nan', private='x-zh-min')
|
||
|
||
Occasionally Wiktionary will use 'extlang' tags in strange ways, such
|
||
as using the tag 'und-ibe' for some unspecified Iberian language.
|
||
|
||
>>> Language.get('und-ibe')
|
||
Language.make(extlangs=['ibe'])
|
||
|
||
Here's an example of replacing multiple deprecated tags.
|
||
|
||
The language tag 'sh' (Serbo-Croatian) ended up being politically
|
||
problematic, and different standards took different steps to address
|
||
this. The IANA made it into a macrolanguage that contains 'sr', 'hr',
|
||
and 'bs'. Unicode further decided that it's a legacy tag that should
|
||
be interpreted as 'sr-Latn', which the language matching rules say
|
||
is mutually intelligible with all those languages.
|
||
|
||
We complicate the example by adding on the territory tag 'QU', an old
|
||
provisional tag for the European Union, which is now standardized as
|
||
'EU'.
|
||
|
||
>>> Language.get('sh-QU')
|
||
Language.make(language='sr', script='Latn', territory='EU')
|
||
"""
|
||
if isinstance(tag, Language):
|
||
if not normalize:
|
||
# shortcut: we have the tag already
|
||
return tag
|
||
|
||
# We might need to normalize this tag. Convert it back into a
|
||
# string tag, to cover all the edge cases of normalization in a
|
||
# way that we've already solved.
|
||
tag = tag.to_tag()
|
||
|
||
if (tag, normalize) in Language._PARSE_CACHE:
|
||
return Language._PARSE_CACHE[tag, normalize]
|
||
|
||
data: Dict[str, Any] = {}
|
||
|
||
# If the complete tag appears as something to normalize, do the
|
||
# normalization right away. Smash case and convert underscores to
|
||
# hyphens when checking, because the case normalization that comes from
|
||
# parse_tag() hasn't been applied yet.
|
||
|
||
tag_lower = normalize_characters(tag)
|
||
if normalize and tag_lower in LANGUAGE_REPLACEMENTS:
|
||
tag = LANGUAGE_REPLACEMENTS[tag_lower]
|
||
|
||
components = parse_tag(tag)
|
||
|
||
for typ, value in components:
|
||
if typ == 'extlang' and normalize and 'language' in data:
|
||
# smash extlangs when possible
|
||
minitag = f"{data['language']}-{value}"
|
||
norm = LANGUAGE_REPLACEMENTS.get(normalize_characters(minitag))
|
||
if norm is not None:
|
||
data.update(Language.get(norm, normalize).to_dict())
|
||
else:
|
||
data.setdefault('extlangs', []).append(value)
|
||
elif typ in {'extlang', 'variant', 'extension'}:
|
||
data.setdefault(typ + 's', []).append(value)
|
||
elif typ == 'language':
|
||
if value == 'und':
|
||
pass
|
||
elif normalize:
|
||
replacement = LANGUAGE_REPLACEMENTS.get(value.lower())
|
||
if replacement is not None:
|
||
# parse the replacement if necessary -- this helps with
|
||
# Serbian and Moldovan
|
||
data.update(Language.get(replacement, normalize).to_dict())
|
||
else:
|
||
data['language'] = value
|
||
else:
|
||
data['language'] = value
|
||
elif typ == 'territory':
|
||
if normalize:
|
||
data['territory'] = TERRITORY_REPLACEMENTS.get(value.lower(), value)
|
||
else:
|
||
data['territory'] = value
|
||
elif typ == 'grandfathered':
|
||
# If we got here, we got a grandfathered tag but we were asked
|
||
# not to normalize it, or the CLDR data doesn't know how to
|
||
# normalize it. The best we can do is set the entire tag as the
|
||
# language.
|
||
data['language'] = value
|
||
else:
|
||
data[typ] = value
|
||
|
||
result = Language.make(**data)
|
||
Language._PARSE_CACHE[tag, normalize] = result
|
||
return result
|
||
|
||
def to_tag(self) -> str:
|
||
"""
|
||
Convert a Language back to a standard language tag, as a string.
|
||
This is also the str() representation of a Language object.
|
||
|
||
>>> Language.make(language='en', territory='GB').to_tag()
|
||
'en-GB'
|
||
|
||
>>> Language.make(language='yue', script='Hant', territory='HK').to_tag()
|
||
'yue-Hant-HK'
|
||
|
||
>>> Language.make(script='Arab').to_tag()
|
||
'und-Arab'
|
||
|
||
>>> str(Language.make(territory='IN'))
|
||
'und-IN'
|
||
"""
|
||
if self._str_tag is not None:
|
||
return self._str_tag
|
||
subtags = ['und']
|
||
if self.language:
|
||
subtags[0] = self.language
|
||
if self.extlangs:
|
||
for extlang in sorted(self.extlangs):
|
||
subtags.append(extlang)
|
||
if self.script:
|
||
subtags.append(self.script)
|
||
if self.territory:
|
||
subtags.append(self.territory)
|
||
if self.variants:
|
||
for variant in sorted(self.variants):
|
||
subtags.append(variant)
|
||
if self.extensions:
|
||
for ext in self.extensions:
|
||
subtags.append(ext)
|
||
if self.private:
|
||
subtags.append(self.private)
|
||
self._str_tag = '-'.join(subtags)
|
||
return self._str_tag
|
||
|
||
def simplify_script(self) -> 'Language':
|
||
"""
|
||
Remove the script from some parsed language data, if the script is
|
||
redundant with the language.
|
||
|
||
>>> Language.make(language='en', script='Latn').simplify_script()
|
||
Language.make(language='en')
|
||
|
||
>>> Language.make(language='yi', script='Latn').simplify_script()
|
||
Language.make(language='yi', script='Latn')
|
||
|
||
>>> Language.make(language='yi', script='Hebr').simplify_script()
|
||
Language.make(language='yi')
|
||
"""
|
||
if self._simplified is not None:
|
||
return self._simplified
|
||
|
||
if self.language and self.script:
|
||
if DEFAULT_SCRIPTS.get(self.language) == self.script:
|
||
result = self.update_dict({'script': None})
|
||
self._simplified = result
|
||
return self._simplified
|
||
|
||
self._simplified = self
|
||
return self._simplified
|
||
|
||
def assume_script(self) -> 'Language':
|
||
"""
|
||
Fill in the script if it's missing, and if it can be assumed from the
|
||
language subtag. This is the opposite of `simplify_script`.
|
||
|
||
>>> Language.make(language='en').assume_script()
|
||
Language.make(language='en', script='Latn')
|
||
|
||
>>> Language.make(language='yi').assume_script()
|
||
Language.make(language='yi', script='Hebr')
|
||
|
||
>>> Language.make(language='yi', script='Latn').assume_script()
|
||
Language.make(language='yi', script='Latn')
|
||
|
||
This fills in nothing when the script cannot be assumed -- such as when
|
||
the language has multiple scripts, or it has no standard orthography:
|
||
|
||
>>> Language.make(language='sr').assume_script()
|
||
Language.make(language='sr')
|
||
|
||
>>> Language.make(language='eee').assume_script()
|
||
Language.make(language='eee')
|
||
|
||
It also dosn't fill anything in when the language is unspecified.
|
||
|
||
>>> Language.make(territory='US').assume_script()
|
||
Language.make(territory='US')
|
||
"""
|
||
if self._assumed is not None:
|
||
return self._assumed
|
||
if self.language and not self.script:
|
||
try:
|
||
self._assumed = self.update_dict(
|
||
{'script': DEFAULT_SCRIPTS[self.language]}
|
||
)
|
||
except KeyError:
|
||
self._assumed = self
|
||
else:
|
||
self._assumed = self
|
||
return self._assumed
|
||
|
||
def prefer_macrolanguage(self) -> 'Language':
|
||
"""
|
||
BCP 47 doesn't specify what to do with macrolanguages and the languages
|
||
they contain. The Unicode CLDR, on the other hand, says that when a
|
||
macrolanguage has a dominant standardized language, the macrolanguage
|
||
code should be used for that language. For example, Mandarin Chinese
|
||
is 'zh', not 'cmn', according to Unicode, and Malay is 'ms', not 'zsm'.
|
||
|
||
This isn't a rule you'd want to follow in all cases -- for example, you may
|
||
want to be able to specifically say that 'ms' (the Malay macrolanguage)
|
||
contains both 'zsm' (Standard Malay) and 'id' (Indonesian). But applying
|
||
this rule helps when interoperating with the Unicode CLDR.
|
||
|
||
So, applying `prefer_macrolanguage` to a Language object will
|
||
return a new object, replacing the language with the macrolanguage if
|
||
it is the dominant language within that macrolanguage. It will leave
|
||
non-dominant languages that have macrolanguages alone.
|
||
|
||
>>> Language.get('arb').prefer_macrolanguage()
|
||
Language.make(language='ar')
|
||
|
||
>>> Language.get('cmn-Hant').prefer_macrolanguage()
|
||
Language.make(language='zh', script='Hant')
|
||
|
||
>>> Language.get('yue-Hant').prefer_macrolanguage()
|
||
Language.make(language='yue', script='Hant')
|
||
"""
|
||
if self._macrolanguage is not None:
|
||
return self._macrolanguage
|
||
language = self.language or 'und'
|
||
if language in NORMALIZED_MACROLANGUAGES:
|
||
self._macrolanguage = self.update_dict(
|
||
{'language': NORMALIZED_MACROLANGUAGES[language]}
|
||
)
|
||
else:
|
||
self._macrolanguage = self
|
||
return self._macrolanguage
|
||
|
||
def to_alpha3(self, variant: str = 'T') -> str:
|
||
"""
|
||
Get the three-letter language code for this language, even if it's
|
||
canonically written with a two-letter code.
|
||
|
||
These codes are the 'alpha3' codes defined by ISO 639-2.
|
||
|
||
When this function returns, it always returns a 3-letter string. If
|
||
there is no known alpha3 code for the language, it raises a LookupError.
|
||
|
||
In cases where the distinction matters, we default to the 'terminology'
|
||
code. You can pass `variant='B'` to get the 'bibliographic' code instead.
|
||
For example, the terminology code for German is 'deu', while the
|
||
bibliographic code is 'ger'.
|
||
|
||
(The confusion between these two sets of codes is a good reason to avoid
|
||
using alpha3 codes. Every language that has two different alpha3 codes
|
||
also has an alpha2 code that's preferred, such as 'de' for German.)
|
||
|
||
>>> Language.get('fr').to_alpha3()
|
||
'fra'
|
||
>>> Language.get('fr-CA').to_alpha3()
|
||
'fra'
|
||
>>> Language.get('fr').to_alpha3(variant='B')
|
||
'fre'
|
||
>>> Language.get('de').to_alpha3(variant='T')
|
||
'deu'
|
||
>>> Language.get('ja').to_alpha3()
|
||
'jpn'
|
||
>>> Language.get('un').to_alpha3()
|
||
Traceback (most recent call last):
|
||
...
|
||
LookupError: 'un' is not a known language code, and has no alpha3 code.
|
||
|
||
|
||
All valid two-letter language codes have corresponding alpha3 codes,
|
||
even the un-normalized ones. If they were assigned an alpha3 code by ISO
|
||
before they were assigned a normalized code by CLDR, these codes may be
|
||
different:
|
||
|
||
>>> Language.get('tl', normalize=False).to_alpha3()
|
||
'tgl'
|
||
>>> Language.get('tl').to_alpha3()
|
||
'fil'
|
||
>>> Language.get('sh', normalize=False).to_alpha3()
|
||
'hbs'
|
||
|
||
|
||
Three-letter codes are preserved, even if they're unknown:
|
||
|
||
>>> Language.get('qqq').to_alpha3()
|
||
'qqq'
|
||
>>> Language.get('und').to_alpha3()
|
||
'und'
|
||
"""
|
||
variant = variant.upper()
|
||
if variant not in 'BT':
|
||
raise ValueError("Variant must be 'B' or 'T'")
|
||
|
||
language = self.language
|
||
if language is None:
|
||
return 'und'
|
||
elif len(language) == 3:
|
||
return language
|
||
else:
|
||
if variant == 'B' and language in LANGUAGE_ALPHA3_BIBLIOGRAPHIC:
|
||
return LANGUAGE_ALPHA3_BIBLIOGRAPHIC[language]
|
||
elif language in LANGUAGE_ALPHA3:
|
||
return LANGUAGE_ALPHA3[language]
|
||
else:
|
||
raise LookupError(
|
||
f"{language!r} is not a known language code, "
|
||
"and has no alpha3 code."
|
||
)
|
||
|
||
def broader_tags(self) -> List[str]:
|
||
"""
|
||
Iterate through increasingly general tags for this language.
|
||
|
||
This isn't actually that useful for matching two arbitrary language tags
|
||
against each other, but it is useful for matching them against a known
|
||
standardized form, such as in the CLDR data.
|
||
|
||
The list of broader versions to try appears in UTR 35, section 4.3,
|
||
"Likely Subtags".
|
||
|
||
>>> Language.get('nn-Latn-NO-x-thingy').broader_tags()
|
||
['nn-Latn-NO-x-thingy', 'nn-Latn-NO', 'nn-NO', 'nn-Latn', 'nn', 'und-Latn', 'und']
|
||
|
||
>>> Language.get('arb-Arab').broader_tags()
|
||
['arb-Arab', 'ar-Arab', 'arb', 'ar', 'und-Arab', 'und']
|
||
"""
|
||
if self._broader is not None:
|
||
return self._broader
|
||
self._broader = [self.to_tag()]
|
||
seen = set([self.to_tag()])
|
||
for keyset in self.BROADER_KEYSETS:
|
||
for start_language in (self, self.prefer_macrolanguage()):
|
||
filtered = start_language._filter_attributes(keyset)
|
||
tag = filtered.to_tag()
|
||
if tag not in seen:
|
||
self._broader.append(tag)
|
||
seen.add(tag)
|
||
return self._broader
|
||
|
||
def broaden(self) -> 'List[Language]':
|
||
"""
|
||
Like `broader_tags`, but returrns Language objects instead of strings.
|
||
"""
|
||
return [Language.get(tag) for tag in self.broader_tags()]
|
||
|
||
def maximize(self) -> 'Language':
|
||
"""
|
||
The Unicode CLDR contains a "likelySubtags" data file, which can guess
|
||
reasonable values for fields that are missing from a language tag.
|
||
|
||
This is particularly useful for comparing, for example, "zh-Hant" and
|
||
"zh-TW", two common language tags that say approximately the same thing
|
||
via rather different information. (Using traditional Han characters is
|
||
not the same as being in Taiwan, but each implies that the other is
|
||
likely.)
|
||
|
||
These implications are provided in the CLDR supplemental data, and are
|
||
based on the likelihood of people using the language to transmit text
|
||
on the Internet. (This is why the overall default is English, not
|
||
Chinese.)
|
||
|
||
It's important to recognize that these tags amplify majorities, and
|
||
that not all language support fits into a "likely" language tag.
|
||
|
||
>>> str(Language.get('zh-Hant').maximize())
|
||
'zh-Hant-TW'
|
||
>>> str(Language.get('zh-TW').maximize())
|
||
'zh-Hant-TW'
|
||
>>> str(Language.get('ja').maximize())
|
||
'ja-Jpan-JP'
|
||
>>> str(Language.get('pt').maximize())
|
||
'pt-Latn-BR'
|
||
>>> str(Language.get('und-Arab').maximize())
|
||
'ar-Arab-EG'
|
||
>>> str(Language.get('und-CH').maximize())
|
||
'de-Latn-CH'
|
||
|
||
As many standards are, this is US-centric:
|
||
|
||
>>> str(Language.make().maximize())
|
||
'en-Latn-US'
|
||
|
||
"Extlangs" have no likely-subtags information, so they will give
|
||
maximized results that make no sense:
|
||
|
||
>>> str(Language.get('und-ibe').maximize())
|
||
'en-ibe-Latn-US'
|
||
"""
|
||
if self._filled is not None:
|
||
return self._filled
|
||
|
||
for tag in self.broader_tags():
|
||
if tag in LIKELY_SUBTAGS:
|
||
result = Language.get(LIKELY_SUBTAGS[tag], normalize=False)
|
||
result = result.update(self)
|
||
self._filled = result
|
||
return result
|
||
|
||
raise RuntimeError(
|
||
"Couldn't fill in likely values. This represents a problem with "
|
||
"the LIKELY_SUBTAGS data."
|
||
)
|
||
|
||
# Support an old, wordier name for the method
|
||
fill_likely_values = maximize
|
||
|
||
def match_score(self, supported: 'Language') -> int:
|
||
"""
|
||
DEPRECATED: use .distance() instead, which uses newer data and is _lower_
|
||
for better matching languages.
|
||
"""
|
||
warnings.warn(
|
||
"`match_score` is deprecated because it's based on deprecated CLDR info. "
|
||
"Use `distance` instead, which is _lower_ for better matching languages. ",
|
||
DeprecationWarning,
|
||
)
|
||
return 100 - min(self.distance(supported), 100)
|
||
|
||
def distance(self, supported: 'Language') -> int:
|
||
"""
|
||
Suppose that `self` is the language that the user desires, and
|
||
`supported` is a language that is actually supported.
|
||
|
||
This method returns a number from 0 to 134 measuring the 'distance'
|
||
between the languages (lower numbers are better). This is not a
|
||
symmetric relation.
|
||
|
||
The language distance is not really about the linguistic similarity or
|
||
history of the languages; instead, it's based largely on sociopolitical
|
||
factors, indicating which language speakers are likely to know which
|
||
other languages in the present world. Much of the heuristic is about
|
||
finding a widespread 'world language' like English, Chinese, French, or
|
||
Russian that speakers of a more localized language will accept.
|
||
|
||
A version that works on language tags, as strings, is in the function
|
||
`tag_distance`. See that function for copious examples.
|
||
"""
|
||
if supported == self:
|
||
return 0
|
||
|
||
# CLDR has realized that these matching rules are undermined when the
|
||
# unspecified language 'und' gets maximized to 'en-Latn-US', so this case
|
||
# is specifically not maximized:
|
||
if self.language is None and self.script is None and self.territory is None:
|
||
desired_triple = ('und', 'Zzzz', 'ZZ')
|
||
else:
|
||
desired_complete = self.prefer_macrolanguage().maximize()
|
||
desired_triple = (
|
||
desired_complete.language,
|
||
desired_complete.script,
|
||
desired_complete.territory,
|
||
)
|
||
|
||
if (
|
||
supported.language is None
|
||
and supported.script is None
|
||
and supported.territory is None
|
||
):
|
||
supported_triple = ('und', 'Zzzz', 'ZZ')
|
||
else:
|
||
supported_complete = supported.prefer_macrolanguage().maximize()
|
||
supported_triple = (
|
||
supported_complete.language,
|
||
supported_complete.script,
|
||
supported_complete.territory,
|
||
)
|
||
|
||
return tuple_distance_cached(desired_triple, supported_triple)
|
||
|
||
def is_valid(self) -> bool:
|
||
"""
|
||
Checks whether the language, script, territory, and variants
|
||
(if present) are all tags that have meanings assigned by IANA.
|
||
For example, 'ja' (Japanese) is a valid tag, and 'jp' is not.
|
||
|
||
The data is current as of CLDR 40.
|
||
|
||
>>> Language.get('ja').is_valid()
|
||
True
|
||
>>> Language.get('jp').is_valid()
|
||
False
|
||
>>> Language.get('en-001').is_valid()
|
||
True
|
||
>>> Language.get('en-000').is_valid()
|
||
False
|
||
>>> Language.get('en-Latn').is_valid()
|
||
True
|
||
>>> Language.get('en-Latnx').is_valid()
|
||
False
|
||
>>> Language.get('und').is_valid()
|
||
True
|
||
>>> Language.get('en-GB-oxendict').is_valid()
|
||
True
|
||
>>> Language.get('en-GB-oxenfree').is_valid()
|
||
False
|
||
>>> Language.get('x-heptapod').is_valid()
|
||
True
|
||
|
||
Some scripts are, confusingly, not included in CLDR's 'validity' pattern.
|
||
If a script appears in the IANA registry, we consider it valid.
|
||
|
||
>>> Language.get('ur-Aran').is_valid()
|
||
True
|
||
>>> Language.get('cu-Cyrs').is_valid()
|
||
True
|
||
|
||
A language tag with multiple extlangs will parse, but is not valid.
|
||
The only allowed example is 'zh-min-nan', which normalizes to the
|
||
language 'nan'.
|
||
|
||
>>> Language.get('zh-min-nan').is_valid()
|
||
True
|
||
>>> Language.get('sgn-ase-bfi').is_valid()
|
||
False
|
||
|
||
These examples check that duplicate tags are not valid:
|
||
|
||
>>> Language.get('de-1901').is_valid()
|
||
True
|
||
>>> Language.get('de-1901-1901').is_valid()
|
||
False
|
||
>>> Language.get('en-a-bbb-c-ddd').is_valid()
|
||
True
|
||
>>> Language.get('en-a-bbb-a-ddd').is_valid()
|
||
False
|
||
|
||
Of course, you should be prepared to catch a failure to parse the
|
||
language code at all:
|
||
|
||
>>> Language.get('C').is_valid()
|
||
Traceback (most recent call last):
|
||
...
|
||
langcodes.tag_parser.LanguageTagError: Expected a language code, got 'c'
|
||
"""
|
||
if self.extlangs is not None:
|
||
# An erratum to BCP 47 says that tags with more than one extlang are
|
||
# invalid.
|
||
if len(self.extlangs) > 1:
|
||
return False
|
||
|
||
subtags = [self.language, self.script, self.territory]
|
||
checked_subtags = []
|
||
if self.variants is not None:
|
||
subtags.extend(self.variants)
|
||
for subtag in subtags:
|
||
if subtag is not None:
|
||
checked_subtags.append(subtag)
|
||
if not subtag.startswith('x-') and not VALIDITY.match(subtag):
|
||
if subtag not in ALL_SCRIPTS:
|
||
return False
|
||
|
||
# We check extensions for validity by ensuring that there aren't
|
||
# two extensions introduced by the same letter. For example, you can't
|
||
# have two 'u-' extensions.
|
||
if self.extensions:
|
||
checked_subtags.extend([extension[:2] for extension in self.extensions])
|
||
if len(set(checked_subtags)) != len(checked_subtags):
|
||
return False
|
||
return True
|
||
|
||
def has_name_data(self) -> bool:
|
||
"""
|
||
Return True when we can name languages in this language. Requires
|
||
`language_data` to be installed.
|
||
|
||
This is true when the language, or one of its 'broader' versions, is in
|
||
the list of CLDR target languages.
|
||
|
||
>>> Language.get('fr').has_name_data()
|
||
True
|
||
>>> Language.get('so').has_name_data()
|
||
True
|
||
>>> Language.get('enc').has_name_data()
|
||
False
|
||
>>> Language.get('und').has_name_data()
|
||
False
|
||
"""
|
||
try:
|
||
from language_data.name_data import LANGUAGES_WITH_NAME_DATA
|
||
except ImportError:
|
||
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
||
raise
|
||
|
||
matches = set(self.broader_tags()) & LANGUAGES_WITH_NAME_DATA
|
||
return bool(matches)
|
||
|
||
# These methods help to show what the language tag means in natural
|
||
# language. They actually apply the language-matching algorithm to find
|
||
# the right language to name things in.
|
||
|
||
def _get_name(
|
||
self, attribute: str, language: Union[str, 'Language'], max_distance: int
|
||
) -> str:
|
||
try:
|
||
from language_data.names import code_to_names
|
||
except ImportError:
|
||
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
||
raise
|
||
|
||
assert attribute in self.ATTRIBUTES
|
||
if isinstance(language, str):
|
||
language = Language.get(language)
|
||
|
||
attr_value = getattr(self, attribute)
|
||
if attr_value is None:
|
||
if attribute == 'language':
|
||
attr_value = 'und'
|
||
else:
|
||
return None
|
||
names = code_to_names(attr_value)
|
||
|
||
result = self._best_name(names, language, max_distance)
|
||
if result is not None:
|
||
return result
|
||
else:
|
||
# Construct a string like "Unknown language [zzz]"
|
||
placeholder = None
|
||
if attribute == 'language':
|
||
placeholder = 'und'
|
||
elif attribute == 'script':
|
||
placeholder = 'Zzzz'
|
||
elif attribute == 'territory':
|
||
placeholder = 'ZZ'
|
||
|
||
unknown_name = None
|
||
if placeholder is not None:
|
||
names = code_to_names(placeholder)
|
||
unknown_name = self._best_name(names, language, max_distance)
|
||
if unknown_name is None:
|
||
unknown_name = 'Unknown language subtag'
|
||
return f'{unknown_name} [{attr_value}]'
|
||
|
||
def _best_name(
|
||
self, names: Mapping[str, str], language: 'Language', max_distance: int
|
||
):
|
||
matchable_languages = set(language.broader_tags())
|
||
possible_languages = [
|
||
key for key in sorted(names.keys()) if key in matchable_languages
|
||
]
|
||
|
||
target_language, score = closest_match(
|
||
language, possible_languages, max_distance
|
||
)
|
||
if target_language in names:
|
||
return names[target_language]
|
||
else:
|
||
return names.get(DEFAULT_LANGUAGE)
|
||
|
||
def language_name(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> str:
|
||
"""
|
||
Give the name of the language (not the entire tag, just the language part)
|
||
in a natural language. The target language can be given as a string or
|
||
another Language object.
|
||
|
||
By default, things are named in English:
|
||
|
||
>>> Language.get('fr').language_name()
|
||
'French'
|
||
>>> Language.get('el').language_name()
|
||
'Greek'
|
||
|
||
But you can ask for language names in numerous other languages:
|
||
|
||
>>> Language.get('fr').language_name('fr')
|
||
'français'
|
||
>>> Language.get('el').language_name('fr')
|
||
'grec'
|
||
|
||
Why does everyone get Slovak and Slovenian confused? Let's ask them.
|
||
|
||
>>> Language.get('sl').language_name('sl')
|
||
'slovenščina'
|
||
>>> Language.get('sk').language_name('sk')
|
||
'slovenčina'
|
||
>>> Language.get('sl').language_name('sk')
|
||
'slovinčina'
|
||
>>> Language.get('sk').language_name('sl')
|
||
'slovaščina'
|
||
"""
|
||
return self._get_name('language', language, max_distance)
|
||
|
||
def display_name(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> str:
|
||
"""
|
||
It's often helpful to be able to describe a language code in a way that a user
|
||
(or you) can understand, instead of in inscrutable short codes. The
|
||
`display_name` method lets you describe a Language object *in a language*.
|
||
|
||
The `.display_name(language, min_score)` method will look up the name of the
|
||
language. The names come from the IANA language tag registry, which is only in
|
||
English, plus CLDR, which names languages in many commonly-used languages.
|
||
|
||
The default language for naming things is English:
|
||
|
||
>>> Language.make(language='fr').display_name()
|
||
'French'
|
||
|
||
>>> Language.make().display_name()
|
||
'Unknown language'
|
||
|
||
>>> Language.get('zh-Hans').display_name()
|
||
'Chinese (Simplified)'
|
||
|
||
>>> Language.get('en-US').display_name()
|
||
'English (United States)'
|
||
|
||
But you can ask for language names in numerous other languages:
|
||
|
||
>>> Language.get('fr').display_name('fr')
|
||
'français'
|
||
|
||
>>> Language.get('fr').display_name('es')
|
||
'francés'
|
||
|
||
>>> Language.make().display_name('es')
|
||
'lengua desconocida'
|
||
|
||
>>> Language.get('zh-Hans').display_name('de')
|
||
'Chinesisch (Vereinfacht)'
|
||
|
||
>>> Language.get('en-US').display_name('zh-Hans')
|
||
'英语(美国)'
|
||
"""
|
||
reduced = self.simplify_script()
|
||
language = Language.get(language)
|
||
language_name = reduced.language_name(language, max_distance)
|
||
extra_parts = []
|
||
|
||
if reduced.script is not None:
|
||
extra_parts.append(reduced.script_name(language, max_distance))
|
||
if reduced.territory is not None:
|
||
extra_parts.append(reduced.territory_name(language, max_distance))
|
||
|
||
if extra_parts:
|
||
clarification = language._display_separator().join(extra_parts)
|
||
pattern = language._display_pattern()
|
||
return pattern.format(language_name, clarification)
|
||
else:
|
||
return language_name
|
||
|
||
def _display_pattern(self) -> str:
|
||
"""
|
||
Get the pattern, according to CLDR, that should be used for clarifying
|
||
details of a language code.
|
||
"""
|
||
# Technically we are supposed to look up this pattern in each language.
|
||
# Practically, it's the same in every language except Chinese, where the
|
||
# parentheses are full-width.
|
||
if self._disp_pattern is not None:
|
||
return self._disp_pattern
|
||
if self.distance(Language.get('zh')) <= 25:
|
||
self._disp_pattern = "{0}({1})"
|
||
else:
|
||
self._disp_pattern = "{0} ({1})"
|
||
return self._disp_pattern
|
||
|
||
def _display_separator(self) -> str:
|
||
"""
|
||
Get the symbol that should be used to separate multiple clarifying
|
||
details -- such as a comma in English, or an ideographic comma in
|
||
Japanese.
|
||
|
||
Requires that `language_data` is installed.
|
||
"""
|
||
try:
|
||
from language_data.names import DISPLAY_SEPARATORS
|
||
except ImportError:
|
||
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
||
raise
|
||
|
||
if self._disp_separator is not None:
|
||
return self._disp_separator
|
||
matched, _dist = closest_match(self, DISPLAY_SEPARATORS.keys())
|
||
self._disp_separator = DISPLAY_SEPARATORS[matched]
|
||
return self._disp_separator
|
||
|
||
def autonym(self, max_distance: int = 9) -> str:
|
||
"""
|
||
Give the display name of this language *in* this language.
|
||
Requires that `language_data` is installed.
|
||
|
||
>>> Language.get('fr').autonym()
|
||
'français'
|
||
>>> Language.get('es').autonym()
|
||
'español'
|
||
>>> Language.get('ja').autonym()
|
||
'日本語'
|
||
|
||
This uses the `display_name()` method, so it can include the name of a
|
||
script or territory when appropriate.
|
||
|
||
>>> Language.get('en-AU').autonym()
|
||
'English (Australia)'
|
||
>>> Language.get('sr-Latn').autonym()
|
||
'srpski (latinica)'
|
||
>>> Language.get('sr-Cyrl').autonym()
|
||
'српски (ћирилица)'
|
||
>>> Language.get('pa').autonym()
|
||
'ਪੰਜਾਬੀ'
|
||
>>> Language.get('pa-Arab').autonym()
|
||
'پنجابی (عربی)'
|
||
|
||
This only works for language codes that CLDR has locale data for. You
|
||
can't ask for the autonym of 'ja-Latn' and get 'nihongo (rōmaji)'.
|
||
"""
|
||
lang = self.prefer_macrolanguage()
|
||
return lang.display_name(language=lang, max_distance=max_distance)
|
||
|
||
def script_name(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> str:
|
||
"""
|
||
Describe the script part of the language tag in a natural language.
|
||
Requires that `language_data` is installed.
|
||
"""
|
||
return self._get_name('script', language, max_distance)
|
||
|
||
def territory_name(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> str:
|
||
"""
|
||
Describe the territory part of the language tag in a natural language.
|
||
Requires that `language_data` is installed.
|
||
"""
|
||
return self._get_name('territory', language, max_distance)
|
||
|
||
def region_name(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> str:
|
||
warnings.warn(
|
||
"`region_name` has been renamed to `territory_name` for consistency",
|
||
DeprecationWarning,
|
||
)
|
||
return self.territory_name(language, max_distance)
|
||
|
||
@property
|
||
def region(self):
|
||
warnings.warn(
|
||
"The `region` property has been renamed to `territory` for consistency",
|
||
DeprecationWarning,
|
||
)
|
||
return self.territory
|
||
|
||
def variant_names(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> Sequence[str]:
|
||
"""
|
||
Deprecated in version 3.0.
|
||
|
||
We don't store names for variants anymore, so this just returns the list
|
||
of variant codes, such as ['oxendict'] for en-GB-oxendict.
|
||
"""
|
||
warnings.warn(
|
||
"variant_names is deprecated and just returns the variant codes",
|
||
DeprecationWarning,
|
||
)
|
||
return self.variants or []
|
||
|
||
def describe(
|
||
self,
|
||
language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
|
||
max_distance: int = 25,
|
||
) -> dict:
|
||
"""
|
||
Return a dictionary that describes a given language tag in a specified
|
||
natural language. Requires that `language_data` is installed.
|
||
|
||
See `language_name` and related methods for more specific versions of this.
|
||
|
||
The desired `language` will in fact be matched against the available
|
||
options using the matching technique that this module provides. We can
|
||
illustrate many aspects of this by asking for a description of Shavian
|
||
script (a phonetic script for English devised by author George Bernard
|
||
Shaw), and where you might find it, in various languages.
|
||
|
||
>>> shaw = Language.make(script='Shaw').maximize()
|
||
>>> shaw.describe('en')
|
||
{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
|
||
|
||
>>> shaw.describe('fr')
|
||
{'language': 'anglais', 'script': 'shavien', 'territory': 'Royaume-Uni'}
|
||
|
||
>>> shaw.describe('es')
|
||
{'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}
|
||
|
||
>>> shaw.describe('pt')
|
||
{'language': 'inglês', 'script': 'shaviano', 'territory': 'Reino Unido'}
|
||
|
||
>>> shaw.describe('uk')
|
||
{'language': 'англійська', 'script': 'шоу', 'territory': 'Велика Британія'}
|
||
|
||
>>> shaw.describe('arb')
|
||
{'language': 'الإنجليزية', 'script': 'الشواني', 'territory': 'المملكة المتحدة'}
|
||
|
||
>>> shaw.describe('th')
|
||
{'language': 'อังกฤษ', 'script': 'ซอเวียน', 'territory': 'สหราชอาณาจักร'}
|
||
|
||
>>> shaw.describe('zh-Hans')
|
||
{'language': '英语', 'script': '萧伯纳式文', 'territory': '英国'}
|
||
|
||
>>> shaw.describe('zh-Hant')
|
||
{'language': '英文', 'script': '簫柏納字符', 'territory': '英國'}
|
||
|
||
>>> shaw.describe('ja')
|
||
{'language': '英語', 'script': 'ショー文字', 'territory': 'イギリス'}
|
||
|
||
When we don't have a localization for the language, we fall back on English,
|
||
because the IANA provides names for all known codes in English.
|
||
|
||
>>> shaw.describe('lol')
|
||
{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
|
||
|
||
When the language tag itself is a valid tag but with no known meaning, we
|
||
say so in the appropriate language.
|
||
|
||
>>> Language.get('xyz-ZY').display_name()
|
||
'Unknown language [xyz] (Unknown Region [ZY])'
|
||
|
||
>>> Language.get('xyz-ZY').display_name('es')
|
||
'lengua desconocida [xyz] (Región desconocida [ZY])'
|
||
"""
|
||
names = {}
|
||
if self.language:
|
||
names['language'] = self.language_name(language, max_distance)
|
||
if self.script:
|
||
names['script'] = self.script_name(language, max_distance)
|
||
if self.territory:
|
||
names['territory'] = self.territory_name(language, max_distance)
|
||
return names
|
||
|
||
def speaking_population(self) -> int:
|
||
"""
|
||
Get an estimate of how many people in the world speak this language,
|
||
derived from CLDR data. Requires that `language_data` is installed.
|
||
|
||
Only the language and territory codes will be considered. If a
|
||
territory code is included, the population will count only the
|
||
speakers of the language in that territory.
|
||
|
||
Script subtags are disregarded, because it doesn't make sense to ask
|
||
how many people speak in a particular writing script.
|
||
|
||
>>> Language.get('es').speaking_population()
|
||
487664083
|
||
>>> Language.get('pt').speaking_population()
|
||
237135429
|
||
>>> Language.get('es-BR').speaking_population()
|
||
76218
|
||
>>> Language.get('pt-BR').speaking_population()
|
||
192661560
|
||
>>> Language.get('vo').speaking_population()
|
||
0
|
||
"""
|
||
try:
|
||
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
||
except ImportError:
|
||
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
||
raise
|
||
|
||
lang = self._filter_attributes(['language', 'territory'])
|
||
return LANGUAGE_SPEAKING_POPULATION.get(str(lang), 0)
|
||
|
||
def writing_population(self) -> int:
|
||
"""
|
||
Get an estimate of how many people in the world read and write
|
||
this language, derived from CLDR data. Requires that `language_data`
|
||
is installed.
|
||
|
||
For many languages that aren't typically written, this is an
|
||
overestimate, according to CLDR -- the data often includes people who
|
||
speak that language but write in a different language.
|
||
|
||
Only the language, script, and territory codes will be considered.
|
||
If a territory code is included, the population will count only the
|
||
speakers of the language in that territory.
|
||
|
||
>>> all = Language.get('zh').writing_population()
|
||
>>> all
|
||
1240326057
|
||
|
||
>>> traditional = Language.get('zh-Hant').writing_population()
|
||
>>> traditional
|
||
37019589
|
||
|
||
>>> simplified = Language.get('zh-Hans').writing_population()
|
||
>>> all == traditional + simplified
|
||
True
|
||
|
||
>>> Language.get('zh-Hant-HK').writing_population()
|
||
6439733
|
||
>>> Language.get('zh-Hans-HK').writing_population()
|
||
338933
|
||
|
||
Note that if you want to get the total Chinese writing population
|
||
of Hong Kong, you need to avoid normalization that would interpret
|
||
'zh-HK' as 'zh-Hant-HK'.
|
||
|
||
>>> Language.get('zh-HK', normalize=False).writing_population()
|
||
6778666
|
||
|
||
Unknown or unspecified language codes get a population of 0.
|
||
|
||
>>> Language.get('xyz').writing_population()
|
||
0
|
||
|
||
>>> Language.get('und').writing_population()
|
||
0
|
||
"""
|
||
try:
|
||
from language_data.population_data import LANGUAGE_WRITING_POPULATION
|
||
except ImportError:
|
||
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
||
raise
|
||
|
||
lang = self._filter_attributes(['language', 'script', 'territory'])
|
||
if str(lang) in LANGUAGE_WRITING_POPULATION:
|
||
return LANGUAGE_WRITING_POPULATION[str(lang)]
|
||
else:
|
||
lang = lang.simplify_script()
|
||
return LANGUAGE_WRITING_POPULATION.get(str(lang), 0)
|
||
|
||
@staticmethod
|
||
def find_name(
|
||
tagtype: str, name: str, language: Optional[Union[str, 'Language']] = None
|
||
) -> 'Language':
|
||
"""
|
||
Find the subtag of a particular `tagtype` that has the given `name`.
|
||
Requires that `language_data` is installed.
|
||
|
||
The default language, "und", will allow matching names in any language,
|
||
so you can get the code 'fr' by looking up "French", "Français", or
|
||
"francés".
|
||
|
||
Occasionally, names are ambiguous in a way that can be resolved by
|
||
specifying what name the language is supposed to be in. For example,
|
||
there is a language named 'Malayo' in English, but it's different from
|
||
the language named 'Malayo' in Spanish (which is Malay). Specifying the
|
||
language will look up the name in a trie that is only in that language.
|
||
|
||
In a previous version, we thought we were going to deprecate the
|
||
`language` parameter, as there weren't significant cases of conflicts
|
||
in names of things between languages. Well, we got more data, and
|
||
conflicts in names are everywhere.
|
||
|
||
Specifying the language that the name should be in is still not
|
||
required, but it will help to make sure that names can be
|
||
round-tripped.
|
||
|
||
>>> Language.find_name('language', 'francés')
|
||
Language.make(language='fr')
|
||
|
||
>>> Language.find_name('territory', 'United Kingdom')
|
||
Language.make(territory='GB')
|
||
|
||
>>> Language.find_name('script', 'Arabic')
|
||
Language.make(script='Arab')
|
||
|
||
>>> Language.find_name('language', 'norsk bokmål')
|
||
Language.make(language='nb')
|
||
|
||
>>> Language.find_name('language', 'norsk')
|
||
Language.make(language='no')
|
||
|
||
>>> Language.find_name('language', 'norsk', 'en')
|
||
Traceback (most recent call last):
|
||
...
|
||
LookupError: Can't find any language named 'norsk'
|
||
|
||
>>> Language.find_name('language', 'norsk', 'no')
|
||
Language.make(language='no')
|
||
|
||
>>> Language.find_name('language', 'malayo', 'en')
|
||
Language.make(language='mbp')
|
||
|
||
>>> Language.find_name('language', 'malayo', 'es')
|
||
Language.make(language='ms')
|
||
|
||
Some langauge names resolve to more than a language. For example,
|
||
the name 'Brazilian Portuguese' resolves to a language and a territory,
|
||
and 'Simplified Chinese' resolves to a language and a script. In these
|
||
cases, a Language object with multiple subtags will be returned.
|
||
|
||
>>> Language.find_name('language', 'Brazilian Portuguese', 'en')
|
||
Language.make(language='pt', territory='BR')
|
||
|
||
>>> Language.find_name('language', 'Simplified Chinese', 'en')
|
||
Language.make(language='zh', script='Hans')
|
||
|
||
A small amount of fuzzy matching is supported: if the name can be
|
||
shortened to match a single language name, you get that language.
|
||
This allows, for example, "Hakka dialect" to match "Hakka".
|
||
|
||
>>> Language.find_name('language', 'Hakka dialect')
|
||
Language.make(language='hak')
|
||
"""
|
||
try:
|
||
from language_data.names import name_to_code
|
||
except ImportError:
|
||
print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
|
||
raise
|
||
|
||
# No matter what form of language we got, normalize it to a single
|
||
# language subtag
|
||
if isinstance(language, Language):
|
||
language = language.language
|
||
elif isinstance(language, str):
|
||
language = get(language).language
|
||
if language is None:
|
||
language = 'und'
|
||
|
||
code = name_to_code(tagtype, name, language)
|
||
if code is None:
|
||
raise LookupError(f"Can't find any {tagtype} named {name!r}")
|
||
if '-' in code:
|
||
return Language.get(code)
|
||
else:
|
||
data = {tagtype: code}
|
||
return Language.make(**data)
|
||
|
||
@staticmethod
|
||
def find(
|
||
name: str, language: Optional[Union[str, 'Language']] = None
|
||
) -> 'Language':
|
||
"""
|
||
A concise version of `find_name`, used to get a language tag by its
|
||
name in a natural language. The language can be omitted in the large
|
||
majority of cases, where the language name is not ambiguous.
|
||
|
||
>>> Language.find('Türkçe')
|
||
Language.make(language='tr')
|
||
>>> Language.find('brazilian portuguese')
|
||
Language.make(language='pt', territory='BR')
|
||
>>> Language.find('simplified chinese')
|
||
Language.make(language='zh', script='Hans')
|
||
|
||
Some language names are ambiguous: for example, there is a language
|
||
named 'Fala' in English (with code 'fax'), but 'Fala' is also the
|
||
Kwasio word for French. In this case, specifying the language that
|
||
the name is in is necessary for disambiguation.
|
||
|
||
>>> Language.find('fala')
|
||
Language.make(language='fr')
|
||
>>> Language.find('fala', 'nmg')
|
||
Language.make(language='fr')
|
||
>>> Language.find('fala', 'en')
|
||
Language.make(language='fax')
|
||
"""
|
||
return Language.find_name('language', name, language)
|
||
|
||
def to_dict(self) -> dict:
|
||
"""
|
||
Get a dictionary of the attributes of this Language object, which
|
||
can be useful for constructing a similar object.
|
||
"""
|
||
if self._dict is not None:
|
||
return self._dict
|
||
|
||
result = {}
|
||
for key in self.ATTRIBUTES:
|
||
value = getattr(self, key)
|
||
if value:
|
||
result[key] = value
|
||
self._dict = result
|
||
return result
|
||
|
||
def update(self, other: 'Language') -> 'Language':
|
||
"""
|
||
Update this Language with the fields of another Language.
|
||
"""
|
||
return Language.make(
|
||
language=other.language or self.language,
|
||
extlangs=other.extlangs or self.extlangs,
|
||
script=other.script or self.script,
|
||
territory=other.territory or self.territory,
|
||
variants=other.variants or self.variants,
|
||
extensions=other.extensions or self.extensions,
|
||
private=other.private or self.private,
|
||
)
|
||
|
||
def update_dict(self, newdata: dict) -> 'Language':
|
||
"""
|
||
Update the attributes of this Language from a dictionary.
|
||
"""
|
||
return Language.make(
|
||
language=newdata.get('language', self.language),
|
||
extlangs=newdata.get('extlangs', self.extlangs),
|
||
script=newdata.get('script', self.script),
|
||
territory=newdata.get('territory', self.territory),
|
||
variants=newdata.get('variants', self.variants),
|
||
extensions=newdata.get('extensions', self.extensions),
|
||
private=newdata.get('private', self.private),
|
||
)
|
||
|
||
@staticmethod
|
||
def _filter_keys(d: dict, keys: Iterable[str]) -> dict:
|
||
"""
|
||
Select a subset of keys from a dictionary.
|
||
"""
|
||
return {key: d[key] for key in keys if key in d}
|
||
|
||
def _filter_attributes(self, keyset: Iterable[str]) -> 'Language':
|
||
"""
|
||
Return a copy of this object with a subset of its attributes set.
|
||
"""
|
||
filtered = self._filter_keys(self.to_dict(), keyset)
|
||
return Language.make(**filtered)
|
||
|
||
def _searchable_form(self) -> 'Language':
|
||
"""
|
||
Convert a parsed language tag so that the information it contains is in
|
||
the best form for looking up information in the CLDR.
|
||
"""
|
||
if self._searchable is not None:
|
||
return self._searchable
|
||
|
||
self._searchable = (
|
||
self._filter_attributes({'language', 'script', 'territory'})
|
||
.simplify_script()
|
||
.prefer_macrolanguage()
|
||
)
|
||
return self._searchable
|
||
|
||
def __eq__(self, other):
|
||
if self is other:
|
||
return True
|
||
if not isinstance(other, Language):
|
||
return False
|
||
return self._str_tag == other._str_tag
|
||
|
||
def __hash__(self) -> int:
|
||
return hash(id(self))
|
||
|
||
def __getitem__(self, key: str) -> Optional[Union[str, List[str]]]:
|
||
if key in self.ATTRIBUTES:
|
||
return getattr(self, key)
|
||
else:
|
||
raise KeyError(key)
|
||
|
||
def __contains__(self, key: str) -> bool:
|
||
return key in self.ATTRIBUTES and getattr(self, key)
|
||
|
||
def __repr__(self) -> str:
|
||
items = []
|
||
for attr in self.ATTRIBUTES:
|
||
if getattr(self, attr):
|
||
value = getattr(self, attr)
|
||
items.append(f'{attr}={value!r}')
|
||
joined = ', '.join(items)
|
||
return f"Language.make({joined})"
|
||
|
||
def __str__(self) -> str:
|
||
return self.to_tag()
|
||
|
||
|
||
# Make the get(), find(), and find_name() functions available at the top level
|
||
get = Language.get
|
||
find = Language.find
|
||
find_name = Language.find_name
|
||
|
||
# Make the Language object available under the old name LanguageData
|
||
LanguageData = Language
|
||
|
||
|
||
def standardize_tag(tag: Union[str, Language], macro: bool = False) -> str:
|
||
"""
|
||
Standardize a language tag:
|
||
|
||
- Replace deprecated values with their updated versions (if those exist)
|
||
- Remove script tags that are redundant with the language
|
||
- If *macro* is True, use a macrolanguage to represent the most common
|
||
standardized language within that macrolanguage. For example, 'cmn'
|
||
(Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
|
||
becomes 'ar' (Arabic).
|
||
- Format the result according to the conventions of BCP 47
|
||
|
||
Macrolanguage replacement is not required by BCP 47, but it is required
|
||
by the Unicode CLDR.
|
||
|
||
>>> standardize_tag('en_US')
|
||
'en-US'
|
||
|
||
>>> standardize_tag('en-Latn')
|
||
'en'
|
||
|
||
>>> standardize_tag('en-uk')
|
||
'en-GB'
|
||
|
||
>>> standardize_tag('eng')
|
||
'en'
|
||
|
||
>>> standardize_tag('arb-Arab', macro=True)
|
||
'ar'
|
||
|
||
>>> standardize_tag('sh-QU')
|
||
'sr-Latn-EU'
|
||
|
||
>>> standardize_tag('sgn-US')
|
||
'ase'
|
||
|
||
>>> standardize_tag('zh-cmn-hans-cn')
|
||
'zh-Hans-CN'
|
||
|
||
>>> standardize_tag('zsm', macro=True)
|
||
'ms'
|
||
|
||
>>> standardize_tag('ja-latn-hepburn')
|
||
'ja-Latn-hepburn'
|
||
|
||
>>> standardize_tag('spa-latn-mx')
|
||
'es-MX'
|
||
|
||
If the tag can't be parsed according to BCP 47, this will raise a
|
||
LanguageTagError (a subclass of ValueError):
|
||
|
||
>>> standardize_tag('spa-mx-latn')
|
||
Traceback (most recent call last):
|
||
...
|
||
langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
|
||
"""
|
||
langdata = Language.get(tag, normalize=True)
|
||
if macro:
|
||
langdata = langdata.prefer_macrolanguage()
|
||
|
||
return langdata.simplify_script().to_tag()
|
||
|
||
|
||
def tag_is_valid(tag: Union[str, Language]) -> bool:
|
||
"""
|
||
Determines whether a string is a valid language tag. This is similar to
|
||
Language.get(tag).is_valid(), but can return False in the case where
|
||
the tag doesn't parse.
|
||
|
||
>>> tag_is_valid('ja')
|
||
True
|
||
>>> tag_is_valid('jp')
|
||
False
|
||
>>> tag_is_valid('spa-Latn-MX')
|
||
True
|
||
>>> tag_is_valid('spa-MX-Latn')
|
||
False
|
||
>>> tag_is_valid('')
|
||
False
|
||
>>> tag_is_valid('C.UTF-8')
|
||
False
|
||
"""
|
||
try:
|
||
langdata = Language.get(tag)
|
||
return langdata.is_valid()
|
||
except LanguageTagError:
|
||
return False
|
||
|
||
|
||
def tag_match_score(
|
||
desired: Union[str, Language], supported: Union[str, Language]
|
||
) -> int:
|
||
"""
|
||
DEPRECATED: use .distance() instead, which uses newer data and is _lower_
|
||
for better matching languages.
|
||
|
||
Return a number from 0 to 100 indicating the strength of match between the
|
||
language the user desires, D, and a supported language, S. Higher numbers
|
||
are better. A reasonable cutoff for not messing with your users is to
|
||
only accept scores of 75 or more.
|
||
|
||
A score of 100 means the languages are the same, possibly after normalizing
|
||
and filling in likely values.
|
||
"""
|
||
warnings.warn(
|
||
"tag_match_score is deprecated because it's based on deprecated CLDR info. "
|
||
"Use tag_distance instead, which is _lower_ for better matching languages. ",
|
||
DeprecationWarning,
|
||
)
|
||
desired_ld = Language.get(desired)
|
||
supported_ld = Language.get(supported)
|
||
return desired_ld.match_score(supported_ld)
|
||
|
||
|
||
def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int:
|
||
"""
|
||
Tags that expand to the same thing when likely values are filled in get a
|
||
distance of 0.
|
||
|
||
>>> tag_distance('en', 'en')
|
||
0
|
||
>>> tag_distance('en', 'en-US')
|
||
0
|
||
>>> tag_distance('zh-Hant', 'zh-TW')
|
||
0
|
||
>>> tag_distance('ru-Cyrl', 'ru')
|
||
0
|
||
|
||
As a specific example, Serbo-Croatian is a politically contentious idea,
|
||
but in CLDR, it's considered equivalent to Serbian in Latin characters.
|
||
|
||
>>> tag_distance('sh', 'sr-Latn')
|
||
0
|
||
|
||
... which is very similar to Croatian but sociopolitically not the same.
|
||
|
||
>>> tag_distance('sh', 'hr')
|
||
9
|
||
|
||
Unicode reorganized its distinction between 'no' (Norwegian) and 'nb'
|
||
(Norwegian Bokmål) in 2021. 'no' is preferred in most contexts, and the more
|
||
specific 'nb' is a distance of 1 from it:
|
||
|
||
>>> tag_distance('nb', 'no')
|
||
1
|
||
|
||
These distances can be asymmetrical: this data includes the fact that speakers
|
||
of Swiss German (gsw) know High German (de), but not at all the other way around.
|
||
|
||
The difference seems a little bit extreme, but the asymmetry is certainly
|
||
there. And if your text is tagged as 'gsw', it must be that way for a
|
||
reason.
|
||
|
||
>>> tag_distance('gsw', 'de')
|
||
8
|
||
>>> tag_distance('de', 'gsw')
|
||
84
|
||
|
||
Unconnected languages get a distance of 80 to 134.
|
||
|
||
>>> tag_distance('en', 'zh')
|
||
134
|
||
>>> tag_distance('es', 'fr')
|
||
84
|
||
>>> tag_distance('fr-CH', 'de-CH')
|
||
80
|
||
|
||
Different local variants of the same language get a distance from 3 to 5.
|
||
>>> tag_distance('zh-HK', 'zh-MO') # Chinese is similar in Hong Kong and Macao
|
||
4
|
||
>>> tag_distance('en-AU', 'en-GB') # Australian English is similar to British English
|
||
3
|
||
>>> tag_distance('en-IN', 'en-GB') # Indian English is also similar to British English
|
||
3
|
||
>>> tag_distance('es-PE', 'es-419') # Peruvian Spanish is Latin American Spanish
|
||
1
|
||
>>> tag_distance('es-419', 'es-PE') # but Latin American Spanish is not necessarily Peruvian
|
||
4
|
||
>>> tag_distance('es-ES', 'es-419') # Spanish in Spain is further from Latin American Spanish
|
||
5
|
||
>>> tag_distance('en-US', 'en-GB') # American and British English are somewhat different
|
||
5
|
||
>>> tag_distance('es-MX', 'es-ES') # Mexican Spanish is different from Spanish Spanish
|
||
5
|
||
>>> # European Portuguese is different from the most common form (Brazilian Portuguese)
|
||
>>> tag_distance('pt', 'pt-PT')
|
||
5
|
||
|
||
>>> # Serbian has two scripts, and people might prefer one but understand both
|
||
>>> tag_distance('sr-Latn', 'sr-Cyrl')
|
||
5
|
||
|
||
A distance of 10 is used for matching a specific language to its
|
||
more-commonly-used macrolanguage tag.
|
||
|
||
>>> tag_distance('arz', 'ar') # Egyptian Arabic to Modern Standard Arabic
|
||
10
|
||
>>> tag_distance('wuu', 'zh') # Wu Chinese to (Mandarin) Chinese
|
||
10
|
||
|
||
Higher distances can arrive due to particularly contentious differences in
|
||
the script for writing the language, where people who understand one script
|
||
can learn the other but may not be happy with it. This specifically applies
|
||
to Chinese.
|
||
|
||
>>> tag_distance('zh-TW', 'zh-CN')
|
||
54
|
||
>>> tag_distance('zh-Hans', 'zh-Hant')
|
||
54
|
||
>>> tag_distance('zh-CN', 'zh-HK')
|
||
54
|
||
>>> tag_distance('zh-CN', 'zh-TW')
|
||
54
|
||
>>> tag_distance('zh-Hant', 'zh-Hans')
|
||
54
|
||
|
||
This distance range also applies to the differences between Norwegian
|
||
Bokmål, Nynorsk, and Danish.
|
||
|
||
>>> tag_distance('no', 'da')
|
||
12
|
||
>>> tag_distance('no', 'nn')
|
||
20
|
||
|
||
Differences of 20 to 50 can represent substantially different languages,
|
||
in cases where speakers of the first may understand the second for demographic
|
||
reasons.
|
||
|
||
>>> tag_distance('eu', 'es') # Basque to Spanish
|
||
20
|
||
>>> tag_distance('af', 'nl') # Afrikaans to Dutch
|
||
24
|
||
>>> tag_distance('mr', 'hi') # Marathi to Hindi
|
||
30
|
||
>>> tag_distance('ms', 'id') # Malay to Indonesian
|
||
34
|
||
>>> tag_distance('mg', 'fr') # Malagasy to French
|
||
34
|
||
>>> tag_distance('ta', 'en') # Tamil to English
|
||
44
|
||
|
||
A complex example is the tag 'yue' for Cantonese. Written Chinese is usually
|
||
presumed to be Mandarin Chinese, but colloquial Cantonese can be written as
|
||
well. (Some things could not be written any other way, such as Cantonese
|
||
song lyrics.)
|
||
|
||
The difference between Cantonese and Mandarin also implies script and
|
||
territory differences by default, adding to the distance.
|
||
|
||
>>> tag_distance('yue', 'zh')
|
||
64
|
||
|
||
When the supported script is a different one than desired, this is usually
|
||
a major difference with score of 50 or more.
|
||
|
||
>>> tag_distance('ja', 'ja-Latn-US-hepburn')
|
||
54
|
||
|
||
>>> # You can read the Shavian script, right?
|
||
>>> tag_distance('en', 'en-Shaw')
|
||
54
|
||
"""
|
||
desired_obj = Language.get(desired)
|
||
supported_obj = Language.get(supported)
|
||
return desired_obj.distance(supported_obj)
|
||
|
||
|
||
def best_match(
|
||
desired_language: Union[str, Language],
|
||
supported_languages: Sequence[str],
|
||
min_score: int = 75,
|
||
) -> Tuple[str, int]:
|
||
"""
|
||
DEPRECATED: use .closest_match() instead. This function emulates the old
|
||
matching behavior by subtracting the language distance from 100.
|
||
|
||
You have software that supports any of the `supported_languages`. You want
|
||
to use `desired_language`. This function lets you choose the right language,
|
||
even if there isn't an exact match.
|
||
|
||
Returns:
|
||
|
||
- The best-matching language code, which will be one of the
|
||
`supported_languages` or 'und'
|
||
- The score of the match, from 0 to 100; higher is better.
|
||
|
||
`min_score` sets the minimum match score. If all languages match with a lower
|
||
score than that, the result will be 'und' with a score of 0.
|
||
"""
|
||
max_distance = 100 - min_score
|
||
supported, distance = closest_match(
|
||
desired_language, supported_languages, max_distance
|
||
)
|
||
score = max(0, 100 - distance)
|
||
return supported, score
|
||
|
||
|
||
def closest_match(
|
||
desired_language: Union[str, Language],
|
||
supported_languages: Sequence[str],
|
||
max_distance: int = 25,
|
||
) -> Tuple[str, int]:
|
||
"""
|
||
You have software that supports any of the `supported_languages`. You want
|
||
to use `desired_language`. This function lets you choose the right language,
|
||
even if there isn't an exact match.
|
||
|
||
Returns:
|
||
|
||
- The best-matching language code, which will be one of the
|
||
`supported_languages` or 'und' for no match
|
||
- The distance of the match, which is 0 for a perfect match and increases
|
||
from there (see `tag_distance`)
|
||
|
||
`max_distance` sets the maximum match distance. If all matches are farther
|
||
than that, the result will be 'und' with a distance of 1000. The default
|
||
value is 25, and raising it can cause data to be processed in significantly
|
||
the wrong language. The documentation for `tag_distance` describes the
|
||
distance values in more detail.
|
||
|
||
When there is a tie for the best matching language, the first one in the
|
||
tie will be used.
|
||
|
||
>>> closest_match('fr', ['de', 'en', 'fr'])
|
||
('fr', 0)
|
||
|
||
>>> closest_match('pt', ['pt-BR', 'pt-PT'])
|
||
('pt-BR', 0)
|
||
|
||
>>> closest_match('en-AU', ['en-GB', 'en-US'])
|
||
('en-GB', 3)
|
||
|
||
>>> closest_match('af', ['en', 'nl', 'zu'])
|
||
('nl', 24)
|
||
|
||
>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
|
||
('und', 1000)
|
||
"""
|
||
desired_language = str(desired_language)
|
||
|
||
# Quickly return if the desired language is directly supported
|
||
if desired_language in supported_languages:
|
||
return desired_language, 0
|
||
|
||
# Reduce the desired language to a standard form that could also match
|
||
desired_language = standardize_tag(desired_language)
|
||
if desired_language in supported_languages:
|
||
return desired_language, 0
|
||
|
||
match_distances = [
|
||
(supported, tag_distance(desired_language, supported))
|
||
for supported in supported_languages
|
||
]
|
||
match_distances = [
|
||
(supported, distance)
|
||
for (supported, distance) in match_distances
|
||
if distance <= max_distance
|
||
] + [('und', 1000)]
|
||
|
||
match_distances.sort(key=itemgetter(1))
|
||
return match_distances[0]
|
||
|
||
|
||
def closest_supported_match(
|
||
desired_language: Union[str, Language],
|
||
supported_languages: Sequence[str],
|
||
max_distance: int = 25,
|
||
) -> Optional[str]:
|
||
"""
|
||
Wraps `closest_match` with a simpler return type. Returns the language
|
||
tag of the closest match if there is one, or None if there is not.
|
||
|
||
>>> closest_supported_match('fr', ['de', 'en', 'fr'])
|
||
'fr'
|
||
|
||
>>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
|
||
'pt-BR'
|
||
|
||
>>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
|
||
'en-GB'
|
||
|
||
>>> closest_supported_match('und', ['en', 'und'])
|
||
'und'
|
||
|
||
>>> closest_supported_match('af', ['en', 'nl', 'zu'])
|
||
'nl'
|
||
|
||
>>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
|
||
None
|
||
"""
|
||
code, distance = closest_match(desired_language, supported_languages, max_distance)
|
||
if distance == 1000:
|
||
return None
|
||
else:
|
||
return code
|