""" langcodes knows what languages are. It knows the standardized codes that refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi. Often, it knows what these languages are called *in* a language, and that language doesn't have to be English. See README.md for the main documentation, or read it on GitHub at https://github.com/LuminosoInsight/langcodes/ . For more specific documentation on the functions in langcodes, scroll down and read the docstrings. Some of these functions, particularly those that work with the names of languages, require the `language_data` module to be installed. """ from operator import itemgetter from typing import Any, List, Tuple, Dict, Sequence, Iterable, Optional, Mapping, Union import warnings import sys from langcodes.tag_parser import LanguageTagError, parse_tag, normalize_characters from langcodes.language_distance import tuple_distance_cached from langcodes.data_dicts import ( ALL_SCRIPTS, DEFAULT_SCRIPTS, LANGUAGE_REPLACEMENTS, LANGUAGE_ALPHA3, LANGUAGE_ALPHA3_BIBLIOGRAPHIC, TERRITORY_REPLACEMENTS, NORMALIZED_MACROLANGUAGES, LIKELY_SUBTAGS, VALIDITY, ) # When we're getting natural language information *about* languages, it's in # English if you don't specify the language. DEFAULT_LANGUAGE = 'en' LANGUAGE_NAME_IMPORT_MESSAGE = """ Looking up language names now requires the `language_data` package. Install it with: pip install language_data Or as an optional feature of langcodes: pip install langcodes[data] """ class Language: """ The Language class defines the results of parsing a language tag. Language objects have the following attributes, any of which may be unspecified (in which case their value is None): - *language*: the code for the language itself. - *script*: the 4-letter code for the writing system being used. - *territory*: the 2-letter or 3-digit code for the country or similar territory of the world whose usage of the language appears in this text. - *extlangs*: a list of more specific language codes that follow the language code. (This is allowed by the language code syntax, but deprecated.) - *variants*: codes for specific variations of language usage that aren't covered by the *script* or *territory* codes. - *extensions*: information that's attached to the language code for use in some specific system, such as Unicode collation orders. - *private*: a code starting with `x-` that has no defined meaning. The `Language.get` method converts a string to a Language instance. It's also available at the top level of this module as the `get` function. """ ATTRIBUTES = [ 'language', 'extlangs', 'script', 'territory', 'variants', 'extensions', 'private', ] # When looking up "likely subtags" data, we try looking up the data for # increasingly less specific versions of the language code. BROADER_KEYSETS = [ {'language', 'script', 'territory'}, {'language', 'territory'}, {'language', 'script'}, {'language'}, {'script'}, {}, ] MATCHABLE_KEYSETS = [ {'language', 'script', 'territory'}, {'language', 'script'}, {'language'}, ] # Values cached at the class level _INSTANCES: Dict[tuple, 'Language'] = {} _PARSE_CACHE: Dict[Tuple[str, bool], 'Language'] = {} def __init__( self, language: Optional[str] = None, extlangs: Optional[Sequence[str]] = None, script: Optional[str] = None, territory: Optional[str] = None, variants: Optional[Sequence[str]] = None, extensions: Optional[Sequence[str]] = None, private: Optional[str] = None, ): """ The constructor for Language objects. It's inefficient to call this directly, because it can't return an existing instance. Instead, call Language.make(), which has the same signature. """ self.language = language self.extlangs = extlangs self.script = script self.territory = territory self.variants = variants self.extensions = extensions self.private = private # Cached values self._simplified: 'Language' = None self._searchable: 'Language' = None self._broader: List[str] = None self._assumed: 'Language' = None self._filled: 'Language' = None self._macrolanguage: Optional['Language'] = None self._str_tag: str = None self._dict: dict = None self._disp_separator: str = None self._disp_pattern: str = None # Make sure the str_tag value is cached self.to_tag() @classmethod def make( cls, language: Optional[str] = None, extlangs: Optional[Sequence[str]] = None, script: Optional[str] = None, territory: Optional[str] = None, variants: Optional[Sequence[str]] = None, extensions: Optional[Sequence[str]] = None, private: Optional[str] = None, ) -> 'Language': """ Create a Language object by giving any subset of its attributes. If this value has been created before, return the existing value. """ values = ( language, tuple(extlangs or ()), script, territory, tuple(variants or ()), tuple(extensions or ()), private, ) if values in cls._INSTANCES: return cls._INSTANCES[values] instance = cls( language=language, extlangs=extlangs, script=script, territory=territory, variants=variants, extensions=extensions, private=private, ) cls._INSTANCES[values] = instance return instance @staticmethod def get(tag: Union[str, 'Language'], normalize=True) -> 'Language': """ Create a Language object from a language tag string. If normalize=True, non-standard or overlong tags will be replaced as they're interpreted. This is recommended. Here are several examples of language codes, which are also test cases. Most language codes are straightforward, but these examples will get pretty obscure toward the end. >>> Language.get('en-US') Language.make(language='en', territory='US') >>> Language.get('zh-Hant') Language.make(language='zh', script='Hant') >>> Language.get('und') Language.make() This function is idempotent, in case you already have a Language object: >>> Language.get(Language.get('en-us')) Language.make(language='en', territory='US') The non-code 'root' is sometimes used to represent the lack of any language information, similar to 'und'. >>> Language.get('root') Language.make() By default, getting a Language object will automatically convert deprecated tags: >>> Language.get('iw') Language.make(language='he') >>> Language.get('in') Language.make(language='id') One type of deprecated tag that should be replaced is for sign languages, which used to all be coded as regional variants of a fictitious global sign language called 'sgn'. Of course, there is no global sign language, so sign languages now have their own language codes. >>> Language.get('sgn-US') Language.make(language='ase') >>> Language.get('sgn-US', normalize=False) Language.make(language='sgn', territory='US') 'en-gb-oed' is a tag that's grandfathered into the standard because it has been used to mean "spell-check this with Oxford English Dictionary spelling", but that tag has the wrong shape. We interpret this as the new standardized tag 'en-gb-oxendict', unless asked not to normalize. >>> Language.get('en-gb-oed') Language.make(language='en', territory='GB', variants=['oxendict']) >>> Language.get('en-gb-oed', normalize=False) Language.make(language='en-gb-oed') 'zh-min-nan' is another oddly-formed tag, used to represent the Southern Min language, which includes Taiwanese as a regional form. It now has its own language code. >>> Language.get('zh-min-nan') Language.make(language='nan') The vague tag 'zh-min' is now also interpreted as 'nan', with a private extension indicating that it had a different form: >>> Language.get('zh-min') Language.make(language='nan', private='x-zh-min') Occasionally Wiktionary will use 'extlang' tags in strange ways, such as using the tag 'und-ibe' for some unspecified Iberian language. >>> Language.get('und-ibe') Language.make(extlangs=['ibe']) Here's an example of replacing multiple deprecated tags. The language tag 'sh' (Serbo-Croatian) ended up being politically problematic, and different standards took different steps to address this. The IANA made it into a macrolanguage that contains 'sr', 'hr', and 'bs'. Unicode further decided that it's a legacy tag that should be interpreted as 'sr-Latn', which the language matching rules say is mutually intelligible with all those languages. We complicate the example by adding on the territory tag 'QU', an old provisional tag for the European Union, which is now standardized as 'EU'. >>> Language.get('sh-QU') Language.make(language='sr', script='Latn', territory='EU') """ if isinstance(tag, Language): if not normalize: # shortcut: we have the tag already return tag # We might need to normalize this tag. Convert it back into a # string tag, to cover all the edge cases of normalization in a # way that we've already solved. tag = tag.to_tag() if (tag, normalize) in Language._PARSE_CACHE: return Language._PARSE_CACHE[tag, normalize] data: Dict[str, Any] = {} # If the complete tag appears as something to normalize, do the # normalization right away. Smash case and convert underscores to # hyphens when checking, because the case normalization that comes from # parse_tag() hasn't been applied yet. tag_lower = normalize_characters(tag) if normalize and tag_lower in LANGUAGE_REPLACEMENTS: tag = LANGUAGE_REPLACEMENTS[tag_lower] components = parse_tag(tag) for typ, value in components: if typ == 'extlang' and normalize and 'language' in data: # smash extlangs when possible minitag = f"{data['language']}-{value}" norm = LANGUAGE_REPLACEMENTS.get(normalize_characters(minitag)) if norm is not None: data.update(Language.get(norm, normalize).to_dict()) else: data.setdefault('extlangs', []).append(value) elif typ in {'extlang', 'variant', 'extension'}: data.setdefault(typ + 's', []).append(value) elif typ == 'language': if value == 'und': pass elif normalize: replacement = LANGUAGE_REPLACEMENTS.get(value.lower()) if replacement is not None: # parse the replacement if necessary -- this helps with # Serbian and Moldovan data.update(Language.get(replacement, normalize).to_dict()) else: data['language'] = value else: data['language'] = value elif typ == 'territory': if normalize: data['territory'] = TERRITORY_REPLACEMENTS.get(value.lower(), value) else: data['territory'] = value elif typ == 'grandfathered': # If we got here, we got a grandfathered tag but we were asked # not to normalize it, or the CLDR data doesn't know how to # normalize it. The best we can do is set the entire tag as the # language. data['language'] = value else: data[typ] = value result = Language.make(**data) Language._PARSE_CACHE[tag, normalize] = result return result def to_tag(self) -> str: """ Convert a Language back to a standard language tag, as a string. This is also the str() representation of a Language object. >>> Language.make(language='en', territory='GB').to_tag() 'en-GB' >>> Language.make(language='yue', script='Hant', territory='HK').to_tag() 'yue-Hant-HK' >>> Language.make(script='Arab').to_tag() 'und-Arab' >>> str(Language.make(territory='IN')) 'und-IN' """ if self._str_tag is not None: return self._str_tag subtags = ['und'] if self.language: subtags[0] = self.language if self.extlangs: for extlang in sorted(self.extlangs): subtags.append(extlang) if self.script: subtags.append(self.script) if self.territory: subtags.append(self.territory) if self.variants: for variant in sorted(self.variants): subtags.append(variant) if self.extensions: for ext in self.extensions: subtags.append(ext) if self.private: subtags.append(self.private) self._str_tag = '-'.join(subtags) return self._str_tag def simplify_script(self) -> 'Language': """ Remove the script from some parsed language data, if the script is redundant with the language. >>> Language.make(language='en', script='Latn').simplify_script() Language.make(language='en') >>> Language.make(language='yi', script='Latn').simplify_script() Language.make(language='yi', script='Latn') >>> Language.make(language='yi', script='Hebr').simplify_script() Language.make(language='yi') """ if self._simplified is not None: return self._simplified if self.language and self.script: if DEFAULT_SCRIPTS.get(self.language) == self.script: result = self.update_dict({'script': None}) self._simplified = result return self._simplified self._simplified = self return self._simplified def assume_script(self) -> 'Language': """ Fill in the script if it's missing, and if it can be assumed from the language subtag. This is the opposite of `simplify_script`. >>> Language.make(language='en').assume_script() Language.make(language='en', script='Latn') >>> Language.make(language='yi').assume_script() Language.make(language='yi', script='Hebr') >>> Language.make(language='yi', script='Latn').assume_script() Language.make(language='yi', script='Latn') This fills in nothing when the script cannot be assumed -- such as when the language has multiple scripts, or it has no standard orthography: >>> Language.make(language='sr').assume_script() Language.make(language='sr') >>> Language.make(language='eee').assume_script() Language.make(language='eee') It also dosn't fill anything in when the language is unspecified. >>> Language.make(territory='US').assume_script() Language.make(territory='US') """ if self._assumed is not None: return self._assumed if self.language and not self.script: try: self._assumed = self.update_dict( {'script': DEFAULT_SCRIPTS[self.language]} ) except KeyError: self._assumed = self else: self._assumed = self return self._assumed def prefer_macrolanguage(self) -> 'Language': """ BCP 47 doesn't specify what to do with macrolanguages and the languages they contain. The Unicode CLDR, on the other hand, says that when a macrolanguage has a dominant standardized language, the macrolanguage code should be used for that language. For example, Mandarin Chinese is 'zh', not 'cmn', according to Unicode, and Malay is 'ms', not 'zsm'. This isn't a rule you'd want to follow in all cases -- for example, you may want to be able to specifically say that 'ms' (the Malay macrolanguage) contains both 'zsm' (Standard Malay) and 'id' (Indonesian). But applying this rule helps when interoperating with the Unicode CLDR. So, applying `prefer_macrolanguage` to a Language object will return a new object, replacing the language with the macrolanguage if it is the dominant language within that macrolanguage. It will leave non-dominant languages that have macrolanguages alone. >>> Language.get('arb').prefer_macrolanguage() Language.make(language='ar') >>> Language.get('cmn-Hant').prefer_macrolanguage() Language.make(language='zh', script='Hant') >>> Language.get('yue-Hant').prefer_macrolanguage() Language.make(language='yue', script='Hant') """ if self._macrolanguage is not None: return self._macrolanguage language = self.language or 'und' if language in NORMALIZED_MACROLANGUAGES: self._macrolanguage = self.update_dict( {'language': NORMALIZED_MACROLANGUAGES[language]} ) else: self._macrolanguage = self return self._macrolanguage def to_alpha3(self, variant: str = 'T') -> str: """ Get the three-letter language code for this language, even if it's canonically written with a two-letter code. These codes are the 'alpha3' codes defined by ISO 639-2. When this function returns, it always returns a 3-letter string. If there is no known alpha3 code for the language, it raises a LookupError. In cases where the distinction matters, we default to the 'terminology' code. You can pass `variant='B'` to get the 'bibliographic' code instead. For example, the terminology code for German is 'deu', while the bibliographic code is 'ger'. (The confusion between these two sets of codes is a good reason to avoid using alpha3 codes. Every language that has two different alpha3 codes also has an alpha2 code that's preferred, such as 'de' for German.) >>> Language.get('fr').to_alpha3() 'fra' >>> Language.get('fr-CA').to_alpha3() 'fra' >>> Language.get('fr').to_alpha3(variant='B') 'fre' >>> Language.get('de').to_alpha3(variant='T') 'deu' >>> Language.get('ja').to_alpha3() 'jpn' >>> Language.get('un').to_alpha3() Traceback (most recent call last): ... LookupError: 'un' is not a known language code, and has no alpha3 code. All valid two-letter language codes have corresponding alpha3 codes, even the un-normalized ones. If they were assigned an alpha3 code by ISO before they were assigned a normalized code by CLDR, these codes may be different: >>> Language.get('tl', normalize=False).to_alpha3() 'tgl' >>> Language.get('tl').to_alpha3() 'fil' >>> Language.get('sh', normalize=False).to_alpha3() 'hbs' Three-letter codes are preserved, even if they're unknown: >>> Language.get('qqq').to_alpha3() 'qqq' >>> Language.get('und').to_alpha3() 'und' """ variant = variant.upper() if variant not in 'BT': raise ValueError("Variant must be 'B' or 'T'") language = self.language if language is None: return 'und' elif len(language) == 3: return language else: if variant == 'B' and language in LANGUAGE_ALPHA3_BIBLIOGRAPHIC: return LANGUAGE_ALPHA3_BIBLIOGRAPHIC[language] elif language in LANGUAGE_ALPHA3: return LANGUAGE_ALPHA3[language] else: raise LookupError( f"{language!r} is not a known language code, " "and has no alpha3 code." ) def broader_tags(self) -> List[str]: """ Iterate through increasingly general tags for this language. This isn't actually that useful for matching two arbitrary language tags against each other, but it is useful for matching them against a known standardized form, such as in the CLDR data. The list of broader versions to try appears in UTR 35, section 4.3, "Likely Subtags". >>> Language.get('nn-Latn-NO-x-thingy').broader_tags() ['nn-Latn-NO-x-thingy', 'nn-Latn-NO', 'nn-NO', 'nn-Latn', 'nn', 'und-Latn', 'und'] >>> Language.get('arb-Arab').broader_tags() ['arb-Arab', 'ar-Arab', 'arb', 'ar', 'und-Arab', 'und'] """ if self._broader is not None: return self._broader self._broader = [self.to_tag()] seen = set([self.to_tag()]) for keyset in self.BROADER_KEYSETS: for start_language in (self, self.prefer_macrolanguage()): filtered = start_language._filter_attributes(keyset) tag = filtered.to_tag() if tag not in seen: self._broader.append(tag) seen.add(tag) return self._broader def broaden(self) -> 'List[Language]': """ Like `broader_tags`, but returrns Language objects instead of strings. """ return [Language.get(tag) for tag in self.broader_tags()] def maximize(self) -> 'Language': """ The Unicode CLDR contains a "likelySubtags" data file, which can guess reasonable values for fields that are missing from a language tag. This is particularly useful for comparing, for example, "zh-Hant" and "zh-TW", two common language tags that say approximately the same thing via rather different information. (Using traditional Han characters is not the same as being in Taiwan, but each implies that the other is likely.) These implications are provided in the CLDR supplemental data, and are based on the likelihood of people using the language to transmit text on the Internet. (This is why the overall default is English, not Chinese.) It's important to recognize that these tags amplify majorities, and that not all language support fits into a "likely" language tag. >>> str(Language.get('zh-Hant').maximize()) 'zh-Hant-TW' >>> str(Language.get('zh-TW').maximize()) 'zh-Hant-TW' >>> str(Language.get('ja').maximize()) 'ja-Jpan-JP' >>> str(Language.get('pt').maximize()) 'pt-Latn-BR' >>> str(Language.get('und-Arab').maximize()) 'ar-Arab-EG' >>> str(Language.get('und-CH').maximize()) 'de-Latn-CH' As many standards are, this is US-centric: >>> str(Language.make().maximize()) 'en-Latn-US' "Extlangs" have no likely-subtags information, so they will give maximized results that make no sense: >>> str(Language.get('und-ibe').maximize()) 'en-ibe-Latn-US' """ if self._filled is not None: return self._filled for tag in self.broader_tags(): if tag in LIKELY_SUBTAGS: result = Language.get(LIKELY_SUBTAGS[tag], normalize=False) result = result.update(self) self._filled = result return result raise RuntimeError( "Couldn't fill in likely values. This represents a problem with " "the LIKELY_SUBTAGS data." ) # Support an old, wordier name for the method fill_likely_values = maximize def match_score(self, supported: 'Language') -> int: """ DEPRECATED: use .distance() instead, which uses newer data and is _lower_ for better matching languages. """ warnings.warn( "`match_score` is deprecated because it's based on deprecated CLDR info. " "Use `distance` instead, which is _lower_ for better matching languages. ", DeprecationWarning, ) return 100 - min(self.distance(supported), 100) def distance(self, supported: 'Language') -> int: """ Suppose that `self` is the language that the user desires, and `supported` is a language that is actually supported. This method returns a number from 0 to 134 measuring the 'distance' between the languages (lower numbers are better). This is not a symmetric relation. The language distance is not really about the linguistic similarity or history of the languages; instead, it's based largely on sociopolitical factors, indicating which language speakers are likely to know which other languages in the present world. Much of the heuristic is about finding a widespread 'world language' like English, Chinese, French, or Russian that speakers of a more localized language will accept. A version that works on language tags, as strings, is in the function `tag_distance`. See that function for copious examples. """ if supported == self: return 0 # CLDR has realized that these matching rules are undermined when the # unspecified language 'und' gets maximized to 'en-Latn-US', so this case # is specifically not maximized: if self.language is None and self.script is None and self.territory is None: desired_triple = ('und', 'Zzzz', 'ZZ') else: desired_complete = self.prefer_macrolanguage().maximize() desired_triple = ( desired_complete.language, desired_complete.script, desired_complete.territory, ) if ( supported.language is None and supported.script is None and supported.territory is None ): supported_triple = ('und', 'Zzzz', 'ZZ') else: supported_complete = supported.prefer_macrolanguage().maximize() supported_triple = ( supported_complete.language, supported_complete.script, supported_complete.territory, ) return tuple_distance_cached(desired_triple, supported_triple) def is_valid(self) -> bool: """ Checks whether the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA. For example, 'ja' (Japanese) is a valid tag, and 'jp' is not. The data is current as of CLDR 40. >>> Language.get('ja').is_valid() True >>> Language.get('jp').is_valid() False >>> Language.get('en-001').is_valid() True >>> Language.get('en-000').is_valid() False >>> Language.get('en-Latn').is_valid() True >>> Language.get('en-Latnx').is_valid() False >>> Language.get('und').is_valid() True >>> Language.get('en-GB-oxendict').is_valid() True >>> Language.get('en-GB-oxenfree').is_valid() False >>> Language.get('x-heptapod').is_valid() True Some scripts are, confusingly, not included in CLDR's 'validity' pattern. If a script appears in the IANA registry, we consider it valid. >>> Language.get('ur-Aran').is_valid() True >>> Language.get('cu-Cyrs').is_valid() True A language tag with multiple extlangs will parse, but is not valid. The only allowed example is 'zh-min-nan', which normalizes to the language 'nan'. >>> Language.get('zh-min-nan').is_valid() True >>> Language.get('sgn-ase-bfi').is_valid() False These examples check that duplicate tags are not valid: >>> Language.get('de-1901').is_valid() True >>> Language.get('de-1901-1901').is_valid() False >>> Language.get('en-a-bbb-c-ddd').is_valid() True >>> Language.get('en-a-bbb-a-ddd').is_valid() False Of course, you should be prepared to catch a failure to parse the language code at all: >>> Language.get('C').is_valid() Traceback (most recent call last): ... langcodes.tag_parser.LanguageTagError: Expected a language code, got 'c' """ if self.extlangs is not None: # An erratum to BCP 47 says that tags with more than one extlang are # invalid. if len(self.extlangs) > 1: return False subtags = [self.language, self.script, self.territory] checked_subtags = [] if self.variants is not None: subtags.extend(self.variants) for subtag in subtags: if subtag is not None: checked_subtags.append(subtag) if not subtag.startswith('x-') and not VALIDITY.match(subtag): if subtag not in ALL_SCRIPTS: return False # We check extensions for validity by ensuring that there aren't # two extensions introduced by the same letter. For example, you can't # have two 'u-' extensions. if self.extensions: checked_subtags.extend([extension[:2] for extension in self.extensions]) if len(set(checked_subtags)) != len(checked_subtags): return False return True def has_name_data(self) -> bool: """ Return True when we can name languages in this language. Requires `language_data` to be installed. This is true when the language, or one of its 'broader' versions, is in the list of CLDR target languages. >>> Language.get('fr').has_name_data() True >>> Language.get('so').has_name_data() True >>> Language.get('enc').has_name_data() False >>> Language.get('und').has_name_data() False """ try: from language_data.name_data import LANGUAGES_WITH_NAME_DATA except ImportError: print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) raise matches = set(self.broader_tags()) & LANGUAGES_WITH_NAME_DATA return bool(matches) # These methods help to show what the language tag means in natural # language. They actually apply the language-matching algorithm to find # the right language to name things in. def _get_name( self, attribute: str, language: Union[str, 'Language'], max_distance: int ) -> str: try: from language_data.names import code_to_names except ImportError: print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) raise assert attribute in self.ATTRIBUTES if isinstance(language, str): language = Language.get(language) attr_value = getattr(self, attribute) if attr_value is None: if attribute == 'language': attr_value = 'und' else: return None names = code_to_names(attr_value) result = self._best_name(names, language, max_distance) if result is not None: return result else: # Construct a string like "Unknown language [zzz]" placeholder = None if attribute == 'language': placeholder = 'und' elif attribute == 'script': placeholder = 'Zzzz' elif attribute == 'territory': placeholder = 'ZZ' unknown_name = None if placeholder is not None: names = code_to_names(placeholder) unknown_name = self._best_name(names, language, max_distance) if unknown_name is None: unknown_name = 'Unknown language subtag' return f'{unknown_name} [{attr_value}]' def _best_name( self, names: Mapping[str, str], language: 'Language', max_distance: int ): matchable_languages = set(language.broader_tags()) possible_languages = [ key for key in sorted(names.keys()) if key in matchable_languages ] target_language, score = closest_match( language, possible_languages, max_distance ) if target_language in names: return names[target_language] else: return names.get(DEFAULT_LANGUAGE) def language_name( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> str: """ Give the name of the language (not the entire tag, just the language part) in a natural language. The target language can be given as a string or another Language object. By default, things are named in English: >>> Language.get('fr').language_name() 'French' >>> Language.get('el').language_name() 'Greek' But you can ask for language names in numerous other languages: >>> Language.get('fr').language_name('fr') 'français' >>> Language.get('el').language_name('fr') 'grec' Why does everyone get Slovak and Slovenian confused? Let's ask them. >>> Language.get('sl').language_name('sl') 'slovenščina' >>> Language.get('sk').language_name('sk') 'slovenčina' >>> Language.get('sl').language_name('sk') 'slovinčina' >>> Language.get('sk').language_name('sl') 'slovaščina' """ return self._get_name('language', language, max_distance) def display_name( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> str: """ It's often helpful to be able to describe a language code in a way that a user (or you) can understand, instead of in inscrutable short codes. The `display_name` method lets you describe a Language object *in a language*. The `.display_name(language, min_score)` method will look up the name of the language. The names come from the IANA language tag registry, which is only in English, plus CLDR, which names languages in many commonly-used languages. The default language for naming things is English: >>> Language.make(language='fr').display_name() 'French' >>> Language.make().display_name() 'Unknown language' >>> Language.get('zh-Hans').display_name() 'Chinese (Simplified)' >>> Language.get('en-US').display_name() 'English (United States)' But you can ask for language names in numerous other languages: >>> Language.get('fr').display_name('fr') 'français' >>> Language.get('fr').display_name('es') 'francés' >>> Language.make().display_name('es') 'lengua desconocida' >>> Language.get('zh-Hans').display_name('de') 'Chinesisch (Vereinfacht)' >>> Language.get('en-US').display_name('zh-Hans') '英语(美国)' """ reduced = self.simplify_script() language = Language.get(language) language_name = reduced.language_name(language, max_distance) extra_parts = [] if reduced.script is not None: extra_parts.append(reduced.script_name(language, max_distance)) if reduced.territory is not None: extra_parts.append(reduced.territory_name(language, max_distance)) if extra_parts: clarification = language._display_separator().join(extra_parts) pattern = language._display_pattern() return pattern.format(language_name, clarification) else: return language_name def _display_pattern(self) -> str: """ Get the pattern, according to CLDR, that should be used for clarifying details of a language code. """ # Technically we are supposed to look up this pattern in each language. # Practically, it's the same in every language except Chinese, where the # parentheses are full-width. if self._disp_pattern is not None: return self._disp_pattern if self.distance(Language.get('zh')) <= 25: self._disp_pattern = "{0}({1})" else: self._disp_pattern = "{0} ({1})" return self._disp_pattern def _display_separator(self) -> str: """ Get the symbol that should be used to separate multiple clarifying details -- such as a comma in English, or an ideographic comma in Japanese. Requires that `language_data` is installed. """ try: from language_data.names import DISPLAY_SEPARATORS except ImportError: print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) raise if self._disp_separator is not None: return self._disp_separator matched, _dist = closest_match(self, DISPLAY_SEPARATORS.keys()) self._disp_separator = DISPLAY_SEPARATORS[matched] return self._disp_separator def autonym(self, max_distance: int = 9) -> str: """ Give the display name of this language *in* this language. Requires that `language_data` is installed. >>> Language.get('fr').autonym() 'français' >>> Language.get('es').autonym() 'español' >>> Language.get('ja').autonym() '日本語' This uses the `display_name()` method, so it can include the name of a script or territory when appropriate. >>> Language.get('en-AU').autonym() 'English (Australia)' >>> Language.get('sr-Latn').autonym() 'srpski (latinica)' >>> Language.get('sr-Cyrl').autonym() 'српски (ћирилица)' >>> Language.get('pa').autonym() 'ਪੰਜਾਬੀ' >>> Language.get('pa-Arab').autonym() 'پنجابی (عربی)' This only works for language codes that CLDR has locale data for. You can't ask for the autonym of 'ja-Latn' and get 'nihongo (rōmaji)'. """ lang = self.prefer_macrolanguage() return lang.display_name(language=lang, max_distance=max_distance) def script_name( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> str: """ Describe the script part of the language tag in a natural language. Requires that `language_data` is installed. """ return self._get_name('script', language, max_distance) def territory_name( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> str: """ Describe the territory part of the language tag in a natural language. Requires that `language_data` is installed. """ return self._get_name('territory', language, max_distance) def region_name( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> str: warnings.warn( "`region_name` has been renamed to `territory_name` for consistency", DeprecationWarning, ) return self.territory_name(language, max_distance) @property def region(self): warnings.warn( "The `region` property has been renamed to `territory` for consistency", DeprecationWarning, ) return self.territory def variant_names( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> Sequence[str]: """ Deprecated in version 3.0. We don't store names for variants anymore, so this just returns the list of variant codes, such as ['oxendict'] for en-GB-oxendict. """ warnings.warn( "variant_names is deprecated and just returns the variant codes", DeprecationWarning, ) return self.variants or [] def describe( self, language: Union[str, 'Language'] = DEFAULT_LANGUAGE, max_distance: int = 25, ) -> dict: """ Return a dictionary that describes a given language tag in a specified natural language. Requires that `language_data` is installed. See `language_name` and related methods for more specific versions of this. The desired `language` will in fact be matched against the available options using the matching technique that this module provides. We can illustrate many aspects of this by asking for a description of Shavian script (a phonetic script for English devised by author George Bernard Shaw), and where you might find it, in various languages. >>> shaw = Language.make(script='Shaw').maximize() >>> shaw.describe('en') {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'} >>> shaw.describe('fr') {'language': 'anglais', 'script': 'shavien', 'territory': 'Royaume-Uni'} >>> shaw.describe('es') {'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'} >>> shaw.describe('pt') {'language': 'inglês', 'script': 'shaviano', 'territory': 'Reino Unido'} >>> shaw.describe('uk') {'language': 'англійська', 'script': 'шоу', 'territory': 'Велика Британія'} >>> shaw.describe('arb') {'language': 'الإنجليزية', 'script': 'الشواني', 'territory': 'المملكة المتحدة'} >>> shaw.describe('th') {'language': 'อังกฤษ', 'script': 'ซอเวียน', 'territory': 'สหราชอาณาจักร'} >>> shaw.describe('zh-Hans') {'language': '英语', 'script': '萧伯纳式文', 'territory': '英国'} >>> shaw.describe('zh-Hant') {'language': '英文', 'script': '簫柏納字符', 'territory': '英國'} >>> shaw.describe('ja') {'language': '英語', 'script': 'ショー文字', 'territory': 'イギリス'} When we don't have a localization for the language, we fall back on English, because the IANA provides names for all known codes in English. >>> shaw.describe('lol') {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'} When the language tag itself is a valid tag but with no known meaning, we say so in the appropriate language. >>> Language.get('xyz-ZY').display_name() 'Unknown language [xyz] (Unknown Region [ZY])' >>> Language.get('xyz-ZY').display_name('es') 'lengua desconocida [xyz] (Región desconocida [ZY])' """ names = {} if self.language: names['language'] = self.language_name(language, max_distance) if self.script: names['script'] = self.script_name(language, max_distance) if self.territory: names['territory'] = self.territory_name(language, max_distance) return names def speaking_population(self) -> int: """ Get an estimate of how many people in the world speak this language, derived from CLDR data. Requires that `language_data` is installed. Only the language and territory codes will be considered. If a territory code is included, the population will count only the speakers of the language in that territory. Script subtags are disregarded, because it doesn't make sense to ask how many people speak in a particular writing script. >>> Language.get('es').speaking_population() 487664083 >>> Language.get('pt').speaking_population() 237135429 >>> Language.get('es-BR').speaking_population() 76218 >>> Language.get('pt-BR').speaking_population() 192661560 >>> Language.get('vo').speaking_population() 0 """ try: from language_data.population_data import LANGUAGE_SPEAKING_POPULATION except ImportError: print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) raise lang = self._filter_attributes(['language', 'territory']) return LANGUAGE_SPEAKING_POPULATION.get(str(lang), 0) def writing_population(self) -> int: """ Get an estimate of how many people in the world read and write this language, derived from CLDR data. Requires that `language_data` is installed. For many languages that aren't typically written, this is an overestimate, according to CLDR -- the data often includes people who speak that language but write in a different language. Only the language, script, and territory codes will be considered. If a territory code is included, the population will count only the speakers of the language in that territory. >>> all = Language.get('zh').writing_population() >>> all 1240326057 >>> traditional = Language.get('zh-Hant').writing_population() >>> traditional 37019589 >>> simplified = Language.get('zh-Hans').writing_population() >>> all == traditional + simplified True >>> Language.get('zh-Hant-HK').writing_population() 6439733 >>> Language.get('zh-Hans-HK').writing_population() 338933 Note that if you want to get the total Chinese writing population of Hong Kong, you need to avoid normalization that would interpret 'zh-HK' as 'zh-Hant-HK'. >>> Language.get('zh-HK', normalize=False).writing_population() 6778666 Unknown or unspecified language codes get a population of 0. >>> Language.get('xyz').writing_population() 0 >>> Language.get('und').writing_population() 0 """ try: from language_data.population_data import LANGUAGE_WRITING_POPULATION except ImportError: print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) raise lang = self._filter_attributes(['language', 'script', 'territory']) if str(lang) in LANGUAGE_WRITING_POPULATION: return LANGUAGE_WRITING_POPULATION[str(lang)] else: lang = lang.simplify_script() return LANGUAGE_WRITING_POPULATION.get(str(lang), 0) @staticmethod def find_name( tagtype: str, name: str, language: Optional[Union[str, 'Language']] = None ) -> 'Language': """ Find the subtag of a particular `tagtype` that has the given `name`. Requires that `language_data` is installed. The default language, "und", will allow matching names in any language, so you can get the code 'fr' by looking up "French", "Français", or "francés". Occasionally, names are ambiguous in a way that can be resolved by specifying what name the language is supposed to be in. For example, there is a language named 'Malayo' in English, but it's different from the language named 'Malayo' in Spanish (which is Malay). Specifying the language will look up the name in a trie that is only in that language. In a previous version, we thought we were going to deprecate the `language` parameter, as there weren't significant cases of conflicts in names of things between languages. Well, we got more data, and conflicts in names are everywhere. Specifying the language that the name should be in is still not required, but it will help to make sure that names can be round-tripped. >>> Language.find_name('language', 'francés') Language.make(language='fr') >>> Language.find_name('territory', 'United Kingdom') Language.make(territory='GB') >>> Language.find_name('script', 'Arabic') Language.make(script='Arab') >>> Language.find_name('language', 'norsk bokmål') Language.make(language='nb') >>> Language.find_name('language', 'norsk') Language.make(language='no') >>> Language.find_name('language', 'norsk', 'en') Traceback (most recent call last): ... LookupError: Can't find any language named 'norsk' >>> Language.find_name('language', 'norsk', 'no') Language.make(language='no') >>> Language.find_name('language', 'malayo', 'en') Language.make(language='mbp') >>> Language.find_name('language', 'malayo', 'es') Language.make(language='ms') Some langauge names resolve to more than a language. For example, the name 'Brazilian Portuguese' resolves to a language and a territory, and 'Simplified Chinese' resolves to a language and a script. In these cases, a Language object with multiple subtags will be returned. >>> Language.find_name('language', 'Brazilian Portuguese', 'en') Language.make(language='pt', territory='BR') >>> Language.find_name('language', 'Simplified Chinese', 'en') Language.make(language='zh', script='Hans') A small amount of fuzzy matching is supported: if the name can be shortened to match a single language name, you get that language. This allows, for example, "Hakka dialect" to match "Hakka". >>> Language.find_name('language', 'Hakka dialect') Language.make(language='hak') """ try: from language_data.names import name_to_code except ImportError: print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) raise # No matter what form of language we got, normalize it to a single # language subtag if isinstance(language, Language): language = language.language elif isinstance(language, str): language = get(language).language if language is None: language = 'und' code = name_to_code(tagtype, name, language) if code is None: raise LookupError(f"Can't find any {tagtype} named {name!r}") if '-' in code: return Language.get(code) else: data = {tagtype: code} return Language.make(**data) @staticmethod def find( name: str, language: Optional[Union[str, 'Language']] = None ) -> 'Language': """ A concise version of `find_name`, used to get a language tag by its name in a natural language. The language can be omitted in the large majority of cases, where the language name is not ambiguous. >>> Language.find('Türkçe') Language.make(language='tr') >>> Language.find('brazilian portuguese') Language.make(language='pt', territory='BR') >>> Language.find('simplified chinese') Language.make(language='zh', script='Hans') Some language names are ambiguous: for example, there is a language named 'Fala' in English (with code 'fax'), but 'Fala' is also the Kwasio word for French. In this case, specifying the language that the name is in is necessary for disambiguation. >>> Language.find('fala') Language.make(language='fr') >>> Language.find('fala', 'nmg') Language.make(language='fr') >>> Language.find('fala', 'en') Language.make(language='fax') """ return Language.find_name('language', name, language) def to_dict(self) -> dict: """ Get a dictionary of the attributes of this Language object, which can be useful for constructing a similar object. """ if self._dict is not None: return self._dict result = {} for key in self.ATTRIBUTES: value = getattr(self, key) if value: result[key] = value self._dict = result return result def update(self, other: 'Language') -> 'Language': """ Update this Language with the fields of another Language. """ return Language.make( language=other.language or self.language, extlangs=other.extlangs or self.extlangs, script=other.script or self.script, territory=other.territory or self.territory, variants=other.variants or self.variants, extensions=other.extensions or self.extensions, private=other.private or self.private, ) def update_dict(self, newdata: dict) -> 'Language': """ Update the attributes of this Language from a dictionary. """ return Language.make( language=newdata.get('language', self.language), extlangs=newdata.get('extlangs', self.extlangs), script=newdata.get('script', self.script), territory=newdata.get('territory', self.territory), variants=newdata.get('variants', self.variants), extensions=newdata.get('extensions', self.extensions), private=newdata.get('private', self.private), ) @staticmethod def _filter_keys(d: dict, keys: Iterable[str]) -> dict: """ Select a subset of keys from a dictionary. """ return {key: d[key] for key in keys if key in d} def _filter_attributes(self, keyset: Iterable[str]) -> 'Language': """ Return a copy of this object with a subset of its attributes set. """ filtered = self._filter_keys(self.to_dict(), keyset) return Language.make(**filtered) def _searchable_form(self) -> 'Language': """ Convert a parsed language tag so that the information it contains is in the best form for looking up information in the CLDR. """ if self._searchable is not None: return self._searchable self._searchable = ( self._filter_attributes({'language', 'script', 'territory'}) .simplify_script() .prefer_macrolanguage() ) return self._searchable def __eq__(self, other): if self is other: return True if not isinstance(other, Language): return False return self._str_tag == other._str_tag def __hash__(self) -> int: return hash(id(self)) def __getitem__(self, key: str) -> Optional[Union[str, List[str]]]: if key in self.ATTRIBUTES: return getattr(self, key) else: raise KeyError(key) def __contains__(self, key: str) -> bool: return key in self.ATTRIBUTES and getattr(self, key) def __repr__(self) -> str: items = [] for attr in self.ATTRIBUTES: if getattr(self, attr): value = getattr(self, attr) items.append(f'{attr}={value!r}') joined = ', '.join(items) return f"Language.make({joined})" def __str__(self) -> str: return self.to_tag() # Make the get(), find(), and find_name() functions available at the top level get = Language.get find = Language.find find_name = Language.find_name # Make the Language object available under the old name LanguageData LanguageData = Language def standardize_tag(tag: Union[str, Language], macro: bool = False) -> str: """ Standardize a language tag: - Replace deprecated values with their updated versions (if those exist) - Remove script tags that are redundant with the language - If *macro* is True, use a macrolanguage to represent the most common standardized language within that macrolanguage. For example, 'cmn' (Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic) becomes 'ar' (Arabic). - Format the result according to the conventions of BCP 47 Macrolanguage replacement is not required by BCP 47, but it is required by the Unicode CLDR. >>> standardize_tag('en_US') 'en-US' >>> standardize_tag('en-Latn') 'en' >>> standardize_tag('en-uk') 'en-GB' >>> standardize_tag('eng') 'en' >>> standardize_tag('arb-Arab', macro=True) 'ar' >>> standardize_tag('sh-QU') 'sr-Latn-EU' >>> standardize_tag('sgn-US') 'ase' >>> standardize_tag('zh-cmn-hans-cn') 'zh-Hans-CN' >>> standardize_tag('zsm', macro=True) 'ms' >>> standardize_tag('ja-latn-hepburn') 'ja-Latn-hepburn' >>> standardize_tag('spa-latn-mx') 'es-MX' If the tag can't be parsed according to BCP 47, this will raise a LanguageTagError (a subclass of ValueError): >>> standardize_tag('spa-mx-latn') Traceback (most recent call last): ... langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string. """ langdata = Language.get(tag, normalize=True) if macro: langdata = langdata.prefer_macrolanguage() return langdata.simplify_script().to_tag() def tag_is_valid(tag: Union[str, Language]) -> bool: """ Determines whether a string is a valid language tag. This is similar to Language.get(tag).is_valid(), but can return False in the case where the tag doesn't parse. >>> tag_is_valid('ja') True >>> tag_is_valid('jp') False >>> tag_is_valid('spa-Latn-MX') True >>> tag_is_valid('spa-MX-Latn') False >>> tag_is_valid('') False >>> tag_is_valid('C.UTF-8') False """ try: langdata = Language.get(tag) return langdata.is_valid() except LanguageTagError: return False def tag_match_score( desired: Union[str, Language], supported: Union[str, Language] ) -> int: """ DEPRECATED: use .distance() instead, which uses newer data and is _lower_ for better matching languages. Return a number from 0 to 100 indicating the strength of match between the language the user desires, D, and a supported language, S. Higher numbers are better. A reasonable cutoff for not messing with your users is to only accept scores of 75 or more. A score of 100 means the languages are the same, possibly after normalizing and filling in likely values. """ warnings.warn( "tag_match_score is deprecated because it's based on deprecated CLDR info. " "Use tag_distance instead, which is _lower_ for better matching languages. ", DeprecationWarning, ) desired_ld = Language.get(desired) supported_ld = Language.get(supported) return desired_ld.match_score(supported_ld) def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int: """ Tags that expand to the same thing when likely values are filled in get a distance of 0. >>> tag_distance('en', 'en') 0 >>> tag_distance('en', 'en-US') 0 >>> tag_distance('zh-Hant', 'zh-TW') 0 >>> tag_distance('ru-Cyrl', 'ru') 0 As a specific example, Serbo-Croatian is a politically contentious idea, but in CLDR, it's considered equivalent to Serbian in Latin characters. >>> tag_distance('sh', 'sr-Latn') 0 ... which is very similar to Croatian but sociopolitically not the same. >>> tag_distance('sh', 'hr') 9 Unicode reorganized its distinction between 'no' (Norwegian) and 'nb' (Norwegian Bokmål) in 2021. 'no' is preferred in most contexts, and the more specific 'nb' is a distance of 1 from it: >>> tag_distance('nb', 'no') 1 These distances can be asymmetrical: this data includes the fact that speakers of Swiss German (gsw) know High German (de), but not at all the other way around. The difference seems a little bit extreme, but the asymmetry is certainly there. And if your text is tagged as 'gsw', it must be that way for a reason. >>> tag_distance('gsw', 'de') 8 >>> tag_distance('de', 'gsw') 84 Unconnected languages get a distance of 80 to 134. >>> tag_distance('en', 'zh') 134 >>> tag_distance('es', 'fr') 84 >>> tag_distance('fr-CH', 'de-CH') 80 Different local variants of the same language get a distance from 3 to 5. >>> tag_distance('zh-HK', 'zh-MO') # Chinese is similar in Hong Kong and Macao 4 >>> tag_distance('en-AU', 'en-GB') # Australian English is similar to British English 3 >>> tag_distance('en-IN', 'en-GB') # Indian English is also similar to British English 3 >>> tag_distance('es-PE', 'es-419') # Peruvian Spanish is Latin American Spanish 1 >>> tag_distance('es-419', 'es-PE') # but Latin American Spanish is not necessarily Peruvian 4 >>> tag_distance('es-ES', 'es-419') # Spanish in Spain is further from Latin American Spanish 5 >>> tag_distance('en-US', 'en-GB') # American and British English are somewhat different 5 >>> tag_distance('es-MX', 'es-ES') # Mexican Spanish is different from Spanish Spanish 5 >>> # European Portuguese is different from the most common form (Brazilian Portuguese) >>> tag_distance('pt', 'pt-PT') 5 >>> # Serbian has two scripts, and people might prefer one but understand both >>> tag_distance('sr-Latn', 'sr-Cyrl') 5 A distance of 10 is used for matching a specific language to its more-commonly-used macrolanguage tag. >>> tag_distance('arz', 'ar') # Egyptian Arabic to Modern Standard Arabic 10 >>> tag_distance('wuu', 'zh') # Wu Chinese to (Mandarin) Chinese 10 Higher distances can arrive due to particularly contentious differences in the script for writing the language, where people who understand one script can learn the other but may not be happy with it. This specifically applies to Chinese. >>> tag_distance('zh-TW', 'zh-CN') 54 >>> tag_distance('zh-Hans', 'zh-Hant') 54 >>> tag_distance('zh-CN', 'zh-HK') 54 >>> tag_distance('zh-CN', 'zh-TW') 54 >>> tag_distance('zh-Hant', 'zh-Hans') 54 This distance range also applies to the differences between Norwegian Bokmål, Nynorsk, and Danish. >>> tag_distance('no', 'da') 12 >>> tag_distance('no', 'nn') 20 Differences of 20 to 50 can represent substantially different languages, in cases where speakers of the first may understand the second for demographic reasons. >>> tag_distance('eu', 'es') # Basque to Spanish 20 >>> tag_distance('af', 'nl') # Afrikaans to Dutch 24 >>> tag_distance('mr', 'hi') # Marathi to Hindi 30 >>> tag_distance('ms', 'id') # Malay to Indonesian 34 >>> tag_distance('mg', 'fr') # Malagasy to French 34 >>> tag_distance('ta', 'en') # Tamil to English 44 A complex example is the tag 'yue' for Cantonese. Written Chinese is usually presumed to be Mandarin Chinese, but colloquial Cantonese can be written as well. (Some things could not be written any other way, such as Cantonese song lyrics.) The difference between Cantonese and Mandarin also implies script and territory differences by default, adding to the distance. >>> tag_distance('yue', 'zh') 64 When the supported script is a different one than desired, this is usually a major difference with score of 50 or more. >>> tag_distance('ja', 'ja-Latn-US-hepburn') 54 >>> # You can read the Shavian script, right? >>> tag_distance('en', 'en-Shaw') 54 """ desired_obj = Language.get(desired) supported_obj = Language.get(supported) return desired_obj.distance(supported_obj) def best_match( desired_language: Union[str, Language], supported_languages: Sequence[str], min_score: int = 75, ) -> Tuple[str, int]: """ DEPRECATED: use .closest_match() instead. This function emulates the old matching behavior by subtracting the language distance from 100. You have software that supports any of the `supported_languages`. You want to use `desired_language`. This function lets you choose the right language, even if there isn't an exact match. Returns: - The best-matching language code, which will be one of the `supported_languages` or 'und' - The score of the match, from 0 to 100; higher is better. `min_score` sets the minimum match score. If all languages match with a lower score than that, the result will be 'und' with a score of 0. """ max_distance = 100 - min_score supported, distance = closest_match( desired_language, supported_languages, max_distance ) score = max(0, 100 - distance) return supported, score def closest_match( desired_language: Union[str, Language], supported_languages: Sequence[str], max_distance: int = 25, ) -> Tuple[str, int]: """ You have software that supports any of the `supported_languages`. You want to use `desired_language`. This function lets you choose the right language, even if there isn't an exact match. Returns: - The best-matching language code, which will be one of the `supported_languages` or 'und' for no match - The distance of the match, which is 0 for a perfect match and increases from there (see `tag_distance`) `max_distance` sets the maximum match distance. If all matches are farther than that, the result will be 'und' with a distance of 1000. The default value is 25, and raising it can cause data to be processed in significantly the wrong language. The documentation for `tag_distance` describes the distance values in more detail. When there is a tie for the best matching language, the first one in the tie will be used. >>> closest_match('fr', ['de', 'en', 'fr']) ('fr', 0) >>> closest_match('pt', ['pt-BR', 'pt-PT']) ('pt-BR', 0) >>> closest_match('en-AU', ['en-GB', 'en-US']) ('en-GB', 3) >>> closest_match('af', ['en', 'nl', 'zu']) ('nl', 24) >>> closest_match('ja', ['ja-Latn-hepburn', 'en']) ('und', 1000) """ desired_language = str(desired_language) # Quickly return if the desired language is directly supported if desired_language in supported_languages: return desired_language, 0 # Reduce the desired language to a standard form that could also match desired_language = standardize_tag(desired_language) if desired_language in supported_languages: return desired_language, 0 match_distances = [ (supported, tag_distance(desired_language, supported)) for supported in supported_languages ] match_distances = [ (supported, distance) for (supported, distance) in match_distances if distance <= max_distance ] + [('und', 1000)] match_distances.sort(key=itemgetter(1)) return match_distances[0] def closest_supported_match( desired_language: Union[str, Language], supported_languages: Sequence[str], max_distance: int = 25, ) -> Optional[str]: """ Wraps `closest_match` with a simpler return type. Returns the language tag of the closest match if there is one, or None if there is not. >>> closest_supported_match('fr', ['de', 'en', 'fr']) 'fr' >>> closest_supported_match('pt', ['pt-BR', 'pt-PT']) 'pt-BR' >>> closest_supported_match('en-AU', ['en-GB', 'en-US']) 'en-GB' >>> closest_supported_match('und', ['en', 'und']) 'und' >>> closest_supported_match('af', ['en', 'nl', 'zu']) 'nl' >>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10)) None """ code, distance = closest_match(desired_language, supported_languages, max_distance) if distance == 1000: return None else: return code