SickGear/lib/langcodes/language_distance.py

from .data_dicts import LANGUAGE_DISTANCES
from typing import Dict, Tuple


TagTriple = Tuple[str, str, str]
_DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
DEFAULT_TERRITORY_DISTANCE = 4


# Territory clusters used in territory matching:
# Maghreb (the western Arab world)
MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}

# United States and its territories
US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}

# Special Autonomous Regions of China
CNSAR = {"HK", "MO"}

LATIN_AMERICA = {
    "419",
    # Central America
    "013",
    "BZ",
    "CR",
    "SV",
    "GT",
    "HN",
    "MX",
    "NI",
    "PA",
    # South America
    "005",
    "AR",
    "BO",
    "BR",
    "CL",
    "CO",
    "EC",
    "FK",
    "GF",
    "GY",
    "PY",
    "PE",
    "SR",
    "UY",
    "VE",
}

# North and South America
AMERICAS = {
    "019",
    # Caribbean
    "029",
    "AI",
    "AG",
    "AW",
    "BS",
    "BB",
    "VG",
    "BQ",
    "KY",
    "CU",
    "CW",
    "DM",
    "DO",
    "GD",
    "GP",
    "HT",
    "JM",
    "MQ",
    "MS",
    "PR",
    "SX",
    "BL",
    "KN",
    "LC",
    "MF",
    "VC",
    "TT",
    "TC",
    "VI",
    # Northern America
    "021",
    "BM",
    "CA",
    "GL",
    "PM",
    "US",
    # North America as a whole
    "003",
} | LATIN_AMERICA


def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
    """
    Takes in triples of (language, script, territory), which can be derived by
    'maximizing' a language tag. Returns a number from 0 to 135 indicating the
    'distance' between these for the purposes of language matching.
    """
    # First of all, if these are identical, return quickly:
    if supported == desired:
        return 0

    # If we've already figured it out, return the cached distance.
    if (desired, supported) in _DISTANCE_CACHE:
        return _DISTANCE_CACHE[desired, supported]
    else:
        result = _tuple_distance(desired, supported)
        _DISTANCE_CACHE[desired, supported] = result
        return result


def _get2(dictionary: dict, key1: str, key2: str, default):
    return dictionary.get(key1, {}).get(key2, default)


def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
    desired_language, desired_script, desired_territory = desired
    supported_language, supported_script, supported_territory = supported
    distance = 0

    if desired_language != supported_language:
        distance += _get2(
            LANGUAGE_DISTANCES,
            desired_language,
            supported_language,
            DEFAULT_LANGUAGE_DISTANCE,
        )

    desired_script_pair = f"{desired_language}_{desired_script}"
    supported_script_pair = f"{supported_language}_{supported_script}"

    if desired_script != supported_script:
        # Scripts can match other scripts, but only when paired with a
        # language. For example, there is no reason to assume someone who can
        # read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
        # someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
        # a language written in two scripts.
        distance += _get2(
            LANGUAGE_DISTANCES,
            desired_script_pair,
            supported_script_pair,
            DEFAULT_SCRIPT_DISTANCE,
        )

    if desired_territory != supported_territory:
        # The rules for matching territories are too weird to implement the
        # general case efficiently. Instead of implementing all the possible
        # match rules the XML could define, instead we just reimplement the
        # rules of CLDR 36.1 here in code.

        tdist = DEFAULT_TERRITORY_DISTANCE
        if desired_script_pair == supported_script_pair:
            if desired_language == "ar":
                if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
                    tdist = 5
            elif desired_language == "en":
                if (desired_territory == "GB") and (supported_territory not in US):
                    tdist = 3
                elif (desired_territory not in US) and (supported_territory == "GB"):
                    tdist = 3
                elif (desired_territory in US) != (supported_territory in US):
                    tdist = 5
            # This is not a rule that's spelled out in CLDR, but is implied by things
            # about territory containment mentioned in other standards. Numeric values
            # for territories, like '003', represent broad regions that contain more
            # specific territories.
            #
            # 419 is the numeric value most often seen in language codes, particularly
            # 'es-419' for Latin American Spanish. If you have a language code that
            # differs only in that its territory is more specific, like 'es-PY', it should
            # be closer to a supported 'es-419' than anything with a territory difference.
            #
            # We can implement this for 419 without becoming responsible for keeping up
            # with which countries/territories/regions contain others in the general case.
            elif desired_territory in LATIN_AMERICA and supported_territory == "419":
                tdist = 1
            elif desired_language == "es" or desired_language == "pt":
                if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
                    tdist = 5
            elif desired_script_pair == "zh_Hant":
                if (desired_territory in CNSAR) != (supported_territory in CNSAR):
                    tdist = 5
        distance += tdist
    return distance
Add langcodes 3.3.0 and language-data 1.1 (vendor'd from `pip install langcodes[data]`) Add a Select2 drop-down to `add-shows` and `edit-show`. The Select2 enables displaying inline language flag images, this feature deprecated by the native `select` drop-down element on some browsers. Change run existing TVInfo source language lists through validation (removes ~4 bad items), de-dupe list, get the native names, English names, and three letter abbr. Change remove marisa-trie requirement from language_data/names.py because nothing in SG calls a function that requires it. Change update some flags. 2023-01-26 03:30:07 +00:00			`from .data_dicts import LANGUAGE_DISTANCES`
			`from typing import Dict, Tuple`


			`TagTriple = Tuple[str, str, str]`
			`_DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}`
			`DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES[""][""]`
			`DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["_"]["_"]`
			`DEFAULT_TERRITORY_DISTANCE = 4`


			`# Territory clusters used in territory matching:`
			`# Maghreb (the western Arab world)`
			`MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}`

			`# United States and its territories`
			`US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}`

			`# Special Autonomous Regions of China`
			`CNSAR = {"HK", "MO"}`

			`LATIN_AMERICA = {`
			`"419",`
			`# Central America`
			`"013",`
			`"BZ",`
			`"CR",`
			`"SV",`
			`"GT",`
			`"HN",`
			`"MX",`
			`"NI",`
			`"PA",`
			`# South America`
			`"005",`
			`"AR",`
			`"BO",`
			`"BR",`
			`"CL",`
			`"CO",`
			`"EC",`
			`"FK",`
			`"GF",`
			`"GY",`
			`"PY",`
			`"PE",`
			`"SR",`
			`"UY",`
			`"VE",`
			`}`

			`# North and South America`
			`AMERICAS = {`
			`"019",`
			`# Caribbean`
			`"029",`
			`"AI",`
			`"AG",`
			`"AW",`
			`"BS",`
			`"BB",`
			`"VG",`
			`"BQ",`
			`"KY",`
			`"CU",`
			`"CW",`
			`"DM",`
			`"DO",`
			`"GD",`
			`"GP",`
			`"HT",`
			`"JM",`
			`"MQ",`
			`"MS",`
			`"PR",`
			`"SX",`
			`"BL",`
			`"KN",`
			`"LC",`
			`"MF",`
			`"VC",`
			`"TT",`
			`"TC",`
			`"VI",`
			`# Northern America`
			`"021",`
			`"BM",`
			`"CA",`
			`"GL",`
			`"PM",`
			`"US",`
			`# North America as a whole`
			`"003",`
			`} \| LATIN_AMERICA`


			`def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:`
			`"""`
			`Takes in triples of (language, script, territory), which can be derived by`
			`'maximizing' a language tag. Returns a number from 0 to 135 indicating the`
			`'distance' between these for the purposes of language matching.`
			`"""`
			`# First of all, if these are identical, return quickly:`
			`if supported == desired:`
			`return 0`

			`# If we've already figured it out, return the cached distance.`
			`if (desired, supported) in _DISTANCE_CACHE:`
			`return _DISTANCE_CACHE[desired, supported]`
			`else:`
			`result = _tuple_distance(desired, supported)`
			`_DISTANCE_CACHE[desired, supported] = result`
			`return result`


			`def _get2(dictionary: dict, key1: str, key2: str, default):`
			`return dictionary.get(key1, {}).get(key2, default)`


			`def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:`
			`desired_language, desired_script, desired_territory = desired`
			`supported_language, supported_script, supported_territory = supported`
			`distance = 0`

			`if desired_language != supported_language:`
			`distance += _get2(`
			`LANGUAGE_DISTANCES,`
			`desired_language,`
			`supported_language,`
			`DEFAULT_LANGUAGE_DISTANCE,`
			`)`

			`desired_script_pair = f"{desired_language}_{desired_script}"`
			`supported_script_pair = f"{supported_language}_{supported_script}"`

			`if desired_script != supported_script:`
			`# Scripts can match other scripts, but only when paired with a`
			`# language. For example, there is no reason to assume someone who can`
			`# read 'Latn' can read 'Cyrl', but there is plenty of reason to believe`
			`# someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is`
			`# a language written in two scripts.`
			`distance += _get2(`
			`LANGUAGE_DISTANCES,`
			`desired_script_pair,`
			`supported_script_pair,`
			`DEFAULT_SCRIPT_DISTANCE,`
			`)`

			`if desired_territory != supported_territory:`
			`# The rules for matching territories are too weird to implement the`
			`# general case efficiently. Instead of implementing all the possible`
			`# match rules the XML could define, instead we just reimplement the`
			`# rules of CLDR 36.1 here in code.`

			`tdist = DEFAULT_TERRITORY_DISTANCE`
			`if desired_script_pair == supported_script_pair:`
			`if desired_language == "ar":`
			`if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):`
			`tdist = 5`
			`elif desired_language == "en":`
			`if (desired_territory == "GB") and (supported_territory not in US):`
			`tdist = 3`
			`elif (desired_territory not in US) and (supported_territory == "GB"):`
			`tdist = 3`
			`elif (desired_territory in US) != (supported_territory in US):`
			`tdist = 5`
			`# This is not a rule that's spelled out in CLDR, but is implied by things`
			`# about territory containment mentioned in other standards. Numeric values`
			`# for territories, like '003', represent broad regions that contain more`
			`# specific territories.`
			`#`
			`# 419 is the numeric value most often seen in language codes, particularly`
			`# 'es-419' for Latin American Spanish. If you have a language code that`
			`# differs only in that its territory is more specific, like 'es-PY', it should`
			`# be closer to a supported 'es-419' than anything with a territory difference.`
			`#`
			`# We can implement this for 419 without becoming responsible for keeping up`
			`# with which countries/territories/regions contain others in the general case.`
			`elif desired_territory in LATIN_AMERICA and supported_territory == "419":`
			`tdist = 1`
			`elif desired_language == "es" or desired_language == "pt":`
			`if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):`
			`tdist = 5`
			`elif desired_script_pair == "zh_Hant":`
			`if (desired_territory in CNSAR) != (supported_territory in CNSAR):`
			`tdist = 5`
			`distance += tdist`
			`return distance`