mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-22 18:53:38 +00:00
189 lines
5.6 KiB
Python
189 lines
5.6 KiB
Python
|
from .data_dicts import LANGUAGE_DISTANCES
|
||
|
from typing import Dict, Tuple
|
||
|
|
||
|
|
||
|
TagTriple = Tuple[str, str, str]
|
||
|
_DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
|
||
|
DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
|
||
|
DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
|
||
|
DEFAULT_TERRITORY_DISTANCE = 4
|
||
|
|
||
|
|
||
|
# Territory clusters used in territory matching:
|
||
|
# Maghreb (the western Arab world)
|
||
|
MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}
|
||
|
|
||
|
# United States and its territories
|
||
|
US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}
|
||
|
|
||
|
# Special Autonomous Regions of China
|
||
|
CNSAR = {"HK", "MO"}
|
||
|
|
||
|
LATIN_AMERICA = {
|
||
|
"419",
|
||
|
# Central America
|
||
|
"013",
|
||
|
"BZ",
|
||
|
"CR",
|
||
|
"SV",
|
||
|
"GT",
|
||
|
"HN",
|
||
|
"MX",
|
||
|
"NI",
|
||
|
"PA",
|
||
|
# South America
|
||
|
"005",
|
||
|
"AR",
|
||
|
"BO",
|
||
|
"BR",
|
||
|
"CL",
|
||
|
"CO",
|
||
|
"EC",
|
||
|
"FK",
|
||
|
"GF",
|
||
|
"GY",
|
||
|
"PY",
|
||
|
"PE",
|
||
|
"SR",
|
||
|
"UY",
|
||
|
"VE",
|
||
|
}
|
||
|
|
||
|
# North and South America
|
||
|
AMERICAS = {
|
||
|
"019",
|
||
|
# Caribbean
|
||
|
"029",
|
||
|
"AI",
|
||
|
"AG",
|
||
|
"AW",
|
||
|
"BS",
|
||
|
"BB",
|
||
|
"VG",
|
||
|
"BQ",
|
||
|
"KY",
|
||
|
"CU",
|
||
|
"CW",
|
||
|
"DM",
|
||
|
"DO",
|
||
|
"GD",
|
||
|
"GP",
|
||
|
"HT",
|
||
|
"JM",
|
||
|
"MQ",
|
||
|
"MS",
|
||
|
"PR",
|
||
|
"SX",
|
||
|
"BL",
|
||
|
"KN",
|
||
|
"LC",
|
||
|
"MF",
|
||
|
"VC",
|
||
|
"TT",
|
||
|
"TC",
|
||
|
"VI",
|
||
|
# Northern America
|
||
|
"021",
|
||
|
"BM",
|
||
|
"CA",
|
||
|
"GL",
|
||
|
"PM",
|
||
|
"US",
|
||
|
# North America as a whole
|
||
|
"003",
|
||
|
} | LATIN_AMERICA
|
||
|
|
||
|
|
||
|
def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
|
||
|
"""
|
||
|
Takes in triples of (language, script, territory), which can be derived by
|
||
|
'maximizing' a language tag. Returns a number from 0 to 135 indicating the
|
||
|
'distance' between these for the purposes of language matching.
|
||
|
"""
|
||
|
# First of all, if these are identical, return quickly:
|
||
|
if supported == desired:
|
||
|
return 0
|
||
|
|
||
|
# If we've already figured it out, return the cached distance.
|
||
|
if (desired, supported) in _DISTANCE_CACHE:
|
||
|
return _DISTANCE_CACHE[desired, supported]
|
||
|
else:
|
||
|
result = _tuple_distance(desired, supported)
|
||
|
_DISTANCE_CACHE[desired, supported] = result
|
||
|
return result
|
||
|
|
||
|
|
||
|
def _get2(dictionary: dict, key1: str, key2: str, default):
|
||
|
return dictionary.get(key1, {}).get(key2, default)
|
||
|
|
||
|
|
||
|
def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
|
||
|
desired_language, desired_script, desired_territory = desired
|
||
|
supported_language, supported_script, supported_territory = supported
|
||
|
distance = 0
|
||
|
|
||
|
if desired_language != supported_language:
|
||
|
distance += _get2(
|
||
|
LANGUAGE_DISTANCES,
|
||
|
desired_language,
|
||
|
supported_language,
|
||
|
DEFAULT_LANGUAGE_DISTANCE,
|
||
|
)
|
||
|
|
||
|
desired_script_pair = f"{desired_language}_{desired_script}"
|
||
|
supported_script_pair = f"{supported_language}_{supported_script}"
|
||
|
|
||
|
if desired_script != supported_script:
|
||
|
# Scripts can match other scripts, but only when paired with a
|
||
|
# language. For example, there is no reason to assume someone who can
|
||
|
# read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
|
||
|
# someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
|
||
|
# a language written in two scripts.
|
||
|
distance += _get2(
|
||
|
LANGUAGE_DISTANCES,
|
||
|
desired_script_pair,
|
||
|
supported_script_pair,
|
||
|
DEFAULT_SCRIPT_DISTANCE,
|
||
|
)
|
||
|
|
||
|
if desired_territory != supported_territory:
|
||
|
# The rules for matching territories are too weird to implement the
|
||
|
# general case efficiently. Instead of implementing all the possible
|
||
|
# match rules the XML could define, instead we just reimplement the
|
||
|
# rules of CLDR 36.1 here in code.
|
||
|
|
||
|
tdist = DEFAULT_TERRITORY_DISTANCE
|
||
|
if desired_script_pair == supported_script_pair:
|
||
|
if desired_language == "ar":
|
||
|
if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
|
||
|
tdist = 5
|
||
|
elif desired_language == "en":
|
||
|
if (desired_territory == "GB") and (supported_territory not in US):
|
||
|
tdist = 3
|
||
|
elif (desired_territory not in US) and (supported_territory == "GB"):
|
||
|
tdist = 3
|
||
|
elif (desired_territory in US) != (supported_territory in US):
|
||
|
tdist = 5
|
||
|
# This is not a rule that's spelled out in CLDR, but is implied by things
|
||
|
# about territory containment mentioned in other standards. Numeric values
|
||
|
# for territories, like '003', represent broad regions that contain more
|
||
|
# specific territories.
|
||
|
#
|
||
|
# 419 is the numeric value most often seen in language codes, particularly
|
||
|
# 'es-419' for Latin American Spanish. If you have a language code that
|
||
|
# differs only in that its territory is more specific, like 'es-PY', it should
|
||
|
# be closer to a supported 'es-419' than anything with a territory difference.
|
||
|
#
|
||
|
# We can implement this for 419 without becoming responsible for keeping up
|
||
|
# with which countries/territories/regions contain others in the general case.
|
||
|
elif desired_territory in LATIN_AMERICA and supported_territory == "419":
|
||
|
tdist = 1
|
||
|
elif desired_language == "es" or desired_language == "pt":
|
||
|
if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
|
||
|
tdist = 5
|
||
|
elif desired_script_pair == "zh_Hant":
|
||
|
if (desired_territory in CNSAR) != (supported_territory in CNSAR):
|
||
|
tdist = 5
|
||
|
distance += tdist
|
||
|
return distance
|