mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-15 09:25:04 +00:00
9009cc7a7b
Add a Select2 drop-down to `add-shows` and `edit-show`. The Select2 enables displaying inline language flag images, this feature deprecated by the native `select` drop-down element on some browsers. Change run existing TVInfo source language lists through validation (removes ~4 bad items), de-dupe list, get the native names, English names, and three letter abbr. Change remove marisa-trie requirement from language_data/names.py because nothing in SG calls a function that requires it. Change update some flags.
112 lines
3.8 KiB
Python
112 lines
3.8 KiB
Python
# import marisa_trie
|
||
import warnings
|
||
|
||
from language_data.util import data_filename
|
||
|
||
|
||
TRIES = {}
|
||
|
||
# This is something we could hypothetically discover from XML files, but
|
||
# we end up learning that most languages separate things with commas, with
|
||
# a few exceptions. We'll just put those exceptions here.
|
||
DISPLAY_SEPARATORS = {
|
||
'am': '፣',
|
||
'ar': '، ',
|
||
'brx': ',',
|
||
'fa': '، ',
|
||
'ja': '、',
|
||
'my': '၊ ',
|
||
'ug': '، ',
|
||
'und': ', ',
|
||
'ur': '، ',
|
||
'yue': ',',
|
||
'zh': ',',
|
||
}
|
||
|
||
|
||
def normalize_name(name):
|
||
"""
|
||
When looking up a language-code component by name, we would rather ignore
|
||
distinctions of case and certain punctuation. "Chinese (Traditional)"
|
||
should be matched by "Chinese Traditional" and "chinese traditional".
|
||
"""
|
||
name = name.casefold()
|
||
name = name.replace("’", "'")
|
||
name = name.replace("-", " ")
|
||
name = name.replace("(", "")
|
||
name = name.replace(")", "")
|
||
name = name.replace(",", "")
|
||
return name.strip()
|
||
|
||
|
||
# def load_trie(filename):
|
||
# """
|
||
# Load a BytesTrie from the marisa_trie on-disk format.
|
||
# """
|
||
# trie = marisa_trie.BytesTrie()
|
||
# # marisa_trie raises warnings that make no sense. Ignore them.
|
||
# with warnings.catch_warnings():
|
||
# warnings.simplefilter("ignore")
|
||
# trie.load(filename)
|
||
# return trie
|
||
|
||
|
||
def get_trie_value(trie, key):
|
||
"""
|
||
Get the value that a BytesTrie stores for a particular key, decoded
|
||
as Unicode. Raises a KeyError if there is no value for that key.
|
||
"""
|
||
return trie[key][0].decode("utf-8")
|
||
|
||
|
||
def name_to_code(category, name, language: str = "und"):
|
||
"""
|
||
Get a language, script, or territory by its name in some language.
|
||
|
||
The language here must be a string representing a language subtag only.
|
||
The `Language.find` method can handle other representations of a language
|
||
and normalize them to this form.
|
||
|
||
The default language, "und", will allow matching names in any language,
|
||
so you can get the code 'fr' by looking up "French", "Français", or
|
||
"francés".
|
||
|
||
A small amount of fuzzy matching is supported: if the name can be
|
||
shortened or lengthened to match a single language name, you get that
|
||
language. This allows, for example, "Hakka Chinese" to match "Hakka".
|
||
|
||
Occasionally, names are ambiguous in a way that can be resolved by
|
||
specifying what name the language is supposed to be in. For example,
|
||
there is a language named 'Malayo' in English, but it's different from
|
||
the language named 'Malayo' in Spanish (which is Malay). Specifying the
|
||
language will look up the name in a trie that is only in that language.
|
||
"""
|
||
assert "/" not in language, "Language codes cannot contain slashes"
|
||
assert "-" not in language, "This code should be reduced to a language subtag only"
|
||
trie_name = "{}/name_to_{}".format(language, category)
|
||
if trie_name not in TRIES:
|
||
TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))
|
||
|
||
trie = TRIES[trie_name]
|
||
lookup = normalize_name(name)
|
||
if lookup in trie:
|
||
return get_trie_value(trie, lookup)
|
||
else:
|
||
# Is this a language name plus extra verbiage? Maybe it has "...isch",
|
||
# "... language", or "... Chinese" attached to it, for example. Look
|
||
# for a matching prefix of the desired name with at least 4 characters.
|
||
prefixes = trie.prefixes(lookup)
|
||
if prefixes and len(prefixes[-1]) >= 4:
|
||
return get_trie_value(trie, prefixes[-1])
|
||
else:
|
||
return None
|
||
|
||
|
||
def code_to_names(code):
|
||
"""
|
||
Given the code for a language, script, or territory, get a dictionary of its
|
||
names in various languages.
|
||
"""
|
||
# late import to save memory when possible
|
||
import language_data.name_data
|
||
return language_data.name_data.CODE_TO_NAMES.get(code, {})
|