mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-27 23:23:38 +00:00
113 lines
3.8 KiB
Python
113 lines
3.8 KiB
Python
|
# import marisa_trie
|
|||
|
import warnings
|
|||
|
|
|||
|
from language_data.util import data_filename
|
|||
|
|
|||
|
|
|||
|
TRIES = {}
|
|||
|
|
|||
|
# This is something we could hypothetically discover from XML files, but
|
|||
|
# we end up learning that most languages separate things with commas, with
|
|||
|
# a few exceptions. We'll just put those exceptions here.
|
|||
|
DISPLAY_SEPARATORS = {
|
|||
|
'am': '፣',
|
|||
|
'ar': '، ',
|
|||
|
'brx': ',',
|
|||
|
'fa': '، ',
|
|||
|
'ja': '、',
|
|||
|
'my': '၊ ',
|
|||
|
'ug': '، ',
|
|||
|
'und': ', ',
|
|||
|
'ur': '، ',
|
|||
|
'yue': ',',
|
|||
|
'zh': ',',
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def normalize_name(name):
|
|||
|
"""
|
|||
|
When looking up a language-code component by name, we would rather ignore
|
|||
|
distinctions of case and certain punctuation. "Chinese (Traditional)"
|
|||
|
should be matched by "Chinese Traditional" and "chinese traditional".
|
|||
|
"""
|
|||
|
name = name.casefold()
|
|||
|
name = name.replace("’", "'")
|
|||
|
name = name.replace("-", " ")
|
|||
|
name = name.replace("(", "")
|
|||
|
name = name.replace(")", "")
|
|||
|
name = name.replace(",", "")
|
|||
|
return name.strip()
|
|||
|
|
|||
|
|
|||
|
# def load_trie(filename):
|
|||
|
# """
|
|||
|
# Load a BytesTrie from the marisa_trie on-disk format.
|
|||
|
# """
|
|||
|
# trie = marisa_trie.BytesTrie()
|
|||
|
# # marisa_trie raises warnings that make no sense. Ignore them.
|
|||
|
# with warnings.catch_warnings():
|
|||
|
# warnings.simplefilter("ignore")
|
|||
|
# trie.load(filename)
|
|||
|
# return trie
|
|||
|
|
|||
|
|
|||
|
def get_trie_value(trie, key):
|
|||
|
"""
|
|||
|
Get the value that a BytesTrie stores for a particular key, decoded
|
|||
|
as Unicode. Raises a KeyError if there is no value for that key.
|
|||
|
"""
|
|||
|
return trie[key][0].decode("utf-8")
|
|||
|
|
|||
|
|
|||
|
def name_to_code(category, name, language: str = "und"):
|
|||
|
"""
|
|||
|
Get a language, script, or territory by its name in some language.
|
|||
|
|
|||
|
The language here must be a string representing a language subtag only.
|
|||
|
The `Language.find` method can handle other representations of a language
|
|||
|
and normalize them to this form.
|
|||
|
|
|||
|
The default language, "und", will allow matching names in any language,
|
|||
|
so you can get the code 'fr' by looking up "French", "Français", or
|
|||
|
"francés".
|
|||
|
|
|||
|
A small amount of fuzzy matching is supported: if the name can be
|
|||
|
shortened or lengthened to match a single language name, you get that
|
|||
|
language. This allows, for example, "Hakka Chinese" to match "Hakka".
|
|||
|
|
|||
|
Occasionally, names are ambiguous in a way that can be resolved by
|
|||
|
specifying what name the language is supposed to be in. For example,
|
|||
|
there is a language named 'Malayo' in English, but it's different from
|
|||
|
the language named 'Malayo' in Spanish (which is Malay). Specifying the
|
|||
|
language will look up the name in a trie that is only in that language.
|
|||
|
"""
|
|||
|
assert "/" not in language, "Language codes cannot contain slashes"
|
|||
|
assert "-" not in language, "This code should be reduced to a language subtag only"
|
|||
|
trie_name = "{}/name_to_{}".format(language, category)
|
|||
|
if trie_name not in TRIES:
|
|||
|
TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))
|
|||
|
|
|||
|
trie = TRIES[trie_name]
|
|||
|
lookup = normalize_name(name)
|
|||
|
if lookup in trie:
|
|||
|
return get_trie_value(trie, lookup)
|
|||
|
else:
|
|||
|
# Is this a language name plus extra verbiage? Maybe it has "...isch",
|
|||
|
# "... language", or "... Chinese" attached to it, for example. Look
|
|||
|
# for a matching prefix of the desired name with at least 4 characters.
|
|||
|
prefixes = trie.prefixes(lookup)
|
|||
|
if prefixes and len(prefixes[-1]) >= 4:
|
|||
|
return get_trie_value(trie, prefixes[-1])
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
def code_to_names(code):
|
|||
|
"""
|
|||
|
Given the code for a language, script, or territory, get a dictionary of its
|
|||
|
names in various languages.
|
|||
|
"""
|
|||
|
# late import to save memory when possible
|
|||
|
import language_data.name_data
|
|||
|
return language_data.name_data.CODE_TO_NAMES.get(code, {})
|