SickGear/lib/language_data/names.py

# import marisa_trie
import warnings

from language_data.util import data_filename


TRIES = {}

# This is something we could hypothetically discover from XML files, but
# we end up learning that most languages separate things with commas, with
# a few exceptions. We'll just put those exceptions here.
DISPLAY_SEPARATORS = {
    'am': '፣',
    'ar': '، ',
    'brx': ',',
    'fa': '، ',
    'ja': '、',
    'my': '၊ ',
    'ug': '، ',
    'und': ', ',
    'ur': '، ',
    'yue': '，',
    'zh': '，',
}


def normalize_name(name):
    """
    When looking up a language-code component by name, we would rather ignore
    distinctions of case and certain punctuation. "Chinese (Traditional)"
    should be matched by "Chinese Traditional" and "chinese traditional".
    """
    name = name.casefold()
    name = name.replace("’", "'")
    name = name.replace("-", " ")
    name = name.replace("(", "")
    name = name.replace(")", "")
    name = name.replace(",", "")
    return name.strip()


# def load_trie(filename):
#     """
#     Load a BytesTrie from the marisa_trie on-disk format.
#     """
#     trie = marisa_trie.BytesTrie()
#     # marisa_trie raises warnings that make no sense. Ignore them.
#     with warnings.catch_warnings():
#         warnings.simplefilter("ignore")
#         trie.load(filename)
#     return trie


def get_trie_value(trie, key):
    """
    Get the value that a BytesTrie stores for a particular key, decoded
    as Unicode. Raises a KeyError if there is no value for that key.
    """
    return trie[key][0].decode("utf-8")


def name_to_code(category, name, language: str = "und"):
    """
    Get a language, script, or territory by its name in some language.

    The language here must be a string representing a language subtag only.
    The `Language.find` method can handle other representations of a language
    and normalize them to this form.

    The default language, "und", will allow matching names in any language,
    so you can get the code 'fr' by looking up "French", "Français", or
    "francés".

    A small amount of fuzzy matching is supported: if the name can be
    shortened or lengthened to match a single language name, you get that
    language. This allows, for example, "Hakka Chinese" to match "Hakka".

    Occasionally, names are ambiguous in a way that can be resolved by
    specifying what name the language is supposed to be in. For example,
    there is a language named 'Malayo' in English, but it's different from
    the language named 'Malayo' in Spanish (which is Malay). Specifying the
    language will look up the name in a trie that is only in that language.
    """
    assert "/" not in language, "Language codes cannot contain slashes"
    assert "-" not in language, "This code should be reduced to a language subtag only"
    trie_name = "{}/name_to_{}".format(language, category)
    if trie_name not in TRIES:
        TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))

    trie = TRIES[trie_name]
    lookup = normalize_name(name)
    if lookup in trie:
        return get_trie_value(trie, lookup)
    else:
        # Is this a language name plus extra verbiage? Maybe it has "...isch",
        # "... language", or "... Chinese" attached to it, for example. Look
        # for a matching prefix of the desired name with at least 4 characters.
        prefixes = trie.prefixes(lookup)
        if prefixes and len(prefixes[-1]) >= 4:
            return get_trie_value(trie, prefixes[-1])
        else:
            return None


def code_to_names(code):
    """
    Given the code for a language, script, or territory, get a dictionary of its
    names in various languages.
    """
    # late import to save memory when possible
    import language_data.name_data
    return language_data.name_data.CODE_TO_NAMES.get(code, {})