SickGear/lib/unidecode/__init__.py

# -*- coding: utf-8 -*-
# vi:tabstop=4:expandtab:sw=4
"""Transliterate Unicode text into plain 7-bit ASCII.

Example usage:

>>> from unidecode import unidecode
>>> unidecode("\u5317\u4EB0")
"Bei Jing "

The transliteration uses a straightforward map, and doesn't have alternatives
for the same character based on language, position, or anything else.

A standard string object will be returned. If you need bytes, use:

>>> unidecode("Κνωσός").encode("ascii")
b'Knosos'
"""
import warnings
from typing import Dict, Optional, Sequence

Cache: Dict[int, Optional[Sequence[Optional[str]]]] = {}

class UnidecodeError(ValueError):
    def __init__(self, message: str, index: Optional[int] = None) -> None:
        """Raised for Unidecode-related errors.

        The index attribute contains the index of the character that caused
        the error.
        """
        super(UnidecodeError, self).__init__(message)
        self.index = index


def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
    """Transliterate an Unicode object into an ASCII string

    >>> unidecode("\u5317\u4EB0")
    "Bei Jing "

    This function first tries to convert the string using ASCII codec.
    If it fails (because of non-ASCII characters), it falls back to
    transliteration using the character tables.

    This is approx. five times faster if the string only contains ASCII
    characters, but slightly slower than unicode_expect_nonascii if
    non-ASCII characters are present.

    errors specifies what to do with characters that have not been
    found in replacement tables. The default is 'ignore' which ignores
    the character. 'strict' raises an UnidecodeError. 'replace'
    substitutes the character with replace_str (default is '?').
    'preserve' keeps the original character.

    Note that if 'preserve' is used the returned string might not be
    ASCII!
    """

    try:
        bytestring = string.encode('ASCII')
    except UnicodeEncodeError:
        pass
    else:
        return string

    return _unidecode(string, errors, replace_str)

def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
    """Transliterate an Unicode object into an ASCII string

    >>> unidecode("\u5317\u4EB0")
    "Bei Jing "

    See unidecode_expect_ascii.
    """

    return _unidecode(string, errors, replace_str)

unidecode = unidecode_expect_ascii

def _get_repl_str(char: str) -> Optional[str]:
    codepoint = ord(char)

    if codepoint < 0x80:
        # Already ASCII
        return str(char)

    if codepoint > 0xeffff:
        # No data on characters in Private Use Area and above.
        return None

    if 0xd800 <= codepoint <= 0xdfff:
        warnings.warn(  "Surrogate character %r will be ignored. "
                        "You might be using a narrow Python build." % (char,),
                        RuntimeWarning, 2)

    section = codepoint >> 8   # Chop off the last two hex digits
    position = codepoint % 256 # Last two hex digits

    try:
        table = Cache[section]
    except KeyError:
        try:
            mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
        except ImportError:
            # No data on this character
            Cache[section] = None
            return None

        Cache[section] = table = mod.data

    if table and len(table) > position:
        return table[position]
    else:
        return None

def _unidecode(string: str, errors: str, replace_str:str) -> str:
    retval = []

    for index, char in enumerate(string):
        repl = _get_repl_str(char)

        if repl is None:
            if errors == 'ignore':
                repl = ''
            elif errors == 'strict':
                raise UnidecodeError('no replacement found for character %r '
                        'in position %d' % (char, index), index)
            elif errors == 'replace':
                repl = replace_str
            elif errors == 'preserve':
                repl = char
            else:
                raise UnidecodeError('invalid value for errors parameter %r' % (errors,))

        retval.append(repl)

    return ''.join(retval)
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`# -- coding: utf-8 --`
			`# vi:tabstop=4:expandtab:sw=4`
			`"""Transliterate Unicode text into plain 7-bit ASCII.`

			`Example usage:`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`>>> from unidecode import unidecode`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`>>> unidecode("\u5317\u4EB0")`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"Bei Jing "`

			`The transliteration uses a straightforward map, and doesn't have alternatives`
			`for the same character based on language, position, or anything else.`

Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`A standard string object will be returned. If you need bytes, use:`

Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`>>> unidecode("Κνωσός").encode("ascii")`
			`b'Knosos'`
			`"""`
			`import warnings`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`from typing import Dict, Optional, Sequence`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.3.6 (4141992) → 1.3.8 (dfe397d). 2024-06-07 16:42:01 +00:00			`Cache: Dict[int, Optional[Sequence[Optional[str]]]] = {}`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`class UnidecodeError(ValueError):`
			`def __init__(self, message: str, index: Optional[int] = None) -> None:`
			`"""Raised for Unidecode-related errors.`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`The index attribute contains the index of the character that caused`
			`the error.`
			`"""`
			`super(UnidecodeError, self).__init__(message)`
			`self.index = index`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00

Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"""Transliterate an Unicode object into an ASCII string`

Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`>>> unidecode("\u5317\u4EB0")`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"Bei Jing "`

			`This function first tries to convert the string using ASCII codec.`
			`If it fails (because of non-ASCII characters), it falls back to`
			`transliteration using the character tables.`

			`This is approx. five times faster if the string only contains ASCII`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`characters, but slightly slower than unicode_expect_nonascii if`
			`non-ASCII characters are present.`

			`errors specifies what to do with characters that have not been`
			`found in replacement tables. The default is 'ignore' which ignores`
			`the character. 'strict' raises an UnidecodeError. 'replace'`
			`substitutes the character with replace_str (default is '?').`
			`'preserve' keeps the original character.`

			`Note that if 'preserve' is used the returned string might not be`
			`ASCII!`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"""`

			`try:`
			`bytestring = string.encode('ASCII')`
			`except UnicodeEncodeError:`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`pass`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`else:`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`return string`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`return _unidecode(string, errors, replace_str)`

			`def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"""Transliterate an Unicode object into an ASCII string`

Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`>>> unidecode("\u5317\u4EB0")`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"Bei Jing "`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00
			`See unidecode_expect_ascii.`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`"""`

Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`return _unidecode(string, errors, replace_str)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
			`unidecode = unidecode_expect_ascii`

Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`def _get_repl_str(char: str) -> Optional[str]:`
			`codepoint = ord(char)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`if codepoint < 0x80:`
			`# Already ASCII`
			`return str(char)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`if codepoint > 0xeffff:`
			`# No data on characters in Private Use Area and above.`
			`return None`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`if 0xd800 <= codepoint <= 0xdfff:`
			`warnings.warn( "Surrogate character %r will be ignored. "`
			`"You might be using a narrow Python build." % (char,),`
			`RuntimeWarning, 2)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`section = codepoint >> 8 # Chop off the last two hex digits`
			`position = codepoint % 256 # Last two hex digits`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`try:`
			`table = Cache[section]`
			`except KeyError:`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`try:`
Update unidecode module 1.1.1 (632af82) → 1.3.6 (4141992). 2023-01-12 20:12:58 +00:00			`mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])`
			`except ImportError:`
			`# No data on this character`
			`Cache[section] = None`
			`return None`

			`Cache[section] = table = mod.data`

			`if table and len(table) > position:`
			`return table[position]`
			`else:`
			`return None`

			`def _unidecode(string: str, errors: str, replace_str:str) -> str:`
			`retval = []`

			`for index, char in enumerate(string):`
			`repl = _get_repl_str(char)`

			`if repl is None:`
			`if errors == 'ignore':`
			`repl = ''`
			`elif errors == 'strict':`
			`raise UnidecodeError('no replacement found for character %r '`
			`'in position %d' % (char, index), index)`
			`elif errors == 'replace':`
			`repl = replace_str`
			`elif errors == 'preserve':`
			`repl = char`
			`else:`
			`raise UnidecodeError('invalid value for errors parameter %r' % (errors,))`

			`retval.append(repl)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
			`return ''.join(retval)`