SickGear/lib/hachoir/core/i18n.py

139 lines
3.6 KiB
Python
Raw Permalink Normal View History

"""
Functions to manage internationalisation (i18n):
- initLocale(): setup locales and install Unicode compatible stdout and
stderr ;
- getTerminalCharset(): guess terminal charset ;
WARNING: Loading this module indirectly calls initLocale() which sets
locale LC_ALL to ''. This is needed to get user preferred locale
settings.
"""
import locale
import sys
from codecs import BOM_UTF8, BOM_UTF16_LE, BOM_UTF16_BE
def _getTerminalCharset():
"""
Function used by getTerminalCharset() to get terminal charset.
@see getTerminalCharset()
"""
# (1) Try locale.getpreferredencoding()
try:
charset = locale.getpreferredencoding()
if charset:
return charset
except (locale.Error, AttributeError):
pass
# (2) Try locale.nl_langinfo(CODESET)
try:
charset = locale.nl_langinfo(locale.CODESET)
if charset:
return charset
except (locale.Error, AttributeError):
pass
# (3) Try sys.stdout.encoding
if hasattr(sys.stdout, "encoding") and sys.stdout.encoding:
return sys.stdout.encoding
# (4) Otherwise, returns "ASCII"
return "ASCII"
def getTerminalCharset():
"""
Guess terminal charset using differents tests:
1. Try locale.getpreferredencoding()
2. Try locale.nl_langinfo(CODESET)
3. Try sys.stdout.encoding
4. Otherwise, returns "ASCII"
WARNING: Call initLocale() before calling this function.
"""
try:
return getTerminalCharset.value
except AttributeError:
getTerminalCharset.value = _getTerminalCharset()
return getTerminalCharset.value
def initLocale():
# Only initialize locale once
if initLocale.is_done:
return
initLocale.is_done = True
# Setup locales
try:
locale.setlocale(locale.LC_ALL, "")
except (locale.Error, IOError):
pass
initLocale.is_done = False
UTF_BOMS = (
(BOM_UTF8, "UTF-8"),
(BOM_UTF16_LE, "UTF-16-LE"),
(BOM_UTF16_BE, "UTF-16-BE"),
)
# Set of valid characters for specific charset
CHARSET_CHARACTERS = (
# U+00E0: LATIN SMALL LETTER A WITH GRAVE
(set("©®éêè\xE0ç".encode("ISO-8859-1")), "ISO-8859-1"),
(set("©®éêè\xE0ç€".encode("ISO-8859-15")), "ISO-8859-15"),
(set("©®".encode("MacRoman")), "MacRoman"),
(set("εδηιθκμοΡσςυΈί".encode("ISO-8859-7")), "ISO-8859-7"),
)
def guessBytesCharset(data, default=None):
r"""
>>> guessBytesCharset(b"abc")
'ASCII'
>>> guessBytesCharset(b"\xEF\xBB\xBFabc")
'UTF-8'
>>> guessBytesCharset(b"abc\xC3\xA9")
'UTF-8'
>>> guessBytesCharset(b"File written by Adobe Photoshop\xA8 4.0\0")
'MacRoman'
>>> guessBytesCharset(b"\xE9l\xE9phant")
'ISO-8859-1'
>>> guessBytesCharset(b"100 \xA4")
'ISO-8859-15'
>>> guessBytesCharset(b'Word \xb8\xea\xe4\xef\xf3\xe7'
... b' - Microsoft Outlook 97'
... b' - \xd1\xf5\xe8\xec\xdf\xf3\xe5\xe9\xf2 e-mail')
'ISO-8859-7'
"""
# Check for UTF BOM
for bom_bytes, charset in UTF_BOMS:
if data.startswith(bom_bytes):
return charset
# Pure ASCII?
try:
data.decode('ascii', 'strict')
return 'ASCII'
except UnicodeDecodeError:
pass
# Valid UTF-8?
try:
data.decode('utf-8', 'strict')
return 'UTF-8'
except UnicodeDecodeError:
pass
# Create a set of non-ASCII characters
non_ascii_set = set(byte for byte in data if byte >= 128)
for characters, charset in CHARSET_CHARACTERS:
if characters.issuperset(non_ascii_set):
return charset
return default