# -*- coding: UTF-8 -*- """ Functions to manage internationalisation (i18n): - initLocale(): setup locales and install Unicode compatible stdout and stderr ; - getTerminalCharset(): guess terminal charset ; - gettext(text) translate a string to current language. The function always returns Unicode string. You can also use the alias: _() ; - ngettext(singular, plural, count): translate a sentence with singular and plural form. The function always returns Unicode string. WARNING: Loading this module indirectly calls initLocale() which sets locale LC_ALL to ''. This is needed to get user preferred locale settings. """ import hachoir.core.config as config import hachoir.core import locale from os import path import sys from codecs import BOM_UTF8, BOM_UTF16_LE, BOM_UTF16_BE def _getTerminalCharset(): """ Function used by getTerminalCharset() to get terminal charset. @see getTerminalCharset() """ # (1) Try locale.getpreferredencoding() try: charset = locale.getpreferredencoding() if charset: return charset except (locale.Error, AttributeError): pass # (2) Try locale.nl_langinfo(CODESET) try: charset = locale.nl_langinfo(locale.CODESET) if charset: return charset except (locale.Error, AttributeError): pass # (3) Try sys.stdout.encoding if hasattr(sys.stdout, "encoding") and sys.stdout.encoding: return sys.stdout.encoding # (4) Otherwise, returns "ASCII" return "ASCII" def getTerminalCharset(): """ Guess terminal charset using differents tests: 1. Try locale.getpreferredencoding() 2. Try locale.nl_langinfo(CODESET) 3. Try sys.stdout.encoding 4. Otherwise, returns "ASCII" WARNING: Call initLocale() before calling this function. """ try: return getTerminalCharset.value except AttributeError: getTerminalCharset.value = _getTerminalCharset() return getTerminalCharset.value class UnicodeStdout(object): def __init__(self, old_device, charset): self.device = old_device self.charset = charset def flush(self): self.device.flush() def write(self, text): if isinstance(text, unicode): text = text.encode(self.charset, 'replace') self.device.write(text) def writelines(self, lines): for text in lines: self.write(text) def initLocale(): # Only initialize locale once if initLocale.is_done: return getTerminalCharset() initLocale.is_done = True # Setup locales try: locale.setlocale(locale.LC_ALL, "") except (locale.Error, IOError): pass # Get the terminal charset charset = getTerminalCharset() # UnicodeStdout conflicts with the readline module if config.unicode_stdout and ('readline' not in sys.modules): # Replace stdout and stderr by unicode objet supporting unicode string sys.stdout = UnicodeStdout(sys.stdout, charset) sys.stderr = UnicodeStdout(sys.stderr, charset) return charset initLocale.is_done = False def _dummy_gettext(text): return unicode(text) def _dummy_ngettext(singular, plural, count): if 1 < abs(count) or not count: return unicode(plural) else: return unicode(singular) def _initGettext(): charset = initLocale() # Try to load gettext module if config.use_i18n: try: import gettext ok = True except ImportError: ok = False else: ok = False # gettext is not available or not needed: use dummy gettext functions if not ok: return (_dummy_gettext, _dummy_ngettext) # Gettext variables package = 'hachoir' locale_dir = path.join(path.dirname(__file__), "..", "locale") # Initialize gettext module gettext.bindtextdomain(package, locale_dir) gettext.textdomain(package) translate = gettext.gettext ngettext = gettext.ngettext # TODO: translate_unicode lambda function really sucks! # => find native function to do that unicode_gettext = lambda text: \ unicode(translate(text), charset) unicode_ngettext = lambda singular, plural, count: \ unicode(ngettext(singular, plural, count), charset) return (unicode_gettext, unicode_ngettext) UTF_BOMS = ( (BOM_UTF8, "UTF-8"), (BOM_UTF16_LE, "UTF-16-LE"), (BOM_UTF16_BE, "UTF-16-BE"), ) # Set of valid characters for specific charset CHARSET_CHARACTERS = ( # U+00E0: LATIN SMALL LETTER A WITH GRAVE (set(u"©®éêè\xE0ç".encode("ISO-8859-1")), "ISO-8859-1"), (set(u"©®éêè\xE0ç€".encode("ISO-8859-15")), "ISO-8859-15"), (set(u"©®".encode("MacRoman")), "MacRoman"), (set(u"εδηιθκμοΡσςυΈί".encode("ISO-8859-7")), "ISO-8859-7"), ) def guessBytesCharset(bytes, default=None): r""" >>> guessBytesCharset("abc") 'ASCII' >>> guessBytesCharset("\xEF\xBB\xBFabc") 'UTF-8' >>> guessBytesCharset("abc\xC3\xA9") 'UTF-8' >>> guessBytesCharset("File written by Adobe Photoshop\xA8 4.0\0") 'MacRoman' >>> guessBytesCharset("\xE9l\xE9phant") 'ISO-8859-1' >>> guessBytesCharset("100 \xA4") 'ISO-8859-15' >>> guessBytesCharset('Word \xb8\xea\xe4\xef\xf3\xe7 - Microsoft Outlook 97 - \xd1\xf5\xe8\xec\xdf\xf3\xe5\xe9\xf2 e-mail') 'ISO-8859-7' """ # Check for UTF BOM for bom_bytes, charset in UTF_BOMS: if bytes.startswith(bom_bytes): return charset # Pure ASCII? try: text = unicode(bytes, 'ASCII', 'strict') return 'ASCII' except UnicodeDecodeError: pass # Valid UTF-8? try: text = unicode(bytes, 'UTF-8', 'strict') return 'UTF-8' except UnicodeDecodeError: pass # Create a set of non-ASCII characters non_ascii_set = set(byte for byte in bytes if ord(byte) >= 128) for characters, charset in CHARSET_CHARACTERS: if characters.issuperset(non_ascii_set): return charset return default # Initialize _(), gettext() and ngettext() functions gettext, ngettext = _initGettext() _ = gettext