SickGear/lib/hachoir/field/string_field.py

"""
String field classes:
- String: Fixed length string (no prefix/no suffix) ;
- CString: String which ends with nul byte ("\0") ;
- UnixLine: Unix line of text, string which ends with "\n" ;
- PascalString8, PascalString16, PascalString32: String prefixed with
  length written in a 8, 16, 32-bit integer (use parent endian).

Constructor has optional arguments:
- strip: value can be a string or True ;
- charset: if set, convert string to unicode using this charset (in "replace"
  mode which replace all buggy characters with ".").

Note: For PascalStringXX, prefixed value is the number of bytes and not
      of characters!
"""

from hachoir.field import FieldError, Bytes
from hachoir.core.endian import LITTLE_ENDIAN, BIG_ENDIAN
from hachoir.core.tools import alignValue, makePrintable
from hachoir.core.i18n import guessBytesCharset
from hachoir.core import config
from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF32_LE, BOM_UTF32_BE

# Default charset used to convert byte string to Unicode
# This charset is used if no charset is specified or on conversion error
FALLBACK_CHARSET = "ISO-8859-1"


class GenericString(Bytes):
    """
    Generic string class.

    charset have to be in CHARSET_8BIT or in UTF_CHARSET.
    """

    VALID_FORMATS = ("C", "UnixLine",
                     "fixed", "Pascal8", "Pascal16", "Pascal32")

    # 8-bit charsets
    CHARSET_8BIT = set((
        "ASCII",          # ANSI X3.4-1968
        "MacRoman",
        "CP037",          # EBCDIC 037
        "CP874",          # Thai
        "WINDOWS-1250",   # Central Europe
        "WINDOWS-1251",   # Cyrillic
        "WINDOWS-1252",   # Latin I
        "WINDOWS-1253",   # Greek
        "WINDOWS-1254",   # Turkish
        "WINDOWS-1255",   # Hebrew
        "WINDOWS-1256",   # Arabic
        "WINDOWS-1257",   # Baltic
        "WINDOWS-1258",   # Vietnam
        "ISO-8859-1",     # Latin-1
        "ISO-8859-2",     # Latin-2
        "ISO-8859-3",     # Latin-3
        "ISO-8859-4",     # Latin-4
        "ISO-8859-5",
        "ISO-8859-6",
        "ISO-8859-7",
        "ISO-8859-8",
        "ISO-8859-9",     # Latin-5
        "ISO-8859-10",    # Latin-6
        "ISO-8859-11",    # Thai
        "ISO-8859-13",    # Latin-7
        "ISO-8859-14",    # Latin-8
        "ISO-8859-15",    # Latin-9 or ("Latin-0")
        "ISO-8859-16",    # Latin-10
    ))

    # UTF-xx charset familly
    UTF_CHARSET = {
        "UTF-8": (8, None),
        "UTF-16-LE": (16, LITTLE_ENDIAN),
        "UTF-32LE": (32, LITTLE_ENDIAN),
        "UTF-16-BE": (16, BIG_ENDIAN),
        "UTF-32BE": (32, BIG_ENDIAN),
        "UTF-16": (16, "BOM"),
        "UTF-32": (32, "BOM"),
    }

    # UTF-xx BOM => charset with endian
    UTF_BOM = {
        16: {BOM_UTF16_LE: "UTF-16-LE", BOM_UTF16_BE: "UTF-16-BE"},
        32: {BOM_UTF32_LE: "UTF-32LE", BOM_UTF32_BE: "UTF-32BE"},
    }

    # Suffix format: value is suffix (string)
    SUFFIX_FORMAT = {
        "C": {
            8: {LITTLE_ENDIAN: b"\0", BIG_ENDIAN: b"\0"},
            16: {LITTLE_ENDIAN: b"\0\0", BIG_ENDIAN: b"\0\0"},
            32: {LITTLE_ENDIAN: b"\0\0\0\0", BIG_ENDIAN: b"\0\0\0\0"},
        },
        "UnixLine": {
            8: {LITTLE_ENDIAN: b"\n", BIG_ENDIAN: b"\n"},
            16: {LITTLE_ENDIAN: b"\n\0", BIG_ENDIAN: b"\0\n"},
            32: {LITTLE_ENDIAN: b"\n\0\0\0", BIG_ENDIAN: b"\0\0\0\n"},
        },

    }

    # Pascal format: value is the size of the prefix in bits
    PASCAL_FORMATS = {
        "Pascal8": 1,
        "Pascal16": 2,
        "Pascal32": 4
    }

    # Raw value: with prefix and suffix, not stripped,
    # and not converted to Unicode
    _raw_value = None

    def __init__(self, parent, name, format, description=None,
                 strip=None, charset=None, nbytes=None, truncate=None):
        Bytes.__init__(self, parent, name, 1, description)

        # Is format valid?
        assert format in self.VALID_FORMATS

        # Store options
        self._format = format
        self._strip = strip
        self._truncate = truncate

        # Check charset and compute character size in bytes
        # (or None when it's not possible to guess character size)
        if not charset or charset in self.CHARSET_8BIT:
            self._character_size = 1   # one byte per character
        elif charset in self.UTF_CHARSET:
            self._character_size = None
        else:
            raise FieldError("Invalid charset for %s: \"%s\"" %
                             (self.path, charset))
        self._charset = charset

        # It is a fixed string?
        if nbytes is not None:
            assert self._format == "fixed"
            # Arbitrary limits, just to catch some bugs...
            if not (1 <= nbytes <= 0xffff):
                raise FieldError("Invalid string size for %s: %s" %
                                 (self.path, nbytes))
            self._content_size = nbytes   # content length in bytes
            self._size = nbytes * 8
            self._content_offset = 0
        else:
            # Format with a suffix: Find the end of the string
            if self._format in self.SUFFIX_FORMAT:
                self._content_offset = 0

                # Choose the suffix
                suffix = self.suffix_str

                # Find the suffix
                length = self._parent.stream.searchBytesLength(
                    suffix, False, self.absolute_address)
                if length is None:
                    raise FieldError("Unable to find end of string %s (format %s)!"
                                     % (self.path, self._format))
                if 1 < len(suffix):
                    # Fix length for little endian bug with UTF-xx charset:
                    #   u"abc" -> "a\0b\0c\0\0\0" (UTF-16-LE)
                    #   search returns length=5, whereas real lenght is 6
                    length = alignValue(length, len(suffix))

                # Compute sizes
                self._content_size = length  # in bytes
                self._size = (length + len(suffix)) * 8

            # Format with a prefix: Read prefixed length in bytes
            else:
                assert self._format in self.PASCAL_FORMATS

                # Get the prefix size
                prefix_size = self.PASCAL_FORMATS[self._format]
                self._content_offset = prefix_size

                # Read the prefix and compute sizes
                value = self._parent.stream.readBits(
                    self.absolute_address, prefix_size * 8, self._parent.endian)
                self._content_size = value   # in bytes
                self._size = (prefix_size + value) * 8

        # For UTF-16 and UTF-32, choose the right charset using BOM
        if self._charset in self.UTF_CHARSET:
            # Charset requires a BOM?
            bomsize, endian = self.UTF_CHARSET[self._charset]
            if endian == "BOM":
                # Read the BOM value
                nbytes = bomsize // 8
                bom = self._parent.stream.readBytes(
                    self.absolute_address, nbytes)

                # Choose right charset using the BOM
                bom_endian = self.UTF_BOM[bomsize]
                if bom not in bom_endian:
                    raise FieldError("String %s has invalid BOM (%s)!"
                                     % (self.path, repr(bom)))
                self._charset = bom_endian[bom]
                self._content_size -= nbytes
                self._content_offset += nbytes

        # Compute length in character if possible
        if self._character_size:
            self._length = self._content_size // self._character_size
        else:
            self._length = None

    @staticmethod
    def staticSuffixStr(format, charset, endian):
        if format not in GenericString.SUFFIX_FORMAT:
            return ''
        suffix = GenericString.SUFFIX_FORMAT[format]
        if charset in GenericString.UTF_CHARSET:
            suffix_size = GenericString.UTF_CHARSET[charset][0]
            suffix = suffix[suffix_size]
        else:
            suffix = suffix[8]
        return suffix[endian]

    def _getSuffixStr(self):
        return self.staticSuffixStr(
            self._format, self._charset, self._parent.endian)
    suffix_str = property(_getSuffixStr)

    def _convertText(self, text):
        if not self._charset:
            # charset is still unknown: guess the charset
            self._charset = guessBytesCharset(text, default=FALLBACK_CHARSET)

        # Try to convert to Unicode
        try:
            return str(text, self._charset, "strict")
        except UnicodeDecodeError as exc:
            err = exc

        # --- Conversion error ---

        # Fix truncated UTF-16 string like 'B\0e' (3 bytes)
        # => Add missing nul byte: 'B\0e\0' (4 bytes)
        if err.reason == "truncated data" \
                and err.end == len(text) \
                and self._charset == "UTF-16-LE":
            try:
                text = str(text + b"\0", self._charset, "strict")
                self.warning(
                    "Fix truncated %s string: add missing nul byte" % self._charset)
                return text
            except UnicodeDecodeError:
                pass

        # On error, use FALLBACK_CHARSET
        self.warning("Unable to convert string to Unicode: %s" % err)
        return str(text, FALLBACK_CHARSET, "strict")

    def _guessCharset(self):
        addr = self.absolute_address + self._content_offset * 8
        bytes = self._parent.stream.readBytes(addr, self._content_size)
        return guessBytesCharset(bytes, default=FALLBACK_CHARSET)

    def createValue(self, human=True):
        # Compress data address (in bits) and size (in bytes)
        if human:
            addr = self.absolute_address + self._content_offset * 8
            size = self._content_size
        else:
            addr = self.absolute_address
            size = self._size // 8
        if size == 0:
            # Empty string
            return ""

        # Read bytes in data stream
        text = self._parent.stream.readBytes(addr, size)

        # Don't transform data?
        if not human:
            return text

        # Convert text to Unicode
        text = self._convertText(text)

        # Truncate
        if self._truncate:
            pos = text.find(self._truncate)
            if 0 <= pos:
                text = text[:pos]

        # Strip string if needed
        if self._strip:
            if isinstance(self._strip, str):
                text = text.strip(self._strip)
            else:
                text = text.strip()
        assert isinstance(text, str)
        return text

    def createDisplay(self, human=True):
        if not human:
            if self._raw_value is None:
                self._raw_value = GenericString.createValue(self, False)
            value = makePrintable(self._raw_value, "ASCII")
        elif self._charset:
            value = makePrintable(self.value, "ISO-8859-1")
        else:
            value = self.value
        if config.max_string_length < len(value):
            # Truncate string if needed
            value = "%s(...)" % value[:config.max_string_length]
        if not self._charset or not human:
            return makePrintable(value, "ASCII", quote='"')
        else:
            if value:
                return '"%s"' % value.replace('"', '\\"')
            else:
                return "(empty)"

    def createRawDisplay(self):
        return GenericString.createDisplay(self, human=False)

    def _getLength(self):
        if self._length is None:
            self._length = len(self.value)
        return self._length
    length = property(_getLength, doc="String length in characters")

    def _getFormat(self):
        return self._format
    format = property(_getFormat, doc="String format (eg. 'C')")

    def _getCharset(self):
        if not self._charset:
            self._charset = self._guessCharset()
        return self._charset
    charset = property(_getCharset, doc="String charset (eg. 'ISO-8859-1')")

    def _getContentSize(self):
        return self._content_size
    content_size = property(_getContentSize, doc="Content size in bytes")

    def _getContentOffset(self):
        return self._content_offset
    content_offset = property(_getContentOffset, doc="Content offset in bytes")

    def getFieldType(self):
        info = self.charset
        if self._strip:
            if isinstance(self._strip, str):
                info += ",strip=%s" % makePrintable(
                    self._strip, "ASCII", quote="'")
            else:
                info += ",strip=True"
        return "%s<%s>" % (Bytes.getFieldType(self), info)


def stringFactory(name, format, doc):
    class NewString(GenericString):
        __doc__ = doc

        def __init__(self, parent, name, description=None,
                     strip=None, charset=None, truncate=None):
            GenericString.__init__(self, parent, name, format, description,
                                   strip=strip, charset=charset, truncate=truncate)
    cls = NewString
    cls.__name__ = name
    return cls


# String which ends with nul byte ("\0")
CString = stringFactory("CString", "C",
                        r"""C string: string ending with nul byte.
See GenericString to get more information.""")

# Unix line of text: string which ends with "\n" (ASCII 0x0A)
UnixLine = stringFactory("UnixLine", "UnixLine",
                         r"""Unix line: string ending with "\n" (ASCII code 10).
See GenericString to get more information.""")

# String prefixed with length written in a 8-bit integer
PascalString8 = stringFactory("PascalString8", "Pascal8",
                              r"""Pascal string: string prefixed with 8-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")

# String prefixed with length written in a 16-bit integer (use parent endian)
PascalString16 = stringFactory("PascalString16", "Pascal16",
                               r"""Pascal string: string prefixed with 16-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")

# String prefixed with length written in a 32-bit integer (use parent endian)
PascalString32 = stringFactory("PascalString32", "Pascal32",
                               r"""Pascal string: string prefixed with 32-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")


class String(GenericString):
    """
    String with fixed size (size in bytes).
    See GenericString to get more information.
    """
    static_size = staticmethod(lambda *args, **kw: args[1] * 8)

    def __init__(self, parent, name, nbytes, description=None,
                 strip=None, charset=None, truncate=None):
        GenericString.__init__(self, parent, name, "fixed", description,
                               strip=strip, charset=charset, nbytes=nbytes, truncate=truncate)


String.__name__ = "FixedString"