SickGear/lib/hachoir/field/string_field.py

411 lines
15 KiB
Python
Raw Permalink Normal View History

"""
String field classes:
- String: Fixed length string (no prefix/no suffix) ;
- CString: String which ends with nul byte ("\0") ;
- UnixLine: Unix line of text, string which ends with "\n" ;
- PascalString8, PascalString16, PascalString32: String prefixed with
length written in a 8, 16, 32-bit integer (use parent endian).
Constructor has optional arguments:
- strip: value can be a string or True ;
- charset: if set, convert string to unicode using this charset (in "replace"
mode which replace all buggy characters with ".").
Note: For PascalStringXX, prefixed value is the number of bytes and not
of characters!
"""
from hachoir.field import FieldError, Bytes
from hachoir.core.endian import LITTLE_ENDIAN, BIG_ENDIAN
from hachoir.core.tools import alignValue, makePrintable
from hachoir.core.i18n import guessBytesCharset
from hachoir.core import config
from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF32_LE, BOM_UTF32_BE
# Default charset used to convert byte string to Unicode
# This charset is used if no charset is specified or on conversion error
FALLBACK_CHARSET = "ISO-8859-1"
class GenericString(Bytes):
"""
Generic string class.
charset have to be in CHARSET_8BIT or in UTF_CHARSET.
"""
VALID_FORMATS = ("C", "UnixLine",
"fixed", "Pascal8", "Pascal16", "Pascal32")
# 8-bit charsets
CHARSET_8BIT = set((
"ASCII", # ANSI X3.4-1968
"MacRoman",
"CP037", # EBCDIC 037
"CP874", # Thai
"WINDOWS-1250", # Central Europe
"WINDOWS-1251", # Cyrillic
"WINDOWS-1252", # Latin I
"WINDOWS-1253", # Greek
"WINDOWS-1254", # Turkish
"WINDOWS-1255", # Hebrew
"WINDOWS-1256", # Arabic
"WINDOWS-1257", # Baltic
"WINDOWS-1258", # Vietnam
"ISO-8859-1", # Latin-1
"ISO-8859-2", # Latin-2
"ISO-8859-3", # Latin-3
"ISO-8859-4", # Latin-4
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-9", # Latin-5
"ISO-8859-10", # Latin-6
"ISO-8859-11", # Thai
"ISO-8859-13", # Latin-7
"ISO-8859-14", # Latin-8
"ISO-8859-15", # Latin-9 or ("Latin-0")
"ISO-8859-16", # Latin-10
))
# UTF-xx charset familly
UTF_CHARSET = {
"UTF-8": (8, None),
"UTF-16-LE": (16, LITTLE_ENDIAN),
"UTF-32LE": (32, LITTLE_ENDIAN),
"UTF-16-BE": (16, BIG_ENDIAN),
"UTF-32BE": (32, BIG_ENDIAN),
"UTF-16": (16, "BOM"),
"UTF-32": (32, "BOM"),
}
# UTF-xx BOM => charset with endian
UTF_BOM = {
16: {BOM_UTF16_LE: "UTF-16-LE", BOM_UTF16_BE: "UTF-16-BE"},
32: {BOM_UTF32_LE: "UTF-32LE", BOM_UTF32_BE: "UTF-32BE"},
}
# Suffix format: value is suffix (string)
SUFFIX_FORMAT = {
"C": {
8: {LITTLE_ENDIAN: b"\0", BIG_ENDIAN: b"\0"},
16: {LITTLE_ENDIAN: b"\0\0", BIG_ENDIAN: b"\0\0"},
32: {LITTLE_ENDIAN: b"\0\0\0\0", BIG_ENDIAN: b"\0\0\0\0"},
},
"UnixLine": {
8: {LITTLE_ENDIAN: b"\n", BIG_ENDIAN: b"\n"},
16: {LITTLE_ENDIAN: b"\n\0", BIG_ENDIAN: b"\0\n"},
32: {LITTLE_ENDIAN: b"\n\0\0\0", BIG_ENDIAN: b"\0\0\0\n"},
},
}
# Pascal format: value is the size of the prefix in bits
PASCAL_FORMATS = {
"Pascal8": 1,
"Pascal16": 2,
"Pascal32": 4
}
# Raw value: with prefix and suffix, not stripped,
# and not converted to Unicode
_raw_value = None
def __init__(self, parent, name, format, description=None,
strip=None, charset=None, nbytes=None, truncate=None):
Bytes.__init__(self, parent, name, 1, description)
# Is format valid?
assert format in self.VALID_FORMATS
# Store options
self._format = format
self._strip = strip
self._truncate = truncate
# Check charset and compute character size in bytes
# (or None when it's not possible to guess character size)
if not charset or charset in self.CHARSET_8BIT:
self._character_size = 1 # one byte per character
elif charset in self.UTF_CHARSET:
self._character_size = None
else:
raise FieldError("Invalid charset for %s: \"%s\"" %
(self.path, charset))
self._charset = charset
# It is a fixed string?
if nbytes is not None:
assert self._format == "fixed"
# Arbitrary limits, just to catch some bugs...
if not (1 <= nbytes <= 0xffff):
raise FieldError("Invalid string size for %s: %s" %
(self.path, nbytes))
self._content_size = nbytes # content length in bytes
self._size = nbytes * 8
self._content_offset = 0
else:
# Format with a suffix: Find the end of the string
if self._format in self.SUFFIX_FORMAT:
self._content_offset = 0
# Choose the suffix
suffix = self.suffix_str
# Find the suffix
length = self._parent.stream.searchBytesLength(
suffix, False, self.absolute_address)
if length is None:
raise FieldError("Unable to find end of string %s (format %s)!"
% (self.path, self._format))
if 1 < len(suffix):
# Fix length for little endian bug with UTF-xx charset:
# u"abc" -> "a\0b\0c\0\0\0" (UTF-16-LE)
# search returns length=5, whereas real lenght is 6
length = alignValue(length, len(suffix))
# Compute sizes
self._content_size = length # in bytes
self._size = (length + len(suffix)) * 8
# Format with a prefix: Read prefixed length in bytes
else:
assert self._format in self.PASCAL_FORMATS
# Get the prefix size
prefix_size = self.PASCAL_FORMATS[self._format]
self._content_offset = prefix_size
# Read the prefix and compute sizes
value = self._parent.stream.readBits(
self.absolute_address, prefix_size * 8, self._parent.endian)
self._content_size = value # in bytes
self._size = (prefix_size + value) * 8
# For UTF-16 and UTF-32, choose the right charset using BOM
if self._charset in self.UTF_CHARSET:
# Charset requires a BOM?
bomsize, endian = self.UTF_CHARSET[self._charset]
if endian == "BOM":
# Read the BOM value
nbytes = bomsize // 8
bom = self._parent.stream.readBytes(
self.absolute_address, nbytes)
# Choose right charset using the BOM
bom_endian = self.UTF_BOM[bomsize]
if bom not in bom_endian:
raise FieldError("String %s has invalid BOM (%s)!"
% (self.path, repr(bom)))
self._charset = bom_endian[bom]
self._content_size -= nbytes
self._content_offset += nbytes
# Compute length in character if possible
if self._character_size:
self._length = self._content_size // self._character_size
else:
self._length = None
@staticmethod
def staticSuffixStr(format, charset, endian):
if format not in GenericString.SUFFIX_FORMAT:
return ''
suffix = GenericString.SUFFIX_FORMAT[format]
if charset in GenericString.UTF_CHARSET:
suffix_size = GenericString.UTF_CHARSET[charset][0]
suffix = suffix[suffix_size]
else:
suffix = suffix[8]
return suffix[endian]
def _getSuffixStr(self):
return self.staticSuffixStr(
self._format, self._charset, self._parent.endian)
suffix_str = property(_getSuffixStr)
def _convertText(self, text):
if not self._charset:
# charset is still unknown: guess the charset
self._charset = guessBytesCharset(text, default=FALLBACK_CHARSET)
# Try to convert to Unicode
try:
return str(text, self._charset, "strict")
except UnicodeDecodeError as exc:
err = exc
# --- Conversion error ---
# Fix truncated UTF-16 string like 'B\0e' (3 bytes)
# => Add missing nul byte: 'B\0e\0' (4 bytes)
if err.reason == "truncated data" \
and err.end == len(text) \
and self._charset == "UTF-16-LE":
try:
text = str(text + b"\0", self._charset, "strict")
self.warning(
"Fix truncated %s string: add missing nul byte" % self._charset)
return text
except UnicodeDecodeError:
pass
# On error, use FALLBACK_CHARSET
self.warning("Unable to convert string to Unicode: %s" % err)
return str(text, FALLBACK_CHARSET, "strict")
def _guessCharset(self):
addr = self.absolute_address + self._content_offset * 8
bytes = self._parent.stream.readBytes(addr, self._content_size)
return guessBytesCharset(bytes, default=FALLBACK_CHARSET)
def createValue(self, human=True):
# Compress data address (in bits) and size (in bytes)
if human:
addr = self.absolute_address + self._content_offset * 8
size = self._content_size
else:
addr = self.absolute_address
size = self._size // 8
if size == 0:
# Empty string
return ""
# Read bytes in data stream
text = self._parent.stream.readBytes(addr, size)
# Don't transform data?
if not human:
return text
# Convert text to Unicode
text = self._convertText(text)
# Truncate
if self._truncate:
pos = text.find(self._truncate)
if 0 <= pos:
text = text[:pos]
# Strip string if needed
if self._strip:
if isinstance(self._strip, str):
text = text.strip(self._strip)
else:
text = text.strip()
assert isinstance(text, str)
return text
def createDisplay(self, human=True):
if not human:
if self._raw_value is None:
self._raw_value = GenericString.createValue(self, False)
value = makePrintable(self._raw_value, "ASCII")
elif self._charset:
value = makePrintable(self.value, "ISO-8859-1")
else:
value = self.value
if config.max_string_length < len(value):
# Truncate string if needed
value = "%s(...)" % value[:config.max_string_length]
if not self._charset or not human:
return makePrintable(value, "ASCII", quote='"')
else:
if value:
return '"%s"' % value.replace('"', '\\"')
else:
return "(empty)"
def createRawDisplay(self):
return GenericString.createDisplay(self, human=False)
def _getLength(self):
if self._length is None:
self._length = len(self.value)
return self._length
length = property(_getLength, doc="String length in characters")
def _getFormat(self):
return self._format
format = property(_getFormat, doc="String format (eg. 'C')")
def _getCharset(self):
if not self._charset:
self._charset = self._guessCharset()
return self._charset
charset = property(_getCharset, doc="String charset (eg. 'ISO-8859-1')")
def _getContentSize(self):
return self._content_size
content_size = property(_getContentSize, doc="Content size in bytes")
def _getContentOffset(self):
return self._content_offset
content_offset = property(_getContentOffset, doc="Content offset in bytes")
def getFieldType(self):
info = self.charset
if self._strip:
if isinstance(self._strip, str):
info += ",strip=%s" % makePrintable(
self._strip, "ASCII", quote="'")
else:
info += ",strip=True"
return "%s<%s>" % (Bytes.getFieldType(self), info)
def stringFactory(name, format, doc):
class NewString(GenericString):
__doc__ = doc
def __init__(self, parent, name, description=None,
strip=None, charset=None, truncate=None):
GenericString.__init__(self, parent, name, format, description,
strip=strip, charset=charset, truncate=truncate)
cls = NewString
cls.__name__ = name
return cls
# String which ends with nul byte ("\0")
CString = stringFactory("CString", "C",
r"""C string: string ending with nul byte.
See GenericString to get more information.""")
# Unix line of text: string which ends with "\n" (ASCII 0x0A)
UnixLine = stringFactory("UnixLine", "UnixLine",
r"""Unix line: string ending with "\n" (ASCII code 10).
See GenericString to get more information.""")
# String prefixed with length written in a 8-bit integer
PascalString8 = stringFactory("PascalString8", "Pascal8",
r"""Pascal string: string prefixed with 8-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")
# String prefixed with length written in a 16-bit integer (use parent endian)
PascalString16 = stringFactory("PascalString16", "Pascal16",
r"""Pascal string: string prefixed with 16-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")
# String prefixed with length written in a 32-bit integer (use parent endian)
PascalString32 = stringFactory("PascalString32", "Pascal32",
r"""Pascal string: string prefixed with 32-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")
class String(GenericString):
"""
String with fixed size (size in bytes).
See GenericString to get more information.
"""
static_size = staticmethod(lambda *args, **kw: args[1] * 8)
def __init__(self, parent, name, nbytes, description=None,
strip=None, charset=None, truncate=None):
GenericString.__init__(self, parent, name, "fixed", description,
strip=strip, charset=charset, nbytes=nbytes, truncate=truncate)
String.__name__ = "FixedString"