SickGear/lib/hachoir_core/field/string_field.py
echel0n 0d9fbc1ad7 Welcome to our SickBeard-TVRage Edition ...
This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer.

Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer.

Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk!

Enjoy!
2014-03-09 22:39:12 -07:00

402 lines
14 KiB
Python

"""
String field classes:
- String: Fixed length string (no prefix/no suffix) ;
- CString: String which ends with nul byte ("\0") ;
- UnixLine: Unix line of text, string which ends with "\n" ;
- PascalString8, PascalString16, PascalString32: String prefixed with
length written in a 8, 16, 32-bit integer (use parent endian).
Constructor has optional arguments:
- strip: value can be a string or True ;
- charset: if set, convert string to unicode using this charset (in "replace"
mode which replace all buggy characters with ".").
Note: For PascalStringXX, prefixed value is the number of bytes and not
of characters!
"""
from lib.hachoir_core.field import FieldError, Bytes
from lib.hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN
from lib.hachoir_core.tools import alignValue, makePrintable
from lib.hachoir_core.i18n import guessBytesCharset, _
from lib.hachoir_core import config
from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF32_LE, BOM_UTF32_BE
# Default charset used to convert byte string to Unicode
# This charset is used if no charset is specified or on conversion error
FALLBACK_CHARSET = "ISO-8859-1"
class GenericString(Bytes):
"""
Generic string class.
charset have to be in CHARSET_8BIT or in UTF_CHARSET.
"""
VALID_FORMATS = ("C", "UnixLine",
"fixed", "Pascal8", "Pascal16", "Pascal32")
# 8-bit charsets
CHARSET_8BIT = set((
"ASCII", # ANSI X3.4-1968
"MacRoman",
"CP037", # EBCDIC 037
"CP874", # Thai
"WINDOWS-1250", # Central Europe
"WINDOWS-1251", # Cyrillic
"WINDOWS-1252", # Latin I
"WINDOWS-1253", # Greek
"WINDOWS-1254", # Turkish
"WINDOWS-1255", # Hebrew
"WINDOWS-1256", # Arabic
"WINDOWS-1257", # Baltic
"WINDOWS-1258", # Vietnam
"ISO-8859-1", # Latin-1
"ISO-8859-2", # Latin-2
"ISO-8859-3", # Latin-3
"ISO-8859-4", # Latin-4
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-9", # Latin-5
"ISO-8859-10", # Latin-6
"ISO-8859-11", # Thai
"ISO-8859-13", # Latin-7
"ISO-8859-14", # Latin-8
"ISO-8859-15", # Latin-9 or ("Latin-0")
"ISO-8859-16", # Latin-10
))
# UTF-xx charset familly
UTF_CHARSET = {
"UTF-8": (8, None),
"UTF-16-LE": (16, LITTLE_ENDIAN),
"UTF-32LE": (32, LITTLE_ENDIAN),
"UTF-16-BE": (16, BIG_ENDIAN),
"UTF-32BE": (32, BIG_ENDIAN),
"UTF-16": (16, "BOM"),
"UTF-32": (32, "BOM"),
}
# UTF-xx BOM => charset with endian
UTF_BOM = {
16: {BOM_UTF16_LE: "UTF-16-LE", BOM_UTF16_BE: "UTF-16-BE"},
32: {BOM_UTF32_LE: "UTF-32LE", BOM_UTF32_BE: "UTF-32BE"},
}
# Suffix format: value is suffix (string)
SUFFIX_FORMAT = {
"C": {
8: {LITTLE_ENDIAN: "\0", BIG_ENDIAN: "\0"},
16: {LITTLE_ENDIAN: "\0\0", BIG_ENDIAN: "\0\0"},
32: {LITTLE_ENDIAN: "\0\0\0\0", BIG_ENDIAN: "\0\0\0\0"},
},
"UnixLine": {
8: {LITTLE_ENDIAN: "\n", BIG_ENDIAN: "\n"},
16: {LITTLE_ENDIAN: "\n\0", BIG_ENDIAN: "\0\n"},
32: {LITTLE_ENDIAN: "\n\0\0\0", BIG_ENDIAN: "\0\0\0\n"},
},
}
# Pascal format: value is the size of the prefix in bits
PASCAL_FORMATS = {
"Pascal8": 1,
"Pascal16": 2,
"Pascal32": 4
}
# Raw value: with prefix and suffix, not stripped,
# and not converted to Unicode
_raw_value = None
def __init__(self, parent, name, format, description=None,
strip=None, charset=None, nbytes=None, truncate=None):
Bytes.__init__(self, parent, name, 1, description)
# Is format valid?
assert format in self.VALID_FORMATS
# Store options
self._format = format
self._strip = strip
self._truncate = truncate
# Check charset and compute character size in bytes
# (or None when it's not possible to guess character size)
if not charset or charset in self.CHARSET_8BIT:
self._character_size = 1 # one byte per character
elif charset in self.UTF_CHARSET:
self._character_size = None
else:
raise FieldError("Invalid charset for %s: \"%s\"" %
(self.path, charset))
self._charset = charset
# It is a fixed string?
if nbytes is not None:
assert self._format == "fixed"
# Arbitrary limits, just to catch some bugs...
if not (1 <= nbytes <= 0xffff):
raise FieldError("Invalid string size for %s: %s" %
(self.path, nbytes))
self._content_size = nbytes # content length in bytes
self._size = nbytes * 8
self._content_offset = 0
else:
# Format with a suffix: Find the end of the string
if self._format in self.SUFFIX_FORMAT:
self._content_offset = 0
# Choose the suffix
suffix = self.suffix_str
# Find the suffix
length = self._parent.stream.searchBytesLength(
suffix, False, self.absolute_address)
if length is None:
raise FieldError("Unable to find end of string %s (format %s)!"
% (self.path, self._format))
if 1 < len(suffix):
# Fix length for little endian bug with UTF-xx charset:
# u"abc" -> "a\0b\0c\0\0\0" (UTF-16-LE)
# search returns length=5, whereas real lenght is 6
length = alignValue(length, len(suffix))
# Compute sizes
self._content_size = length # in bytes
self._size = (length + len(suffix)) * 8
# Format with a prefix: Read prefixed length in bytes
else:
assert self._format in self.PASCAL_FORMATS
# Get the prefix size
prefix_size = self.PASCAL_FORMATS[self._format]
self._content_offset = prefix_size
# Read the prefix and compute sizes
value = self._parent.stream.readBits(
self.absolute_address, prefix_size*8, self._parent.endian)
self._content_size = value # in bytes
self._size = (prefix_size + value) * 8
# For UTF-16 and UTF-32, choose the right charset using BOM
if self._charset in self.UTF_CHARSET:
# Charset requires a BOM?
bomsize, endian = self.UTF_CHARSET[self._charset]
if endian == "BOM":
# Read the BOM value
nbytes = bomsize // 8
bom = self._parent.stream.readBytes(self.absolute_address, nbytes)
# Choose right charset using the BOM
bom_endian = self.UTF_BOM[bomsize]
if bom not in bom_endian:
raise FieldError("String %s has invalid BOM (%s)!"
% (self.path, repr(bom)))
self._charset = bom_endian[bom]
self._content_size -= nbytes
self._content_offset += nbytes
# Compute length in character if possible
if self._character_size:
self._length = self._content_size // self._character_size
else:
self._length = None
@staticmethod
def staticSuffixStr(format, charset, endian):
if format not in GenericString.SUFFIX_FORMAT:
return ''
suffix = GenericString.SUFFIX_FORMAT[format]
if charset in GenericString.UTF_CHARSET:
suffix_size = GenericString.UTF_CHARSET[charset][0]
suffix = suffix[suffix_size]
else:
suffix = suffix[8]
return suffix[endian]
def _getSuffixStr(self):
return self.staticSuffixStr(
self._format, self._charset, self._parent.endian)
suffix_str = property(_getSuffixStr)
def _convertText(self, text):
if not self._charset:
# charset is still unknown: guess the charset
self._charset = guessBytesCharset(text, default=FALLBACK_CHARSET)
# Try to convert to Unicode
try:
return unicode(text, self._charset, "strict")
except UnicodeDecodeError, err:
pass
#--- Conversion error ---
# Fix truncated UTF-16 string like 'B\0e' (3 bytes)
# => Add missing nul byte: 'B\0e\0' (4 bytes)
if err.reason == "truncated data" \
and err.end == len(text) \
and self._charset == "UTF-16-LE":
try:
text = unicode(text+"\0", self._charset, "strict")
self.warning("Fix truncated %s string: add missing nul byte" % self._charset)
return text
except UnicodeDecodeError, err:
pass
# On error, use FALLBACK_CHARSET
self.warning(u"Unable to convert string to Unicode: %s" % err)
return unicode(text, FALLBACK_CHARSET, "strict")
def _guessCharset(self):
addr = self.absolute_address + self._content_offset * 8
bytes = self._parent.stream.readBytes(addr, self._content_size)
return guessBytesCharset(bytes, default=FALLBACK_CHARSET)
def createValue(self, human=True):
# Compress data address (in bits) and size (in bytes)
if human:
addr = self.absolute_address + self._content_offset * 8
size = self._content_size
else:
addr = self.absolute_address
size = self._size // 8
if size == 0:
# Empty string
return u""
# Read bytes in data stream
text = self._parent.stream.readBytes(addr, size)
# Don't transform data?
if not human:
return text
# Convert text to Unicode
text = self._convertText(text)
# Truncate
if self._truncate:
pos = text.find(self._truncate)
if 0 <= pos:
text = text[:pos]
# Strip string if needed
if self._strip:
if isinstance(self._strip, (str, unicode)):
text = text.strip(self._strip)
else:
text = text.strip()
assert isinstance(text, unicode)
return text
def createDisplay(self, human=True):
if not human:
if self._raw_value is None:
self._raw_value = GenericString.createValue(self, False)
value = makePrintable(self._raw_value, "ASCII", to_unicode=True)
elif self._charset:
value = makePrintable(self.value, "ISO-8859-1", to_unicode=True)
else:
value = self.value
if config.max_string_length < len(value):
# Truncate string if needed
value = "%s(...)" % value[:config.max_string_length]
if not self._charset or not human:
return makePrintable(value, "ASCII", quote='"', to_unicode=True)
else:
if value:
return '"%s"' % value.replace('"', '\\"')
else:
return _("(empty)")
def createRawDisplay(self):
return GenericString.createDisplay(self, human=False)
def _getLength(self):
if self._length is None:
self._length = len(self.value)
return self._length
length = property(_getLength, doc="String length in characters")
def _getFormat(self):
return self._format
format = property(_getFormat, doc="String format (eg. 'C')")
def _getCharset(self):
if not self._charset:
self._charset = self._guessCharset()
return self._charset
charset = property(_getCharset, doc="String charset (eg. 'ISO-8859-1')")
def _getContentSize(self):
return self._content_size
content_size = property(_getContentSize, doc="Content size in bytes")
def _getContentOffset(self):
return self._content_offset
content_offset = property(_getContentOffset, doc="Content offset in bytes")
def getFieldType(self):
info = self.charset
if self._strip:
if isinstance(self._strip, (str, unicode)):
info += ",strip=%s" % makePrintable(self._strip, "ASCII", quote="'")
else:
info += ",strip=True"
return "%s<%s>" % (Bytes.getFieldType(self), info)
def stringFactory(name, format, doc):
class NewString(GenericString):
__doc__ = doc
def __init__(self, parent, name, description=None,
strip=None, charset=None, truncate=None):
GenericString.__init__(self, parent, name, format, description,
strip=strip, charset=charset, truncate=truncate)
cls = NewString
cls.__name__ = name
return cls
# String which ends with nul byte ("\0")
CString = stringFactory("CString", "C",
r"""C string: string ending with nul byte.
See GenericString to get more information.""")
# Unix line of text: string which ends with "\n" (ASCII 0x0A)
UnixLine = stringFactory("UnixLine", "UnixLine",
r"""Unix line: string ending with "\n" (ASCII code 10).
See GenericString to get more information.""")
# String prefixed with length written in a 8-bit integer
PascalString8 = stringFactory("PascalString8", "Pascal8",
r"""Pascal string: string prefixed with 8-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")
# String prefixed with length written in a 16-bit integer (use parent endian)
PascalString16 = stringFactory("PascalString16", "Pascal16",
r"""Pascal string: string prefixed with 16-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")
# String prefixed with length written in a 32-bit integer (use parent endian)
PascalString32 = stringFactory("PascalString32", "Pascal32",
r"""Pascal string: string prefixed with 32-bit integer containing its length (endian depends on parent endian).
See GenericString to get more information.""")
class String(GenericString):
"""
String with fixed size (size in bytes).
See GenericString to get more information.
"""
static_size = staticmethod(lambda *args, **kw: args[1]*8)
def __init__(self, parent, name, nbytes, description=None,
strip=None, charset=None, truncate=None):
GenericString.__init__(self, parent, name, "fixed", description,
strip=strip, charset=charset, nbytes=nbytes, truncate=truncate)
String.__name__ = "FixedString"