mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-05 02:43:37 +00:00
0d9fbc1ad7
This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy!
402 lines
14 KiB
Python
402 lines
14 KiB
Python
"""
|
|
String field classes:
|
|
- String: Fixed length string (no prefix/no suffix) ;
|
|
- CString: String which ends with nul byte ("\0") ;
|
|
- UnixLine: Unix line of text, string which ends with "\n" ;
|
|
- PascalString8, PascalString16, PascalString32: String prefixed with
|
|
length written in a 8, 16, 32-bit integer (use parent endian).
|
|
|
|
Constructor has optional arguments:
|
|
- strip: value can be a string or True ;
|
|
- charset: if set, convert string to unicode using this charset (in "replace"
|
|
mode which replace all buggy characters with ".").
|
|
|
|
Note: For PascalStringXX, prefixed value is the number of bytes and not
|
|
of characters!
|
|
"""
|
|
|
|
from lib.hachoir_core.field import FieldError, Bytes
|
|
from lib.hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN
|
|
from lib.hachoir_core.tools import alignValue, makePrintable
|
|
from lib.hachoir_core.i18n import guessBytesCharset, _
|
|
from lib.hachoir_core import config
|
|
from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF32_LE, BOM_UTF32_BE
|
|
|
|
# Default charset used to convert byte string to Unicode
|
|
# This charset is used if no charset is specified or on conversion error
|
|
FALLBACK_CHARSET = "ISO-8859-1"
|
|
|
|
class GenericString(Bytes):
|
|
"""
|
|
Generic string class.
|
|
|
|
charset have to be in CHARSET_8BIT or in UTF_CHARSET.
|
|
"""
|
|
|
|
VALID_FORMATS = ("C", "UnixLine",
|
|
"fixed", "Pascal8", "Pascal16", "Pascal32")
|
|
|
|
# 8-bit charsets
|
|
CHARSET_8BIT = set((
|
|
"ASCII", # ANSI X3.4-1968
|
|
"MacRoman",
|
|
"CP037", # EBCDIC 037
|
|
"CP874", # Thai
|
|
"WINDOWS-1250", # Central Europe
|
|
"WINDOWS-1251", # Cyrillic
|
|
"WINDOWS-1252", # Latin I
|
|
"WINDOWS-1253", # Greek
|
|
"WINDOWS-1254", # Turkish
|
|
"WINDOWS-1255", # Hebrew
|
|
"WINDOWS-1256", # Arabic
|
|
"WINDOWS-1257", # Baltic
|
|
"WINDOWS-1258", # Vietnam
|
|
"ISO-8859-1", # Latin-1
|
|
"ISO-8859-2", # Latin-2
|
|
"ISO-8859-3", # Latin-3
|
|
"ISO-8859-4", # Latin-4
|
|
"ISO-8859-5",
|
|
"ISO-8859-6",
|
|
"ISO-8859-7",
|
|
"ISO-8859-8",
|
|
"ISO-8859-9", # Latin-5
|
|
"ISO-8859-10", # Latin-6
|
|
"ISO-8859-11", # Thai
|
|
"ISO-8859-13", # Latin-7
|
|
"ISO-8859-14", # Latin-8
|
|
"ISO-8859-15", # Latin-9 or ("Latin-0")
|
|
"ISO-8859-16", # Latin-10
|
|
))
|
|
|
|
# UTF-xx charset familly
|
|
UTF_CHARSET = {
|
|
"UTF-8": (8, None),
|
|
"UTF-16-LE": (16, LITTLE_ENDIAN),
|
|
"UTF-32LE": (32, LITTLE_ENDIAN),
|
|
"UTF-16-BE": (16, BIG_ENDIAN),
|
|
"UTF-32BE": (32, BIG_ENDIAN),
|
|
"UTF-16": (16, "BOM"),
|
|
"UTF-32": (32, "BOM"),
|
|
}
|
|
|
|
# UTF-xx BOM => charset with endian
|
|
UTF_BOM = {
|
|
16: {BOM_UTF16_LE: "UTF-16-LE", BOM_UTF16_BE: "UTF-16-BE"},
|
|
32: {BOM_UTF32_LE: "UTF-32LE", BOM_UTF32_BE: "UTF-32BE"},
|
|
}
|
|
|
|
# Suffix format: value is suffix (string)
|
|
SUFFIX_FORMAT = {
|
|
"C": {
|
|
8: {LITTLE_ENDIAN: "\0", BIG_ENDIAN: "\0"},
|
|
16: {LITTLE_ENDIAN: "\0\0", BIG_ENDIAN: "\0\0"},
|
|
32: {LITTLE_ENDIAN: "\0\0\0\0", BIG_ENDIAN: "\0\0\0\0"},
|
|
},
|
|
"UnixLine": {
|
|
8: {LITTLE_ENDIAN: "\n", BIG_ENDIAN: "\n"},
|
|
16: {LITTLE_ENDIAN: "\n\0", BIG_ENDIAN: "\0\n"},
|
|
32: {LITTLE_ENDIAN: "\n\0\0\0", BIG_ENDIAN: "\0\0\0\n"},
|
|
},
|
|
|
|
}
|
|
|
|
# Pascal format: value is the size of the prefix in bits
|
|
PASCAL_FORMATS = {
|
|
"Pascal8": 1,
|
|
"Pascal16": 2,
|
|
"Pascal32": 4
|
|
}
|
|
|
|
# Raw value: with prefix and suffix, not stripped,
|
|
# and not converted to Unicode
|
|
_raw_value = None
|
|
|
|
def __init__(self, parent, name, format, description=None,
|
|
strip=None, charset=None, nbytes=None, truncate=None):
|
|
Bytes.__init__(self, parent, name, 1, description)
|
|
|
|
# Is format valid?
|
|
assert format in self.VALID_FORMATS
|
|
|
|
# Store options
|
|
self._format = format
|
|
self._strip = strip
|
|
self._truncate = truncate
|
|
|
|
# Check charset and compute character size in bytes
|
|
# (or None when it's not possible to guess character size)
|
|
if not charset or charset in self.CHARSET_8BIT:
|
|
self._character_size = 1 # one byte per character
|
|
elif charset in self.UTF_CHARSET:
|
|
self._character_size = None
|
|
else:
|
|
raise FieldError("Invalid charset for %s: \"%s\"" %
|
|
(self.path, charset))
|
|
self._charset = charset
|
|
|
|
# It is a fixed string?
|
|
if nbytes is not None:
|
|
assert self._format == "fixed"
|
|
# Arbitrary limits, just to catch some bugs...
|
|
if not (1 <= nbytes <= 0xffff):
|
|
raise FieldError("Invalid string size for %s: %s" %
|
|
(self.path, nbytes))
|
|
self._content_size = nbytes # content length in bytes
|
|
self._size = nbytes * 8
|
|
self._content_offset = 0
|
|
else:
|
|
# Format with a suffix: Find the end of the string
|
|
if self._format in self.SUFFIX_FORMAT:
|
|
self._content_offset = 0
|
|
|
|
# Choose the suffix
|
|
suffix = self.suffix_str
|
|
|
|
# Find the suffix
|
|
length = self._parent.stream.searchBytesLength(
|
|
suffix, False, self.absolute_address)
|
|
if length is None:
|
|
raise FieldError("Unable to find end of string %s (format %s)!"
|
|
% (self.path, self._format))
|
|
if 1 < len(suffix):
|
|
# Fix length for little endian bug with UTF-xx charset:
|
|
# u"abc" -> "a\0b\0c\0\0\0" (UTF-16-LE)
|
|
# search returns length=5, whereas real lenght is 6
|
|
length = alignValue(length, len(suffix))
|
|
|
|
# Compute sizes
|
|
self._content_size = length # in bytes
|
|
self._size = (length + len(suffix)) * 8
|
|
|
|
# Format with a prefix: Read prefixed length in bytes
|
|
else:
|
|
assert self._format in self.PASCAL_FORMATS
|
|
|
|
# Get the prefix size
|
|
prefix_size = self.PASCAL_FORMATS[self._format]
|
|
self._content_offset = prefix_size
|
|
|
|
# Read the prefix and compute sizes
|
|
value = self._parent.stream.readBits(
|
|
self.absolute_address, prefix_size*8, self._parent.endian)
|
|
self._content_size = value # in bytes
|
|
self._size = (prefix_size + value) * 8
|
|
|
|
# For UTF-16 and UTF-32, choose the right charset using BOM
|
|
if self._charset in self.UTF_CHARSET:
|
|
# Charset requires a BOM?
|
|
bomsize, endian = self.UTF_CHARSET[self._charset]
|
|
if endian == "BOM":
|
|
# Read the BOM value
|
|
nbytes = bomsize // 8
|
|
bom = self._parent.stream.readBytes(self.absolute_address, nbytes)
|
|
|
|
# Choose right charset using the BOM
|
|
bom_endian = self.UTF_BOM[bomsize]
|
|
if bom not in bom_endian:
|
|
raise FieldError("String %s has invalid BOM (%s)!"
|
|
% (self.path, repr(bom)))
|
|
self._charset = bom_endian[bom]
|
|
self._content_size -= nbytes
|
|
self._content_offset += nbytes
|
|
|
|
# Compute length in character if possible
|
|
if self._character_size:
|
|
self._length = self._content_size // self._character_size
|
|
else:
|
|
self._length = None
|
|
|
|
@staticmethod
|
|
def staticSuffixStr(format, charset, endian):
|
|
if format not in GenericString.SUFFIX_FORMAT:
|
|
return ''
|
|
suffix = GenericString.SUFFIX_FORMAT[format]
|
|
if charset in GenericString.UTF_CHARSET:
|
|
suffix_size = GenericString.UTF_CHARSET[charset][0]
|
|
suffix = suffix[suffix_size]
|
|
else:
|
|
suffix = suffix[8]
|
|
return suffix[endian]
|
|
|
|
def _getSuffixStr(self):
|
|
return self.staticSuffixStr(
|
|
self._format, self._charset, self._parent.endian)
|
|
suffix_str = property(_getSuffixStr)
|
|
|
|
def _convertText(self, text):
|
|
if not self._charset:
|
|
# charset is still unknown: guess the charset
|
|
self._charset = guessBytesCharset(text, default=FALLBACK_CHARSET)
|
|
|
|
# Try to convert to Unicode
|
|
try:
|
|
return unicode(text, self._charset, "strict")
|
|
except UnicodeDecodeError, err:
|
|
pass
|
|
|
|
#--- Conversion error ---
|
|
|
|
# Fix truncated UTF-16 string like 'B\0e' (3 bytes)
|
|
# => Add missing nul byte: 'B\0e\0' (4 bytes)
|
|
if err.reason == "truncated data" \
|
|
and err.end == len(text) \
|
|
and self._charset == "UTF-16-LE":
|
|
try:
|
|
text = unicode(text+"\0", self._charset, "strict")
|
|
self.warning("Fix truncated %s string: add missing nul byte" % self._charset)
|
|
return text
|
|
except UnicodeDecodeError, err:
|
|
pass
|
|
|
|
# On error, use FALLBACK_CHARSET
|
|
self.warning(u"Unable to convert string to Unicode: %s" % err)
|
|
return unicode(text, FALLBACK_CHARSET, "strict")
|
|
|
|
def _guessCharset(self):
|
|
addr = self.absolute_address + self._content_offset * 8
|
|
bytes = self._parent.stream.readBytes(addr, self._content_size)
|
|
return guessBytesCharset(bytes, default=FALLBACK_CHARSET)
|
|
|
|
def createValue(self, human=True):
|
|
# Compress data address (in bits) and size (in bytes)
|
|
if human:
|
|
addr = self.absolute_address + self._content_offset * 8
|
|
size = self._content_size
|
|
else:
|
|
addr = self.absolute_address
|
|
size = self._size // 8
|
|
if size == 0:
|
|
# Empty string
|
|
return u""
|
|
|
|
# Read bytes in data stream
|
|
text = self._parent.stream.readBytes(addr, size)
|
|
|
|
# Don't transform data?
|
|
if not human:
|
|
return text
|
|
|
|
# Convert text to Unicode
|
|
text = self._convertText(text)
|
|
|
|
# Truncate
|
|
if self._truncate:
|
|
pos = text.find(self._truncate)
|
|
if 0 <= pos:
|
|
text = text[:pos]
|
|
|
|
# Strip string if needed
|
|
if self._strip:
|
|
if isinstance(self._strip, (str, unicode)):
|
|
text = text.strip(self._strip)
|
|
else:
|
|
text = text.strip()
|
|
assert isinstance(text, unicode)
|
|
return text
|
|
|
|
def createDisplay(self, human=True):
|
|
if not human:
|
|
if self._raw_value is None:
|
|
self._raw_value = GenericString.createValue(self, False)
|
|
value = makePrintable(self._raw_value, "ASCII", to_unicode=True)
|
|
elif self._charset:
|
|
value = makePrintable(self.value, "ISO-8859-1", to_unicode=True)
|
|
else:
|
|
value = self.value
|
|
if config.max_string_length < len(value):
|
|
# Truncate string if needed
|
|
value = "%s(...)" % value[:config.max_string_length]
|
|
if not self._charset or not human:
|
|
return makePrintable(value, "ASCII", quote='"', to_unicode=True)
|
|
else:
|
|
if value:
|
|
return '"%s"' % value.replace('"', '\\"')
|
|
else:
|
|
return _("(empty)")
|
|
|
|
def createRawDisplay(self):
|
|
return GenericString.createDisplay(self, human=False)
|
|
|
|
def _getLength(self):
|
|
if self._length is None:
|
|
self._length = len(self.value)
|
|
return self._length
|
|
length = property(_getLength, doc="String length in characters")
|
|
|
|
def _getFormat(self):
|
|
return self._format
|
|
format = property(_getFormat, doc="String format (eg. 'C')")
|
|
|
|
def _getCharset(self):
|
|
if not self._charset:
|
|
self._charset = self._guessCharset()
|
|
return self._charset
|
|
charset = property(_getCharset, doc="String charset (eg. 'ISO-8859-1')")
|
|
|
|
def _getContentSize(self):
|
|
return self._content_size
|
|
content_size = property(_getContentSize, doc="Content size in bytes")
|
|
|
|
def _getContentOffset(self):
|
|
return self._content_offset
|
|
content_offset = property(_getContentOffset, doc="Content offset in bytes")
|
|
|
|
def getFieldType(self):
|
|
info = self.charset
|
|
if self._strip:
|
|
if isinstance(self._strip, (str, unicode)):
|
|
info += ",strip=%s" % makePrintable(self._strip, "ASCII", quote="'")
|
|
else:
|
|
info += ",strip=True"
|
|
return "%s<%s>" % (Bytes.getFieldType(self), info)
|
|
|
|
def stringFactory(name, format, doc):
|
|
class NewString(GenericString):
|
|
__doc__ = doc
|
|
def __init__(self, parent, name, description=None,
|
|
strip=None, charset=None, truncate=None):
|
|
GenericString.__init__(self, parent, name, format, description,
|
|
strip=strip, charset=charset, truncate=truncate)
|
|
cls = NewString
|
|
cls.__name__ = name
|
|
return cls
|
|
|
|
# String which ends with nul byte ("\0")
|
|
CString = stringFactory("CString", "C",
|
|
r"""C string: string ending with nul byte.
|
|
See GenericString to get more information.""")
|
|
|
|
# Unix line of text: string which ends with "\n" (ASCII 0x0A)
|
|
UnixLine = stringFactory("UnixLine", "UnixLine",
|
|
r"""Unix line: string ending with "\n" (ASCII code 10).
|
|
See GenericString to get more information.""")
|
|
|
|
# String prefixed with length written in a 8-bit integer
|
|
PascalString8 = stringFactory("PascalString8", "Pascal8",
|
|
r"""Pascal string: string prefixed with 8-bit integer containing its length (endian depends on parent endian).
|
|
See GenericString to get more information.""")
|
|
|
|
# String prefixed with length written in a 16-bit integer (use parent endian)
|
|
PascalString16 = stringFactory("PascalString16", "Pascal16",
|
|
r"""Pascal string: string prefixed with 16-bit integer containing its length (endian depends on parent endian).
|
|
See GenericString to get more information.""")
|
|
|
|
# String prefixed with length written in a 32-bit integer (use parent endian)
|
|
PascalString32 = stringFactory("PascalString32", "Pascal32",
|
|
r"""Pascal string: string prefixed with 32-bit integer containing its length (endian depends on parent endian).
|
|
See GenericString to get more information.""")
|
|
|
|
|
|
class String(GenericString):
|
|
"""
|
|
String with fixed size (size in bytes).
|
|
See GenericString to get more information.
|
|
"""
|
|
static_size = staticmethod(lambda *args, **kw: args[1]*8)
|
|
|
|
def __init__(self, parent, name, nbytes, description=None,
|
|
strip=None, charset=None, truncate=None):
|
|
GenericString.__init__(self, parent, name, "fixed", description,
|
|
strip=strip, charset=charset, nbytes=nbytes, truncate=truncate)
|
|
String.__name__ = "FixedString"
|
|
|