SickGear/lib/hachoir/parser/misc/msoffice_summary.py
2023-02-09 13:41:15 +00:00

427 lines
14 KiB
Python

"""
Microsoft Document summaries structures.
Documents
---------
- Apache POI (HPSF Internals):
http://poi.apache.org/hpsf/internals.html
"""
from hachoir.core.endian import BIG_ENDIAN
from hachoir.field import (FieldSet, ParserError,
SeekableFieldSet,
Bit, Bits, NullBits,
UInt8, UInt16, UInt32, TimestampWin64, TimedeltaWin64, Enum,
Bytes, RawBytes, NullBytes, PaddingBits, String,
Int8, Int32, Float32, Float64, PascalString32)
from hachoir.core.text_handler import textHandler, hexadecimal, filesizeHandler
from hachoir.core.tools import createDict, paddingSize
from hachoir.parser.common.win32 import GUID, PascalStringWin32, CODEPAGE_CHARSET
from hachoir.parser.image.bmp import BmpHeader, parseImageData
from hachoir.parser.misc.ole2_util import OLE2FragmentParser
MAX_SECTION_COUNT = 100
OS_MAC = 1
OS_NAME = {
0: "Windows 16-bit",
1: "Macintosh",
2: "Windows 32-bit",
}
class OSConfig:
def __init__(self, big_endian):
if big_endian:
self.charset = "MacRoman"
self.utf16 = "UTF-16-BE"
else:
# FIXME: Don't guess the charset, use ISO-8859-1 or UTF-8
# self.charset = "ISO-8859-1"
self.charset = None
self.utf16 = "UTF-16-LE"
class PropertyIndex(FieldSet):
TAG_CODEPAGE = 1
COMMON_PROPERTY = {
0: "Dictionary",
1: "CodePage",
0x80000000: "LOCALE_SYSTEM_DEFAULT",
0x80000003: "CASE_SENSITIVE",
}
DOCUMENT_PROPERTY = {
2: "Category",
3: "PresentationFormat",
4: "NumBytes",
5: "NumLines",
6: "NumParagraphs",
7: "NumSlides",
8: "NumNotes",
9: "NumHiddenSlides",
10: "NumMMClips",
11: "Scale",
12: "HeadingPairs",
13: "DocumentParts",
14: "Manager",
15: "Company",
16: "LinksDirty",
17: "DocSumInfo_17",
18: "DocSumInfo_18",
19: "DocSumInfo_19",
20: "DocSumInfo_20",
21: "DocSumInfo_21",
22: "DocSumInfo_22",
23: "DocSumInfo_23",
}
DOCUMENT_PROPERTY.update(COMMON_PROPERTY)
COMPONENT_PROPERTY = {
2: "Title",
3: "Subject",
4: "Author",
5: "Keywords",
6: "Comments",
7: "Template",
8: "LastSavedBy",
9: "RevisionNumber",
10: "TotalEditingTime",
11: "LastPrinted",
12: "CreateTime",
13: "LastSavedTime",
14: "NumPages",
15: "NumWords",
16: "NumCharacters",
17: "Thumbnail",
18: "AppName",
19: "Security",
}
COMPONENT_PROPERTY.update(COMMON_PROPERTY)
def createFields(self):
if self.root.IS_DOC_SUMMARY:
enum = self.DOCUMENT_PROPERTY
else:
enum = self.COMPONENT_PROPERTY
yield Enum(UInt32(self, "id"), enum)
yield UInt32(self, "offset")
def createDescription(self):
return "Property: %s" % self["id"].display
class Bool(Int8):
def createValue(self):
value = Int8.createValue(self)
return (value == -1)
class Thumbnail(FieldSet):
"""
Thumbnail.
Documents:
- See Jakarta POI
http://jakarta.apache.org/poi/hpsf/thumbnails.html
http://www.penguin-soft.com/penguin/developer/poi/
org/apache/poi/hpsf/Thumbnail.html#CF_BITMAP
- How To Extract Thumbnail Images
http://sparks.discreet.com/knowledgebase/public/
solutions/ExtractThumbnailImg.htm
"""
FORMAT_CLIPBOARD = -1
FORMAT_NAME = {
-1: "Windows clipboard",
-2: "Macintosh clipboard",
-3: "GUID that contains format identifier",
0: "No data",
2: "Bitmap",
3: "Windows metafile format",
8: "Device Independent Bitmap (DIB)",
14: "Enhanced Windows metafile",
}
DIB_BMP = 8
DIB_FORMAT = {
2: "Bitmap Obsolete (old BMP)",
3: "Windows metafile format (WMF)",
8: "Device Independent Bitmap (BMP)",
14: "Enhanced Windows metafile (EMF)",
}
def __init__(self, *args):
FieldSet.__init__(self, *args)
self._size = self["size"].value * 8
def createFields(self):
yield filesizeHandler(UInt32(self, "size"))
yield Enum(Int32(self, "format"), self.FORMAT_NAME)
if self["format"].value == self.FORMAT_CLIPBOARD:
yield Enum(UInt32(self, "dib_format"), self.DIB_FORMAT)
if self["dib_format"].value == self.DIB_BMP:
yield BmpHeader(self, "bmp_header")
size = (self.size - self.current_size) // 8
yield parseImageData(self, "pixels", size, self["bmp_header"])
return
size = (self.size - self.current_size) // 8
if size:
yield RawBytes(self, "data", size)
class PropertyContent(FieldSet):
class NullHandler(FieldSet):
def createFields(self):
yield UInt32(self, "unknown[]")
yield PascalString32(self, "data")
def createValue(self):
return self["data"].value
class BlobHandler(FieldSet):
def createFields(self):
self.osconfig = self.parent.osconfig
yield UInt32(self, "size")
yield UInt32(self, "count")
for i in range(self["count"].value):
yield PropertyContent(self, "item[]")
n = paddingSize(self.current_size, 32)
if n:
yield PaddingBits(self, "padding[]", n)
class WidePascalString32(FieldSet):
''' uses number of characters instead of number of bytes '''
def __init__(self, parent, name, charset='ASCII'):
FieldSet.__init__(self, parent, name)
self.charset = charset
def createFields(self):
yield UInt32(self, "length", "Length of this string")
yield String(self, "data", self["length"].value * 2, charset=self.charset)
def createValue(self):
return self["data"].value
def createDisplay(self):
return 'u' + self["data"].display
TYPE_LPSTR = 30
TYPE_INFO = {
0: ("EMPTY", None),
1: ("NULL", NullHandler),
2: ("UInt16", UInt16),
3: ("UInt32", UInt32),
4: ("Float32", Float32),
5: ("Float64", Float64),
6: ("CY", None),
7: ("DATE", None),
8: ("BSTR", None),
9: ("DISPATCH", None),
10: ("ERROR", None),
11: ("BOOL", Bool),
12: ("VARIANT", None),
13: ("UNKNOWN", None),
14: ("DECIMAL", None),
16: ("I1", None),
17: ("UI1", None),
18: ("UI2", None),
19: ("UI4", None),
20: ("I8", None),
21: ("UI8", None),
22: ("INT", None),
23: ("UINT", None),
24: ("VOID", None),
25: ("HRESULT", None),
26: ("PTR", None),
27: ("SAFEARRAY", None),
28: ("CARRAY", None),
29: ("USERDEFINED", None),
30: ("LPSTR", PascalString32),
31: ("LPWSTR", WidePascalString32),
64: ("FILETIME", TimestampWin64),
65: ("BLOB", BlobHandler),
66: ("STREAM", None),
67: ("STORAGE", None),
68: ("STREAMED_OBJECT", None),
69: ("STORED_OBJECT", None),
70: ("BLOB_OBJECT", None),
71: ("THUMBNAIL", Thumbnail),
72: ("CLSID", None),
0x1000: ("Vector", None),
}
TYPE_NAME = createDict(TYPE_INFO, 0)
def createFields(self):
self.osconfig = self.parent.osconfig
if True:
yield Enum(Bits(self, "type", 12), self.TYPE_NAME)
yield Bit(self, "is_vector")
yield NullBits(self, "padding", 32 - 12 - 1)
else:
yield Enum(Bits(self, "type", 32), self.TYPE_NAME)
tag = self["type"].value
kw = {}
try:
handler = self.TYPE_INFO[tag][1]
if handler in (self.WidePascalString32, PascalString32):
cur = self
while not hasattr(cur, 'osconfig'):
cur = cur.parent
if cur is None:
raise LookupError('Cannot find osconfig')
osconfig = cur.osconfig
if tag == self.TYPE_LPSTR:
kw["charset"] = osconfig.charset
else:
kw["charset"] = osconfig.utf16
elif handler == TimestampWin64:
if self.description == "TotalEditingTime":
handler = TimedeltaWin64
except LookupError:
handler = None
if not handler:
self.warning("OLE2: Unable to parse property of type %s"
% self["type"].display)
# raise ParserError(
elif self["is_vector"].value:
yield UInt32(self, "count")
for index in range(self["count"].value):
yield handler(self, "item[]", **kw)
else:
yield handler(self, "value", **kw)
self.createValue = lambda: self["value"].value
PropertyContent.TYPE_INFO[12] = ("VARIANT", PropertyContent)
class SummarySection(SeekableFieldSet):
def __init__(self, *args):
SeekableFieldSet.__init__(self, *args)
self._size = self["size"].value * 8
def createFields(self):
self.osconfig = self.parent.osconfig
yield UInt32(self, "size")
yield UInt32(self, "property_count")
for index in range(self["property_count"].value):
yield PropertyIndex(self, "property_index[]")
for index in range(self["property_count"].value):
findex = self["property_index[%u]" % index]
self.seekByte(findex["offset"].value)
field = PropertyContent(self, "property[]", findex["id"].display)
yield field
if not self.osconfig.charset \
and findex['id'].value == PropertyIndex.TAG_CODEPAGE:
codepage = field['value'].value
if codepage in CODEPAGE_CHARSET:
self.osconfig.charset = CODEPAGE_CHARSET[codepage]
else:
self.warning("Unknown codepage: %r" % codepage)
class SummaryIndex(FieldSet):
static_size = 20 * 8
def createFields(self):
yield String(self, "name", 16)
yield UInt32(self, "offset")
class Summary(OLE2FragmentParser):
IS_DOC_SUMMARY = False
ENDIAN_CHECK = True
def __init__(self, stream, **args):
OLE2FragmentParser.__init__(self, stream, **args)
# self.osconfig = OSConfig(self["os_type"].value == OS_MAC)
self.osconfig = OSConfig(self.endian == BIG_ENDIAN)
def createFields(self):
yield Bytes(self, "endian", 2, "Endian (\\xfe\\xff for little endian)")
yield UInt16(self, "format", "Format (0)")
yield UInt8(self, "os_version")
yield UInt8(self, "os_revision")
yield Enum(UInt16(self, "os_type"), OS_NAME)
yield GUID(self, "format_id")
yield UInt32(self, "section_count")
if MAX_SECTION_COUNT < self["section_count"].value:
raise ParserError("OLE2: Too much sections (%s)" %
self["section_count"].value)
section_indexes = []
for index in range(self["section_count"].value):
section_index = SummaryIndex(self, "section_index[]")
yield section_index
section_indexes.append(section_index)
for section_index in section_indexes:
self.seekByte(section_index["offset"].value)
yield SummarySection(self, "section[]")
size = (self.size - self.current_size) // 8
if 0 < size:
yield NullBytes(self, "end_padding", size)
class DocSummary(Summary):
IS_DOC_SUMMARY = True
class CompObj(OLE2FragmentParser):
ENDIAN_CHECK = True
def __init__(self, stream, **args):
OLE2FragmentParser.__init__(self, stream, **args)
self.osconfig = OSConfig(self["os"].value == OS_MAC)
def createFields(self):
# Header
yield UInt16(self, "version", "Version (=1)")
yield Bytes(self, "endian", 2, "Endian (\\xfe\\xff for little endian)")
yield UInt8(self, "os_version")
yield UInt8(self, "os_revision")
yield Enum(UInt16(self, "os"), OS_NAME)
yield Int32(self, "unused", "(=-1)")
yield GUID(self, "clsid")
# User type
yield PascalString32(self, "user_type", strip="\0")
# Clipboard format
if self["os"].value == OS_MAC:
yield Int32(self, "unused[]", "(=-2)")
yield String(self, "clipboard_format", 4)
else:
yield PascalString32(self, "clipboard_format", strip="\0")
if self._current_size // 8 == self.datasize:
return
# -- OLE 2.01 ---
# Program ID
yield PascalString32(self, "prog_id", strip="\0")
if self["os"].value != OS_MAC:
# Magic number
yield textHandler(UInt32(self, "magic", "Magic number (0x71B239F4)"), hexadecimal)
# Unicode version
yield PascalStringWin32(self, "user_type_unicode", strip="\0")
yield PascalStringWin32(self, "clipboard_format_unicode", strip="\0")
yield PascalStringWin32(self, "prog_id_unicode", strip="\0")
# _current_size because current_size returns _current_max_size
size = self.datasize - (self._current_size // 8)
if size:
yield NullBytes(self, "end_padding", size)
if self.datasize < self.size // 8:
yield RawBytes(self, "slack_space", (self.size // 8) - self.datasize)