SickGear/lib/hachoir/parser/misc/chm.py
JackDandy 980e05cc99 Change Hachoir can't support PY2 so backport their PY3 to prevent a need for system dependant external binaries like mediainfo.
Backported 400 revisions from rev 1de4961-8897c5b (2018-2014).
Move core/benchmark, core/cmd_line, core/memory, core/profiler and core/timeout to core/optional/*
Remove metadata/qt*

PORT: Version 2.0a3 (inline with 3.0a3 @ f80c7d5).
Basic Support for XMP Packets.
tga: improvements to adhere more closely to the spec.
pdf: slightly improved parsing.
rar: fix TypeError on unknown block types.
Add MacRoman win32 codepage.
tiff/exif: support SubIFDs and tiled images.
Add method to export metadata in dictionary.
mpeg_video: don't attempt to parse Stream past length.
mpeg_video: parse ESCR correctly, add SCR value.
Change centralise CustomFragments.
field: don't set parser class if class is None, to enable autodetect.
field: add value/display for CustomFragment.
parser: inline warning to enable tracebacks in debug mode.
Fix empty bytestrings in makePrintable.
Fix contentSize in jpeg.py to account for image_data blocks.
Fix the ELF parser.
Enhance the AR archive parser.
elf parser: fix wrong wrong fields order in parsing little endian section flags.
elf parser: add s390 as a machine type.
Flesh out mp4 parser.

PORT: Version 2.0a1 (inline with 3.0a1).
Major refactoring and PEP8.
Fix ResourceWarning warnings on files. Add a close() method and support for the context manager protocol ("with obj: ...") to parsers, input and output streams.
metadata: get comment from ZIP.
Support for InputIOStream.read(0).
Fix sizeGe when size is None.
Remove unused new_seekable_field_set file.
Remove parser Mapsforge .map.
Remove parser Parallel Realities Starfighter .pak files.
sevenzip: fix for newer archives.
java: update access flags and modifiers for Java 1.7 and update description text for most recent Java.
Support ustar prefix field in tar archives.
Remove file_system* parsers.
Remove misc parsers 3d0, 3ds, gnome_keyring, msoffice*, mstask, ole*, word*.
Remove program parsers macho, nds, prc.
Support non-8bit Character subclasses.
Python parser supports Python 3.7.
Enhance mpeg_ts parser to support MTS/M2TS.
Support for creation date in tiff.
Change don't hardcode errno constant.

PORT: 1.9.1
Internal Only: The following are legacy reference to upstream commit messages.
Relevant changes up to b0a115f8.
Use integer division.
Replace HACHOIR_ERRORS with Exception.
Fix metadata.Data: make it sortable.
Import fixes from e7de492.
PORT: Version 2.0a1 (inline with 3.0a1 @ e9f8fad).
Replace hachoir.core.field with hachoir.field
Replace hachoir.core.stream with hachoir.stream
Remove the compatibility module for PY1.5 to PY2.5.
metadata: support TIFF picture.
metadata: fix string normalization.
metadata: fix datetime regex Fix hachoir bug #57.
FileFromInputStream: fix comparison between None and an int.
InputIOStream: open the file in binary mode.
2018-03-28 00:43:11 +01:00

349 lines
14 KiB
Python

"""
InfoTech Storage Format (ITSF) parser, used by Microsoft's HTML Help (.chm)
Document:
- Microsoft's HTML Help (.chm) format
http://www.wotsit.org (search "chm")
- chmlib library
http://www.jedrea.com/chmlib/
- Unofficial CHM Spec
http://savannah.nongnu.org/projects/chmspec
- Microsoft's HTML Help (.chm) format
http://www.speakeasy.org/~russotto/chm/chmformat.html
Author: Victor Stinner
Creation date: 2007-03-04
"""
from hachoir.field import (Field, FieldSet, ParserError, RootSeekableFieldSet,
Int32, UInt16, UInt32, UInt64,
RawBytes, PaddingBytes,
Enum, String)
from hachoir.core.endian import LITTLE_ENDIAN
from hachoir.parser import HachoirParser
from hachoir.parser.common.win32 import GUID
from hachoir.parser.common.win32_lang_id import LANGUAGE_ID
from hachoir.core.text_handler import textHandler, hexadecimal, filesizeHandler
class CWord(Field):
"""
Compressed double-word
"""
def __init__(self, parent, name, description=None):
Field.__init__(self, parent, name, 8, description)
endian = self._parent.endian
stream = self._parent.stream
addr = self.absolute_address
value = 0
byte = stream.readBits(addr, 8, endian)
while byte & 0x80:
value <<= 7
value += (byte & 0x7f)
self._size += 8
if 64 < self._size:
raise ParserError("CHM: CWord is limited to 64 bits")
addr += 8
byte = stream.readBits(addr, 8, endian)
value <<= 7
value += byte
self.createValue = lambda: value
class Filesize_Header(FieldSet):
def createFields(self):
yield textHandler(UInt32(self, "unknown[]", "0x01FE"), hexadecimal)
yield textHandler(UInt32(self, "unknown[]", "0x0"), hexadecimal)
yield filesizeHandler(UInt64(self, "file_size"))
yield textHandler(UInt32(self, "unknown[]", "0x0"), hexadecimal)
yield textHandler(UInt32(self, "unknown[]", "0x0"), hexadecimal)
class ITSP(FieldSet):
def __init__(self, *args):
FieldSet.__init__(self, *args)
self._size = self["size"].value * 8
def createFields(self):
yield String(self, "magic", 4, "ITSP", charset="ASCII")
yield UInt32(self, "version", "Version (=1)")
yield filesizeHandler(UInt32(self, "size", "Length (in bytes) of the directory header (84)"))
yield UInt32(self, "unknown[]", "(=10)")
yield filesizeHandler(UInt32(self, "block_size", "Directory block size"))
yield UInt32(self, "density", "Density of quickref section, usually 2")
yield UInt32(self, "index_depth", "Depth of the index tree")
yield Int32(self, "nb_dir", "Chunk number of root index chunk")
yield UInt32(self, "first_pmgl", "Chunk number of first PMGL (listing) chunk")
yield UInt32(self, "last_pmgl", "Chunk number of last PMGL (listing) chunk")
yield Int32(self, "unknown[]", "-1")
yield UInt32(self, "nb_dir_chunk", "Number of directory chunks (total)")
yield Enum(UInt32(self, "lang_id", "Windows language ID"), LANGUAGE_ID)
yield GUID(self, "system_uuid", "{5D02926A-212E-11D0-9DF9-00A0C922E6EC}")
yield filesizeHandler(UInt32(self, "size2", "Same value than size"))
yield Int32(self, "unknown[]", "-1")
yield Int32(self, "unknown[]", "-1")
yield Int32(self, "unknown[]", "-1")
class ITSF(FieldSet):
def createFields(self):
yield String(self, "magic", 4, "ITSF", charset="ASCII")
yield UInt32(self, "version")
yield UInt32(self, "header_size", "Total header length (in bytes)")
yield UInt32(self, "one")
yield UInt32(self, "last_modified", "Lower 32 bits of the time expressed in units of 0.1 us")
yield Enum(UInt32(self, "lang_id", "Windows Language ID"), LANGUAGE_ID)
yield GUID(self, "dir_uuid", "{7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC}")
yield GUID(self, "stream_uuid", "{7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC}")
yield UInt64(self, "filesize_offset")
yield filesizeHandler(UInt64(self, "filesize_len"))
yield UInt64(self, "dir_offset")
yield filesizeHandler(UInt64(self, "dir_len"))
if 3 <= self["version"].value:
yield UInt64(self, "data_offset")
class PMGL_Entry(FieldSet):
def createFields(self):
yield CWord(self, "name_len")
yield String(self, "name", self["name_len"].value, charset="UTF-8")
yield CWord(self, "section", "Section number that the entry data is in.")
yield CWord(self, "start", "Start offset of the data")
yield filesizeHandler(CWord(self, "length", "Length of the data"))
def createDescription(self):
return "%s (%s)" % (self["name"].value, self["length"].display)
class PMGL(FieldSet):
def createFields(self):
# Header
yield String(self, "magic", 4, "PMGL", charset="ASCII")
yield filesizeHandler(Int32(self, "free_space",
"Length of free space and/or quickref area at end of directory chunk"))
yield Int32(self, "unknown")
yield Int32(self, "previous", "Chunk number of previous listing chunk")
yield Int32(self, "next", "Chunk number of previous listing chunk")
# Entries
stop = self.size - self["free_space"].value * 8
entry_count = 0
while self.current_size < stop:
yield PMGL_Entry(self, "entry[]")
entry_count += 1
# Padding
quickref_frequency = 1 + (1 << self["/dir/itsp/density"].value)
num_quickref = (entry_count // quickref_frequency)
if entry_count % quickref_frequency == 0:
num_quickref -= 1
print self.current_size // 8, quickref_frequency, num_quickref
padding = (self["free_space"].value - (num_quickref * 2 + 2))
if padding:
yield PaddingBytes(self, "padding", padding)
for i in range(num_quickref * quickref_frequency, 0, -quickref_frequency):
yield UInt16(self, "quickref[%i]" % i)
yield UInt16(self, "entry_count")
class PMGI_Entry(FieldSet):
def createFields(self):
yield CWord(self, "name_len")
yield String(self, "name", self["name_len"].value, charset="UTF-8")
yield CWord(self, "page")
def createDescription(self):
return "%s (page #%u)" % (self["name"].value, self["page"].value)
class PMGI(FieldSet):
def createFields(self):
yield String(self, "magic", 4, "PMGI", charset="ASCII")
yield filesizeHandler(UInt32(self, "free_space",
"Length of free space and/or quickref area at end of directory chunk"))
stop = self.size - self["free_space"].value * 8
while self.current_size < stop:
yield PMGI_Entry(self, "entry[]")
padding = (self.size - self.current_size) // 8
if padding:
yield PaddingBytes(self, "padding", padding)
class Directory(FieldSet):
def createFields(self):
yield ITSP(self, "itsp")
block_size = self["itsp/block_size"].value * 8
nb_dir = self["itsp/nb_dir"].value
if nb_dir < 0:
nb_dir = 1
for index in xrange(nb_dir):
yield PMGL(self, "pmgl[]", size=block_size)
if self.current_size < self.size:
yield PMGI(self, "pmgi", size=block_size)
class NameList(FieldSet):
def createFields(self):
yield UInt16(self, "length", "Length of name list in 2-byte blocks")
yield UInt16(self, "count", "Number of entries in name list")
for index in range(self["count"].value):
length = UInt16(self, "name_len[]", "Length of name in 2-byte blocks, excluding terminating null")
yield length
yield String(self, "name[]", length.value * 2 + 2, charset="UTF-16-LE")
class ControlData(FieldSet):
def createFields(self):
yield UInt32(self, "count", "Number of DWORDS in this struct")
yield String(self, "type", 4, "Type of compression")
if self["type"].value != 'LZXC': return
yield UInt32(self, "version", "Compression version")
version = self["version"].value
if version == 1:
block = 'bytes'
else:
block = '32KB blocks'
yield UInt32(self, "reset_interval", "LZX: Reset interval in %s" % block)
yield UInt32(self, "window_size", "LZX: Window size in %s" % block)
yield UInt32(self, "cache_size", "LZX: Cache size in %s" % block)
yield UInt32(self, "unknown[]")
class ResetTable(FieldSet):
def createFields(self):
yield UInt32(self, "unknown[]", "Version number?")
yield UInt32(self, "count", "Number of entries")
yield UInt32(self, "entry_size", "Size of each entry")
yield UInt32(self, "header_size", "Size of this header")
yield UInt64(self, "uncompressed_size")
yield UInt64(self, "compressed_size")
yield UInt64(self, "block_size", "Block size in bytes")
for i in xrange(self["count"].value):
yield UInt64(self, "block_location[]",
"location in compressed data of 1st block boundary in uncompressed data")
class SystemEntry(FieldSet):
ENTRY_TYPE = {0: "HHP: [OPTIONS]: Contents File",
1: "HHP: [OPTIONS]: Index File",
2: "HHP: [OPTIONS]: Default Topic",
3: "HHP: [OPTIONS]: Title",
4: "File Metadata",
5: "HHP: [OPTIONS]: Default Window",
6: "HHP: [OPTIONS]: Compiled file",
# 7 present only in files with Binary Index; unknown function
# 8 unknown function
9: "Version",
10: "Timestamp",
# 11 only in Binary TOC files
12: "Number of Info Types",
13: "#IDXHDR file",
# 14 unknown function
# 15 checksum??
16: "HHP: [OPTIONS]: Default Font",
}
def createFields(self):
yield Enum(UInt16(self, "type", "Type of entry"), self.ENTRY_TYPE)
yield UInt16(self, "length", "Length of entry")
yield RawBytes(self, "data", self["length"].value)
def createDescription(self):
return '#SYSTEM Entry, Type %s' % self["type"].display
class SystemFile(FieldSet):
def createFields(self):
yield UInt32(self, "version", "Either 2 or 3")
while self.current_size < self.size:
yield SystemEntry(self, "entry[]")
class ChmFile(HachoirParser, RootSeekableFieldSet):
MAGIC = "ITSF\3\0\0\0"
PARSER_TAGS = {
"id": "chm",
"category": "misc",
"file_ext": ("chm",),
"min_size": 4 * 8,
"magic": ((MAGIC, 0),),
"description": "Microsoft's HTML Help (.chm)",
}
endian = LITTLE_ENDIAN
def __init__(self, stream, **args):
RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
HachoirParser.__init__(self, stream, **args)
def validate(self):
if self.stream.readBytes(0, len(self.MAGIC)) != self.MAGIC:
return "Invalid magic"
return True
def createFields(self):
yield ITSF(self, "itsf")
yield Filesize_Header(self, "file_size", size=self["itsf/filesize_len"].value * 8)
self.seekByte(self["itsf/dir_offset"].value)
directory = Directory(self, "dir", size=self["itsf/dir_len"].value * 8)
yield directory
otherentries = {}
for pmgl in directory.array("pmgl"):
for entry in pmgl.array("entry"):
if entry["section"].value != 0:
otherentries.setdefault(entry["section"].value, []).append(entry)
continue
if entry["length"].value == 0:
continue
self.seekByte(self["itsf/data_offset"].value + entry["start"].value)
name = entry["name"].value
if name == "::DataSpace/NameList":
yield NameList(self, "name_list")
elif name.startswith('::DataSpace/Storage/'):
sectname = str(name.split('/')[2])
if name.endswith('/SpanInfo'):
yield UInt64(self, "%s_spaninfo" % sectname,
"Size of uncompressed data in the %s section" % sectname)
elif name.endswith('/ControlData'):
yield ControlData(self, "%s_controldata" % sectname, "Data about the compression scheme",
size=entry["length"].value * 8)
elif name.endswith('/Transform/List'):
yield String(self, "%s_transform_list" % sectname, 38, description="Transform/List element",
charset="UTF-16-LE")
elif name.endswith('/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable'):
yield ResetTable(self, "%s_reset_table" % sectname, "LZX Reset Table",
size=entry["length"].value * 8)
elif name.endswith('/Content'):
# eventually, a LZX wrapper will appear here, we hope!
yield RawBytes(self, "%s_content" % sectname, entry["length"].value,
"Content for the %s section" % sectname)
else:
yield RawBytes(self, "entry_data[]", entry["length"].value, name)
elif name == "/#SYSTEM":
yield SystemFile(self, "system_file", size=entry["length"].value * 8)
else:
yield RawBytes(self, "entry_data[]", entry["length"].value, name)
def getFile(self, filename):
page = 0
if 'pmgi' in self['/dir']:
for entry in self['/dir/pmgi'].array('entry'):
if entry['name'].value <= filename:
page = entry['page'].value
pmgl = self['/dir/pmgl[%i]' % page]
for entry in pmgl.array('entry'):
if entry['name'].value == filename:
return entry
raise ParserError("File '%s' not found!" % filename)
def createContentSize(self):
return self["file_size/file_size"].value * 8