SickGear/lib/hachoir/parser/misc/chm.py

"""
InfoTech Storage Format (ITSF) parser, used by Microsoft's HTML Help (.chm)

Document:
- Microsoft's HTML Help (.chm) format
  http://www.wotsit.org (search "chm")
- chmlib library
  http://www.jedrea.com/chmlib/
- Unofficial CHM Spec
  http://savannah.nongnu.org/projects/chmspec
- Microsoft's HTML Help (.chm) format
  http://www.speakeasy.org/~russotto/chm/chmformat.html

Author: Victor Stinner
Creation date: 2007-03-04
"""

from hachoir.field import (Field, FieldSet, ParserError, RootSeekableFieldSet,
                           Int32, UInt16, UInt32, UInt64,
                           RawBytes, PaddingBytes,
                           Enum, String)
from hachoir.core.endian import LITTLE_ENDIAN
from hachoir.parser import HachoirParser
from hachoir.parser.common.win32 import GUID
from hachoir.parser.common.win32_lang_id import LANGUAGE_ID
from hachoir.core.text_handler import textHandler, hexadecimal, filesizeHandler


class CWord(Field):
    """
    Compressed double-word
    """

    def __init__(self, parent, name, description=None):
        Field.__init__(self, parent, name, 8, description)

        endian = self._parent.endian
        stream = self._parent.stream
        addr = self.absolute_address

        value = 0
        byte = stream.readBits(addr, 8, endian)
        while byte & 0x80:
            value <<= 7
            value += (byte & 0x7f)
            self._size += 8
            if 64 < self._size:
                raise ParserError("CHM: CWord is limited to 64 bits")
            addr += 8
            byte = stream.readBits(addr, 8, endian)
        value <<= 7
        value += byte
        self.createValue = lambda: value


class Filesize_Header(FieldSet):
    def createFields(self):
        yield textHandler(UInt32(self, "unknown[]", "0x01FE"), hexadecimal)
        yield textHandler(UInt32(self, "unknown[]", "0x0"), hexadecimal)
        yield filesizeHandler(UInt64(self, "file_size"))
        yield textHandler(UInt32(self, "unknown[]", "0x0"), hexadecimal)
        yield textHandler(UInt32(self, "unknown[]", "0x0"), hexadecimal)


class ITSP(FieldSet):
    def __init__(self, *args):
        FieldSet.__init__(self, *args)
        self._size = self["size"].value * 8

    def createFields(self):
        yield String(self, "magic", 4, "ITSP", charset="ASCII")
        yield UInt32(self, "version", "Version (=1)")
        yield filesizeHandler(UInt32(self, "size", "Length (in bytes) of the directory header (84)"))
        yield UInt32(self, "unknown[]", "(=10)")
        yield filesizeHandler(UInt32(self, "block_size", "Directory block size"))
        yield UInt32(self, "density", "Density of quickref section, usually 2")
        yield UInt32(self, "index_depth", "Depth of the index tree")
        yield Int32(self, "nb_dir", "Chunk number of root index chunk")
        yield UInt32(self, "first_pmgl", "Chunk number of first PMGL (listing) chunk")
        yield UInt32(self, "last_pmgl", "Chunk number of last PMGL (listing) chunk")
        yield Int32(self, "unknown[]", "-1")
        yield UInt32(self, "nb_dir_chunk", "Number of directory chunks (total)")
        yield Enum(UInt32(self, "lang_id", "Windows language ID"), LANGUAGE_ID)
        yield GUID(self, "system_uuid", "{5D02926A-212E-11D0-9DF9-00A0C922E6EC}")
        yield filesizeHandler(UInt32(self, "size2", "Same value than size"))
        yield Int32(self, "unknown[]", "-1")
        yield Int32(self, "unknown[]", "-1")
        yield Int32(self, "unknown[]", "-1")


class ITSF(FieldSet):
    def createFields(self):
        yield String(self, "magic", 4, "ITSF", charset="ASCII")
        yield UInt32(self, "version")
        yield UInt32(self, "header_size", "Total header length (in bytes)")
        yield UInt32(self, "one")
        yield UInt32(self, "last_modified", "Lower 32 bits of the time expressed in units of 0.1 us")
        yield Enum(UInt32(self, "lang_id", "Windows Language ID"), LANGUAGE_ID)
        yield GUID(self, "dir_uuid", "{7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC}")
        yield GUID(self, "stream_uuid", "{7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC}")
        yield UInt64(self, "filesize_offset")
        yield filesizeHandler(UInt64(self, "filesize_len"))
        yield UInt64(self, "dir_offset")
        yield filesizeHandler(UInt64(self, "dir_len"))
        if 3 <= self["version"].value:
            yield UInt64(self, "data_offset")


class PMGL_Entry(FieldSet):
    def createFields(self):
        yield CWord(self, "name_len")
        yield String(self, "name", self["name_len"].value, charset="UTF-8")
        yield CWord(self, "section", "Section number that the entry data is in.")
        yield CWord(self, "start", "Start offset of the data")
        yield filesizeHandler(CWord(self, "length", "Length of the data"))

    def createDescription(self):
        return "%s (%s)" % (self["name"].value, self["length"].display)


class PMGL(FieldSet):
    def createFields(self):
        # Header
        yield String(self, "magic", 4, "PMGL", charset="ASCII")
        yield filesizeHandler(Int32(self, "free_space",
                                    "Length of free space and/or quickref area at end of directory chunk"))
        yield Int32(self, "unknown")
        yield Int32(self, "previous", "Chunk number of previous listing chunk")
        yield Int32(self, "next", "Chunk number of previous listing chunk")

        # Entries
        stop = self.size - self["free_space"].value * 8
        entry_count = 0
        while self.current_size < stop:
            yield PMGL_Entry(self, "entry[]")
            entry_count += 1

        # Padding
        quickref_frequency = 1 + (1 << self["/dir/itsp/density"].value)
        num_quickref = (entry_count // quickref_frequency)
        if entry_count % quickref_frequency == 0:
            num_quickref -= 1
        print self.current_size // 8, quickref_frequency, num_quickref
        padding = (self["free_space"].value - (num_quickref * 2 + 2))
        if padding:
            yield PaddingBytes(self, "padding", padding)
        for i in range(num_quickref * quickref_frequency, 0, -quickref_frequency):
            yield UInt16(self, "quickref[%i]" % i)
        yield UInt16(self, "entry_count")


class PMGI_Entry(FieldSet):
    def createFields(self):
        yield CWord(self, "name_len")
        yield String(self, "name", self["name_len"].value, charset="UTF-8")
        yield CWord(self, "page")

    def createDescription(self):
        return "%s (page #%u)" % (self["name"].value, self["page"].value)


class PMGI(FieldSet):
    def createFields(self):
        yield String(self, "magic", 4, "PMGI", charset="ASCII")
        yield filesizeHandler(UInt32(self, "free_space",
                                     "Length of free space and/or quickref area at end of directory chunk"))

        stop = self.size - self["free_space"].value * 8
        while self.current_size < stop:
            yield PMGI_Entry(self, "entry[]")

        padding = (self.size - self.current_size) // 8
        if padding:
            yield PaddingBytes(self, "padding", padding)


class Directory(FieldSet):
    def createFields(self):
        yield ITSP(self, "itsp")
        block_size = self["itsp/block_size"].value * 8

        nb_dir = self["itsp/nb_dir"].value

        if nb_dir < 0:
            nb_dir = 1
        for index in xrange(nb_dir):
            yield PMGL(self, "pmgl[]", size=block_size)

        if self.current_size < self.size:
            yield PMGI(self, "pmgi", size=block_size)


class NameList(FieldSet):
    def createFields(self):
        yield UInt16(self, "length", "Length of name list in 2-byte blocks")
        yield UInt16(self, "count", "Number of entries in name list")
        for index in range(self["count"].value):
            length = UInt16(self, "name_len[]", "Length of name in 2-byte blocks, excluding terminating null")
            yield length
            yield String(self, "name[]", length.value * 2 + 2, charset="UTF-16-LE")


class ControlData(FieldSet):
    def createFields(self):
        yield UInt32(self, "count", "Number of DWORDS in this struct")
        yield String(self, "type", 4, "Type of compression")
        if self["type"].value != 'LZXC': return
        yield UInt32(self, "version", "Compression version")
        version = self["version"].value
        if version == 1:
            block = 'bytes'
        else:
            block = '32KB blocks'
        yield UInt32(self, "reset_interval", "LZX: Reset interval in %s" % block)
        yield UInt32(self, "window_size", "LZX: Window size in %s" % block)
        yield UInt32(self, "cache_size", "LZX: Cache size in %s" % block)
        yield UInt32(self, "unknown[]")


class ResetTable(FieldSet):
    def createFields(self):
        yield UInt32(self, "unknown[]", "Version number?")
        yield UInt32(self, "count", "Number of entries")
        yield UInt32(self, "entry_size", "Size of each entry")
        yield UInt32(self, "header_size", "Size of this header")
        yield UInt64(self, "uncompressed_size")
        yield UInt64(self, "compressed_size")
        yield UInt64(self, "block_size", "Block size in bytes")
        for i in xrange(self["count"].value):
            yield UInt64(self, "block_location[]",
                         "location in compressed data of 1st block boundary in uncompressed data")


class SystemEntry(FieldSet):
    ENTRY_TYPE = {0: "HHP: [OPTIONS]: Contents File",
                  1: "HHP: [OPTIONS]: Index File",
                  2: "HHP: [OPTIONS]: Default Topic",
                  3: "HHP: [OPTIONS]: Title",
                  4: "File Metadata",
                  5: "HHP: [OPTIONS]: Default Window",
                  6: "HHP: [OPTIONS]: Compiled file",
                  # 7 present only in files with Binary Index; unknown function
                  # 8 unknown function
                  9: "Version",
                  10: "Timestamp",
                  # 11 only in Binary TOC files
                  12: "Number of Info Types",
                  13: "#IDXHDR file",
                  # 14 unknown function
                  # 15 checksum??
                  16: "HHP: [OPTIONS]: Default Font",
                  }

    def createFields(self):
        yield Enum(UInt16(self, "type", "Type of entry"), self.ENTRY_TYPE)
        yield UInt16(self, "length", "Length of entry")
        yield RawBytes(self, "data", self["length"].value)

    def createDescription(self):
        return '#SYSTEM Entry, Type %s' % self["type"].display


class SystemFile(FieldSet):
    def createFields(self):
        yield UInt32(self, "version", "Either 2 or 3")
        while self.current_size < self.size:
            yield SystemEntry(self, "entry[]")


class ChmFile(HachoirParser, RootSeekableFieldSet):
    MAGIC = "ITSF\3\0\0\0"
    PARSER_TAGS = {
        "id": "chm",
        "category": "misc",
        "file_ext": ("chm",),
        "min_size": 4 * 8,
        "magic": ((MAGIC, 0),),
        "description": "Microsoft's HTML Help (.chm)",
    }
    endian = LITTLE_ENDIAN

    def __init__(self, stream, **args):
        RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
        HachoirParser.__init__(self, stream, **args)

    def validate(self):
        if self.stream.readBytes(0, len(self.MAGIC)) != self.MAGIC:
            return "Invalid magic"
        return True

    def createFields(self):
        yield ITSF(self, "itsf")
        yield Filesize_Header(self, "file_size", size=self["itsf/filesize_len"].value * 8)

        self.seekByte(self["itsf/dir_offset"].value)
        directory = Directory(self, "dir", size=self["itsf/dir_len"].value * 8)
        yield directory

        otherentries = {}
        for pmgl in directory.array("pmgl"):
            for entry in pmgl.array("entry"):
                if entry["section"].value != 0:
                    otherentries.setdefault(entry["section"].value, []).append(entry)
                    continue
                if entry["length"].value == 0:
                    continue
                self.seekByte(self["itsf/data_offset"].value + entry["start"].value)
                name = entry["name"].value
                if name == "::DataSpace/NameList":
                    yield NameList(self, "name_list")
                elif name.startswith('::DataSpace/Storage/'):
                    sectname = str(name.split('/')[2])
                    if name.endswith('/SpanInfo'):
                        yield UInt64(self, "%s_spaninfo" % sectname,
                                     "Size of uncompressed data in the %s section" % sectname)
                    elif name.endswith('/ControlData'):
                        yield ControlData(self, "%s_controldata" % sectname, "Data about the compression scheme",
                                          size=entry["length"].value * 8)
                    elif name.endswith('/Transform/List'):
                        yield String(self, "%s_transform_list" % sectname, 38, description="Transform/List element",
                                     charset="UTF-16-LE")
                    elif name.endswith('/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable'):
                        yield ResetTable(self, "%s_reset_table" % sectname, "LZX Reset Table",
                                         size=entry["length"].value * 8)
                    elif name.endswith('/Content'):
                        # eventually, a LZX wrapper will appear here, we hope!
                        yield RawBytes(self, "%s_content" % sectname, entry["length"].value,
                                       "Content for the %s section" % sectname)
                    else:
                        yield RawBytes(self, "entry_data[]", entry["length"].value, name)
                elif name == "/#SYSTEM":
                    yield SystemFile(self, "system_file", size=entry["length"].value * 8)
                else:
                    yield RawBytes(self, "entry_data[]", entry["length"].value, name)

    def getFile(self, filename):
        page = 0
        if 'pmgi' in self['/dir']:
            for entry in self['/dir/pmgi'].array('entry'):
                if entry['name'].value <= filename:
                    page = entry['page'].value
        pmgl = self['/dir/pmgl[%i]' % page]
        for entry in pmgl.array('entry'):
            if entry['name'].value == filename:
                return entry
        raise ParserError("File '%s' not found!" % filename)

    def createContentSize(self):
        return self["file_size/file_size"].value * 8