""" BZIP2 archive file Author: Victor Stinner, Robert Xiao """ from hachoir.parser import Parser from hachoir.core.tools import paddingSize from hachoir.field import (Field, FieldSet, GenericVector, ParserError, String, PaddingBits, Bit, Bits, Character, UInt32, Enum, CompressedField) from hachoir.core.endian import BIG_ENDIAN from hachoir.core.text_handler import textHandler, hexadecimal from hachoir.parser.archive.zlib import build_tree, HuffmanCode try: from bz2 import BZ2Decompressor class Bunzip2: def __init__(self, stream): self.bzip2 = BZ2Decompressor() def __call__(self, size, data=''): try: return self.bzip2.decompress(data) except EOFError: return '' has_deflate = True except ImportError: has_deflate = False class ZeroTerminatedNumber(Field): """Zero (bit) terminated number: e.g. 11110 is 4.""" def __init__(self, parent, name, description=None): Field.__init__(self, parent, name, 0, description) endian = self.parent.endian stream = self.parent.stream addr = self.absolute_address value = 0 while True: bit = stream.readBits(addr, 1, endian) addr += 1 self._size += 1 if not bit: break value += 1 self._value = value def createValue(self): return self._value def move_to_front(seq, index): seq[:] = seq[index:index + 1] + seq[0:index] + seq[index + 1:] class Bzip2Bitmap(FieldSet): def __init__(self, parent, name, nb_items, start_index, *args, **kwargs): FieldSet.__init__(self, parent, name, *args, **kwargs) self.nb_items = nb_items self.start_index = start_index def createFields(self): for i in range(self.start_index, self.start_index + self.nb_items): yield Bit(self, "symbol_used[%i]" % i, "Is the symbol %i (%r) used?" % (i, chr(i))) class Bzip2Lengths(FieldSet): def __init__(self, parent, name, symbols, *args, **kwargs): FieldSet.__init__(self, parent, name, *args, **kwargs) self.symbols = symbols def createFields(self): yield Bits(self, "start_length", 5) length = self["start_length"].value lengths = [] for i in range(self.symbols): while True: bit = Bit( self, "change_length[%i][]" % i, "Should the length be changed for symbol %i?" % i) yield bit if not bit.value: break else: bit = Enum(Bit(self, "length_decrement[%i][]" % i, "Decrement the value?"), { True: "Decrement", False: "Increment"}) yield bit if bit.value: length -= 1 else: length += 1 lengths.append(length) self.final_length = length self.tree = build_tree(lengths) class Bzip2Selectors(FieldSet): def __init__(self, parent, name, ngroups, *args, **kwargs): FieldSet.__init__(self, parent, name, *args, **kwargs) self.groups = list(range(ngroups)) def createFields(self): for i in range(self["../selectors_used"].value): field = ZeroTerminatedNumber(self, "selector_list[]") move_to_front(self.groups, field.value) field.realvalue = self.groups[0] field._description = "MTF'ed selector index: raw value %i, real value %i" % ( field.value, field.realvalue) yield field class Bzip2Block(FieldSet): def createFields(self): yield textHandler(Bits(self, "blockheader", 48, "Block header"), hexadecimal) if self["blockheader"].value != 0x314159265359: # pi raise ParserError("Invalid block header!") yield textHandler(UInt32(self, "crc32", "CRC32 for this block"), hexadecimal) yield Bit(self, "randomized", "Is this block randomized?") yield Bits(self, "orig_bwt_pointer", 24, "Starting pointer into BWT after untransform") yield GenericVector(self, "huffman_used_map", 16, Bit, 'block_used', "Bitmap showing which blocks (representing 16 literals each) are in use") symbols_used = [] for index, block_used in enumerate(self["huffman_used_map"].array('block_used')): if block_used.value: start_index = index * 16 field = Bzip2Bitmap(self, "huffman_used_bitmap[%i]" % index, 16, start_index, "Bitmap for block %i (literals %i to %i) showing which symbols are in use" % ( index, start_index, start_index + 15)) yield field for i, used in enumerate(field): if used.value: symbols_used.append(start_index + i) yield Bits(self, "huffman_groups", 3, "Number of different Huffman tables in use") yield Bits(self, "selectors_used", 15, "Number of times the Huffman tables are switched") yield Bzip2Selectors(self, "selectors_list", self["huffman_groups"].value) trees = [] for group in range(self["huffman_groups"].value): field = Bzip2Lengths(self, "huffman_lengths[]", len(symbols_used) + 2) yield field trees.append(field.tree) counter = 0 rle_run = 0 selector_tree = None while True: if counter % 50 == 0: select_id = self["selectors_list"].array( "selector_list")[counter // 50].realvalue selector_tree = trees[select_id] field = HuffmanCode(self, "huffman_code[]", selector_tree) if field.realvalue in [0, 1]: # RLE codes if rle_run == 0: rle_power = 1 rle_run += (field.realvalue + 1) * rle_power rle_power <<= 1 field._description = "RLE Run Code %i (for %r); Total accumulated run %i (Huffman Code %i)" % ( field.realvalue, chr(symbols_used[0]), rle_run, field.value) elif field.realvalue == len(symbols_used) + 1: field._description = "Block Terminator (%i) (Huffman Code %i)" % ( field.realvalue, field.value) yield field break else: rle_run = 0 move_to_front(symbols_used, field.realvalue - 1) field._description = "Literal %r (value %i) (Huffman Code %i)" % ( chr(symbols_used[0]), field.realvalue, field.value) yield field if field.realvalue == len(symbols_used) + 1: break counter += 1 class Bzip2Stream(FieldSet): START_BLOCK = 0x314159265359 # pi END_STREAM = 0x177245385090 # sqrt(pi) def createFields(self): end = False while not end: marker = self.stream.readBits( self.absolute_address + self.current_size, 48, self.endian) if marker == self.START_BLOCK: yield Bzip2Block(self, "block[]") elif marker == self.END_STREAM: yield textHandler(Bits(self, "stream_end", 48, "End-of-stream marker"), hexadecimal) yield textHandler(UInt32(self, "crc32", "CRC32 for entire stream"), hexadecimal) padding = paddingSize(self.current_size, 8) if padding: yield PaddingBits(self, "padding[]", padding) end = True else: raise ParserError("Invalid marker 0x%02X!" % marker) class Bzip2Parser(Parser): PARSER_TAGS = { "id": "bzip2", "category": "archive", "file_ext": ("bz2",), "mime": ("application/x-bzip2",), "min_size": 10 * 8, "magic": ((b'BZh', 0),), "description": "bzip2 archive" } endian = BIG_ENDIAN def validate(self): if self.stream.readBytes(0, 3) != b'BZh': return "Wrong file signature" if not("1" <= self["blocksize"].value <= "9"): return "Wrong blocksize" return True def createFields(self): yield String(self, "id", 3, "Identifier (BZh)", charset="ASCII") yield Character(self, "blocksize", "Block size (KB of memory needed to uncompress)") if self._size is None: # TODO: is it possible to handle piped input? raise NotImplementedError size = (self._size - self.current_size) // 8 if size: for tag, filename in self.stream.tags: if tag == "filename" and filename.endswith(".bz2"): filename = filename[:-4] break else: filename = None data = Bzip2Stream(self, "file", size=size * 8) if has_deflate: CompressedField(self, Bunzip2) def createInputStream(**args): if filename: args.setdefault("tags", []).append( ("filename", filename)) return self._createInputStream(**args) data._createInputStream = createInputStream yield data