from hachoir.parser import createParser from hachoir.core.tools import makePrintable from hachoir.metadata import extractMetadata from hachoir.core.i18n import initLocale from sys import argv, stderr, exit from os import walk from os.path import join as path_join from fnmatch import fnmatch import codecs OUTPUT_FILENAME = "metadata.csv" class Extractor: def __init__(self, directory, fields): self.directory = directory self.fields = fields self.charset = "UTF-8" self.total = 0 self.invalid = 0 def main(self): output = codecs.open(OUTPUT_FILENAME, "w", self.charset) for filename in self.findFiles(self.directory, '*.doc'): self.total += 1 line = self.processFile(filename) if line: print(line, file=output) else: self.invalid += 1 output.close() self.summary() def summary(self): print(file=stderr) print("Valid files: %s" % (self.total - self.invalid), file=stderr) print("Invalid files: %s" % self.invalid, file=stderr) print("Total files: %s" % self.total, file=stderr) print(file=stderr) print("Result written into %s" % OUTPUT_FILENAME, file=stderr) def findFiles(self, directory, pattern): for dirpath, dirnames, filenames in walk(directory): for filename in filenames: if not fnmatch(filename.lower(), pattern): continue yield path_join(dirpath, filename) def processFile(self, filename): print("[%s] Process file %s..." % (self.total, filename)) parser = createParser(filename) if not parser: print("Unable to parse file", file=stderr) return None try: metadata = extractMetadata(parser) except Exception as err: print("Metadata extraction error: %s" % str(err), file=stderr) return None if not metadata: print("Unable to extract metadata", file=stderr) return None filename = makePrintable(filename, self.charset) line = [filename] for field in self.fields: value = metadata.getText(field, '') value = makePrintable(value, self.charset) line.append(value) return '; '.join(line) def main(): initLocale() if len(argv) != 3: print("usage: %s directory fields" % argv[0], file=stderr) print(file=stderr) print("eg. %s . title,creation_date" % argv[0], file=stderr) exit(1) directory = argv[1] fields = [field.strip() for field in argv[2].split(",")] Extractor(directory, fields).main()