# -*- coding: utf-8 -*- """ Various utilities. """ from hachoir_core.i18n import _, ngettext import re import stat from datetime import datetime, timedelta, MAXYEAR from warnings import warn def deprecated(comment=None): """ This is a decorator which can be used to mark functions as deprecated. It will result in a warning being emmitted when the function is used. Examples: :: @deprecated def oldfunc(): ... @deprecated("use newfunc()!") def oldfunc2(): ... Code from: http://code.activestate.com/recipes/391367/ """ def _deprecated(func): def newFunc(*args, **kwargs): message = "Call to deprecated function %s" % func.__name__ if comment: message += ": " + comment warn(message, category=DeprecationWarning, stacklevel=2) return func(*args, **kwargs) newFunc.__name__ = func.__name__ newFunc.__doc__ = func.__doc__ newFunc.__dict__.update(func.__dict__) return newFunc return _deprecated def paddingSize(value, align): """ Compute size of a padding field. >>> paddingSize(31, 4) 1 >>> paddingSize(32, 4) 0 >>> paddingSize(33, 4) 3 Note: (value + paddingSize(value, align)) == alignValue(value, align) """ if value % align != 0: return align - (value % align) else: return 0 def alignValue(value, align): """ Align a value to next 'align' multiple. >>> alignValue(31, 4) 32 >>> alignValue(32, 4) 32 >>> alignValue(33, 4) 36 Note: alignValue(value, align) == (value + paddingSize(value, align)) """ if value % align != 0: return value + align - (value % align) else: return value def timedelta2seconds(delta): """ Convert a datetime.timedelta() objet to a number of second (floatting point number). >>> timedelta2seconds(timedelta(seconds=2, microseconds=40000)) 2.04 >>> timedelta2seconds(timedelta(minutes=1, milliseconds=250)) 60.25 """ return delta.microseconds / 1000000.0 \ + delta.seconds + delta.days * 60*60*24 def humanDurationNanosec(nsec): """ Convert a duration in nanosecond to human natural representation. Returns an unicode string. >>> humanDurationNanosec(60417893) u'60.42 ms' """ # Nano second if nsec < 1000: return u"%u nsec" % nsec # Micro seconds usec, nsec = divmod(nsec, 1000) if usec < 1000: return u"%.2f usec" % (usec+float(nsec)/1000) # Milli seconds msec, usec = divmod(usec, 1000) if msec < 1000: return u"%.2f ms" % (msec + float(usec)/1000) return humanDuration(msec) def humanDuration(delta): """ Convert a duration in millisecond to human natural representation. Returns an unicode string. >>> humanDuration(0) u'0 ms' >>> humanDuration(213) u'213 ms' >>> humanDuration(4213) u'4 sec 213 ms' >>> humanDuration(6402309) u'1 hour 46 min 42 sec' """ if not isinstance(delta, timedelta): delta = timedelta(microseconds=delta*1000) # Milliseconds text = [] if 1000 <= delta.microseconds: text.append(u"%u ms" % (delta.microseconds//1000)) # Seconds minutes, seconds = divmod(delta.seconds, 60) hours, minutes = divmod(minutes, 60) if seconds: text.append(u"%u sec" % seconds) if minutes: text.append(u"%u min" % minutes) if hours: text.append(ngettext("%u hour", "%u hours", hours) % hours) # Days years, days = divmod(delta.days, 365) if days: text.append(ngettext("%u day", "%u days", days) % days) if years: text.append(ngettext("%u year", "%u years", years) % years) if 3 < len(text): text = text[-3:] elif not text: return u"0 ms" return u" ".join(reversed(text)) def humanFilesize(size): """ Convert a file size in byte to human natural representation. It uses the values: 1 KB is 1024 bytes, 1 MB is 1024 KB, etc. The result is an unicode string. >>> humanFilesize(1) u'1 byte' >>> humanFilesize(790) u'790 bytes' >>> humanFilesize(256960) u'250.9 KB' """ if size < 10000: return ngettext("%u byte", "%u bytes", size) % size units = [_("KB"), _("MB"), _("GB"), _("TB")] size = float(size) divisor = 1024 for unit in units: size = size / divisor if size < divisor: return "%.1f %s" % (size, unit) return "%u %s" % (size, unit) def humanBitSize(size): """ Convert a size in bit to human classic representation. It uses the values: 1 Kbit is 1000 bits, 1 Mbit is 1000 Kbit, etc. The result is an unicode string. >>> humanBitSize(1) u'1 bit' >>> humanBitSize(790) u'790 bits' >>> humanBitSize(256960) u'257.0 Kbit' """ divisor = 1000 if size < divisor: return ngettext("%u bit", "%u bits", size) % size units = [u"Kbit", u"Mbit", u"Gbit", u"Tbit"] size = float(size) for unit in units: size = size / divisor if size < divisor: return "%.1f %s" % (size, unit) return u"%u %s" % (size, unit) def humanBitRate(size): """ Convert a bit rate to human classic representation. It uses humanBitSize() to convert size into human reprensation. The result is an unicode string. >>> humanBitRate(790) u'790 bits/sec' >>> humanBitRate(256960) u'257.0 Kbit/sec' """ return "".join((humanBitSize(size), "/sec")) def humanFrequency(hertz): """ Convert a frequency in hertz to human classic representation. It uses the values: 1 KHz is 1000 Hz, 1 MHz is 1000 KMhz, etc. The result is an unicode string. >>> humanFrequency(790) u'790 Hz' >>> humanFrequency(629469) u'629.5 kHz' """ divisor = 1000 if hertz < divisor: return u"%u Hz" % hertz units = [u"kHz", u"MHz", u"GHz", u"THz"] hertz = float(hertz) for unit in units: hertz = hertz / divisor if hertz < divisor: return u"%.1f %s" % (hertz, unit) return u"%s %s" % (hertz, unit) regex_control_code = re.compile(r"([\x00-\x1f\x7f])") controlchars = tuple({ # Don't use "\0", because "\0"+"0"+"1" = "\001" = "\1" (1 character) # Same rease to not use octal syntax ("\1") ord("\n"): r"\n", ord("\r"): r"\r", ord("\t"): r"\t", ord("\a"): r"\a", ord("\b"): r"\b", }.get(code, '\\x%02x' % code) for code in xrange(128) ) def makePrintable(data, charset, quote=None, to_unicode=False, smart=True): r""" Prepare a string to make it printable in the specified charset. It escapes control characters. Characters with code bigger than 127 are escaped if data type is 'str' or if charset is "ASCII". Examples with Unicode: >>> aged = unicode("âgé", "UTF-8") >>> repr(aged) # text type is 'unicode' "u'\\xe2g\\xe9'" >>> makePrintable("abc\0", "UTF-8") 'abc\\0' >>> makePrintable(aged, "latin1") '\xe2g\xe9' >>> makePrintable(aged, "latin1", quote='"') '"\xe2g\xe9"' Examples with string encoded in latin1: >>> aged_latin = unicode("âgé", "UTF-8").encode("latin1") >>> repr(aged_latin) # text type is 'str' "'\\xe2g\\xe9'" >>> makePrintable(aged_latin, "latin1") '\\xe2g\\xe9' >>> makePrintable("", "latin1") '' >>> makePrintable("a", "latin1", quote='"') '"a"' >>> makePrintable("", "latin1", quote='"') '(empty)' >>> makePrintable("abc", "latin1", quote="'") "'abc'" Control codes: >>> makePrintable("\0\x03\x0a\x10 \x7f", "latin1") '\\0\\3\\n\\x10 \\x7f' Quote character may also be escaped (only ' and "): >>> print makePrintable("a\"b", "latin-1", quote='"') "a\"b" >>> print makePrintable("a\"b", "latin-1", quote="'") 'a"b' >>> print makePrintable("a'b", "latin-1", quote="'") 'a\'b' """ if data: if not isinstance(data, unicode): data = unicode(data, "ISO-8859-1") charset = "ASCII" data = regex_control_code.sub( lambda regs: controlchars[ord(regs.group(1))], data) if quote: if quote in "\"'": data = data.replace(quote, '\\' + quote) data = ''.join((quote, data, quote)) elif quote: data = "(empty)" data = data.encode(charset, "backslashreplace") if smart: # Replace \x00\x01 by \0\1 data = re.sub(r"\\x0([0-7])(?=[^0-7]|$)", r"\\\1", data) if to_unicode: data = unicode(data, charset) return data def makeUnicode(text): r""" Convert text to printable Unicode string. For byte string (type 'str'), use charset ISO-8859-1 for the conversion to Unicode >>> makeUnicode(u'abc\0d') u'abc\\0d' >>> makeUnicode('a\xe9') u'a\xe9' """ if isinstance(text, str): text = unicode(text, "ISO-8859-1") elif not isinstance(text, unicode): try: text = unicode(text) except UnicodeError: try: text = str(text) except Exception: text = repr(text) return makeUnicode(text) text = regex_control_code.sub( lambda regs: controlchars[ord(regs.group(1))], text) text = re.sub(r"\\x0([0-7])(?=[^0-7]|$)", r"\\\1", text) return text def binarySearch(seq, cmp_func): """ Search a value in a sequence using binary search. Returns index of the value, or None if the value doesn't exist. 'seq' have to be sorted in ascending order according to the comparaison function ; 'cmp_func', prototype func(x), is the compare function: - Return strictly positive value if we have to search forward ; - Return strictly negative value if we have to search backward ; - Otherwise (zero) we got the value. >>> # Search number 5 (search forward) ... binarySearch([0, 4, 5, 10], lambda x: 5-x) 2 >>> # Backward search ... binarySearch([10, 5, 4, 0], lambda x: x-5) 1 """ lower = 0 upper = len(seq) while lower < upper: index = (lower + upper) >> 1 diff = cmp_func(seq[index]) if diff < 0: upper = index elif diff > 0: lower = index + 1 else: return index return None def lowerBound(seq, cmp_func): f = 0 l = len(seq) while l > 0: h = l >> 1 m = f + h if cmp_func(seq[m]): f = m f += 1 l -= h + 1 else: l = h return f def humanUnixAttributes(mode): """ Convert a Unix file attributes (or "file mode") to an unicode string. Original source code: http://cvs.savannah.gnu.org/viewcvs/coreutils/lib/filemode.c?root=coreutils >>> humanUnixAttributes(0644) u'-rw-r--r-- (644)' >>> humanUnixAttributes(02755) u'-rwxr-sr-x (2755)' """ def ftypelet(mode): if stat.S_ISREG (mode) or not stat.S_IFMT(mode): return '-' if stat.S_ISBLK (mode): return 'b' if stat.S_ISCHR (mode): return 'c' if stat.S_ISDIR (mode): return 'd' if stat.S_ISFIFO(mode): return 'p' if stat.S_ISLNK (mode): return 'l' if stat.S_ISSOCK(mode): return 's' return '?' chars = [ ftypelet(mode), 'r', 'w', 'x', 'r', 'w', 'x', 'r', 'w', 'x' ] for i in xrange(1, 10): if not mode & 1 << 9 - i: chars[i] = '-' if mode & stat.S_ISUID: if chars[3] != 'x': chars[3] = 'S' else: chars[3] = 's' if mode & stat.S_ISGID: if chars[6] != 'x': chars[6] = 'S' else: chars[6] = 's' if mode & stat.S_ISVTX: if chars[9] != 'x': chars[9] = 'T' else: chars[9] = 't' return u"%s (%o)" % (''.join(chars), mode) def createDict(data, index): """ Create a new dictionnay from dictionnary key=>values: just keep value number 'index' from all values. >>> data={10: ("dix", 100, "a"), 20: ("vingt", 200, "b")} >>> createDict(data, 0) {10: 'dix', 20: 'vingt'} >>> createDict(data, 2) {10: 'a', 20: 'b'} """ return dict( (key,values[index]) for key, values in data.iteritems() ) # Start of UNIX timestamp (Epoch): 1st January 1970 at 00:00 UNIX_TIMESTAMP_T0 = datetime(1970, 1, 1) def timestampUNIX(value): """ Convert an UNIX (32-bit) timestamp to datetime object. Timestamp value is the number of seconds since the 1st January 1970 at 00:00. Maximum value is 2147483647: 19 january 2038 at 03:14:07. May raise ValueError for invalid value: value have to be in 0..2147483647. >>> timestampUNIX(0) datetime.datetime(1970, 1, 1, 0, 0) >>> timestampUNIX(1154175644) datetime.datetime(2006, 7, 29, 12, 20, 44) >>> timestampUNIX(1154175644.37) datetime.datetime(2006, 7, 29, 12, 20, 44, 370000) >>> timestampUNIX(2147483647) datetime.datetime(2038, 1, 19, 3, 14, 7) """ if not isinstance(value, (float, int, long)): raise TypeError("timestampUNIX(): an integer or float is required") if not(0 <= value <= 2147483647): raise ValueError("timestampUNIX(): value have to be in 0..2147483647") return UNIX_TIMESTAMP_T0 + timedelta(seconds=value) # Start of Macintosh timestamp: 1st January 1904 at 00:00 MAC_TIMESTAMP_T0 = datetime(1904, 1, 1) def timestampMac32(value): """ Convert an Mac (32-bit) timestamp to string. The format is the number of seconds since the 1st January 1904 (to 2040). Returns unicode string. >>> timestampMac32(0) datetime.datetime(1904, 1, 1, 0, 0) >>> timestampMac32(2843043290) datetime.datetime(1994, 2, 2, 14, 14, 50) """ if not isinstance(value, (float, int, long)): raise TypeError("an integer or float is required") if not(0 <= value <= 4294967295): return _("invalid Mac timestamp (%s)") % value return MAC_TIMESTAMP_T0 + timedelta(seconds=value) def durationWin64(value): """ Convert Windows 64-bit duration to string. The timestamp format is a 64-bit number: number of 100ns. See also timestampWin64(). >>> str(durationWin64(1072580000)) '0:01:47.258000' >>> str(durationWin64(2146280000)) '0:03:34.628000' """ if not isinstance(value, (float, int, long)): raise TypeError("an integer or float is required") if value < 0: raise ValueError("value have to be a positive or nul integer") return timedelta(microseconds=value/10) # Start of 64-bit Windows timestamp: 1st January 1600 at 00:00 WIN64_TIMESTAMP_T0 = datetime(1601, 1, 1, 0, 0, 0) def timestampWin64(value): """ Convert Windows 64-bit timestamp to string. The timestamp format is a 64-bit number which represents number of 100ns since the 1st January 1601 at 00:00. Result is an unicode string. See also durationWin64(). Maximum date is 28 may 60056. >>> timestampWin64(0) datetime.datetime(1601, 1, 1, 0, 0) >>> timestampWin64(127840491566710000) datetime.datetime(2006, 2, 10, 12, 45, 56, 671000) """ try: return WIN64_TIMESTAMP_T0 + durationWin64(value) except OverflowError: raise ValueError(_("date newer than year %s (value=%s)") % (MAXYEAR, value)) # Start of 60-bit UUID timestamp: 15 October 1582 at 00:00 UUID60_TIMESTAMP_T0 = datetime(1582, 10, 15, 0, 0, 0) def timestampUUID60(value): """ Convert UUID 60-bit timestamp to string. The timestamp format is a 60-bit number which represents number of 100ns since the the 15 October 1582 at 00:00. Result is an unicode string. >>> timestampUUID60(0) datetime.datetime(1582, 10, 15, 0, 0) >>> timestampUUID60(130435676263032368) datetime.datetime(1996, 2, 14, 5, 13, 46, 303236) """ if not isinstance(value, (float, int, long)): raise TypeError("an integer or float is required") if value < 0: raise ValueError("value have to be a positive or nul integer") try: return UUID60_TIMESTAMP_T0 + timedelta(microseconds=value/10) except OverflowError: raise ValueError(_("timestampUUID60() overflow (value=%s)") % value) def humanDatetime(value, strip_microsecond=True): """ Convert a timestamp to Unicode string: use ISO format with space separator. >>> humanDatetime( datetime(2006, 7, 29, 12, 20, 44) ) u'2006-07-29 12:20:44' >>> humanDatetime( datetime(2003, 6, 30, 16, 0, 5, 370000) ) u'2003-06-30 16:00:05' >>> humanDatetime( datetime(2003, 6, 30, 16, 0, 5, 370000), False ) u'2003-06-30 16:00:05.370000' """ text = unicode(value.isoformat()) text = text.replace('T', ' ') if strip_microsecond and "." in text: text = text.split(".")[0] return text NEWLINES_REGEX = re.compile("\n+") def normalizeNewline(text): r""" Replace Windows and Mac newlines with Unix newlines. Replace multiple consecutive newlines with one newline. >>> normalizeNewline('a\r\nb') 'a\nb' >>> normalizeNewline('a\r\rb') 'a\nb' >>> normalizeNewline('a\n\nb') 'a\nb' """ text = text.replace("\r\n", "\n") text = text.replace("\r", "\n") return NEWLINES_REGEX.sub("\n", text)