# -*- coding: utf-8 -*- from __future__ import division, print_function """ This module intents to generate a binary representation of a python object where it is guaranteed that the same objects will result in the same binary representation. By far not all python objects are supported. Here is the list of supported types - special build-in constants: True, False, None - integer - float (64bit) - complex (128bit) - np.ndarray - list - tuple - dictionary - namedtuple (new since version 0x80, before it also need the 'classes' lookup when loaded) For any nested combination of these objects it is also guaranteed that the original objects can be restored without any extra information. Additionally - 'getstate' (objects that implement __getstate__ and return a state that can be dumped as well) can be dumped. To Restore these objects the load function needs a lookup given by the argument 'classes' which maps the objects class name (obj.__class__.__name__) to the actual class definition (the class object). Of course for these objects the __setstate__ method needs to be implemented. NOTE: the tests pass python2.7 and python 3.4 so far, but it not yet been tested if the binary representation is the same among different python versions (they should be though!) """ from collections import namedtuple from math import ceil import numpy as np import struct from sys import version_info try: import scipy from scipy.sparse import csc_matrix except ImportError: scipy = None _spec_types = (bool, type(None)) _SPEC = 0x00 # True, False, None _INT_32 = 0x01 _FLOAT = 0x02 _COMPLEX = 0x03 _STR = 0x04 _BYTES = 0x05 # only for python3, as bytes and str are equivalent in python2 _INT = 0x06 _TUPLE = 0x07 _NAMEDTUPLE = 0x08 _NPARRAY = 0x09 _LIST = 0x0a _GETSTATE = 0x0b # only used when __bfkey__ is not present _DICT = 0x0c _INT_NEG = 0x0d _BFKEY = 0x0e # a special BF-Key member __bfkey__ is used if implemented, uses __getstate__ as fallback _SP_CSC_MAT = 0x0f # scipy csc sparse matrix _VERS = 0x80 def getVersion(): return _VERS __max_int32 = +2147483647 __min_int32 = -2147483648 def __int_to_bytes(i): m = 0xff assert i >= 0 ba = str() while i > 0: b = i & m ba += str(bytearray([b])) i = i >> 8 return ba[::-1] def __bytes_to_int(ba): i = 0 for b in ba: i = i << 8 i += ord(b) return i def char_eq_byte(ch, b): return ord(ch) == b def byte_eq_byte(b1, b2): return b1 == b2 if version_info.major > 2: BIN_TYPE = bytes str_to_bytes = lambda s: bytes(s, 'utf8') bytes_to_str = lambda b: str(b, 'utf8') LONG_TYPE = int np_load = lambda ba: np.loads(ba) init_BYTES = lambda b: bytes(b) comp_id = byte_eq_byte char_to_byte = lambda ch: ord(ch) byte_to_ord = lambda b: b else: BIN_TYPE = str str_to_bytes = lambda s: s bytes_to_str = lambda b: str(b) LONG_TYPE = long np_load = lambda ba: np.loads(str(ba)) init_BYTES = lambda b: str(bytearray(b)) comp_id = char_eq_byte char_to_byte = lambda ch: ch byte_to_ord = lambda b: ord(b) int_to_bytes = lambda i: i.to_bytes(ceil(i.bit_length() / 8), 'big') bytes_to_int = lambda ba: int.from_bytes(ba, 'big') try: int_to_bytes(2**77) except AttributeError: int_to_bytes = __int_to_bytes __b_tmp = int_to_bytes(2**77) try: bytes_to_int(__b_tmp) except AttributeError: bytes_to_int = __bytes_to_int assert bytes_to_int(__b_tmp) == 2**77 class BFLoadError(Exception): pass class BFUnkownClassError(Exception): def __init__(self, classname): Exception.__init__(self, "could not load object of type '{}', no class definition found in classes\n".format(classname)+ "Please provide the lookup 'classes' when calling load, that maps the class name of the object to the actual "+ "class definition (class object).") def _dump_spec(ob): if ob == True: b = init_BYTES([_SPEC, char_to_byte('T')]) elif ob == False: b = init_BYTES([_SPEC, char_to_byte('F')]) elif ob == None: b = init_BYTES([_SPEC, char_to_byte('N')]) else: raise RuntimeError("object is not of 'special' kind!") return b def _load_spec(b): assert comp_id(b[0], _SPEC) if b[1] == char_to_byte('T'): return True, 2 elif b[1] == char_to_byte('F'): return False, 2 elif b[1] == char_to_byte('N'): return None, 2 else: raise BFLoadError("internal error (unknown code for 'special' {})".format(b[1])) def _dump_int_32(ob): b = init_BYTES([_INT_32]) b += struct.pack('>i', ob) return b def _load_int_32(b): assert comp_id(b[0], _INT_32) i = struct.unpack('>i', b[1:5])[0] return i, 5 def _dump_int(ob): if ob < 0: b = init_BYTES([_INT_NEG]) ob *= -1 else: b = init_BYTES([_INT]) ib = int_to_bytes(ob) num_bytes = len(ib) b += struct.pack('>I', num_bytes) b += ib return b def _load_int(b): if comp_id(b[0], _INT): m = 1 elif comp_id(b[0], _INT_NEG): m = -1 else: raise BFLoadError("internal error (unknown int id {})".format(b[0])) num_bytes = struct.unpack('>I', b[1:5])[0] i = m*bytes_to_int(b[5:5+num_bytes]) return i, num_bytes + 5 def _dump_float(ob): b = init_BYTES([_FLOAT]) b += struct.pack('>d', ob) return b def _load_float(b): assert comp_id(b[0],_FLOAT) f = struct.unpack('>d', b[1:9])[0] return f, 9 def _dump_complex(ob): b = init_BYTES([_COMPLEX]) b += struct.pack('>d', ob.real) b += struct.pack('>d', ob.imag) return b def _load_complex(b): assert comp_id(b[0], _COMPLEX) re = struct.unpack('>d', b[1:9])[0] im = struct.unpack('>d', b[9:17])[0] return re + 1j*im, 13 def _dump_str(ob): b = init_BYTES([_STR]) str_bytes = str_to_bytes(ob) num_bytes = len(str_bytes) b += struct.pack('>I', num_bytes) b += str_bytes return b def _load_str(b): assert comp_id(b[0], _STR) num_bytes = struct.unpack('>I', b[1:5])[0] s = bytes_to_str(b[5:5+num_bytes]) return s, 5+num_bytes def _dump_bytes(ob): b = init_BYTES([_BYTES]) num_bytes = len(ob) b += struct.pack('>I', num_bytes) b += ob return b def _load_bytes(b): assert comp_id(b[0], _BYTES) num_bytes = struct.unpack('>I', b[1:5])[0] b_ = b[5:5+num_bytes] return b_, 5+num_bytes def _dump_tuple(t): b = init_BYTES([_TUPLE]) size = len(t) b += struct.pack('>I', size) for ti in t: b += _dump(ti) return b def _load_tuple(b, classes): assert comp_id(b[0], _TUPLE) size = struct.unpack('>I', b[1:5])[0] idx = 5 t = [] for i in range(size): ob, len_ob = _load(b[idx:], classes) t.append(ob) idx += len_ob return tuple(t), idx def _dump_namedtuple(t): b = init_BYTES([_NAMEDTUPLE]) size = len(t) b += struct.pack('>I', size) b += _dump(t.__class__.__name__) for i in range(size): b += _dump(t._fields[i]) b += _dump(t[i]) return b def _load_namedtuple(b, classes): assert comp_id(b[0], _NAMEDTUPLE) size = struct.unpack('>I', b[1:5])[0] class_name, len_ob = _load_str(b[5:]) idx = 5 + len_ob t = [] fields = [] for i in range(size): ob, len_ob = _load(b[idx:], classes) fields.append(ob) idx += len_ob ob, len_ob = _load(b[idx:], classes) t.append(ob) idx += len_ob np_class = namedtuple(class_name, fields) np_obj = np_class(*t) return np_obj, idx def _dump_list(t): b = init_BYTES([_LIST]) size = len(t) b += struct.pack('>I', size) for ti in t: b += _dump(ti) return b def _load_list(b, classes): assert comp_id(b[0], _LIST) size = struct.unpack('>I', b[1:5])[0] idx = 5 t = [] for i in range(size): ob, len_ob = _load(b[idx:], classes) t.append(ob) idx += len_ob return t, idx def _dump_np_array(np_array): b = init_BYTES([_NPARRAY]) nparray_bytes = np.ndarray.dumps(np_array) size = len(nparray_bytes) b += struct.pack('>I', size) b += nparray_bytes return b def _load_np_array(b): assert comp_id(b[0], _NPARRAY) size = struct.unpack('>I', b[1:5])[0] npa = np_load(b[5: size+5]) return npa, size+5 def _dump_bfkey(ob): b = init_BYTES([_BFKEY]) bfkey = ob.__bfkey__() obj_type = ob.__class__.__name__ b += _dump(str(obj_type)) b += _dump(bfkey) return b def _load_bfkey(b, classes): assert comp_id(b[0], _BFKEY) obj_type, l_obj_type = _load_str(b[1:]) bfkey, l_state = _load(b[l_obj_type+1:], classes) return (obj_type, bfkey), l_obj_type+l_state+1 def _dump_getstate(ob): b = init_BYTES([_GETSTATE]) state = ob.__getstate__() obj_type = ob.__class__.__name__ b += _dump(str(obj_type)) b += _dump(state) return b def _load_getstate(b, classes): assert comp_id(b[0], _GETSTATE) obj_type, l_obj_type = _load_str(b[1:]) state, l_state = _load(b[l_obj_type+1:], classes) try: cls = classes[obj_type] except KeyError: raise BFUnkownClassError(obj_type) obj = cls.__new__(cls) obj.__setstate__(state) return obj, l_obj_type+l_state+1 def _dump_dict(ob): b = init_BYTES([_DICT]) keys = ob.keys() bin_keys = [] for k in keys: try: bin_keys.append( (_dump(k), _dump(ob[k])) ) except: print("failed to dump key '{}'".format(k)) raise b += _dump_list(sorted(bin_keys)) return b def _load_dict(b, classes): assert comp_id(b[0], _DICT) sorted_keys_value, l = _load_list(b[1:], classes) res_dict = {} for i in range(len(sorted_keys_value)): key = _load(sorted_keys_value[i][0], classes)[0] value = _load(sorted_keys_value[i][1], classes)[0] res_dict[key] = value return res_dict, l+1 def _dump_scipy_csc_matrix(ob): b = init_BYTES([_SP_CSC_MAT]) b += _dump_np_array(ob.data) b += _dump_np_array(ob.indices) b += _dump_np_array(ob.indptr) b += _dump_tuple(ob.shape) return b def _load_scipy_csc_matrix(b): assert comp_id(b[0], _SP_CSC_MAT) l = 0 data, _l = _load_np_array(b[1:]) l += _l indices, _l = _load_np_array(b[1 + l:]) l += _l indptr, _l = _load_np_array(b[1 + l:]) l += _l shape, _l = _load_tuple(b[1 + l:], classes={}) l += _l return csc_matrix((data, indices, indptr), shape=shape), l+1 def _dump(ob): if isinstance(ob, _spec_types): return _dump_spec(ob) elif isinstance(ob, (int, LONG_TYPE)): if (__min_int32 <= ob) and (ob <= __max_int32): return _dump_int_32(ob) else: return _dump_int(ob) elif isinstance(ob, float): return _dump_float(ob) elif isinstance(ob, complex): return _dump_complex(ob) elif isinstance(ob, str): return _dump_str(ob) elif isinstance(ob, bytes): return _dump_bytes(ob) elif isinstance(ob, tuple): if hasattr(ob, '_fields'): return _dump_namedtuple(ob) else: return _dump_tuple(ob) elif isinstance(ob, list): return _dump_list(ob) elif isinstance(ob, np.ndarray): return _dump_np_array(ob) elif isinstance(ob, dict): return _dump_dict(ob) elif hasattr(ob, '__bfkey__'): return _dump_bfkey(ob) elif hasattr(ob, '__getstate__'): return _dump_getstate(ob) elif scipy and scipy.sparse.isspmatrix_csc(ob): return _dump_scipy_csc_matrix(ob) else: raise TypeError("unsupported type for dump '{}' ({})".format(type(ob), ob)) def _load(b, classes): identifier = b[0] if isinstance(identifier, str): identifier = ord(identifier) if identifier == _SPEC: return _load_spec(b) elif identifier == _INT_32: return _load_int_32(b) elif (identifier == _INT) or (identifier == _INT_NEG): return _load_int(b) elif identifier == _FLOAT: return _load_float(b) elif identifier == _COMPLEX: return _load_complex(b) elif identifier == _STR: return _load_str(b) elif identifier == _BYTES: return _load_bytes(b) elif identifier == _TUPLE: return _load_tuple(b, classes) elif identifier == _NAMEDTUPLE: return _load_namedtuple(b, classes) elif identifier == _LIST: return _load_list(b, classes) elif identifier == _NPARRAY: return _load_np_array(b) elif identifier == _DICT: return _load_dict(b, classes) elif identifier == _BFKEY: return _load_bfkey(b, classes) elif identifier == _GETSTATE: return _load_getstate(b, classes) elif identifier == _SP_CSC_MAT: return _load_scipy_csc_matrix(b) else: raise BFLoadError("internal error (unknown identifier '{}')".format(hex(identifier))) def dump(ob, vers=_VERS): """ returns the binary footprint of the object 'ob' as bytes """ global _dump # allows to temporally overwrite the global _dump if vers == _VERS: # to dump using different version return init_BYTES([_VERS]) + _dump(ob) elif vers < 0x80: __dump_tmp = _dump _dump = _dump_00 try: res = _dump(ob) finally: _dump = __dump_tmp return res def load(b, classes={}): """ reconstruct the object from the binary footprint given an bytes 'ba' """ global _load vers = b[0] if byte_to_ord(vers) == _VERS: return _load(b[1:], classes)[0] elif byte_to_ord(vers) < 0x80: # very first version # has not even a version tag __load_tmp = _load _load = _load_00 try: res = _load(b, classes)[0] finally: _load = __load_tmp return res else: raise BFLoadError("internal error (unknown version tag {})".format(vers)) ################################################################## #### #### VERY FIRST VERSION -- NO VERSION TAG #### ################################################################## # # so the first two bytes must correspond to an identifier which are assumed # to be < 128 = 0x80 def _load_namedtuple_00(b, classes): """ need to explicitly know the named tuple class for reconstruction later version creates its own named tuple """ assert comp_id(b[0], _NAMEDTUPLE) size = struct.unpack('>I', b[1:5])[0] class_name, len_ob = _load_str(b[5:]) idx = 5 + len_ob t = [] for i in range(size): ob, len_ob = _load(b[idx:], classes) t.append(ob) idx += len_ob try: np_class = classes[class_name] except KeyError: raise BFUnkownClassError(class_name) obj = np_class(*t) return obj, idx def _dump_namedtuple_00(t): b = init_BYTES([_NAMEDTUPLE]) size = len(t) b += struct.pack('>I', size) b += _dump(t.__class__.__name__) for ti in t: b += _dump(ti) return b def _load_00(b, classes): identifier = b[0] if isinstance(identifier, str): identifier = ord(identifier) if identifier == _SPEC: return _load_spec(b) elif identifier == _INT_32: return _load_int_32(b) elif (identifier == _INT) or (identifier == _INT_NEG): return _load_int(b) elif identifier == _FLOAT: return _load_float(b) elif identifier == _COMPLEX: return _load_complex(b) elif identifier == _STR: return _load_str(b) elif identifier == _BYTES: return _load_bytes(b) elif identifier == _TUPLE: return _load_tuple(b, classes) elif identifier == _NAMEDTUPLE: return _load_namedtuple_00(b, classes) elif identifier == _LIST: return _load_list(b, classes) elif identifier == _NPARRAY: return _load_np_array(b) elif identifier == _DICT: return _load_dict(b, classes) elif identifier == _GETSTATE: return _load_getstate(b, classes) else: raise BFLoadError("unknown identifier '{}'".format(hex(identifier))) def _dump_00(ob): if isinstance(ob, _spec_types): return _dump_spec(ob) elif isinstance(ob, (int, LONG_TYPE)): if (__min_int32 <= ob) and (ob <= __max_int32): return _dump_int_32(ob) else: return _dump_int(ob) elif isinstance(ob, float): return _dump_float(ob) elif isinstance(ob, complex): return _dump_complex(ob) elif isinstance(ob, str): return _dump_str(ob) elif isinstance(ob, bytes): return _dump_bytes(ob) elif isinstance(ob, tuple): if hasattr(ob, '_fields'): return _dump_namedtuple_00(ob) else: return _dump_tuple(ob) elif isinstance(ob, list): return _dump_list(ob) elif isinstance(ob, np.ndarray): return _dump_np_array(ob) elif isinstance(ob, dict): return _dump_dict(ob) elif hasattr(ob, '__getstate__'): return _dump_getstate(ob) else: raise RuntimeError("unsupported type for dump '{}'".format(type(ob)))