Serialization of multiple Numpy arrays
- hdf5 allows you to access specific array without loading the whole data structure into memory.
- Same with SQlite, but it is much bigger.
examples/serialization/multiple_numpy_arrays.py
import os import sys import glob import json import sqlite3 import numpy as np import h5py import scipy.io import pickle def main(): for path in glob.glob("demo*"): os.unlink(path) if len(sys.argv) != 4: exit(f"Usage: {sys.argv[0]} ROWS, COLS, COUNT") size = (int(sys.argv[1]), int(sys.argv[2])) count = int(sys.argv[3]) print(f"size: {size} count {count}\n") originals = [np.random.random(size) for _ in range(count)] #print(originals) try_json(originals) try_pickle(originals) try_matlab(originals) try_hdf5(originals) try_hdf5_separate(originals) try_sqlite(originals) def try_json(originals): with open('demo.json', 'w') as fh: json.dump(originals, fh, default=lambda obj: obj.tolist()) with open('demo.json') as fh: loaded = np.array(json.load(fh)) #, default=lambda obj: obj.tolist()) #print(loaded) assert np.array_equal(originals, loaded) print(f"json: {os.path.getsize('demo.json'):7}") def try_pickle(originals): with open('demo.pickle', 'wb') as fh: pickle.dump(originals, fh, pickle.HIGHEST_PROTOCOL) with open('demo.pickle', 'rb') as fh: loaded = pickle.load(fh) assert np.array_equal(originals, loaded) print(f"pickle: {os.path.getsize('demo.pickle'):7}") def try_matlab(originals): scipy.io.savemat('demo.mat', {'data': originals}) mat = scipy.io.loadmat('demo.mat') loaded = mat['data'] assert np.array_equal(originals, loaded) print(f"matlab: {os.path.getsize('demo.mat'):7}") def try_hdf5(originals): with h5py.File('demo.h5', 'w') as hdf: hdf['data'] = originals with h5py.File('demo.h5', 'r') as hdf: loaded = hdf['data'][:] # [:] is needed to copy the content assert np.array_equal(originals, loaded) #print(loaded) print(f"hdf5: {os.path.getsize('demo.h5'):7}") # Don't load all the data in memory when reading def try_hdf5_separate(originals): with h5py.File('demo.hdf5', 'w') as hdf: hdf['data'] = originals for ix in range(len(originals)): with h5py.File('demo.hdf5', 'r') as hdf: loaded = hdf['data'][ix][:] # [:] is needed to copy the content #print(loaded) assert np.array_equal(originals[ix], loaded) print(f"hdf5: {os.path.getsize('demo.hdf5'):7}") # Don't load all the data in memory when reading def try_sqlite(originals): conn = sqlite3.connect("demo.db") curs = conn.cursor() try: curs.execute('''CREATE TABLE arrays ( id INTEGER PRIMARY KEY AUTOINCREMENT, array BlOB NOT NULL )''') sql = '''INSERT INTO arrays (array) VALUES (?)''' pickled = [pickle.dumps(arr, pickle.HIGHEST_PROTOCOL) for arr in originals] #for arr in pickled: # curs.execute(sql, (arr,)) # needs a list of tuples for the placeholder curs.executemany(sql, [(arr,) for arr in pickled]) conn.commit() except sqlite3.OperationalError as err: print(f'sqlite error: {err.args[0]}') conn.close() for ix in range(1, len(originals)+1): try: conn = sqlite3.connect("demo.db") curs = conn.cursor() sql = '''SELECT array FROM arrays WHERE id == ?''' curs.execute(sql, (ix,)) loaded = pickle.loads(curs.fetchone()[0]) except sqlite3.OperationalError as err: print(f'sqlite error: {err.args[0]}') exit() assert np.array_equal(originals[ix-1], loaded) print(f"sqlite: {os.path.getsize('demo.db'):7}") main()