Calculate Genome metrics - vectorized numpy
examples/pandas/genome_calculation_vectorized_numpy.py
import pandas as pd import numpy as np import datetime import sys if len(sys.argv) < 2: exit("Need filename") filename = sys.argv[1] def calculate_averages(df_numpy): v1 = df_numpy[:, 0:3].mean(axis=1) v2 = df_numpy[:, 3:6].mean(axis=1) return np.log2(v1/v2) start = datetime.datetime.now() df = pd.read_excel(filename, index_col='genome name') print(df.head()) print(datetime.datetime.now() - start) # the .values attribute changes from Pandas to numpy array # (no more iloc, no headers, no index) calculated_value = calculate_averages(df.values) print(datetime.datetime.now() - start) threshold = 0.2 filtered_df = df[calculated_value > threshold] print(filtered_df.head()) print(datetime.datetime.now() - start)