Create Excel file for experiment with random data


Input is an Excel file with the following columns:


genome name, c1, c2, c3, c4, c5, c6

We would like to filter to the lines that fulfill the following equations:


log2(avg(1-3) / avg(4-6)) > limit
other_limit > p.value( )


examples/pandas/genome_create_excel.py
import numpy as np
import pandas as pd
import datetime
import sys

if len(sys.argv) < 2:
    exit("Need number of rows")

rows_num = int(sys.argv[1])
cols_num = 6

start_time = datetime.datetime.now()
matrix = np.random.rand(rows_num, cols_num)
#print(matrix)

genome_names = list(map(lambda i: f'g{i}', range(rows_num)))
column_names = list(map(lambda i: f'm{i}', range(cols_num)))

df = pd.DataFrame(matrix, index=genome_names, columns=column_names)
df.index.name = 'genome name'

print(df.head())


end_generate_time = datetime.datetime.now()
print(end_generate_time - start_time)

df.to_excel('raw_data.xlsx')

end_save_time = datetime.datetime.now()
print(end_save_time - end_generate_time)

                   m0        m1        m2        m3        m4        m5
genome name                                                            
g0           0.775167  0.120102  0.813921  0.284670  0.074309  0.978062
g1           0.449572  0.556647  0.851609  0.711773  0.052198  0.543029
g2           0.592324  0.350038  0.273521  0.248995  0.773113  0.998779
0:00:00.007476
0:00:00.140074