Create Excel file for experiment with random data
Input is an Excel file with the following columns:
genome name, c1, c2, c3, c4, c5, c6
- c1-c3 are numbers of cond1
- c4-c6 are numbers of cond2
We would like to filter to the lines that fulfill the following equations:
log2(avg(1-3) / avg(4-6)) > limit
other_limit > p.value( )
import numpy as np
import pandas as pd
import datetime
import sys
if len(sys.argv) < 2:
exit("Need number of rows")
rows_num = int(sys.argv[1])
cols_num = 6
start_time = datetime.datetime.now()
matrix = np.random.rand(rows_num, cols_num)
#print(matrix)
genome_names = list(map(lambda i: f'g{i}', range(rows_num)))
column_names = list(map(lambda i: f'm{i}', range(cols_num)))
df = pd.DataFrame(matrix, index=genome_names, columns=column_names)
df.index.name = 'genome name'
print(df.head())
end_generate_time = datetime.datetime.now()
print(end_generate_time - start_time)
df.to_excel('raw_data.xlsx')
end_save_time = datetime.datetime.now()
print(end_save_time - end_generate_time)
Output:
m0 m1 m2 m3 m4 m5
genome name
g0 0.775167 0.120102 0.813921 0.284670 0.074309 0.978062
g1 0.449572 0.556647 0.851609 0.711773 0.052198 0.543029
g2 0.592324 0.350038 0.273521 0.248995 0.773113 0.998779
0:00:00.007476
0:00:00.140074