TODO: Stack Overflow example
examples/pandas/stack_overflow_pandas.py
import pandas as pd import matplotlib.pyplot as plt import numpy as np file = './developer_survey_2020/survey_results_public.csv' df = pd.read_csv(file, sep = ',') totals = [] countries = [] for c in np.unique(list(df['Country'])): totals.append(len(df[df['Country'] == c])) countries.append(c) plt.figure() plt.bar(np.arange(0, len(countries)), totals) plt.xticks(np.arange(0, len(countries)), countries) #%% age_totals = [] ages = [] for c in np.unique(list(df['Age1stCode'])): age_totals.append(len(df[df['Age1stCode'] == c])) ages.append(c) plt.figure() plt.scatter(ages, age_totals) plt.xticks(fontsize = 5)
examples/pandas/pandas_stackoverflow.py
import pandas df=pandas.read_csv("survey_results_public.csv") countrey_dist=df['Country'].value_counts() open_sourcers_dist=df['OpenSourcer'].value_counts() print("top 10 response countries:\n",countrey_dist.head(10)) open_sourcers_dist_top=df['OpenSourcer'][df['Country']=='United States'].value_counts() experience_dist_top=df['YearsCode'][df['Country']=='United States'].value_counts() print('distribution of open sourcer is the top country USA is :\n' ,open_sourcers_dist_top) print('distribution of experience in the top country USA is:\n' ,experience_dist_top) df=df[['OpenSourcer','YearsCode']][df['Country']=='United States'] df_agg=df.groupby('OpenSourcer').agg('YearsCode') print("relationship betwen OpenSourcer to coding exprience in US is :\n",df_agg)
examples/pandas/orig.py
import sys import pandas as pd filename = "survey_results_public.csv" if len(sys.argv) == 2: filename = sys.argv[1] country_name = 'Israel' chunks = [] dev_chunks=[] for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000): part = chunk[chunk['Country'] == country_name] print(chunk.size) print(part.size) print('--') chunks.append(part) df = pd.concat(chunks) print(df.dtypes) for value in ['Academic researcher','Data or business analyst', 'Data scientist or machine learning specialist','Database administrator','Designer', 'Developer, back-end', 'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack','Developer, game or graphics', 'Developer, mobile','Developer, QA or test', 'DevOps specialist','Educator','Engineer, data', 'Engineer, site reliability','Engineering manager', 'Marketing or sales professional', 'Product manager', 'Scientist', 'Senior Executive (C-Suite, VP, etc.)', 'System administrator']: print(value) df[value]= df.apply(lambda row: value in row['DevType'], axis=1) print(df.count()) print(df.size)
examples/pandas/panda_file.py
import sys import pandas as pd filename = "survey_results_public.csv" if len(sys.argv) == 2: filename = sys.argv[1] country_name = 'Israel' chunks = [] dev_chunks=[] for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000): part = chunk[chunk['Country'] == country_name] #df = pd.read_csv(filename, usecols=['Country','DevType']) #,chunksize=10000): #for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000): # part = chunk[chunk['Country'] == country_name] # # # print(chunk.size) # print(part.size) # print('--') chunks.append(part) # # df = pd.concat(chunks) print(df.dtypes) for value in ['Academic researcher','Data or business analyst', 'Data scientist or machine learning specialist','Database administrator','Designer', 'Developer, back-end', 'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack','Developer, game or graphics', 'Developer, mobile','Developer, QA or test', 'DevOps specialist','Educator','Engineer, data', 'Engineer, site reliability','Engineering manager', 'Marketing or sales professional', 'Product manager', 'Scientist', 'Senior Executive (C-Suite, VP, etc.)', 'System administrator']: #for value in ['Academic researcher','Data or business analyst', 'Designer']: print(value) #df[value]= df.apply(lambda row: 1, axis=1) #df[value]= df.apply(lambda row: value in str(row['DevType']), axis=1) df[value]= df.apply(lambda row: pd.notnull(row['DevType']) and value in row['DevType'], axis=1) print(df.count()) print(df.size) print(df)
examples/pandas/another_pandas.py
import pandas as pd import matplotlib.pyplot as plt filepath=r'survey_results_public.csv' df=pd.read_csv(filepath) print("The dataframe columns are:\n",list(df.columns)) print('-'*30) #Let's check what kind of dtypes is in each column, #if stats can be extracted - print it for i in range(len(df.dtypes.index)): print(df.dtypes.index[i] , 'is of type ', df.dtypes[i]) if df.dtypes[i]=='float64': print('*'*10,"\nAnd it's statistics:") print(df[df.dtypes.index[i]].describe()) #who is the most responsive country? most_responsive_country=df['Country'].value_counts().index[0] #now let's check what is the average working time per week for the most responsive country most_responsive_country_df=df[df['Country']==most_responsive_country] average_working_time_weekly=most_responsive_country_df['WorkWeekHrs'].mean() #back to the original df, see what is the study fields distribution #and then plotting it as a bar chart study_fields_normalized=df['UndergradMajor'].value_counts(normalize=True) fig,ax=plt.subplots() ax.barh(list(study_fields_normalized.index),list(study_fields_normalized*100)) ax.set_xlabel("Relative Distribution") fig.show()