import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
file = './developer_survey_2020/survey_results_public.csv'
df = pd.read_csv(file, sep = ',')
totals = []
countries = []
for c in np.unique(list(df['Country'])):
totals.append(len(df[df['Country'] == c]))
countries.append(c)
plt.figure()
plt.bar(np.arange(0, len(countries)), totals)
plt.xticks(np.arange(0, len(countries)), countries)
#%%
age_totals = []
ages = []
for c in np.unique(list(df['Age1stCode'])):
age_totals.append(len(df[df['Age1stCode'] == c]))
ages.append(c)
plt.figure()
plt.scatter(ages, age_totals)
plt.xticks(fontsize = 5)
import pandas
df=pandas.read_csv("survey_results_public.csv")
countrey_dist=df['Country'].value_counts()
open_sourcers_dist=df['OpenSourcer'].value_counts()
print("top 10 response countries:\n",countrey_dist.head(10))
open_sourcers_dist_top=df['OpenSourcer'][df['Country']=='United States'].value_counts()
experience_dist_top=df['YearsCode'][df['Country']=='United States'].value_counts()
print('distribution of open sourcer is the top country USA is :\n' ,open_sourcers_dist_top)
print('distribution of experience in the top country USA is:\n' ,experience_dist_top)
df=df[['OpenSourcer','YearsCode']][df['Country']=='United States']
df_agg=df.groupby('OpenSourcer').agg('YearsCode')
print("relationship betwen OpenSourcer to coding exprience in US is :\n",df_agg)
import sys
import pandas as pd
filename = "survey_results_public.csv"
if len(sys.argv) == 2:
filename = sys.argv[1]
country_name = 'Israel'
chunks = []
dev_chunks=[]
for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000):
part = chunk[chunk['Country'] == country_name]
print(chunk.size)
print(part.size)
print('--')
chunks.append(part)
df = pd.concat(chunks)
print(df.dtypes)
for value in ['Academic researcher','Data or business analyst', 'Data scientist or machine learning specialist','Database administrator','Designer', 'Developer, back-end',
'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack','Developer, game or graphics', 'Developer, mobile','Developer, QA or test',
'DevOps specialist','Educator','Engineer, data', 'Engineer, site reliability','Engineering manager', 'Marketing or sales professional', 'Product manager', 'Scientist',
'Senior Executive (C-Suite, VP, etc.)', 'System administrator']:
print(value)
df[value]= df.apply(lambda row: value in row['DevType'], axis=1)
print(df.count())
print(df.size)
import sys
import pandas as pd
filename = "survey_results_public.csv"
if len(sys.argv) == 2:
filename = sys.argv[1]
country_name = 'Israel'
chunks = []
dev_chunks=[]
for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000):
part = chunk[chunk['Country'] == country_name]
#df = pd.read_csv(filename, usecols=['Country','DevType'])
#,chunksize=10000):
#for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000):
# part = chunk[chunk['Country'] == country_name]
#
#
# print(chunk.size)
# print(part.size)
# print('--')
chunks.append(part)
#
#
df = pd.concat(chunks)
print(df.dtypes)
for value in ['Academic researcher','Data or business analyst', 'Data scientist or machine learning specialist','Database administrator','Designer', 'Developer, back-end',
'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack','Developer, game or graphics', 'Developer, mobile','Developer, QA or test',
'DevOps specialist','Educator','Engineer, data', 'Engineer, site reliability','Engineering manager', 'Marketing or sales professional', 'Product manager', 'Scientist',
'Senior Executive (C-Suite, VP, etc.)', 'System administrator']:
#for value in ['Academic researcher','Data or business analyst', 'Designer']:
print(value)
#df[value]= df.apply(lambda row: 1, axis=1)
#df[value]= df.apply(lambda row: value in str(row['DevType']), axis=1)
df[value]= df.apply(lambda row: pd.notnull(row['DevType']) and value in row['DevType'], axis=1)
print(df.count())
print(df.size)
print(df)
import pandas as pd
import matplotlib.pyplot as plt
filepath=r'survey_results_public.csv'
df=pd.read_csv(filepath)
print("The dataframe columns are:\n",list(df.columns))
print('-'*30)
#Let's check what kind of dtypes is in each column,
#if stats can be extracted - print it
for i in range(len(df.dtypes.index)):
print(df.dtypes.index[i] , 'is of type ', df.dtypes[i])
if df.dtypes[i]=='float64':
print('*'*10,"\nAnd it's statistics:")
print(df[df.dtypes.index[i]].describe())
#who is the most responsive country?
most_responsive_country=df['Country'].value_counts().index[0]
#now let's check what is the average working time per week for the most responsive country
most_responsive_country_df=df[df['Country']==most_responsive_country]
average_working_time_weekly=most_responsive_country_df['WorkWeekHrs'].mean()
#back to the original df, see what is the study fields distribution
#and then plotting it as a bar chart
study_fields_normalized=df['UndergradMajor'].value_counts(normalize=True)
fig,ax=plt.subplots()
ax.barh(list(study_fields_normalized.index),list(study_fields_normalized*100))
ax.set_xlabel("Relative Distribution")
fig.show()