You are on page 1of 4

Data Analysis in python-2

'''
exploratory data analysis:
1.Getting to know the data
2.Data preprocessing(missing values)
3.Cross tables and data visualization
'''
#to check data type info
print(data.info())
#to check null values(false=no missing data)
data.isnull()
#to check sum of null values in each colum
print('Dara columns with null values:\n',data.isnull().sum())
#so there is no missing values in any column
#now check the descriptive summary of numerical variables in
statistical form
summary_num=data.describe()
print(summary_num)
#also check the descriptive summary of categorical variables
summary_cat=data.describe(include='O')
print(summary_cat)
#to know the frequency of each category under a variable
data['JobType'].value_counts()
data['occupation'].value_counts()
data['EdType'].value_counts()
data['nativecountry'].value_counts()
data['SalStat'].value_counts()
data['maritalstatus'].value_counts()
#to check the special characters for missing/unique values/classes
print(np.unique(data['JobType'])) #there exists ' ?' instead of 'nan'
print(np.unique(data['occupation'])) #there exists ' ?' instead of
'nan'
data.isnull().sum() #but showing no null values
#now we have to r-eread the data including missing values
data=pd.read_csv('income.csv',na_values=[' ?'])
data.isnull().sum() #now it is showing all null values
#data pre-processing and analysing all missing data
missing=data[data.isnull().any(axis=1)]
'''
points to note upto now:
1.missing values in JobType=1809
2.missing values in occupation=1816
3.there are 1809 rows whrere two specific
columns i.e. occupation and JobType have missing
values
4.you still have occupation unfilled for 7 rows(1816-1809)
because JobType is Never worked.
So, in this case we can delete all the missing data or
replace the missing values with alternate reasonable values.
but in this case it is very complex to replace the missing values
by modeling a mechanism to produce these values.
So, we go for deleting these values.
'''
#dropping out all rows of missing values(axis=0 denotes rows)
data2=data.dropna(axis=0)
#now go for further analysis by finding out relationship between
#independent variables(numerical) using correlation measure
correlation=data2.corr() #here non of the variables are nearer to 1 i.e
no-correlation
#now considering the categorical variables for their relationships
data2.columns #cheking the column names for further analysis
#now chek the gender proportion status using corss tables and data
visualization
gender=pd.crosstab(index=data2['gender'],columns='count',normalize=True
)
print(gender) #variable of interest 'gender' is under index column and
we found males are more than females.
#now to check the relationship between genders and salstatus using 2-
way tables.
gender_salstat=pd.crosstab(index=data2['gender'],columns=data2['SalStat
'],margins=True,normalize='index')
print(gender_salstat) #index column being 'gender',its categories along
rows and
#ohter columns being 'SatStat' categories, normalize is used to get the
row proportions equal to 1.
#now to check the proportions of the output variables using frequecny
distribution plots
SalStat=sb.countplot(data2['SalStat']) #frequency dist. of SalStat
'''
therefore we found 75% are above 50000 and 25% are below/equal to 50000
'''
#now check the frequency of age variable using histogram plot
sb.distplot(data2['age'],bins=10,kde=False) #kde=False for frequencies
on y-axis
#peoples with age 20-45 are more
#now check the relationship between age and SalStat using boxplot
sb.boxplot('SalStat','age',data=data2) #boxplot
data2.groupby('SalStat')['age'].median()
'''
therefore we found that 35-50 age have greater than 50000 sal.
while 25-35 age have less or equal to 50000 sal.
similarly we can check the relationship between other variables.
'''
#now other exploratory data analysis
#relationship between JobType ond SalStat using bar plot
sb.countplot(y='JobType',data=data2,hue='SalStat') #taking JobType
along y-axis
#relationship between JobType ond SalStat using cross table
pd.crosstab(index=data2['JobType'],columns=data2['SalStat'],margins=Tru
e,normalize='index')
#therefor, JobType is the important variable in avoiding the misuse of
subsidies.
#now relationship between Educaation ond SalStat using bar plot
sb.countplot(y='EdType',data=data2,hue='SalStat') #taking EdType along
y-axis
#relationship between JobType ond SalStat using cross table
pd.crosstab(index=data2['EdType'],columns=data2['SalStat'],margins=True
,normalize='index')
#therefor, EdType is the important variable in avoiding the misuse of
subsidies.
#now relationship between occupation ond SalStat using bar plot
sb.countplot(y='occupation',data=data2,hue='SalStat') #taking EdType
along y-axis
#relationship between occupation ond SalStat using cross table
pd.crosstab(index=data2['occupation'],columns=data2['SalStat'],margins=
True,normalize='index')
sb.distplot(data2['capitalgain'],bins=10,kde=False) #histogram of
frequency dist. of capital gain
sb.distplot(data2['capitalloss'],bins=10,kde=False) #histogram of
frequency dist. of capital loss
sb.boxplot('SalStat','hoursperweek',data=data2) #boxplot of SalStat vs
hoursperweek

You might also like