You are on page 1of 2

# ## 3.

6 Describe Categorical Variables


# Standard imports
# In[25]:
import numpy as np
import scipy.stats
import pandas as pd
# In[26]:
import matplotlib
import matplotlib.pyplot as pp
import pandas.plotting
from IPython import display
from ipywidgets import interact, widgets
get_ipython().run_line_magic('matplotlib', 'inline')
# In[27]:
import re
import mailbox
import csv
# The table records interviews with women in Whickham, England, in 1973 who were asked if
they were smokers. The interviews were followed up 20 years later, when it was recorded if the
woman were still alive.
#
#
# In[28]:
smoking = pd.read_csv('whickham.csv')
# In[29]:
smoking.info()
# The categorical values in this case smoker and outcome, are both binary, yes or no.
# In[30]:
smoking[0:1314:50]
# In[47]:
smoking.age.max()
# In[31]:
smoking.smoker
# In[32]:
smoking.smoker.value_counts()
# In[48]:
pd.DataFrame(smoking.smoker.value_counts(normalize=True))
# In[49]:
pd.DataFrame(smoking.smoker.value_counts())
# In[34]:
pd.DataFrame(smoking.outcome.value_counts())
# If you want to see the values as fractions of the total number of records, we add
normalize=true. This is useful because we know that the fractions sum to one, so later we can
drop the death.
# In[35]:
pd.DataFrame(smoking.outcome.value_counts(normalize=True))
# We break down the proportion of outcomes by smoker group. We can do this with group by.
#
#
# In[36]:
smoking.groupby("smoker").outcome.value_counts()
# In[50]:
smoking.groupby("smoker").outcome.value_counts(normalize=True)
# In[37]:
bysmoker = smoking.groupby("smoker").outcome.value_counts(normalize=True)
bysmoker
# The index has two levels. So we can move one of the index levels to columns using unstack.
# In[38]:
bysmoker.index
# In[51]:
bysmoker.unstack().drop('Dead', axis=1)
# We divide cases into age groups using Panda's cut. We generate categorical levels based on a
set of bins. And we'll make an entirely
# new column for that.
# In[53]:
smoking.head()
# In[54]:
smoking['ageGroup'] = pd.cut(smoking.age,[0,30,40,53,64,85],labels=['0-30','30-40','40-53','53-
64','64-85'])
# In[55]:
smoking.head()
# In[56]:
smoking['ageGroup'].head()
# In[57]:
smoking['age'].head()
# Grouping by age group, and then smoker status. Use value counts to get proportions.
#
# And we make a nicer display by unstacking the series and dropping the death from the
columns.
# In[58]:
byage = smoking.groupby(['ageGroup','smoker']).outcome.value_counts(normalize=True)
byage
# In[59]:
byage.index
# In[60]:
byage.unstack()
# In[62]:
byage.unstack().drop('Dead', axis=1).unstack()

You might also like