# Standard imports # In[25]: import numpy as np import scipy.stats import pandas as pd # In[26]: import matplotlib import matplotlib.pyplot as pp import pandas.plotting from IPython import display from ipywidgets import interact, widgets get_ipython().run_line_magic('matplotlib', 'inline') # In[27]: import re import mailbox import csv # The table records interviews with women in Whickham, England, in 1973 who were asked if they were smokers. The interviews were followed up 20 years later, when it was recorded if the woman were still alive. # # # In[28]: smoking = pd.read_csv('whickham.csv') # In[29]: smoking.info() # The categorical values in this case smoker and outcome, are both binary, yes or no. # In[30]: smoking[0:1314:50] # In[47]: smoking.age.max() # In[31]: smoking.smoker # In[32]: smoking.smoker.value_counts() # In[48]: pd.DataFrame(smoking.smoker.value_counts(normalize=True)) # In[49]: pd.DataFrame(smoking.smoker.value_counts()) # In[34]: pd.DataFrame(smoking.outcome.value_counts()) # If you want to see the values as fractions of the total number of records, we add normalize=true. This is useful because we know that the fractions sum to one, so later we can drop the death. # In[35]: pd.DataFrame(smoking.outcome.value_counts(normalize=True)) # We break down the proportion of outcomes by smoker group. We can do this with group by. # # # In[36]: smoking.groupby("smoker").outcome.value_counts() # In[50]: smoking.groupby("smoker").outcome.value_counts(normalize=True) # In[37]: bysmoker = smoking.groupby("smoker").outcome.value_counts(normalize=True) bysmoker # The index has two levels. So we can move one of the index levels to columns using unstack. # In[38]: bysmoker.index # In[51]: bysmoker.unstack().drop('Dead', axis=1) # We divide cases into age groups using Panda's cut. We generate categorical levels based on a set of bins. And we'll make an entirely # new column for that. # In[53]: smoking.head() # In[54]: smoking['ageGroup'] = pd.cut(smoking.age,[0,30,40,53,64,85],labels=['0-30','30-40','40-53','53- 64','64-85']) # In[55]: smoking.head() # In[56]: smoking['ageGroup'].head() # In[57]: smoking['age'].head() # Grouping by age group, and then smoker status. Use value counts to get proportions. # # And we make a nicer display by unstacking the series and dropping the death from the columns. # In[58]: byage = smoking.groupby(['ageGroup','smoker']).outcome.value_counts(normalize=True) byage # In[59]: byage.index # In[60]: byage.unstack() # In[62]: byage.unstack().drop('Dead', axis=1).unstack()