In [3]:  import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings

In [14]:  #import data#

data = pd.read_csv('CardioGoodFitness-1.csv')

In [15]:  #Display top 5 data#


Out[15]: Product Age Gender Education MaritalStatus Usage Fitness Income Miles

0 TM195 18 Male 14 Single 3 4 29562 112

1 TM195 19 Male 15 Single 2 3 31836 75

2 TM195 19 Female 14 Partnered 4 3 30699 66

3 TM195 19 Male 12 Single 3 3 32973 85

4 TM195 20 Male 13 Partnered 4 2 35247 47

In [16]:  # analytical summary of the dataset


Out[16]: Product Age Gender Education MaritalStatus Usage Fitness

count 180 180.000000 180 180.000000 180 180.000000 180.000000 1

unique 3 NaN 2 NaN 2 NaN NaN

top TM195 NaN Male NaN Partnered NaN NaN

freq 80 NaN 104 NaN 107 NaN NaN

mean NaN 28.788889 NaN 15.572222 NaN 3.455556 3.311111 537

std NaN 6.943498 NaN 1.617055 NaN 1.084797 0.958869 165

min NaN 18.000000 NaN 12.000000 NaN 2.000000 1.000000 295

25% NaN 24.000000 NaN 14.000000 NaN 3.000000 3.000000 440

50% NaN 26.000000 NaN 16.000000 NaN 3.000000 3.000000 505

75% NaN 33.000000 NaN 16.000000 NaN 4.000000 4.000000 586

max NaN 50.000000 NaN 21.000000 NaN 7.000000 5.000000 1045

In [18]:  #to project number of records and variables

Out[18]: (180, 9)

In [20]: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Product 180 non-null object
1 Age 180 non-null int64
2 Gender 180 non-null object
3 Education 180 non-null int64
4 MaritalStatus 180 non-null object
5 Usage 180 non-null int64
6 Fitness 180 non-null int64
7 Income 180 non-null int64
8 Miles 180 non-null int64
dtypes: int64(6), object(3)
memory usage: 12.8+ KB

In [22]:  # Percentage of missing values in columns

data_missing_columns = (round(((data.isnull().sum()/len(data.index))*100),2).

Out[22]: null

Product 0.0

Age 0.0

Gender 0.0

Education 0.0

MaritalStatus 0.0

Usage 0.0

Fitness 0.0

Income 0.0

Miles 0.0
In [23]:  #summary of distribution for relevant variables

Out[23]: array([[<AxesSubplot:title={'center':'Age'}>,
<AxesSubplot:title={'center':'Miles'}>]], dtype=object)
In [24]:  # Relationship between categorical and continuous variable
# Age distribution of gym goers with respect to gender
sns.boxplot(x="Gender", y="Age", data = data)

Out[24]: <AxesSubplot:xlabel='Gender', ylabel='Age'>

In [25]:  # Relationship between categorical and continuous variable

# Product distribution of gym goers with respect to gender
sns.boxplot(x="Product", y="Age", data = data)

Out[25]: <AxesSubplot:xlabel='Product', ylabel='Age'>

In [26]:  #Relationship between two categorical variables
pd.crosstab(data['Product'], data['Gender'])

Out[26]: Gender Female Male


TM195 40 40

TM498 29 31

TM798 7 33

In [27]:  #Relationship between two categorical variables

sns.countplot(x="Product", hue = "Gender", data = data)

Out[27]: <AxesSubplot:xlabel='Product', ylabel='count'>

In [29]:  #pivot table

pd.pivot_table(data, index=['Product', 'Gender'], columns = ['MaritalStatus']

Out[29]: Age Education Fitness Inco

MaritalStatus Partnered Single Partnered Single Partnered Single Partnered Sin

Product Gender

Female 27 13 27 13 27 13 27
Male 21 19 21 19 21 19 21

Female 15 14 15 14 15 14 15
Male 21 10 21 10 21 10 21

Female 4 3 4 3 4 3 4
Male 19 14 19 14 19 14 19
In [30]:  ## Checking the distribution of the gender ##

Gender = data['Gender'].value_counts()

Out[30]: Male 104

Female 76
Name: Gender, dtype: int64

In [34]:  sns.pairplot(data)

Out[34]: <seaborn.axisgrid.PairGrid at 0x2219ae0feb0>

In [35]:  sns.distplot(data['Age'])

Out[35]: <AxesSubplot:xlabel='Age', ylabel='Density'>

In [36]:  sns.distplot(data['Fitness'])

Out[36]: <AxesSubplot:xlabel='Fitness', ylabel='Density'>

In [37]:  data.hist(by='Gender', column = 'Income')

Out[37]: array([<AxesSubplot:title={'center':'Female'}>,
<AxesSubplot:title={'center':'Male'}>], dtype=object)

In [38]:  corr= data.corr()

sns.heatmap(corr, annot=True)

Out[38]: <AxesSubplot:>
In [43]:  # Identify missing values of dataframe

Out[43]: Product Age Gender Education MaritalStatus Usage Fitness Income Miles

0 False False False False False False False False False

1 False False False False False False False False False

2 False False False False False False False False False

3 False False False False False False False False False

4 False False False False False False False False False

... ... ... ... ... ... ... ... ... ...

175 False False False False False False False False False

176 False False False False False False False False False

177 False False False False False False False False False

178 False False False False False False False False False

179 False False False False False False False False False

180 rows × 9 columns

In [45]:  import os

