You are on page 1of 1

In [38]: import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as st

In [26]: df=pd.read_csv("processed.cleveland.data")
df

Out[26]: 63.0 1.0 1.0.1 145.0 233.0 1.0.2 2.0 150.0 0.0 2.3 3.0 0.0.1 6.0 0

0 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2

1 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1

2 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0

3 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0

4 56.0 1.0 2.0 120.0 236.0 0.0 0.0 178.0 0.0 0.8 1.0 0.0 3.0 0

... ... ... ... ... ... ... ... ... ... ... ... ... ... ...

297 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.0 7.0 1

298 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.0 7.0 2

299 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.0 7.0 3

300 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.0 3.0 1

301 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 ? 3.0 0

302 rows × 14 columns

In [27]: df = df.rename({"63.0":"age","1.0":"gender","1.0.1":"chest_pain","145.0":"bp","233.0":"chol","1.0.2":"fbs","2.0":"restecg","150.0":"max_heart_rate","0.0":"exercise_angina","2.3":"oldpeak","
df

Out[27]: age gender chest_pain bp chol fbs restecg max_heart_rate exercise_angina oldpeak slope vessels_colored thal healthy

0 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2

1 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1

2 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0

3 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0

4 56.0 1.0 2.0 120.0 236.0 0.0 0.0 178.0 0.0 0.8 1.0 0.0 3.0 0

... ... ... ... ... ... ... ... ... ... ... ... ... ... ...

297 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.0 7.0 1

298 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.0 7.0 2

299 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.0 7.0 3

300 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.0 3.0 1

301 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 ? 3.0 0

302 rows × 14 columns

In [28]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 302 non-null float64
1 gender 302 non-null float64
2 chest_pain 302 non-null float64
3 bp 302 non-null float64
4 chol 302 non-null float64
5 fbs 302 non-null float64
6 restecg 302 non-null float64
7 max_heart_rate 302 non-null float64
8 exercise_angina 302 non-null float64
9 oldpeak 302 non-null float64
10 slope 302 non-null float64
11 vessels_colored 302 non-null object
12 thal 302 non-null object
13 healthy 302 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB

In [29]: df.describe()

Out[29]: age gender chest_pain bp chol fbs restecg max_heart_rate exercise_angina oldpeak slope healthy

count 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000

mean 54.410596 0.678808 3.165563 131.645695 246.738411 0.145695 0.986755 149.605960 0.327815 1.035430 1.596026 0.940397

std 9.040163 0.467709 0.953612 17.612202 51.856829 0.353386 0.994916 22.912959 0.470196 1.160723 0.611939 1.229384

min 29.000000 0.000000 1.000000 94.000000 126.000000 0.000000 0.000000 71.000000 0.000000 0.000000 1.000000 0.000000

25% 48.000000 0.000000 3.000000 120.000000 211.000000 0.000000 0.000000 133.250000 0.000000 0.000000 1.000000 0.000000

50% 55.500000 1.000000 3.000000 130.000000 241.500000 0.000000 0.500000 153.000000 0.000000 0.800000 2.000000 0.000000

75% 61.000000 1.000000 4.000000 140.000000 275.000000 0.000000 2.000000 166.000000 1.000000 1.600000 2.000000 2.000000

max 77.000000 1.000000 4.000000 200.000000 564.000000 1.000000 2.000000 202.000000 1.000000 6.200000 3.000000 4.000000

In [30]: df['age'].var()

81.72453851400411
Out[30]:

In [31]: df.var()

C:\Users\admin\AppData\Local\Temp\ipykernel_9656\1568254755.py:1: FutureWarning: The default value of numeric_only in DataFrame.var is deprecated. In a future version, it will default to Fa
lse. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
df.var()
age 81.724539
Out[31]:
gender 0.218752
chest_pain 0.909375
bp 310.189666
chol 2689.130679
fbs 0.124882
restecg 0.989857
max_heart_rate 525.003685
exercise_angina 0.221084
oldpeak 1.347279
slope 0.374469
healthy 1.511386
dtype: float64

In [40]: # Let's check the correlation coefficients to see which variables are highly correlated

# 'oldpeak' , 'chest_pain', 'excercise_angina' coulmns are having high correlation with our final column 'healthy'.
plt.figure(figsize = (16, 10))
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")
plt.show()

C:\Users\admin\AppData\Local\Temp\ipykernel_9656\1276077287.py:4: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to F
alse. Select only valid columns or specify the value of numeric_only to silence this warning.
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")

In [39]: # Here, 'age' column is symmetric in nature. And, 'bp' , 'old_peak' , 'chol' columns are slightly left skewed.
# 'max_heart_rate' looks right skewed.
# Other are categorical variables like 'healthy','slope','excercise_angina'.

df.hist()
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


Out[39]:

In [47]: # 0 means healthy and 1,2,3,4 are un-healthy

df["healthy"].value_counts().plot(kind="bar")

<Axes: >
Out[47]:

In [48]: # 1 for male and 0 for female

df["gender"].value_counts().plot(kind="bar")

<Axes: >
Out[48]:

In [50]: # here, we got outliers in the 'bp'


sns.boxplot(df['bp'])

<Axes: >
Out[50]:

In [52]: # Here, I got outlier in it.


sns.boxplot(df['max_heart_rate'])

<Axes: >
Out[52]:

In [161… sns.pairplot(df)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


Out[161]:

In [ ]:

In [ ]:

You might also like