Professional Documents
Culture Documents
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as st
In [26]: df=pd.read_csv("processed.cleveland.data")
df
Out[26]: 63.0 1.0 1.0.1 145.0 233.0 1.0.2 2.0 150.0 0.0 2.3 3.0 0.0.1 6.0 0
0 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2
1 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1
2 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0
3 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0
4 56.0 1.0 2.0 120.0 236.0 0.0 0.0 178.0 0.0 0.8 1.0 0.0 3.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.0 7.0 1
298 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.0 7.0 2
299 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.0 7.0 3
300 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.0 3.0 1
301 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 ? 3.0 0
In [27]: df = df.rename({"63.0":"age","1.0":"gender","1.0.1":"chest_pain","145.0":"bp","233.0":"chol","1.0.2":"fbs","2.0":"restecg","150.0":"max_heart_rate","0.0":"exercise_angina","2.3":"oldpeak","
df
Out[27]: age gender chest_pain bp chol fbs restecg max_heart_rate exercise_angina oldpeak slope vessels_colored thal healthy
0 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2
1 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1
2 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0
3 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0
4 56.0 1.0 2.0 120.0 236.0 0.0 0.0 178.0 0.0 0.8 1.0 0.0 3.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.0 7.0 1
298 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.0 7.0 2
299 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.0 7.0 3
300 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.0 3.0 1
301 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 ? 3.0 0
In [28]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 302 non-null float64
1 gender 302 non-null float64
2 chest_pain 302 non-null float64
3 bp 302 non-null float64
4 chol 302 non-null float64
5 fbs 302 non-null float64
6 restecg 302 non-null float64
7 max_heart_rate 302 non-null float64
8 exercise_angina 302 non-null float64
9 oldpeak 302 non-null float64
10 slope 302 non-null float64
11 vessels_colored 302 non-null object
12 thal 302 non-null object
13 healthy 302 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB
In [29]: df.describe()
Out[29]: age gender chest_pain bp chol fbs restecg max_heart_rate exercise_angina oldpeak slope healthy
count 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000
mean 54.410596 0.678808 3.165563 131.645695 246.738411 0.145695 0.986755 149.605960 0.327815 1.035430 1.596026 0.940397
std 9.040163 0.467709 0.953612 17.612202 51.856829 0.353386 0.994916 22.912959 0.470196 1.160723 0.611939 1.229384
min 29.000000 0.000000 1.000000 94.000000 126.000000 0.000000 0.000000 71.000000 0.000000 0.000000 1.000000 0.000000
25% 48.000000 0.000000 3.000000 120.000000 211.000000 0.000000 0.000000 133.250000 0.000000 0.000000 1.000000 0.000000
50% 55.500000 1.000000 3.000000 130.000000 241.500000 0.000000 0.500000 153.000000 0.000000 0.800000 2.000000 0.000000
75% 61.000000 1.000000 4.000000 140.000000 275.000000 0.000000 2.000000 166.000000 1.000000 1.600000 2.000000 2.000000
max 77.000000 1.000000 4.000000 200.000000 564.000000 1.000000 2.000000 202.000000 1.000000 6.200000 3.000000 4.000000
In [30]: df['age'].var()
81.72453851400411
Out[30]:
In [31]: df.var()
C:\Users\admin\AppData\Local\Temp\ipykernel_9656\1568254755.py:1: FutureWarning: The default value of numeric_only in DataFrame.var is deprecated. In a future version, it will default to Fa
lse. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
df.var()
age 81.724539
Out[31]:
gender 0.218752
chest_pain 0.909375
bp 310.189666
chol 2689.130679
fbs 0.124882
restecg 0.989857
max_heart_rate 525.003685
exercise_angina 0.221084
oldpeak 1.347279
slope 0.374469
healthy 1.511386
dtype: float64
In [40]: # Let's check the correlation coefficients to see which variables are highly correlated
# 'oldpeak' , 'chest_pain', 'excercise_angina' coulmns are having high correlation with our final column 'healthy'.
plt.figure(figsize = (16, 10))
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")
plt.show()
C:\Users\admin\AppData\Local\Temp\ipykernel_9656\1276077287.py:4: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to F
alse. Select only valid columns or specify the value of numeric_only to silence this warning.
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")
In [39]: # Here, 'age' column is symmetric in nature. And, 'bp' , 'old_peak' , 'chol' columns are slightly left skewed.
# 'max_heart_rate' looks right skewed.
# Other are categorical variables like 'healthy','slope','excercise_angina'.
df.hist()
plt.show
df["healthy"].value_counts().plot(kind="bar")
<Axes: >
Out[47]:
df["gender"].value_counts().plot(kind="bar")
<Axes: >
Out[48]:
<Axes: >
Out[50]:
<Axes: >
Out[52]:
In [161… sns.pairplot(df)
plt.show
In [ ]:
In [ ]: