You are on page 1of 1

Import Necessary Libraries

In [2]: pip install imbalanced-learn

Collecting imbalanced-learn
Downloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
------------------------------------ 258.0/258.0 kB 931.3 kB/s eta 0:00:00
Collecting joblib>=1.1.1
Downloading joblib-1.4.0-py3-none-any.whl (301 kB)
------------------------------------ 301.2/301.2 kB 808.2 kB/s eta 0:00:00
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\murali\anaconda\lib\site-packages (from imbalanced-learn) (2.2.0)
Requirement already satisfied: numpy>=1.17.3 in c:\users\murali\anaconda\lib\site-packages (from imbalanced-learn) (1.23.5)
Requirement already satisfied: scipy>=1.5.0 in c:\users\murali\anaconda\lib\site-packages (from imbalanced-learn) (1.9.1)
Requirement already satisfied: scikit-learn>=1.0.2 in c:\users\murali\anaconda\lib\site-packages (from imbalanced-learn) (1.0.2)
Installing collected packages: joblib, imbalanced-learn
Attempting uninstall: joblib
Found existing installation: joblib 1.1.0
Uninstalling joblib-1.1.0:
Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.12.2 joblib-1.4.0
Note: you may need to restart the kernel to use updated packages.

In [4]: pip install xgboost

Collecting xgboost
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
---------------------------------------- 99.8/99.8 MB 1.9 MB/s eta 0:00:00
Requirement already satisfied: scipy in c:\users\murali\anaconda\lib\site-packages (from xgboost) (1.9.1)
Requirement already satisfied: numpy in c:\users\murali\anaconda\lib\site-packages (from xgboost) (1.23.5)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.

In [6]: pip install eli5

Collecting eli5
Downloading eli5-0.13.0.tar.gz (216 kB)
-------------------------------------- 216.2/216.2 kB 1.9 MB/s eta 0:00:00
Preparing metadata (setup.py): started
Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: attrs>17.1.0 in c:\users\murali\anaconda\lib\site-packages (from eli5) (21.4.0)
Collecting jinja2>=3.0.0
Downloading Jinja2-3.1.3-py3-none-any.whl (133 kB)
-------------------------------------- 133.2/133.2 kB 2.6 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.9.0 in c:\users\murali\anaconda\lib\site-packages (from eli5) (1.23.5)
Requirement already satisfied: scipy in c:\users\murali\anaconda\lib\site-packages (from eli5) (1.9.1)
Requirement already satisfied: six in c:\users\murali\anaconda\lib\site-packages (from eli5) (1.16.0)
Requirement already satisfied: scikit-learn>=0.20 in c:\users\murali\anaconda\lib\site-packages (from eli5) (1.0.2)
Collecting graphviz
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
---------------------------------------- 47.1/47.1 kB 1.2 MB/s eta 0:00:00
Requirement already satisfied: tabulate>=0.7.7 in c:\users\murali\anaconda\lib\site-packages (from eli5) (0.8.10)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\murali\anaconda\lib\site-packages (from jinja2>=3.0.0->eli5) (2.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\murali\anaconda\lib\site-packages (from scikit-learn>=0.20->eli5) (2.2.0)
Requirement already satisfied: joblib>=0.11 in c:\users\murali\anaconda\lib\site-packages (from scikit-learn>=0.20->eli5) (1.4.0)
Building wheels for collected packages: eli5
Building wheel for eli5 (setup.py): started
Building wheel for eli5 (setup.py): finished with status 'done'
Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107728 sha256=a9bbbe7805554ab8e133709ec599fbcb2fee892906f5a41ad5b7d4a8616d2739
Stored in directory: c:\users\murali\appdata\local\pip\cache\wheels\7b\26\a5\8460416695a992a2966b41caa5338e5e7fcea98c9d032d055c
Successfully built eli5
Installing collected packages: jinja2, graphviz, eli5
Attempting uninstall: jinja2
Found existing installation: Jinja2 2.11.3
Uninstalling Jinja2-2.11.3:
Successfully uninstalled Jinja2-2.11.3
Successfully installed eli5-0.13.0 graphviz-0.20.3 jinja2-3.1.3
Note: you may need to restart the kernel to use updated packages.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.

In [37]: import pandas as pd


import numpy as np

import matplotlib.pyplot as plt


import seaborn as sns

from sklearn.feature_selection import SelectKBest, chi2


from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier

import eli5

import warnings
warnings.filterwarnings("ignore")

sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

In [38]: df = pd.read_csv(r"C:\Users\MURALI\Documents\covid19\Covid Data.csv")

In [39]: df.head()

Out[39]: USMER MEDICAL_UNIT SEX PATIENT_TYPE DATE_DIED INTUBED PNEUMONIA AGE PREGNANT DIABETES ... ASTHMA INMSUPR HIPERTENSION OTHER_DISEASE CARDIOVASCULAR OBESITY RENAL_CHRONIC T

0 2 1 1 1 03/05/2020 97 1 65 2 2 ... 2 2 1 2 2 2 2

1 2 1 2 1 03/06/2020 97 1 72 97 2 ... 2 2 1 2 2 1 1

2 2 1 2 2 09/06/2020 1 2 55 97 1 ... 2 2 2 2 2 2 2

3 2 1 1 1 12/06/2020 97 2 53 2 2 ... 2 2 2 2 2 2 2

4 2 1 2 1 21/06/2020 97 2 68 97 1 ... 2 2 1 2 2 2 2

5 rows × 21 columns

Data Analysis
In [40]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 USMER 1048575 non-null int64
1 MEDICAL_UNIT 1048575 non-null int64
2 SEX 1048575 non-null int64
3 PATIENT_TYPE 1048575 non-null int64
4 DATE_DIED 1048575 non-null object
5 INTUBED 1048575 non-null int64
6 PNEUMONIA 1048575 non-null int64
7 AGE 1048575 non-null int64
8 PREGNANT 1048575 non-null int64
9 DIABETES 1048575 non-null int64
10 COPD 1048575 non-null int64
11 ASTHMA 1048575 non-null int64
12 INMSUPR 1048575 non-null int64
13 HIPERTENSION 1048575 non-null int64
14 OTHER_DISEASE 1048575 non-null int64
15 CARDIOVASCULAR 1048575 non-null int64
16 OBESITY 1048575 non-null int64
17 RENAL_CHRONIC 1048575 non-null int64
18 TOBACCO 1048575 non-null int64
19 CLASIFFICATION_FINAL 1048575 non-null int64
20 ICU 1048575 non-null int64
dtypes: int64(20), object(1)
memory usage: 168.0+ MB

In [41]: df.nunique()

USMER 2
Out[41]:
MEDICAL_UNIT 13
SEX 2
PATIENT_TYPE 2
DATE_DIED 401
INTUBED 4
PNEUMONIA 3
AGE 121
PREGNANT 4
DIABETES 3
COPD 3
ASTHMA 3
INMSUPR 3
HIPERTENSION 3
OTHER_DISEASE 3
CARDIOVASCULAR 3
OBESITY 3
RENAL_CHRONIC 3
TOBACCO 3
CLASIFFICATION_FINAL 7
ICU 4
dtype: int64

In [42]: # Filter to separate columns with 2 and 4 values where it should have 2
for c in df.columns:
if len(df[c].unique()) == 3 or len(df[c].unique()) == 4:
print(f"{c}: {len(df[c].unique())}")

INTUBED: 4
PNEUMONIA: 3
PREGNANT: 4
DIABETES: 3
COPD: 3
ASTHMA: 3
INMSUPR: 3
HIPERTENSION: 3
OTHER_DISEASE: 3
CARDIOVASCULAR: 3
OBESITY: 3
RENAL_CHRONIC: 3
TOBACCO: 3
ICU: 4

In [43]: for c in df.columns:


if len(df[c].unique()) == 3 or len(df[c].unique()) == 4:
print(df[c].value_counts(normalize=True))
print('--'*50)

97 0.809235
2 0.151682
1 0.032097
99 0.006986
Name: INTUBED, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.851188
1 0.133551
99 0.015262
Name: PNEUMONIA, dtype: float64
----------------------------------------------------------------------------------------------------
97 0.499259
2 0.489406
1 0.007754
98 0.003580
Name: PREGNANT, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.877618
1 0.119199
98 0.003183
Name: DIABETES, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.982772
1 0.014364
98 0.002864
Name: COPD, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.967050
1 0.030109
98 0.002841
Name: ASTHMA, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.983240
1 0.013514
98 0.003246
Name: INMSUPR, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.841849
1 0.155191
98 0.002960
Name: HIPERTENSION, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.968448
1 0.026741
98 0.004811
Name: OTHER_DISEASE, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.977260
1 0.019807
98 0.002934
Name: CARDIOVASCULAR, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.844696
1 0.152413
98 0.002892
Name: OBESITY, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.979105
1 0.018028
98 0.002867
Name: RENAL_CHRONIC, dtype: float64
----------------------------------------------------------------------------------------------------
2 0.916462
1 0.080467
98 0.003071
Name: TOBACCO, dtype: float64
----------------------------------------------------------------------------------------------------
97 0.809235
2 0.167546
1 0.016077
99 0.007141
Name: ICU, dtype: float64
----------------------------------------------------------------------------------------------------

Clear Dataset
In [44]: df['INTUBED'] = df['INTUBED'].replace(97,0)
df['INTUBED'] = df['INTUBED'].replace([2,99],1)
df['PNEUMONIA'] = df['PNEUMONIA'].replace(2,0)
df['PNEUMONIA'] = df['PNEUMONIA'].replace(99,1)
df['PREGNANT'] = df['PREGNANT'].replace(97,0)
df['PREGNANT'] = df['PREGNANT'].replace([2,98],1)
df['DIABETES'] = df['DIABETES'].replace(2,0)
df['DIABETES'] = df['DIABETES'].replace(98,1)
df['COPD'] = df['COPD'].replace(2,0)
df['COPD'] = df['COPD'].replace(98,1)
df['ASTHMA'] = df['ASTHMA'].replace(2,0)
df['ASTHMA'] = df['ASTHMA'].replace(98,1)
df['INMSUPR'] = df['INMSUPR'].replace(2,0)
df['INMSUPR'] = df['INMSUPR'].replace(98,1)
df['HIPERTENSION'] = df['HIPERTENSION'].replace(2,0)
df['HIPERTENSION'] = df['HIPERTENSION'].replace(98,1)
df['OTHER_DISEASE'] = df['OTHER_DISEASE'].replace(2,0)
df['OTHER_DISEASE'] = df['OTHER_DISEASE'].replace(98,1)
df['CARDIOVASCULAR'] = df['CARDIOVASCULAR'].replace(2,0)
df['CARDIOVASCULAR'] = df['CARDIOVASCULAR'].replace(98,1)
df['OBESITY'] = df['OBESITY'].replace(2,0)
df['OBESITY'] = df['OBESITY'].replace(98,1)
df['TOBACCO'] = df['TOBACCO'].replace(2,0)
df['TOBACCO'] = df['TOBACCO'].replace(98,1)
df['ICU'] = df['ICU'].replace(97,0)
df['ICU'] = df['ICU'].replace([2,99],1)
df['RENAL_CHRONIC'] = df['RENAL_CHRONIC'].replace(2,0)
df['RENAL_CHRONIC'] = df['RENAL_CHRONIC'].replace(98,1)

Plot Analysis
In [45]: fig = plt.figure(figsize=(15,25))
fig.subplots_adjust(hspace=.5, wspace=.5)
ax = fig.add_subplot(5,3,1)
sns.countplot(df['INTUBED'])
abs_values = df['INTUBED'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,2)
sns.countplot(df['PNEUMONIA'])
abs_values = df['PNEUMONIA'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,3)
sns.countplot(df['PREGNANT'])
abs_values = df['PREGNANT'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,4)
sns.countplot(df['DIABETES'])
abs_values = df['DIABETES'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,5)
sns.countplot(df['COPD'])
abs_values = df['COPD'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,6)
sns.countplot(df['ASTHMA'])
abs_values = df['ASTHMA'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,7)
sns.countplot(df['INMSUPR'])
abs_values = df['INMSUPR'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,8)
sns.countplot(df['HIPERTENSION'])
abs_values = df['HIPERTENSION'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,8)
sns.countplot(df['OTHER_DISEASE'])
abs_values = df['OTHER_DISEASE'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,9)
sns.countplot(df['CARDIOVASCULAR'])
abs_values = df['CARDIOVASCULAR'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,10)
sns.countplot(df['OBESITY'])
abs_values = df['OBESITY'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,11)
sns.countplot(df['RENAL_CHRONIC'])
abs_values = df['RENAL_CHRONIC'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,12)
sns.countplot(df['TOBACCO'])
abs_values = df['TOBACCO'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)

ax = fig.add_subplot(5,3,13)
sns.countplot(df['ICU'])
abs_values = df['ICU'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)
plt.show()

Survivors
In [46]: df.DATE_DIED.value_counts()

9999-99-99 971633
Out[46]:
06/07/2020 1000
07/07/2020 996
13/07/2020 990
16/06/2020 979
...
24/11/2020 1
17/12/2020 1
08/12/2020 1
16/03/2021 1
22/04/2021 1
Name: DATE_DIED, Length: 401, dtype: int64

In [47]: df["DEATH"] = [0 if each=="9999-99-99" else 1 for each in df.DATE_DIED]

In [48]: ax = sns.countplot(df['DEATH']);
abs_values = df['DEATH'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)
plt.show()

Media Age of Cases


In [49]: plt.figure(figsize=(13,8))
sns.histplot(x=df['AGE'])
plt.axvline(df['AGE'].median(), color='green')
plt.title("Age Distribution");

Correlation
In [50]: plt.figure(figsize=(18,15))
sns.heatmap(df.corr(),annot=True, fmt=".2f")
plt.title("Correlation Between Features",fontsize=18,color="red");

Model
In [51]: #GradientBoostingClassifier
#XGBClassifier

Train and Test


In [52]: model = []
result = []

In [53]: x = SelectKBest(chi2, k=10).fit_transform(df.drop(['DEATH','DATE_DIED'],axis=1), df['DEATH'])


y = df['DEATH']

In [54]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [56]: clf = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0, max_depth=1).fit(x_train, y_train)


vl = cross_val_score(clf,x_test,y_test,cv=10,n_jobs=1)
print(round((vl.mean()*100),2))

model.append('GradientBoostingClassifier')
result.append(round((vl.mean()*100),2))

93.58

In [57]: sns.heatmap(confusion_matrix(y_test, clf.predict(x_test)), annot=True, fmt=".0f",linewidths=4);

In [58]: eli5.show_weights(clf)

Weight Feature
Out[58]:
0.7973 ± 0.8000 x9
0.1563 ± 0.8000 x3
0.0311 ± 0.8000 x4
0.0086 ± 0.8000 x8
0.0045 ± 0.6000 x0
0.0022 ± 0.6000 x5
0 ± 0.0000 x7
0 ± 0.0000 x6
0 ± 0.0000 x2
0 ± 0.0000 x1

In [59]: xgb = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1)


xgb.fit(x_train,y_train)
vl = cross_val_score(xgb, x_test,y_test, cv=10, n_jobs=1)
print(round((vl.mean()*100),2))
model.append('XGBClassifier')
result.append(round((vl.mean()*100),2))

93.69

In [60]: sns.heatmap(confusion_matrix(y_test, xgb.predict(x_test)), annot=True, fmt=".0f",linewidths=4);

In [61]: eli5.show_weights(xgb)

Weight Feature
Out[61]:
0.7810 f1
0.1489 f4
0.0376 f8
0.0287 f3
0.0038 f0
0 f9
0 f7
0 f6
0 f5
0 f2

Data Balancing
Under-sample the majority class by randomly picking samples with or without replacement.
In [62]: rus = RandomUnderSampler()
x_resampled,y_resampled = rus.fit_resample(x,y)

In [63]: x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)

In [64]: clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1).fit(x_train, y_train)


vl = cross_val_score(clf, x_test,y_test, cv=10, n_jobs=1)
print(round((vl.mean()*100),2))

model.append('GradientBoostingClassifier RandomUnderSampler')
result.append(round((vl.mean()*100),2))

90.78

In [65]: sns.heatmap(confusion_matrix(y_test, clf.predict(x_test)), annot=True, fmt=".0f",linewidths=4);

In [66]: eli5.show_weights(clf)

Weight Feature
Out[66]:
0.8619 ± 0.2800 x9
0.0747 ± 0.8285 x4
0.0345 ± 0.3412 x3
0.0159 ± 0.7846 x8
0.0098 ± 1.0000 x0
0.0021 ± 0.1990 x7
0.0010 ± 0.1990 x5
0.0002 ± 0.1990 x2
0.0001 ± 0.1990 x6
0 ± 0.0000 x1

In [68]: xgb = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1)


xgb.fit(x_train,y_train)
vl = cross_val_score(xgb, x_test,y_test, cv=10, n_jobs=1)
print(round((vl.mean()*100),2))

model.append('XGBClassifier RandomUnderSampler')
result.append(round((vl.mean()*100),2))

90.43

In [69]: sns.heatmap(confusion_matrix(y_test, xgb.predict(x_test)), annot=True, fmt=".0f",linewidths=4);

In [70]: eli5.show_weights(xgb)

Weight Feature
Out[70]:
0.7790 f1
0.1032 f3
0.0864 f4
0.0314 f8
0 f9
0 f7
0 f6
0 f5
0 f2
0 f0

Conclusion
In [71]: df_result = pd.DataFrame()
df_result['model'] = model
df_result['acc'] = result

df_result.sort_values(by=['acc'], ascending=False)

Out[71]: model acc

1 XGBClassifier 93.69

0 GradientBoostingClassifier 93.58

2 GradientBoostingClassifier RandomUnderSampler 90.78

3 XGBClassifier RandomUnderSampler 90.43

In [ ]:

You might also like