You are on page 1of 24

I .

IMPORTS
In [1]: import pandas as pd

import numpy as np

import math

import seaborn as sns

import matplotlib.pyplot as plt

from IPython.display import Image


import warnings

warnings.filterwarnings('ignore')

II . LOAD DATA
1- importer le fichier de données

In [2]: df = pd.read_csv("healthcare-dataset-stroke-data.csv", sep =',', header =0)

print(type(df))

<class 'pandas.core.frame.DataFrame'>

III . DATA DESCRIPTION


Afficher une description générale des données (premières et dernières lignes)

In [3]: df.head(-1)

Out[3]: id gender age hypertension heart_disease ever_married work_type Residence_type a

0 9046 Male 67.0 0 1 Yes Private Urban

Self-
1 51676 Female 61.0 0 0 Yes Rural
employed

2 31112 Male 80.0 0 1 Yes Private Rural

3 60182 Female 49.0 0 0 Yes Private Urban

Self-
4 1665 Female 79.0 1 0 Yes Rural
employed

... ... ... ... ... ... ... ... ...

5104 14180 Female 13.0 0 0 No children Rural

5105 18234 Female 80.0 1 0 Yes Private Urban

Self-
5106 44873 Female 81.0 0 0 Yes Urban
employed

Self-
5107 19723 Female 35.0 0 0 Yes Rural
employed

5108 37544 Male 51.0 0 0 Yes Private Rural

5109 rows × 12 columns


In [4]: df.describe()

Out[4]: id age hypertension heart_disease avg_glucose_level bmi

count 5110.000000 5110.000000 5110.000000 5110.000000 5110.000000 4909.000000 5110

mean 36517.829354 43.226614 0.097456 0.054012 106.147677 28.893237 0

std 21161.721625 22.612647 0.296607 0.226063 45.283560 7.854067 0

min 67.000000 0.080000 0.000000 0.000000 55.120000 10.300000 0

25% 17741.250000 25.000000 0.000000 0.000000 77.245000 23.500000 0

50% 36932.000000 45.000000 0.000000 0.000000 91.885000 28.100000 0

75% 54682.000000 61.000000 0.000000 0.000000 114.090000 33.100000 0

max 72940.000000 82.000000 1.000000 1.000000 271.740000 97.600000 1

In [5]: df.describe(include=['object'])

Out[5]: gender ever_married work_type Residence_type smoking_status

count 5110 5110 5110 5110 5110

unique 3 2 5 2 4

top Female Yes Private Urban never smoked

freq 2994 3353 2925 2596 1892

In [6]: df.groupby('age').size()

Out[6]: age

0.08 2

0.16 3

0.24 5

0.32 5

0.40 2

...

78.00 102

79.00 85

80.00 70

81.00 60

82.00 56

Length: 104, dtype: int64

In [7]: len(df['age']. unique())

Out[7]: 104

In [8]: #last n rows from df

df.tail(-4)

Out[8]: id gender age hypertension heart_disease ever_married work_type Residence_type a

Self-
4 1665 Female 79.0 1 0 Yes Rural
employed

5 56669 Male 81.0 0 0 Yes Private Urban

6 53882 Male 74.0 1 1 Yes Private Rural


id gender age hypertension heart_disease ever_married work_type Residence_type a

7 10434 Female 69.0 0 0 No Private Urban

8 27419 Female 59.0 0 0 Yes Private Rural

... ... ... ... ... ... ... ... ...

5105 18234 Female 80.0 1 0 Yes Private Urban

Self-
5106 44873 Female 81.0 0 0 Yes Urban
employed

Self-
5107 19723 Female 35.0 0 0 Yes Rural
employed

5108 37544 Male 51.0 0 0 Yes Private Rural

5109 44679 Female 44.0 0 0 Yes Govt_job Urban

5106 rows × 12 columns

1. Data Dimension
Afficher le shape des lignes et des colonnes du dataframe
nombrede ligne 5110*nombre
de colonnes: 12*

In [9]: print("Le nombre de lignes",df.shape[0])

print("Le nombre de colonnes",df.shape[1])

Le nombre de lignes 5110

Le nombre de colonnes 12

2. Data types
2.1 - afficher les informations du dataframe

In [10]: df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 5110 entries, 0 to 5109

Data columns (total 12 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 id 5110 non-null int64

1 gender 5110 non-null object

2 age 5110 non-null float64

3 hypertension 5110 non-null int64

4 heart_disease 5110 non-null int64

5 ever_married 5110 non-null object

6 work_type 5110 non-null object

7 Residence_type 5110 non-null object

8 avg_glucose_level 5110 non-null float64

9 bmi 4909 non-null float64

10 smoking_status 5110 non-null object

11 stroke 5110 non-null int64

dtypes: float64(3), int64(4), object(5)

memory usage: 479.2+ KB

2.2 - afficher les types de données dans chaque colonne

In [11]: df.dtypes

Out[11]: id int64

gender object

age float64

hypertension int64

heart_disease int64

ever_married object

work_type object

Residence_type object

avg_glucose_level float64

bmi float64

smoking_status object

stroke int64

dtype: object
2.3 - parcourir les colonnes du dataframe (for) pour afficher leur type et le nombre de valeurs null (ou
manquantes)
In [12]: for var in df.columns:

print('Variable:',var, ' ',df[var].isnull().count())

Variable: id 5110

Variable: gender 5110

Variable: age 5110

Variable: hypertension 5110

Variable: heart_disease 5110

Variable: ever_married 5110

Variable: work_type 5110

Variable: Residence_type 5110

Variable: avg_glucose_level 5110

Variable: bmi 5110

Variable: smoking_status 5110

Variable: stroke 5110

2.4 - Supprimer la colonne Id qui n'est pas intéressante (df.drop(['col'], axis=1, inplace=True))

Vérifier.

In [13]: df.drop(['id'], axis=1, inplace=True)

2.5 - vérifier la suppression

In [14]: df.head()

Out[14]: gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose

0 Male 67.0 0 1 Yes Private Urban 2

Self-
1 Female 61.0 0 0 Yes Rural 2
employed

2 Male 80.0 0 1 Yes Private Rural 1

3 Female 49.0 0 0 Yes Private Urban 1

Self-
4 Female 79.0 1 0 Yes Rural 1
employed

2.5 - convertir le type de la colonne 'age' en int (df.astype())


In [15]: df['age'] = df['age'].astype('int64')

df.head()

Out[15]: gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_


gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_

0 Male 67 0 1 Yes Private Urban 2

Self-
1 Female 61 0 0 Yes Rural 2
employed

2 Male 80 0 1 Yes Private Rural 1

3 Female 49 0 0 Yes Private Urban 1

Self-
4 Female 79 1 0 Yes Rural 1
employed

In [16]: df

Out[16]: gender age hypertension heart_disease ever_married work_type Residence_type avg_gluc

0 Male 67 0 1 Yes Private Urban

Self-
1 Female 61 0 0 Yes Rural
employed

2 Male 80 0 1 Yes Private Rural

3 Female 49 0 0 Yes Private Urban

Self-
4 Female 79 1 0 Yes Rural
employed

... ... ... ... ... ... ... ...

5105 Female 80 1 0 Yes Private Urban

Self-
5106 Female 81 0 0 Yes Urban
employed

Self-
5107 Female 35 0 0 Yes Rural
employed

5108 Male 51 0 0 Yes Private Rural

5109 Female 44 0 0 Yes Govt_job Urban

5110 rows × 11 columns

3. Check NaN
3.1 - Vérifier l'existance de valeurs manquantes

In [17]: df.isnull().any()

Out[17]: gender False

age False

hypertension False

heart_disease False

ever_married False

work_type False

Residence_type False

avg_glucose_level False

bmi True

smoking_status False

stroke False

dtype: bool
3.2 - chercher le nombre de valeurs NaN dans chaque colonne et son nombre de valeurs

In [18]: df.isna().sum()

Out[18]: gender 0

age 0

hypertension 0

heart_disease 0

ever_married 0

work_type 0

Residence_type 0

avg_glucose_level 0

bmi 201

smoking_status 0

stroke 0

dtype: int64

In [19]: df.bmi.isna().value_counts()

Out[19]: False 4909

True 201

Name: bmi, dtype: int64


3.3 - Supprimer les nan dans la variable 'bmi'pour les remplacer par la moyenne des 'bmi'
NB: utiliser la méthode
fillna(...)
si nous souhaitons supprimer toutes les lignes contanant des NaN

In [20]: dd=df.copy()

In [21]: dd.dropna(subset=['bmi'],axis=0,inplace=False)

Out[21]: gender age hypertension heart_disease ever_married work_type Residence_type avg_gluc

0 Male 67 0 1 Yes Private Urban

2 Male 80 0 1 Yes Private Rural

3 Female 49 0 0 Yes Private Urban

Self-
4 Female 79 1 0 Yes Rural
employed

5 Male 81 0 0 Yes Private Urban

... ... ... ... ... ... ... ...

5104 Female 13 0 0 No children Rural

Self-
5106 Female 81 0 0 Yes Urban
employed

Self-
5107 Female 35 0 0 Yes Rural
employed

5108 Male 51 0 0 Yes Private Rural

5109 Female 44 0 0 Yes Govt_job Urban

4909 rows × 11 columns


Nous n'allons pas supprimer les NaN mais plutot les remplacer par la moyenne de la
colonne 'bmi'

In [22]: df.bmi.fillna(df['bmi'].mean(), inplace=True)

3.4 - Afficher le nombre lignes pour chaque classe de la colonne Stroke

In [23]: df.stroke.value_counts()

Out[23]: 0 4861

1 249

Name: stroke, dtype: int64


3.5 - afficher le nombre de femme et d'homme (gender) dans le dataframe

In [24]: df.gender.value_counts()

Out[24]: Female 2994

Male 2115

Other 1

Name: gender, dtype: int64


3.6 - afficher le nombre de personnes mariées

In [25]: df.ever_married.value_counts()

Out[25]: Yes 3353

No 1757

Name: ever_married, dtype: int64

4. Descriptive Statistical
4.1 - examiner le rapport entre le type de travail et le stroke (pandas.crosstab(...))

In [26]: pd.crosstab(df['work_type'],df['stroke'])

Out[26]: stroke 0 1

work_type

Govt_job 624 33

Never_worked 22 0

Private 2776 149

Self-employed 754 65

children 685 2

4.2 - visualiser ce rapport avec df.plot.scatter(...) puis avec sns.countplot(..)

In [27]: plt.figure(figsize=(15,5))

sns.countplot( x='work_type', hue=df['stroke'], palette='Set2', data=df);

5. Numerical Variable
5.1 - créer un dataframe 'df_num' intermediaire contenant les variables numérique seulement

In [28]: df_num = df[['age', 'avg_glucose_level', 'bmi']]

5.2- créer un dataframe 'df_cat' intermediaire contenant les variables catégorielles seulement
In [29]: df_cat = df[['gender', 'hypertension', 'heart_disease', 'ever_married',

'work_type', 'Residence_type', 'smoking_status']]

5.3 - Afficher en histogrammes les statistiques des variables numériques

In [30]: df.hist(bins=25, figsize=(20,10));

5.4 - Afficher en barres le nombre d'observations de la variable catégorique 'stroke'.

In [31]: plt.figure(figsize=(5,5))

sns.countplot( x='stroke', palette='Set2', data=df)

Out[31]: <AxesSubplot:xlabel='stroke', ylabel='count'>


IV. Visualisation and cleaning data
In [32]: sns.set(style='whitegrid')

ax=sns.boxplot(data=df,orient='v')

1 - Afficher quelque boîtes à moustaches (boxplot)

In [33]: sns.set(style='whitegrid')

ax=sns.boxplot(data=df['age'],orient='v')

IV.1 - traitement des valeurs aberrantes de la colonne


'bmi'
1. diagramme en moustache (boxplot)

In [34]: sns.set(style='whitegrid')

ax=sns.boxplot(df['bmi'],orient='h')

2 - calculer les 4 quartiles (Q1,Q3,IRQ) et Sup, Inf

In [35]: Q1=df['bmi'].quantile(0.25)

Q3=df['bmi'].quantile(0.75)

IQR=Q3-Q1

Inf = df['bmi'].quantile(0.25)-1.5*IQR

Sup = df['bmi'].quantile(0.75)+1.5*IQR

In [36]: print(Sup)

46.29999999999999

3- déduire le nombre de outliers

In [37]: df[df['bmi']>=Sup].count()

Out[37]: gender 125

age 125

hypertension 125

heart_disease 125

ever_married 125

work_type 125

Residence_type 125

avg_glucose_level 125

bmi 125

smoking_status 125

stroke 125

dtype: int64
4 - Ecrire une méthode drop_outliers() qui supprime les outliers du dataframe

In [38]: def drop_outliers(data,field_name):

q1=data[field_name].quantile(0.25)

q3=data[field_name].quantile(0.75)

iqr=q3-q1

inf = data[field_name].quantile(0.25)-1.5*iqr

sup = data[field_name].quantile(0.75)+1.5*iqr

data.drop(data[data[field_name]>sup].index,inplace=True)

data.drop(data[data[field_name]<inf].index,inplace=True)

In [39]: drop_outliers(df,'bmi')

5 - retracer le diagramme en moustache pour les 'bmi'

In [40]: sns.set(style='whitegrid')

ax=sns.boxplot(df['bmi'],orient='h')

IV.2 - Traitement des valeurs aberrantes dans


'avg_glucose_level'
1- tracer le diagramme en moustache pour les 'avg_glucose_level'

In [41]: sns.set(style='whitegrid')

ax=sns.boxplot(df['avg_glucose_level'],orient='h')

2 - remplacer les outliers par la mediane

In [42]: Q1g=df['avg_glucose_level'].quantile(0.25)

Q3g=df['avg_glucose_level'].quantile(0.75)

IQRg=Q3g-Q1g

Infg = df['avg_glucose_level'].quantile(0.25)-1.5*IQRg

Supg = df['avg_glucose_level'].quantile(0.75)+1.5*IQRg

for i in df['avg_glucose_level']:

if i >=Supg or i<=Infg :

df['avg_glucose_level']=df['avg_glucose_level'].replace(i,np.mean(df['av

3- retracer le diagramme en moustache pour les 'avg_glucose_level'

In [43]: sns.set(style='whitegrid')

ax=sns.boxplot(x=df['avg_glucose_level'],orient='h') #, showfliers=False)

Donc, la médiane n'est pas la bonne valeur de remplacement


In [44]: df.loc[df['avg_glucose_level']>=Supg].count()

Out[44]: gender 0

age 0

hypertension 0

heart_disease 0

ever_married 0

work_type 0

Residence_type 0

avg_glucose_level 0

bmi 0

smoking_status 0

stroke 0

dtype: int64

In [45]: print(Supg)

print(Infg)

168.59124999999997

22.36125000000002

V. Autres visualisations statistiques


1- Afficher en barres le nombre d'observations de la variable catégorique 'stroke' classée par
'Gender'.

Interpréter.

In [46]: plt.figure(figsize=(15,5))

sns.countplot(data=df, x='gender',hue=df['stroke'], palette='Set2');

2- Afficher en barres le nombre d'observations de la variable catégorique 'stroke' classée par


'work_type'.

Interpréter.

In [47]: plt.figure(figsize=(15,5))

sns.countplot(x='stroke',hue=df['work_type'],palette='Set2',data=df);

3- Afficher en barres le nombre de lignes par type de résidence

In [48]: plt.figure(figsize=(5,5))

sns.countplot( x='Residence_type', palette='Set2', data=df);

4- Afficher en barres le nombre d'observations de la variable catégorique 'stroke' classée par


'ever_married'.

Interpréter.

In [49]: plt.figure(figsize=(5,5))

sns.countplot( x='ever_married', hue=df['stroke'], palette='Set2', data=df)

Out[49]: <AxesSubplot:xlabel='ever_married', ylabel='count'>

La plupart des personnes ayant subi un AVC sont mariées.


5- Afficher en barres le nombre d'observations de la variable catégorique 'stroke' classée par
'smoking_status'.

Interpréter.

In [50]: plt.figure(figsize=(15,5))

sns.countplot( x='smoking_status', hue=df['stroke'], palette='Set2', data=df)

Out[50]: <AxesSubplot:xlabel='smoking_status', ylabel='count'>

Les personnes qui ne fumaient pas avaient une incidence plus élevée d'AVC que les autres qui fumaient ou
avaient déjà fumé. Cependant, il y a une grande partie qui est inconnue et qui peut - ou non - changer ce
scénario.
6- Afficher la correlation entre les variables du dataframe (coefficient de pearson)

In [51]: plt.figure(figsize=(10,10))

correlation = df.corr( method='pearson' )

sns.heatmap( correlation, annot=True );

VI. Les variables en entrée X et la cible Y


1 - Afficher un rappel de dimension et un echantillon du dataframe

In [52]: df.shape

Out[52]: (4984, 11)

In [53]: df.head()

Out[53]: gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_

0 Male 67 0 1 Yes Private Urban 105.73

Self-
1 Female 61 0 0 Yes Rural 105.70
employed

2 Male 80 0 1 Yes Private Rural 105.92

3 Female 49 0 0 Yes Private Urban 105.66


gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_

Self-
4 Female 79 1 0 Yes Rural 105.65
employed

2 - affecter à Y (cible) la dernière colonne (stroke) et le reste du dataframe au entrées X

In [54]: X = df.iloc[:,0:10]

Y= df.iloc[:,10]

3 - afficher un echontillion de X et Y

In [55]: X.head()

Out[55]: gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_

0 Male 67 0 1 Yes Private Urban 105.73

Self-
1 Female 61 0 0 Yes Rural 105.70
employed

2 Male 80 0 1 Yes Private Rural 105.92

3 Female 49 0 0 Yes Private Urban 105.66

Self-
4 Female 79 1 0 Yes Rural 105.65
employed

In [56]: Y.head()

Out[56]: 0 1

1 1

2 1

3 1

4 1

Name: stroke, dtype: int64

VI.1 Transformer X et Y en tableau numpy


1 - lancer les commandes qui transforme X et Y en type numpyarray

In [57]: X=X.values

Y=Y.values

2 - Vérifier le type et le contenu de X et Y

In [58]: type(X)

Out[58]: numpy.ndarray

In [59]: X[:5,:]

Out[59]: array([['Male', 67, 0, 1, 'Yes', 'Private', 'Urban', 105.73068820224687,

36.6, 'formerly smoked'],

['Female', 61, 0, 0, 'Yes', 'Self-employed', 'Rural',

105.70601739329871, 28.893236911794673, 'never smoked'],

['Male', 80, 0, 1, 'Yes', 'Private', 'Rural', 105.92, 32.5,

'never smoked'],

['Female', 49, 0, 0, 'Yes', 'Private', 'Urban',

105.66729187860902, 34.4, 'smokes'],

['Female', 79, 1, 0, 'Yes', 'Self-employed', 'Rural',

105.65413724214807, 24.0, 'never smoked']], dtype=object)

VII. transformation des variables


catégoriques
1 - Nous commençons par les colonnes catégoriques à deux états
'gender','ever_married','Residence_type'

In [60]: from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

In [61]: X[:,0] = label.fit_transform(X[:,0]) # colonne gender devient 1-->male ou 0-->female


X[:,4] = label.fit_transform(X[:,4]) # colonne ever_married devient no-->0 ou yes-->
X[:,6] = label.fit_transform(X[:,6]) # colonne Residence_type devient rural-->0 ou u

X contient les colonnes dans l'ordre : 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
'work_type',
'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status'
2 - Vérifier cette transformation

In [62]: X[:10,:]

Out[62]: array([[1, 67, 0, 1, 1, 'Private', 1, 105.73068820224687, 36.6,

'formerly smoked'],

[0, 61, 0, 0, 1, 'Self-employed', 0, 105.70601739329871,

28.893236911794673, 'never smoked'],

[1, 80, 0, 1, 1, 'Private', 0, 105.92, 32.5, 'never smoked'],

[0, 49, 0, 0, 1, 'Private', 1, 105.66729187860902, 34.4, 'smokes'],

[0, 79, 1, 0, 1, 'Self-employed', 0, 105.65413724214807, 24.0,

'never smoked'],

[1, 81, 0, 0, 1, 'Private', 1, 105.6404001107761, 29.0,

'formerly smoked'],

[1, 74, 1, 1, 1, 'Private', 0, 70.09, 27.4, 'never smoked'],

[0, 69, 0, 0, 0, 'Private', 1, 94.39, 22.8, 'never smoked'],

[0, 59, 0, 0, 1, 'Private', 0, 76.15, 28.893236911794673,

'Unknown'],

[0, 78, 0, 0, 1, 'Private', 1, 58.57, 24.2, 'Unknown']],

dtype=object)
3 - nous transformons la colonne work_type qui a 5 etats : 'private','self-employed','gov-
job','children','never-worked'

In [63]: from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

In [64]: ct = ColumnTransformer([('work_type',OneHotEncoder(),[5])],\

remainder = 'passthrough')

In [65]: X = ct.fit_transform(X)

4 - Vérifier la transformation de la colonne 'work_type' par 5 colonnes ajouter en début du X


(shap et affichage)

In [66]: X.shape # ajout de 4 autres colonnes

#private ---> 0.0 0.0 1.0 0.0 0.0

Out[66]: (4984, 14)

In [67]: X[:,0:5]

Out[67]: array([[0.0, 0.0, 1.0, 0.0, 0.0],

[0.0, 0.0, 0.0, 1.0, 0.0],

[0.0, 0.0, 1.0, 0.0, 0.0],

...,

[0.0, 0.0, 0.0, 1.0, 0.0],

[0.0, 0.0, 1.0, 0.0, 0.0],

[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=object)


5 - Nous remarquons que la première colonne peut être supprimée (on la supprime alors)

In [68]: X = X[:,1:]

6 - Vérifier cette suppression (shape et affichage d'un echantillion)

In [69]: X.shape

Out[69]: (4984, 13)

In [70]: X[:5,:]

Out[70]: array([[0.0, 1.0, 0.0, 0.0, 1, 67, 0, 1, 1, 1, 105.73068820224687, 36.6,

'formerly smoked'],

[0.0, 0.0, 1.0, 0.0, 0, 61, 0, 0, 1, 0, 105.70601739329871,

28.893236911794673, 'never smoked'],

[0.0, 1.0, 0.0, 0.0, 1, 80, 0, 1, 1, 0, 105.92, 32.5,

'never smoked'],

[0.0, 1.0, 0.0, 0.0, 0, 49, 0, 0, 1, 1, 105.66729187860902, 34.4,

'smokes'],

[0.0, 0.0, 1.0, 0.0, 0, 79, 1, 0, 1, 0, 105.65413724214807, 24.0,

'never smoked']], dtype=object)


X contient les colonnes dans l'ordre : 'work_type1', 'work_type2', 'work_type3', 'work_type4', 'gender', 'age',
'hypertension', 'heart_disease', 'ever_married','Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status'
7 - nous transformons la colonne 'smoking_status' qui a 4 etats :'formerly smoked','never
smoked','smokes','unknown'

In [71]: ct = ColumnTransformer([('smoking_status',OneHotEncoder(),[12])],\

remainder = 'passthrough')

In [72]: X = ct.fit_transform(X)

8 - Vérifier la transformation de la colonne 'smoking_status' par 4 colonne ajouter en début du X


(shape et affichage)

In [73]: X[:5,:] # smoking_status est remplacé par 4 colonnes au début

Out[73]: array([[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1, 67, 0, 1, 1, 1,

105.73068820224687, 36.6],
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 61, 0, 0, 1, 0,

105.70601739329871, 28.893236911794673],

[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1, 80, 0, 1, 1, 0,

105.92, 32.5],

[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0, 49, 0, 0, 1, 1,

105.66729187860902, 34.4],
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 79, 1, 0, 1, 0,

105.65413724214807, 24.0]], dtype=object)


9 - nous enlevons la première colonne

In [74]: X = X[:,1:]

10 - Vérifier cette suppression (shape et affichage)

In [75]: X.shape

Out[75]: (4984, 15)

In [76]: X[:5,:]

Out[76]: array([[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1, 67, 0, 1, 1, 1,

105.73068820224687, 36.6],
[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 61, 0, 0, 1, 0,

105.70601739329871, 28.893236911794673],

[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1, 80, 0, 1, 1, 0, 105.92,

32.5],

[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0, 49, 0, 0, 1, 1,

105.66729187860902, 34.4],
[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 79, 1, 0, 1, 0,

105.65413724214807, 24.0]], dtype=object)


X contient les colonnes dans l'ordre : 'smoking_status1', 'smoking_status2', 'smoking_status3', 'work_type1',
'work_type2', 'work_type3', 'work_type4', 'gender', 'age', 'hypertension', 'heart_disease',
'ever_married','Residence_type', 'avg_glucose_level', 'bmi'

VIII. Splitting dataset into train and test sets


In [77]: from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,\

random_state=0)

# random_state D'un autre côté, si vous utilisez random_state=some_number,

#vous pouvez garantir que la sortie de Run 1 sera égale à la sortie de Run 2,

#c'est-à-dire que votre division sera toujours la même. Peu importe le nombre réel d
# L'important est qu'à chaque fois que vous utilisez 42, vous obtiendrez toujours la
#vous effectuez le fractionnement.

In [78]: X_train.shape

Out[78]: (3987, 15)

In [79]: X_test.shape

Out[79]: (997, 15)

In [80]: Y.mean()

Out[80]: 0.049357945425361156

In [81]: print(y_train.mean())

print(y_test.mean())

0.04890895410082769

0.05115346038114343

IX. Mise à l'echelle des variables


1- Nous allons faire une standardisation du X_train et X_test

In [82]: from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [83]: X_train_sc = scaler.fit_transform(X_train)

X_test_sc = scaler.transform(X_test)

2 - Afficher un echantillon du X_train_sc


In [84]: X_train_sc[0:5,:]

Out[84]: array([[-0.45987593, -0.76339435, 2.35061019, -0.06543788, -1.15453161,

-0.43402253, -0.39876262, 1.17425648, -0.48865318, -0.3217376 ,

-0.24340395, 0.73078831, -1.02463236, -0.4345941 , -0.37874265],

[ 2.17449955, -0.76339435, -0.42542145, -0.06543788, 0.86615212,

-0.43402253, -0.39876262, 1.17425648, 0.87161861, 3.10812288,

-0.24340395, 0.73078831, 0.97595981, 0.04686664, 0.52470066],

[-0.45987593, -0.76339435, -0.42542145, -0.06543788, -1.15453161,

-0.43402253, 2.50775762, 1.17425648, -1.45400735, -0.3217376 ,

-0.24340395, -1.36838532, 0.97595981, 0.49137185, -1.38758767],

[-0.45987593, 1.30993895, -0.42542145, -0.06543788, 0.86615212,

-0.43402253, -0.39876262, 1.17425648, 0.60834019, -0.3217376 ,

-0.24340395, 0.73078831, 0.97595981, 0.8676035 , -0.06253749],

[-0.45987593, 1.30993895, -0.42542145, -0.06543788, -1.15453161,

2.30402784, -0.39876262, -0.85160271, 0.82773887, -0.3217376 ,

-0.24340395, 0.73078831, -1.02463236, 1.68083913, -0.52931653]])

X. Modélisation
X.1 - k Nearest Neighbors
In [85]: from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(X_train_sc,y_train)

y_pred_knn = knn.predict(X_test_sc)

X.2 - Evaluation du modèle KNN


In [86]: from sklearn.metrics import accuracy_score, confusion_matrix,recall_score,precision_
print('Confusion matrix knn \n', confusion_matrix(y_test,y_pred_knn))

print('Accuracy knn', accuracy_score(y_test,y_pred_knn))

Confusion matrix knn

[[937 9]

[ 49 2]]

Accuracy knn 0.9418254764292878

In [87]: from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_knn))

precision recall f1-score support

0 0.95 0.99 0.97 946

1 0.18 0.04 0.06 51

accuracy 0.94 997

macro avg 0.57 0.51 0.52 997

weighted avg 0.91 0.94 0.92 997

In [88]: from sklearn import metrics

print('Recall knn : ', recall_score(y_test,y_pred_knn))

print('Precision knn : ', precision_score(y_test,y_pred_knn))

Recall knn : 0.0392156862745098

Precision knn : 0.18181818181818182

In [89]: for i in range(10):

print(y_test[i],y_pred_knn[i])

0 0

0 0

0 0

0 0

0 0

1 0

0 0

0 0

0 0

0 1

X.3 - GRID SEARCH


In [90]: from sklearn.model_selection import GridSearchCV

parameters = {'n_neighbors' : [1,3,5,7,9,11,13]}

model = KNeighborsClassifier()

clf = GridSearchCV(model,parameters, scoring='accuracy', cv=5)

grille = clf.fit(X_train_sc,y_train)

# meilleur paramètre et meilleur score qui est l'accuracy

print(grille.best_params_)

print(grille.best_score_)

{'n_neighbors': 11}

0.9510910274431372

In [91]: y_pred_knn_o = grille.predict(X_test_sc)

In [92]: knn_op_acc=accuracy_score(y_test,y_pred_knn_o)

knn_op_rec=recall_score(y_test,y_pred_knn_o)

knn_op_prec=precision_score(y_test,y_pred_knn_o)

print('Confusion matrix knn op \n', confusion_matrix(y_test,y_pred_knn_o))


print('Accuracy knn op', knn_op_acc)

print('Recall knn op', knn_op_rec)

print('Precision knn op',knn_op_prec)

Confusion matrix knn op

[[946 0]

[ 51 0]]

Accuracy knn op 0.9488465396188566


Recall knn op 0.0

Precision knn op 0.0

X.4 DECISION TREE


In [93]: from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

In [94]: dt.fit(X_train_sc,y_train)

Out[94]: DecisionTreeClassifier()

In [95]: y_pred_dt = dt.predict(X_test_sc)

In [96]: for i in range(10):

print(y_test[i],y_pred_dt[i])

0 0

0 0

0 0

0 0

0 0

1 0

0 0

0 0

0 1

0 0

In [97]: dt_acc = accuracy_score(y_test,y_pred_dt)

dt_rec = recall_score(y_test,y_pred_dt)

dt_prec = precision_score(y_test,y_pred_dt)

print('Confusion matrix dt \n', confusion_matrix(y_test,y_pred_dt))

print('Accuracy dt', dt_acc)

print('Recall dt', dt_rec)

print('Precision dt', dt_prec)

Confusion matrix dt

[[903 43]

[ 31 20]]

Accuracy dt 0.925777331995988

Recall dt 0.39215686274509803

Precision dt 0.31746031746031744

In [98]: print(classification_report(y_test,y_pred_dt))

precision recall f1-score support

0 0.97 0.95 0.96 946

1 0.32 0.39 0.35 51

accuracy 0.93 997

macro avg 0.64 0.67 0.66 997

weighted avg 0.93 0.93 0.93 997

X contient les colonnes dans l'ordre : 'smoking_status1', 'smoking_status2', 'smoking_status3', 'work_type1',


'work_type2', 'work_type3', 'work_type4', 'gender', 'age', 'hypertension', 'heart_disease',
'ever_married','Residence_type', 'avg_glucose_level', 'bmi'
In [99]: features = ['smoking_status1', 'smoking_status2', 'smoking_status3', 'work_type1', \
'work_type2', 'work_type3', 'work_type4', 'gender', 'age', 'hypertension
'heart_disease', 'ever_married','Residence_type', 'avg_glucose_level', '

!pip install graphviz


In [100… import graphviz

from sklearn import tree

from sklearn.tree import export_graphviz

tree.export_graphviz(dt,feature_names = features,\

out_file = 'dt.dot',\

label = 'all',\

filled = True,\

rounded = True)

---------------------------------------------------------------------------

ModuleNotFoundError Traceback (most recent call last)

<ipython-input-100-1e7839a47ba8> in <module>

----> 1 import graphviz

2 from sklearn import tree

3 from sklearn.tree import export_graphviz

5 tree.export_graphviz(dt,feature_names = features,\

ModuleNotFoundError: No module named 'graphviz'


l'image générée est .dot. Ce format décrit trois types d'objets : graphe, sommet (noeuds) et arrête (branche).
Il
faut la convertir en png pour pouvoir la visualiser.
https://onlineconvertfree.com/fr/convert-format/dot-to-png/
et un convertisseur en ligne.
Convertir l'image et la placer dans votre répertoire pour la visualiser.
In [ ]: Image('dt.png')

X.5 - RANDOM FOREST


In [ ]: from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500)

In [ ]: rf.fit(X_train_sc,y_train)

# 500 nombre d'arbre et prend 80ù de X_train (8000)

In [ ]: y_pred_rf = rf.predict(X_test_sc)

In [ ]: rf_acc=accuracy_score(y_test,y_pred_rf)

rf_rec=recall_score(y_test,y_pred_rf)

rf_prec=precision_score(y_test,y_pred_rf)

print('Confusion matrix rf \n', confusion_matrix(y_test,y_pred_rf))

print('Accuracy rf', rf_acc)

print('Recall rf', rf_rec)

print('Precision rf', rf_prec)

NB : Vous pouvez de même visualiser votre random forest avec Graphiz.

X.6 - SVM
1. Linéaire SVM
Si le paramètre kernel='linear'
C'est à dire la séparation entre les classes est une ligne d'equation ax+b=0
In [ ]: Image(filename='linear_SVM.png')

In [ ]: from sklearn.svm import SVC

linear_SVM = SVC(kernel='linear')

linear_SVM.fit(X_train_sc,y_train)

In [ ]: y_predictSVM_l = linear_SVM.predict(X_test_sc)

In [ ]: svm_acc=accuracy_score(y_test,y_predictSVM_l)

svm_prec=precision_score(y_test,y_predictSVM_l)

svm_rec=recall_score(y_test,y_predictSVM_l)

print(confusion_matrix(y_test,y_predictSVM_l))

print('Accuracy linear SVM {0:.3f}'.format(svm_acc))

print('Precision linear SVM {0:.3f}'.format(svm_prec))

print('Recall linear SVM {0:.3f}'.format(svm_rec))

In [ ]: print(classification_report(y_test,y_predictSVM_l))

1. Kernel SVM
Si le paramètre kernel='rbf'
C'est à dire la séparation entre les classe est un plan, et la répartition des points est
vue selon une fonction appelée RBF.
In [ ]: import matplotlib.image as mpimg

fig = plt.figure()

a=fig.add_subplot(1,2,1)

img1 = mpimg.imread('rbf.gif')

img2 = mpimg.imread('kernel_svm.png')

plt.figure(1)

plt.subplot(211)

plt.imshow(img1)

plt.subplot(212)

plt.imshow(img2)

plt.show()

In [ ]: kernel_SVM = SVC(kernel='rbf')

kernel_SVM.fit(X_train_sc,y_train)

y_predictSVM_k = kernel_SVM.predict(X_test_sc)

Ksvm_acc=accuracy_score(y_test,y_predictSVM_k)

Ksvm_rec=precision_score(y_test,y_predictSVM_k)

Ksvm_prec=recall_score(y_test,y_predictSVM_k)

print(confusion_matrix(y_test,y_predictSVM_k))

print('Accuracy rbf SVM {0:.3f}'.format(Ksvm_acc))

print('Precision rbf SVM {0:.3f}'.format(Ksvm_rec))

print('Recall rbf SVM {0:.3f}'.format(Ksvm_prec))

In [ ]: print(classification_report(y_test,y_predictSVM_k))

NB : Remarquer dans cet exemple, le SVM linéaire et kernel ont donné presque les mêmes
résultats.

X.6 Logistic regression


In [ ]: Image(filename='logistic.png')

In [ ]: from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()

LR.fit(X_train_sc,y_train)

In [ ]: y_predictLR = LR.predict(X_test_sc)

LR_acc=accuracy_score(y_test,y_predictLR)

LR_prec=metrics.precision_score(y_test,y_predictLR)

LR_rec=metrics.recall_score(y_test,y_predictLR)

print(confusion_matrix(y_test,y_predictLR))

print('Accuracy Logistic Regression {0:.3f}'.format(LR_acc))

print('Precision Logistic Regression {0:.3f}'.format(LR_prec))

print('Recall Logistic Regression {0:.3f}'.format(LR_rec))

8. Summary
In [ ]: print("| Algorithm | Accuracy |Recall | Precision |")

print("| --------------- | -------------|-----------|------------|")

print("| Optimized kNN | {0:.3f} | {0:.3f} | {0:.3f} |".format(knn


print("| Decision Tree | {0:.3f} | {0:.3f} | {0:.3f} |".format(dt_
print("| Random Forest | {0:.3f} | {0:.3f} | {0:.3f} |".format(rf_
print("| Linear SVM | {0:.3f} | {0:.3f} | {0:.3f} |".format(svm
print("| Kernel SVM | {0:.3f} | {0:.3f} | {0:.3f} |".format(Ksv
print("| Logistic Reg | {0:.3f} | {0:.3f} | {0:.3f} |".format(LR_

You might also like