Professional Documents
Culture Documents
IMPORTS
In [1]: import pandas as pd
import numpy as np
import math
warnings.filterwarnings('ignore')
II . LOAD DATA
1- importer le fichier de données
print(type(df))
<class 'pandas.core.frame.DataFrame'>
In [3]: df.head(-1)
Self-
1 51676 Female 61.0 0 0 Yes Rural
employed
Self-
4 1665 Female 79.0 1 0 Yes Rural
employed
Self-
5106 44873 Female 81.0 0 0 Yes Urban
employed
Self-
5107 19723 Female 35.0 0 0 Yes Rural
employed
In [5]: df.describe(include=['object'])
unique 3 2 5 2 4
In [6]: df.groupby('age').size()
Out[6]: age
0.08 2
0.16 3
0.24 5
0.32 5
0.40 2
...
78.00 102
79.00 85
80.00 70
81.00 60
82.00 56
Out[7]: 104
df.tail(-4)
Self-
4 1665 Female 79.0 1 0 Yes Rural
employed
Self-
5106 44873 Female 81.0 0 0 Yes Urban
employed
Self-
5107 19723 Female 35.0 0 0 Yes Rural
employed
1. Data Dimension
Afficher le shape des lignes et des colonnes du dataframe
nombrede ligne 5110*nombre
de colonnes: 12*
Le nombre de colonnes 12
2. Data types
2.1 - afficher les informations du dataframe
In [10]: df.info()
<class 'pandas.core.frame.DataFrame'>
In [11]: df.dtypes
Out[11]: id int64
gender object
age float64
hypertension int64
heart_disease int64
ever_married object
work_type object
Residence_type object
avg_glucose_level float64
bmi float64
smoking_status object
stroke int64
dtype: object
2.3 - parcourir les colonnes du dataframe (for) pour afficher leur type et le nombre de valeurs null (ou
manquantes)
In [12]: for var in df.columns:
Variable: id 5110
2.4 - Supprimer la colonne Id qui n'est pas intéressante (df.drop(['col'], axis=1, inplace=True))
Vérifier.
In [14]: df.head()
Self-
1 Female 61.0 0 0 Yes Rural 2
employed
Self-
4 Female 79.0 1 0 Yes Rural 1
employed
df.head()
Self-
1 Female 61 0 0 Yes Rural 2
employed
Self-
4 Female 79 1 0 Yes Rural 1
employed
In [16]: df
Self-
1 Female 61 0 0 Yes Rural
employed
Self-
4 Female 79 1 0 Yes Rural
employed
Self-
5106 Female 81 0 0 Yes Urban
employed
Self-
5107 Female 35 0 0 Yes Rural
employed
3. Check NaN
3.1 - Vérifier l'existance de valeurs manquantes
In [17]: df.isnull().any()
age False
hypertension False
heart_disease False
ever_married False
work_type False
Residence_type False
avg_glucose_level False
bmi True
smoking_status False
stroke False
dtype: bool
3.2 - chercher le nombre de valeurs NaN dans chaque colonne et son nombre de valeurs
In [18]: df.isna().sum()
Out[18]: gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 201
smoking_status 0
stroke 0
dtype: int64
In [19]: df.bmi.isna().value_counts()
True 201
In [20]: dd=df.copy()
In [21]: dd.dropna(subset=['bmi'],axis=0,inplace=False)
Self-
4 Female 79 1 0 Yes Rural
employed
Self-
5106 Female 81 0 0 Yes Urban
employed
Self-
5107 Female 35 0 0 Yes Rural
employed
In [23]: df.stroke.value_counts()
Out[23]: 0 4861
1 249
In [24]: df.gender.value_counts()
Male 2115
Other 1
In [25]: df.ever_married.value_counts()
No 1757
4. Descriptive Statistical
4.1 - examiner le rapport entre le type de travail et le stroke (pandas.crosstab(...))
In [26]: pd.crosstab(df['work_type'],df['stroke'])
Out[26]: stroke 0 1
work_type
Govt_job 624 33
Never_worked 22 0
Self-employed 754 65
children 685 2
In [27]: plt.figure(figsize=(15,5))
5. Numerical Variable
5.1 - créer un dataframe 'df_num' intermediaire contenant les variables numérique seulement
5.2- créer un dataframe 'df_cat' intermediaire contenant les variables catégorielles seulement
In [29]: df_cat = df[['gender', 'hypertension', 'heart_disease', 'ever_married',
In [31]: plt.figure(figsize=(5,5))
ax=sns.boxplot(data=df,orient='v')
In [33]: sns.set(style='whitegrid')
ax=sns.boxplot(data=df['age'],orient='v')
In [34]: sns.set(style='whitegrid')
ax=sns.boxplot(df['bmi'],orient='h')
In [35]: Q1=df['bmi'].quantile(0.25)
Q3=df['bmi'].quantile(0.75)
IQR=Q3-Q1
Inf = df['bmi'].quantile(0.25)-1.5*IQR
Sup = df['bmi'].quantile(0.75)+1.5*IQR
In [36]: print(Sup)
46.29999999999999
In [37]: df[df['bmi']>=Sup].count()
age 125
hypertension 125
heart_disease 125
ever_married 125
work_type 125
Residence_type 125
avg_glucose_level 125
bmi 125
smoking_status 125
stroke 125
dtype: int64
4 - Ecrire une méthode drop_outliers() qui supprime les outliers du dataframe
q1=data[field_name].quantile(0.25)
q3=data[field_name].quantile(0.75)
iqr=q3-q1
inf = data[field_name].quantile(0.25)-1.5*iqr
sup = data[field_name].quantile(0.75)+1.5*iqr
data.drop(data[data[field_name]>sup].index,inplace=True)
data.drop(data[data[field_name]<inf].index,inplace=True)
In [39]: drop_outliers(df,'bmi')
In [40]: sns.set(style='whitegrid')
ax=sns.boxplot(df['bmi'],orient='h')
In [41]: sns.set(style='whitegrid')
ax=sns.boxplot(df['avg_glucose_level'],orient='h')
In [42]: Q1g=df['avg_glucose_level'].quantile(0.25)
Q3g=df['avg_glucose_level'].quantile(0.75)
IQRg=Q3g-Q1g
Infg = df['avg_glucose_level'].quantile(0.25)-1.5*IQRg
Supg = df['avg_glucose_level'].quantile(0.75)+1.5*IQRg
for i in df['avg_glucose_level']:
if i >=Supg or i<=Infg :
df['avg_glucose_level']=df['avg_glucose_level'].replace(i,np.mean(df['av
In [43]: sns.set(style='whitegrid')
ax=sns.boxplot(x=df['avg_glucose_level'],orient='h') #, showfliers=False)
Out[44]: gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 0
smoking_status 0
stroke 0
dtype: int64
In [45]: print(Supg)
print(Infg)
168.59124999999997
22.36125000000002
Interpréter.
In [46]: plt.figure(figsize=(15,5))
Interpréter.
In [47]: plt.figure(figsize=(15,5))
sns.countplot(x='stroke',hue=df['work_type'],palette='Set2',data=df);
In [48]: plt.figure(figsize=(5,5))
Interpréter.
In [49]: plt.figure(figsize=(5,5))
Interpréter.
In [50]: plt.figure(figsize=(15,5))
Les personnes qui ne fumaient pas avaient une incidence plus élevée d'AVC que les autres qui fumaient ou
avaient déjà fumé. Cependant, il y a une grande partie qui est inconnue et qui peut - ou non - changer ce
scénario.
6- Afficher la correlation entre les variables du dataframe (coefficient de pearson)
In [51]: plt.figure(figsize=(10,10))
In [52]: df.shape
In [53]: df.head()
Self-
1 Female 61 0 0 Yes Rural 105.70
employed
Self-
4 Female 79 1 0 Yes Rural 105.65
employed
In [54]: X = df.iloc[:,0:10]
Y= df.iloc[:,10]
3 - afficher un echontillion de X et Y
In [55]: X.head()
Self-
1 Female 61 0 0 Yes Rural 105.70
employed
Self-
4 Female 79 1 0 Yes Rural 105.65
employed
In [56]: Y.head()
Out[56]: 0 1
1 1
2 1
3 1
4 1
In [57]: X=X.values
Y=Y.values
In [58]: type(X)
Out[58]: numpy.ndarray
In [59]: X[:5,:]
'never smoked'],
label = LabelEncoder()
X contient les colonnes dans l'ordre : 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
'work_type',
'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status'
2 - Vérifier cette transformation
In [62]: X[:10,:]
'formerly smoked'],
'never smoked'],
'formerly smoked'],
'Unknown'],
dtype=object)
3 - nous transformons la colonne work_type qui a 5 etats : 'private','self-employed','gov-
job','children','never-worked'
In [64]: ct = ColumnTransformer([('work_type',OneHotEncoder(),[5])],\
remainder = 'passthrough')
In [65]: X = ct.fit_transform(X)
In [67]: X[:,0:5]
...,
In [68]: X = X[:,1:]
In [69]: X.shape
In [70]: X[:5,:]
'formerly smoked'],
'never smoked'],
'smokes'],
In [71]: ct = ColumnTransformer([('smoking_status',OneHotEncoder(),[12])],\
remainder = 'passthrough')
In [72]: X = ct.fit_transform(X)
Out[73]: array([[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1, 67, 0, 1, 1, 1,
105.73068820224687, 36.6],
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 61, 0, 0, 1, 0,
105.70601739329871, 28.893236911794673],
105.92, 32.5],
105.66729187860902, 34.4],
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 79, 1, 0, 1, 0,
In [74]: X = X[:,1:]
In [75]: X.shape
In [76]: X[:5,:]
105.73068820224687, 36.6],
[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 61, 0, 0, 1, 0,
105.70601739329871, 28.893236911794673],
32.5],
105.66729187860902, 34.4],
[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0, 79, 1, 0, 1, 0,
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,\
random_state=0)
#vous pouvez garantir que la sortie de Run 1 sera égale à la sortie de Run 2,
#c'est-à-dire que votre division sera toujours la même. Peu importe le nombre réel d
# L'important est qu'à chaque fois que vous utilisez 42, vous obtiendrez toujours la
#vous effectuez le fractionnement.
In [78]: X_train.shape
In [79]: X_test.shape
In [80]: Y.mean()
Out[80]: 0.049357945425361156
In [81]: print(y_train.mean())
print(y_test.mean())
0.04890895410082769
0.05115346038114343
scaler = StandardScaler()
X_test_sc = scaler.transform(X_test)
X. Modélisation
X.1 - k Nearest Neighbors
In [85]: from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_sc,y_train)
y_pred_knn = knn.predict(X_test_sc)
[[937 9]
[ 49 2]]
print(classification_report(y_test,y_pred_knn))
print(y_test[i],y_pred_knn[i])
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 1
model = KNeighborsClassifier()
grille = clf.fit(X_train_sc,y_train)
print(grille.best_params_)
print(grille.best_score_)
{'n_neighbors': 11}
0.9510910274431372
In [92]: knn_op_acc=accuracy_score(y_test,y_pred_knn_o)
knn_op_rec=recall_score(y_test,y_pred_knn_o)
knn_op_prec=precision_score(y_test,y_pred_knn_o)
[[946 0]
[ 51 0]]
dt = DecisionTreeClassifier()
In [94]: dt.fit(X_train_sc,y_train)
Out[94]: DecisionTreeClassifier()
print(y_test[i],y_pred_dt[i])
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 1
0 0
dt_rec = recall_score(y_test,y_pred_dt)
dt_prec = precision_score(y_test,y_pred_dt)
Confusion matrix dt
[[903 43]
[ 31 20]]
Accuracy dt 0.925777331995988
Recall dt 0.39215686274509803
Precision dt 0.31746031746031744
In [98]: print(classification_report(y_test,y_pred_dt))
tree.export_graphviz(dt,feature_names = features,\
out_file = 'dt.dot',\
label = 'all',\
filled = True,\
rounded = True)
---------------------------------------------------------------------------
<ipython-input-100-1e7839a47ba8> in <module>
5 tree.export_graphviz(dt,feature_names = features,\
rf = RandomForestClassifier(n_estimators=500)
In [ ]: rf.fit(X_train_sc,y_train)
In [ ]: rf_acc=accuracy_score(y_test,y_pred_rf)
rf_rec=recall_score(y_test,y_pred_rf)
rf_prec=precision_score(y_test,y_pred_rf)
X.6 - SVM
1. Linéaire SVM
Si le paramètre kernel='linear'
C'est à dire la séparation entre les classes est une ligne d'equation ax+b=0
In [ ]: Image(filename='linear_SVM.png')
linear_SVM = SVC(kernel='linear')
linear_SVM.fit(X_train_sc,y_train)
In [ ]: svm_acc=accuracy_score(y_test,y_predictSVM_l)
svm_prec=precision_score(y_test,y_predictSVM_l)
svm_rec=recall_score(y_test,y_predictSVM_l)
print(confusion_matrix(y_test,y_predictSVM_l))
In [ ]: print(classification_report(y_test,y_predictSVM_l))
1. Kernel SVM
Si le paramètre kernel='rbf'
C'est à dire la séparation entre les classe est un plan, et la répartition des points est
vue selon une fonction appelée RBF.
In [ ]: import matplotlib.image as mpimg
fig = plt.figure()
a=fig.add_subplot(1,2,1)
img1 = mpimg.imread('rbf.gif')
img2 = mpimg.imread('kernel_svm.png')
plt.figure(1)
plt.subplot(211)
plt.imshow(img1)
plt.subplot(212)
plt.imshow(img2)
plt.show()
kernel_SVM.fit(X_train_sc,y_train)
y_predictSVM_k = kernel_SVM.predict(X_test_sc)
Ksvm_acc=accuracy_score(y_test,y_predictSVM_k)
Ksvm_rec=precision_score(y_test,y_predictSVM_k)
Ksvm_prec=recall_score(y_test,y_predictSVM_k)
print(confusion_matrix(y_test,y_predictSVM_k))
In [ ]: print(classification_report(y_test,y_predictSVM_k))
NB : Remarquer dans cet exemple, le SVM linéaire et kernel ont donné presque les mêmes
résultats.
LR = LogisticRegression()
LR.fit(X_train_sc,y_train)
LR_acc=accuracy_score(y_test,y_predictLR)
LR_prec=metrics.precision_score(y_test,y_predictLR)
LR_rec=metrics.recall_score(y_test,y_predictLR)
print(confusion_matrix(y_test,y_predictLR))
8. Summary
In [ ]: print("| Algorithm | Accuracy |Recall | Precision |")