Professional Documents
Culture Documents
October 9, 2023
1
if test_idx:
# plot all examples
X_test, y_test = X[test_idx, :], y[test_idx]
2
##29
##0 and 1
[0 1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Time 284807 non-null float64
1 V1 284807 non-null float64
2 V2 284807 non-null float64
3 V3 284807 non-null float64
4 V4 284807 non-null float64
5 V5 284807 non-null float64
6 V6 284807 non-null float64
7 V7 284807 non-null float64
8 V8 284807 non-null float64
9 V9 284807 non-null float64
10 V10 284807 non-null float64
11 V11 284807 non-null float64
12 V12 284807 non-null float64
13 V13 284807 non-null float64
14 V14 284807 non-null float64
15 V15 284807 non-null float64
16 V16 284807 non-null float64
17 V17 284807 non-null float64
18 V18 284807 non-null float64
19 V19 284807 non-null float64
3
20 V20 284807 non-null float64
21 V21 284807 non-null float64
22 V22 284807 non-null float64
23 V23 284807 non-null float64
24 V24 284807 non-null float64
25 V25 284807 non-null float64
26 V26 284807 non-null float64
27 V27 284807 non-null float64
28 V28 284807 non-null float64
29 Amount 284807 non-null float64
30 Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
None
0
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
4
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
plt.show()
df1.describe().T
5
0 284315
1 492
Name: Class, dtype: int64
V1 -0.920373
V2 -0.598550
V3 -0.890365
V4 -0.848640
V5 -0.691597
V6 -0.768296
V7 -0.554076
V8 -0.208630
V9 -0.643098
V10 -0.535426
V11 -0.762494
V12 -0.405571
V13 -0.648539
V14 -0.425574
V15 -0.582884
V16 -0.468037
V17 -0.483748
V18 -0.498850
V19 -0.456299
V20 -0.211721
V21 -0.228395
V22 -0.542350
V23 -0.161846
V24 -0.354586
V25 -0.317145
V26 -0.326984
V27 -0.070840
V28 -0.052960
Name: 0.25, dtype: float64 V1 1.315642
V2 0.803724
V3 1.027196
V4 0.743341
V5 0.611926
V6 0.398565
V7 0.570436
V8 0.327346
V9 0.597139
V10 0.453923
6
V11 0.739593
V12 0.618238
V13 0.662505
V14 0.493150
V15 0.648821
V16 0.523296
V17 0.399675
V18 0.500807
V19 0.458949
V20 0.133041
V21 0.186377
V22 0.528554
V23 0.147642
V24 0.439527
V25 0.350716
V26 0.240952
V27 0.091045
V28 0.078280
Name: 0.75, dtype: float64 V1 2.236015
V2 1.402274
V3 1.917560
V4 1.591981
V5 1.303524
V6 1.166861
V7 1.124512
V8 0.535976
V9 1.240237
V10 0.989349
V11 1.502088
V12 1.023810
V13 1.311044
V14 0.918724
V15 1.231705
V16 0.991333
V17 0.883423
V18 0.999657
V19 0.915248
V20 0.344762
V21 0.414772
V22 1.070904
V23 0.309488
V24 0.794113
V25 0.667861
V26 0.567936
V27 0.161885
V28 0.131240
dtype: float64
7
[7]: #creating dataframes for points outside the iqr
q1 = ((df1 < (Q1 - 1.5 * IQR)).sum())
q2 = ((df1 > (Q3 + 1.5 * IQR)).sum())
print(q1)
print(q2)
V1 7062
V2 8430
V3 3343
V4 2243
V5 3884
V6 1752
V7 4810
V8 12237
V9 2439
V10 3368
V11 119
V12 14579
V13 1139
V14 8757
V15 2465
V16 6521
V17 757
V18 3986
V19 5063
V20 8683
V21 6438
V22 926
V23 8170
V24 4638
V25 3688
V26 729
V27 19544
V28 18542
dtype: int64
V1 0
V2 5096
V3 20
V4 8905
V5 8411
V6 21213
V7 4138
V8 11897
8
V9 5844
V10 6128
V11 661
V12 769
V13 2229
V14 5392
V15 429
V16 1663
V17 6663
V18 3547
V19 5142
V20 19087
V21 8059
V22 391
V23 10371
V24 136
V25 1679
V26 4867
V27 19619
V28 11800
dtype: int64
V1 -4.274396
V2 -2.701961
V3 -3.766705
V4 -3.236612
V5 -2.646882
V6 -2.518586
V7 -2.240844
V8 -1.012593
V9 -2.503452
V10 -2.019449
V11 -3.015626
V12 -1.941286
V13 -2.615106
V14 -1.803660
V15 -2.430442
V16 -1.955036
V17 -1.808883
V18 -1.998335
V19 -1.829171
V20 -0.728865
V21 -0.850553
V22 -2.148706
V23 -0.626079
V24 -1.545755
V25 -1.318936
V26 -1.178888
V27 -0.313667
9
V28 -0.249819
dtype: float64
V1 4.669664
V2 2.907135
V3 3.903536
V4 3.131313
V5 2.567212
V6 2.148856
V7 2.257204
V8 1.131309
V9 2.457494
V10 1.937947
V11 2.992725
V12 2.153952
V13 2.629071
V14 1.871236
V15 2.496378
V16 2.010296
V17 1.724810
V18 2.000292
V19 1.831822
V20 0.650184
V21 0.808535
V22 2.134910
V23 0.611875
V24 1.630696
V25 1.352506
V26 1.092856
V27 0.333872
V28 0.275140
dtype: float64
10
-0.6430975702665915 0.5971390302822686 1.24023660054886
-0.5354257264933235 0.453923445139507 0.9893491716328305
-0.7624941955129775 0.739593407321606 1.5020876028345835
-0.40557148544041355 0.618238032946136 1.0238095183865497
-0.6485392991145684 0.662504959439974 1.3110442585545425
-0.4255740124549935 0.493149849218149 0.9187238616731425
-0.582884279157456 0.648820806317158 1.2317050854746139
-0.46803676671289796 0.523296312475344 0.991333079188242
/tmp/ipykernel_1165/417446864.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
11
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
dtype: int64
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
12
V28 0
dtype: int64
plt.figure(figsize= (15,20))
plt.subplot(3, 3, 1)
plt.hist(df1['V1'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V1')
plt.subplot(3,3,2)
plt.hist(df1['V2'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V2')
plt.subplot(3,3,3)
plt.hist(df1['V3'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V3')
plt.subplot(3,3,4)
plt.hist(df1['V4'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V4')
plt.subplot(3, 3, 5)
plt.hist(df1['V5'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V5')
plt.subplot(3,3,6)
plt.hist(df1['V6'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V6')
plt.show()
13
[11]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.hist(df1['V7'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V7')
plt.subplot(3,3,2)
plt.hist(df1['V8'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V8')
plt.subplot(3,3,3)
plt.hist(df1['V9'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V9')
plt.subplot(3,3,4)
plt.hist(df1['V10'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V10')
14
plt.subplot(3,3,5)
plt.hist(df1['V11'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V11')
plt.subplot(3,3,6)
plt.hist(df1['V12'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V12')
plt.show()
plt.subplot(3,3, 2)
plt.hist(df1['V14'], color='lightblue', edgecolor = 'black', alpha = 0.7)
15
plt.xlabel('V14')
plt.subplot(3,3,3)
plt.hist(df1['V15'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V15')
plt.subplot(3,3,4)
plt.hist(df1['V16'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V16')
plt.subplot(3,3,5)
plt.hist(df1['V17'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V17')
plt.subplot(3,3,6)
plt.hist(df1['V18'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V18')
plt.show()
16
[13]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.hist(df1['V19'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V19')
plt.subplot(3,3,2)
plt.hist(df1['V20'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V20')
plt.subplot(3,3,3)
plt.hist(df1['V21'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V21')
plt.subplot(3,3,4)
plt.hist(df1['V22'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V22')
plt.subplot(3,3,5)
plt.hist(df1['V23'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V23')
plt.subplot(3,3,6)
plt.hist(df1['V24'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V24')
plt.show()
17
[14]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.hist(df1['V25'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V25')
plt.subplot(3,3,2)
plt.hist(df1['V26'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V26')
plt.subplot(3,3,3)
plt.hist(df1['V27'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V27')
plt.subplot(3,3,4)
plt.hist(df1['V28'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V28')
18
plt.show()
[ ]:
plt.figure(figsize= (25,25))
pairplot1 = sns.heatmap(df1.corr(),annot=True)
plt.show(pairplot1)
/tmp/ipykernel_1165/593985081.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
19
See the caveats in the documentation: https://pandas.pydata.org/pandas-
docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1['Class'] = df['Class']
[16]: # did you perform feature engineering (drop irrelevant features/columns, for␣
↪example)?
df2 = df1[['V2','V6','V7','V8','V21','V22','V27','V28','Class']]
plt.figure(figsize= (8,7))
pairplot2 = sns.heatmap(df2.corr(),annot=True)
plt.show(pairplot2)
# summarize your findings
20
#I chose the most correlated features to predict fraud and iterated through␣
↪them to see which ones were best
0 284315
1 492
Name: Class, dtype: int64
[18]: #balance data but keeping a little more non fraud cases becuase they are more␣
↪common
#10,000 was the best sample size for me to balance bias and variance
df3 = df2
index = df2[(df2['Class'] == 0)].sample(n=278000).index
df3.drop(index,inplace=True)
21
each_class = df3['Class'].value_counts()
print(each_class)
0 6315
1 492
Name: Class, dtype: int64
/tmp/ipykernel_1165/1221864664.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
[19]: X = df3.iloc[:,[2,3]]
y = df3.iloc[:,[8]]
y = pd.factorize(df3['Class'])[0]
[21]: print(np.bincount(y))
print(np.bincount(y_train))
print(np.bincount(y_test))
[6315 492]
[4420 344]
[1895 148]
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
tree_model.fit(X_train,y_train)
y_pred_test = tree_model.predict(X_test)
y_pred_train = tree_model.predict(X_train)
22
[24]: from sklearn.model_selection import KFold
# Splitting the training set from the validation set for this specific fold
X_train1, X_val1 = pd.DataFrame(X).iloc[train_index, :], pd.DataFrame(X).
↪iloc[val_index, :]
#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)
tree_model.fit(X_train1, y_train1)
score = tree_model.score(X_val1, y_val1)
print(f"fold :", i, "score: ", score)
y_pred_test1 = tree_model.predict(X_val1)
confmax = confusion_matrix(y_val1,y_pred_test1)
#print(confmax)
scores.append(score)
23
fold : 6 score: 0.9632892804698973
fold : 7 score: 0.95
fold : 8 score: 0.9661764705882353
fold : 9 score: 0.9426470588235294
Averge: 0.9544579770234087
#for spam emails we want to have less false positives so important emails dont␣
↪get flagged as spam
24
[[1870 25]
[ 71 77]]
print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred_test))
print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train,␣
↪y_pred=y_pred_train))
print('Error: {}'.format(round(1-accuracy_score(y_true=y_test,␣
↪y_pred=y_pred_test),3)))
print('Specificity: {}'.format(round(tree_confmat[0][1]/(tree_confmat[0][1] +␣
↪tree_confmat[1][0]),3)))
25
print()
null_acc = 6000/(6000
+each_class[1])
print('Null Accuracy: %.3f'% null_acc)
#i tried to get a better f1 score and recall becuase its important for fraud␣
↪detection
Test:
Test Accuracy: 0.953
Training Accuracy: 0.958
Error: 0.047
Specificity: 0.26
Precision: 0.755
Recall/Sensitivity: 0.520
F1: 0.616
#print(train_sizes)
#print(train_scores)
#print(test_scores)
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
26
markersize=5, label='Training accuracy')
plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')
plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, 0, 1.3])
plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.1])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()
#i tried to keep a balance between variance and bias to prevent overfitting and␣
↪underfitting
27
[30]: from sklearn.metrics import roc_curve, auc
28
plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))
plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))
plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')
plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
29
[31]: #KNN
knn = KNeighborsClassifier(n_neighbors=15,
p=2,
metric='minkowski')
[34]: sc.fit(X_train_k)
X_train_std_k = sc.transform(X_train_k)
X_test_std_k = sc.transform(X_test_k)
30
knn.fit(X_train_std_k, y_train_k)
[35]: KNeighborsClassifier(n_neighbors=15)
[38]: scores = []
# Splitting the training set from the validation set for this specific fold
X_train, X_val = pd.DataFrame(X_knn).iloc[train_index, :], pd.
↪DataFrame(X_knn).iloc[val_index, :]
#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)
knn.fit(X_train, y_train)
score = knn.score(X_val, y_val)
print(f"fold :", i, "score: ", score)
y_pred_test = knn.predict(X_val)
confmax = confusion_matrix(y_val,y_pred_test)
#print(confmax)
scores.append(score)
31
print("Averge: ", sum(scores)/len(scores))
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 0 score: 0.9588839941262849
fold : 1 score: 0.9500734214390602
fold : 2 score: 0.9574155653450808
fold : 3 score: 0.9441997063142438
fold : 4 score: 0.9618208516886931
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 5 score: 0.9559471365638766
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
32
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 6 score: 0.9588839941262849
fold : 7 score: 0.9514705882352941
fold : 8 score: 0.9514705882352941
fold : 9 score: 0.9544117647058824
Averge: 0.9544577610779996
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
[40]: print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test_k,␣
↪y_pred=y_pred_test_k))
Test:
Test Accuracy: 0.954
Training Accuracy: 0.948
Precision: 0.762
Recall: 0.541
F1: 0.632
33
[41]: import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
#print(train_sizes)
#print(train_scores)
#print(test_scores)
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5, label='Training accuracy')
plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')
plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, .8, 1.3])
plt.grid()
34
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.0])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()
35
#print("train_prob: ", train_probas)
#print("train_fpr: ", fpr)
#print("train_tpr: ", tpr)
#print("train_thr: ",thresholds)
#print()
#print()
plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))
plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))
plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')
plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')
plt.xlim([-0.05, 1.05])
36
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
[43]: #SVM
#C 10 had better results when I tried 100 I got an inveted auc roc curve
svm = SVC(kernel='rbf', C = 1, random_state=1,probability=True)
37
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_svm, y_svm,␣
↪test_size=.3,random_state=1, stratify=y_svm)
[47]: scores = []
# Splitting the training set from the validation set for this specific fold
X_train3, X_val3 = pd.DataFrame(X_svm).iloc[train_index, :], pd.
↪DataFrame(X_svm).iloc[val_index, :]
#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)
knn.fit(X_train3, y_train3)
score = knn.score(X_val3, y_val3)
print(f"fold :", i, "score: ", score)
y_pred_test3 = knn.predict(X_val3)
confmax = confusion_matrix(y_val3,y_pred_test3)
#print(confmax)
scores.append(score)
38
fold : 4 score: 0.947136563876652
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 5 score: 0.9574155653450808
fold : 6 score: 0.9618208516886931
fold : 7 score: 0.9573529411764706
fold : 8 score: 0.95
fold : 9 score: 0.9676470588235294
Averge: 0.9559287812041115
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
39
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
[48]: sc.fit(X_train_s)
X_train_std_s = sc.transform(X_train_s)
X_test_std_s = sc.transform(X_test_s)
y_pred_test_s = svm.predict(X_test_std_s)
y_pred_train_s = svm.predict(X_train_s)
#plot_decision_regions(X_combined_std, y_combined,
#classifier=svm, test_idx=range(105, 150))
[50]: print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test_k,␣
↪y_pred=y_pred_test_k))
40
print('Null Accuracy: %.3f'% null_acc)
Test:
Test Accuracy: 0.954
Training Accuracy: 0.948
Precision: 0.762
Recall: 0.541
F1: 0.632
#print(train_sizes)
#print(train_scores)
#print(test_scores)
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5, label='Training accuracy')
plt.fill_between(train_sizes,
41
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')
plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, .8, 1.3])
plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.0])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()
42
[57]: from sklearn.metrics import roc_curve, auc
43
plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))
plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))
plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')
plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
44
[ ]: #summary
[ ]: # I would use the knn model because it the accuracy was good for the area under␣
↪the curve and the prrecision was also good as well
# DT and SVM seemed to be less accurate and probably underfit. I struggled to␣
↪get a good result while iterating through the parameters and hyperparameters
# the roc auc score was especially low for svm. svm was hard to get good␣
↪results with.
#6000 samples of class 0 was the best option for me becuase it took less time␣
↪to compute on my laptop. the data needed to be balanced anyways but␣
↪realistically.
[ ]:
45