File 1

blankEDA
October 9, 2023
0.1 Exploratory Data Analysis

[1]: from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
# To check recent matplotlib compatibility

import matplotlib
from distutils.version import LooseVersion
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map

markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface

x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):

plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.8,
color=colors[idx],
marker=markers[idx],
label=cl,
edgecolor='black')
# highlight test examples
1
if test_idx:
# plot all examples
X_test, y_test = X[test_idx, :], y[test_idx]
if LooseVersion(matplotlib.__version__) < LooseVersion('0.3.4'):

plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
else:
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='none',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
0.1.1 Import libraries
[2]: import numpy as np

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
import copy
sns.set()
0.1.2 1. Read in the data set and perform preliminary checks

[3]: # read the data set
df = pd.read_csv('creditcard (1).csv')
# how many samples do we have?

## 284807
# how many features do we have?
2
##29
# what are the classes

classes = pd.unique(df['Class'])
print(classes)
##0 and 1
# how many classes do we have?

## 2
# how many samples of each class? are they balanced?
## 0 has 278703 1 has 487
# what are the names of the features
## V1-V29
# what are the types of features?
print(df.info())
## all features are floats and the class is int
# summarize you findings

## This data is very imbalanced, but the dtypes look good and no nulls.
[0 1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Time 284807 non-null float64
1 V1 284807 non-null float64
3
29 Amount 284807 non-null float64
30 Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
None
0.1.3 2. Check missing values, duplicates, outliers and statistics
[4]: # are there any missing/nan/null values?

df.drop(df.index[148636])
display(df.isna().sum().to_frame())
## no missing values
# if so, what are you going to do about the missing/nan/null values?
0
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
4
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
[5]: # are there outliers?

df1 = df.loc[:, ~df.columns.isin(['Time','Amount','Class'])]
df1.boxplot(figsize = (15,6))
plt.show()
df1.describe().T
# if so, what are you going to do about the outliers

## I will replace the outliers
# are there anomalies in the statistics?

## Yes
# if so, what are you going to do about the anomalies

## I will replace the anomalies
# what is the frequency distribution of each class

## class 0 occurs more frequently than class 1
each_class = df['Class'].value_counts()
print(each_class)
# summarize your findings

## There are a lot of outliers and anomolies and the data is very inbalanced
5
0 284315
1 492
Name: Class, dtype: int64
[6]: # calculated q1 and q3 to calculate the inner quartile range

Q1 = df1.quantile(0.25)
Q3 = df1.quantile(0.75)
IQR = Q3 - Q1
print (Q1, Q3, IQR)
V1 -0.920373
V2 -0.598550
V3 -0.890365
V4 -0.848640
V5 -0.691597
V6 -0.768296
V7 -0.554076
V8 -0.208630
V9 -0.643098
V10 -0.535426
V11 -0.762494
V12 -0.405571
V13 -0.648539
V14 -0.425574
V15 -0.582884
V16 -0.468037
V17 -0.483748
V18 -0.498850
V19 -0.456299
V20 -0.211721
V21 -0.228395
V22 -0.542350
V23 -0.161846
V24 -0.354586
V25 -0.317145
V26 -0.326984
V27 -0.070840
V28 -0.052960
Name: 0.25, dtype: float64 V1 1.315642
V2 0.803724
V3 1.027196
V4 0.743341
V5 0.611926
V6 0.398565
V7 0.570436
V8 0.327346
V9 0.597139
V10 0.453923
6
V11 0.739593
V12 0.618238
V13 0.662505
V14 0.493150
V15 0.648821
V16 0.523296
V17 0.399675
V18 0.500807
V19 0.458949
V20 0.133041
V21 0.186377
V22 0.528554
V23 0.147642
V24 0.439527
V25 0.350716
V26 0.240952
V27 0.091045
V28 0.078280
Name: 0.75, dtype: float64 V1 2.236015
V2 1.402274
V3 1.917560
V4 1.591981
V5 1.303524
V6 1.166861
V7 1.124512
V8 0.535976
V9 1.240237
V10 0.989349
V11 1.502088
V12 1.023810
V13 1.311044
V14 0.918724
V15 1.231705
V16 0.991333
V17 0.883423
V18 0.999657
V19 0.915248
V20 0.344762
V21 0.414772
V22 1.070904
V23 0.309488
V24 0.794113
V25 0.667861
V26 0.567936
V27 0.161885
V28 0.131240
dtype: float64
7
[7]: #creating dataframes for points outside the iqr
q1 = ((df1 < (Q1 - 1.5 * IQR)).sum())
q2 = ((df1 > (Q3 + 1.5 * IQR)).sum())
print(q1)
print(q2)
#Anything above these numbers are ourliers

print((Q1 - 1.5 * IQR))
print((Q3 + 1.5 * IQR))
V1 7062
V2 8430
V3 3343
V4 2243
V5 3884
V6 1752
V7 4810
V8 12237
V9 2439
V10 3368
V11 119
V12 14579
V13 1139
V14 8757
V15 2465
V16 6521
V17 757
V18 3986
V19 5063
V20 8683
V21 6438
V22 926
V23 8170
V24 4638
V25 3688
V26 729
V27 19544
V28 18542
dtype: int64
V1 0
V2 5096
V3 20
V4 8905
V5 8411
V6 21213
V7 4138
V8 11897
8
V9 5844
V10 6128
V11 661
V12 769
V13 2229
V14 5392
V15 429
V16 1663
V17 6663
V18 3547
V19 5142
V20 19087
V21 8059
V22 391
V23 10371
V24 136
V25 1679
V26 4867
V27 19619
V28 11800
dtype: int64
V1 -4.274396
V2 -2.701961
V3 -3.766705
V4 -3.236612
V5 -2.646882
V6 -2.518586
V7 -2.240844
V8 -1.012593
V9 -2.503452
V10 -2.019449
V11 -3.015626
V12 -1.941286
V13 -2.615106
V14 -1.803660
V15 -2.430442
V16 -1.955036
V17 -1.808883
V18 -1.998335
V19 -1.829171
V20 -0.728865
V21 -0.850553
V22 -2.148706
V23 -0.626079
V24 -1.545755
V25 -1.318936
V26 -1.178888
V27 -0.313667
9
V28 -0.249819
dtype: float64
V1 4.669664
V2 2.907135
V3 3.903536
V4 3.131313
V5 2.567212
V6 2.148856
V7 2.257204
V8 1.131309
V9 2.457494
V10 1.937947
V11 2.992725
V12 2.153952
V13 2.629071
V14 1.871236
V15 2.496378
V16 2.010296
V17 1.724810
V18 2.000292
V19 1.831822
V20 0.650184
V21 0.808535
V22 2.134910
V23 0.611875
V24 1.630696
V25 1.352506
V26 1.092856
V27 0.333872
V28 0.275140
dtype: float64
[8]: for col in df1.columns:

q1 = df1[col].quantile(0.25)
q3 = df1[col].quantile(0.75)
iqr = q3 - q1
print(q1, q3, iqr)
df1.loc[(df1[col] < (q1 - 1.5 * iqr)), col] = (q1 - 1.5 * iqr)
df1.loc[(df1[col] > (q3 + 1.5 * iqr)), col] = (q3 + 1.5 * iqr)
-0.920373384390322 1.315641693877865 2.236015078268187

-0.598549913464916 0.8037238712400945 1.4022737847050104
-0.8903648381551406 1.027195542465555 1.9175603806206956
-0.848640116331273 0.7433412894685876 1.5919814057998605
-0.6915970708876575 0.611926439735193 1.3035235106228504
-0.768295608460489 0.39856489635610504 1.166860504816594
-0.5540758790365226 0.5704360728775986 1.1245119519141211
-0.2086297440394665 0.327345861923449 0.5359756059629155
10
-0.6430975702665915 0.5971390302822686 1.24023660054886
-0.5354257264933235 0.453923445139507 0.9893491716328305
-0.7624941955129775 0.739593407321606 1.5020876028345835
-0.40557148544041355 0.618238032946136 1.0238095183865497
-0.6485392991145684 0.662504959439974 1.3110442585545425
-0.4255740124549935 0.493149849218149 0.9187238616731425
-0.582884279157456 0.648820806317158 1.2317050854746139
-0.46803676671289796 0.523296312475344 0.991333079188242
/tmp/ipykernel_1165/417446864.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-

docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.loc[(df1[col] < (q1 - 1.5 * iqr)), col] = (q1 - 1.5 * iqr)

df1.loc[(df1[col] > (q3 + 1.5 * iqr)), col] = (q3 + 1.5 * iqr)
-0.483748313707048 0.3996749826503845 0.8834232963574324
-0.498849798665041 0.5008067468872159 0.9996565455522569
-0.4562989187444475 0.458949355762679 0.9152482745071264
-0.21172136467424701 0.1330408409942945 0.34476220566854154
-0.22839494677851702 0.1863772033785755 0.4147721501570925
-0.5423503726606616 0.5285536353339865 1.0709040079946481
-0.16184634501488449 0.14764206385605 0.3094884088709345
-0.3545861364094985 0.439526600168186 0.7941127365776846
-0.31714505406527 0.350715562867386 0.667860616932656
-0.3269839258807195 0.2409521737147555 0.567936099595475
-0.07083952930446921 0.09104511968580689 0.1618846489902761
-0.0529597930169809 0.07827995475782015 0.13123974777480105
[9]: #no more outliers

df2 = ((df1 < (Q1 - 1.5 * IQR)).sum())
df3 = ((df1 > (Q3 + 1.5 * IQR)).sum())
print(df2)
print(df3)
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
11
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
dtype: int64
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
12
V28 0
dtype: int64
0.1.4 3. Check distributions and coorelations

[10]: # what kind of distributions do the features have?
# fairly normal distributions
# what is your interpretation
plt.figure(figsize= (15,20))
plt.subplot(3, 3, 1)
plt.hist(df1['V1'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V1')
plt.subplot(3,3,2)
plt.xlabel('V2')
plt.subplot(3,3,3)
plt.xlabel('V3')
plt.subplot(3,3,4)
plt.xlabel('V4')
plt.subplot(3, 3, 5)
plt.xlabel('V5')
plt.subplot(3,3,6)
plt.xlabel('V6')
plt.show()
13
[11]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.xlabel('V7')
plt.subplot(3,3,2)
plt.xlabel('V8')
plt.subplot(3,3,3)
plt.xlabel('V9')
plt.subplot(3,3,4)
plt.xlabel('V10')
14
plt.subplot(3,3,5)
plt.xlabel('V11')
plt.subplot(3,3,6)
plt.xlabel('V12')
plt.show()

plt.subplot(3,3,1)
plt.xlabel('V13')
plt.subplot(3,3, 2)
15
plt.xlabel('V14')
plt.subplot(3,3,3)
plt.xlabel('V15')
plt.subplot(3,3,4)
plt.xlabel('V16')
plt.subplot(3,3,5)
plt.xlabel('V17')
plt.subplot(3,3,6)
plt.xlabel('V18')
plt.show()
16
plt.subplot(3,3,1)
plt.xlabel('V19')
plt.subplot(3,3,2)
plt.xlabel('V20')
plt.subplot(3,3,3)
plt.xlabel('V21')
plt.subplot(3,3,4)
plt.xlabel('V22')
plt.subplot(3,3,5)
plt.xlabel('V23')
plt.subplot(3,3,6)
plt.xlabel('V24')
plt.show()
17
plt.subplot(3,3,1)
plt.xlabel('V25')
plt.subplot(3,3,2)
plt.xlabel('V26')
plt.subplot(3,3,3)
plt.xlabel('V27')
plt.subplot(3,3,4)
plt.xlabel('V28')
18
plt.show()
[ ]:
[15]: # what kind of correlation does the data exhibit?

# there are a lot of slightly correlated data
# what is your interpretation
# certain features are better correlated to predict a class
df1['Class'] = df['Class']
pairplot1 = sns.heatmap(df1.corr(),annot=True)
plt.show(pairplot1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
19
df1['Class'] = df['Class']
[16]: # did you perform feature engineering (drop irrelevant features/columns, for␣
↪example)?
df2 = df1[['V2','V6','V7','V8','V21','V22','V27','V28','Class']]
pairplot2 = sns.heatmap(df2.corr(),annot=True)
plt.show(pairplot2)
# summarize your findings
20
#I chose the most correlated features to predict fraud and iterated through␣
↪them to see which ones were best
[17]: each_class = df['Class'].value_counts()

print(each_class)
0 284315
1 492
[18]: #balance data but keeping a little more non fraud cases becuase they are more␣
↪common
#10,000 was the best sample size for me to balance bias and variance
df3 = df2
index = df2[(df2['Class'] == 0)].sample(n=278000).index
df3.drop(index,inplace=True)
21
each_class = df3['Class'].value_counts()
print(each_class)
0 6315
1 492

df3.drop(index,inplace=True)
[19]: X = df3.iloc[:,[2,3]]
y = df3.iloc[:,[8]]
y = pd.factorize(df3['Class'])[0]
[20]: from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.
↪3,random_state=1, stratify=y)
[21]: print(np.bincount(y))
print(np.bincount(y_train))
print(np.bincount(y_test))
[6315 492]
[4420 344]
[1895 148]
[22]: from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
[23]: # Decision Tree

from sklearn.tree import DecisionTreeClassifier
#better results with entropy and 6 depth

tree_model =␣
↪DecisionTreeClassifier(criterion='entropy',max_depth=4,random_state=1)
tree_model.fit(X_train,y_train)
y_pred_test = tree_model.predict(X_test)
y_pred_train = tree_model.predict(X_train)
22
[24]: from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle = True)

print(kf)
KFold(n_splits=10, random_state=None, shuffle=True)
[25]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

scores = []
for i, (train_index, val_index) in enumerate(kf.split(X)):

#print(f"Fold {i}:")
#print(f" Train: index={train_index}")
#print(f" Test: index={val_index}")
# Splitting the training set from the validation set for this specific fold
X_train1, X_val1 = pd.DataFrame(X).iloc[train_index, :], pd.DataFrame(X).
↪iloc[val_index, :]
y_train1, y_val1 = pd.DataFrame(y).iloc[train_index], pd.DataFrame(y).

↪iloc[val_index]
#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)
tree_model.fit(X_train1, y_train1)
score = tree_model.score(X_val1, y_val1)
print(f"fold :", i, "score: ", score)
y_pred_test1 = tree_model.predict(X_val1)
confmax = confusion_matrix(y_val1,y_pred_test1)
#print(confmax)
scores.append(score)
print("Averge: ", sum(scores)/len(scores))

#kfold resulted in a slight increase of accuracy
fold : 0 score: 0.9676945668135095

fold : 1 score: 0.947136563876652
fold : 2 score: 0.9647577092511013
fold : 3 score: 0.9456681350954479
fold : 4 score: 0.947136563876652
fold : 5 score: 0.9500734214390602
23
fold : 6 score: 0.9632892804698973
fold : 7 score: 0.95
fold : 8 score: 0.9661764705882353
fold : 9 score: 0.9426470588235294
Averge: 0.9544579770234087
[26]: # using 8 features [0,1,2,3,4,5,6,7] 0.9319663401331882

# using 7 features [0,1,2,3,4,5,6] 0.9304331542665871
# using 6 features [0,1,2,3,4,5] 0.9277530049272614
# using 5 features [0,1,2,3,4] we get a score of 0.9131350725044767
# using 4 features [0,1,2,3] we get a score of 0.9462799761244338
# using 4 other features [2,3,4,5] we get a score of 0.9397961213908688
# using 2 features [2,3] we get a score of 0.963004576150182
# using 2 other features [4,5] we get a score of 0.8590639373617499
[27]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#for spam emails we want to have less false positives so important emails dont␣
↪get flagged as spam
#tree_confmat = confusion_matrix(y_test, y_pred_test)
tree_confmat = confusion_matrix(y_test, y_pred_test)

cmd = ConfusionMatrixDisplay(tree_confmat,display_labels=['P','N'])
cmd.plot()
plt.show()
print(tree_confmat)
#for credit card fraud we want to reduce the number of false negatives (top␣
↪right number)
24
[[1870 25]
[ 71 77]]
[28]: from sklearn.metrics import accuracy_score, precision_score, recall_score,␣

↪f1_score
print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred_test))
print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train,␣
↪y_pred=y_pred_train))
print('Error: {}'.format(round(1-accuracy_score(y_true=y_test,␣
↪y_pred=y_pred_test),3)))
print('Specificity: {}'.format(round(tree_confmat[0][1]/(tree_confmat[0][1] +␣
↪tree_confmat[1][0]),3)))
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_test))

print('Recall/Sensitivity: %.3f' % recall_score(y_true=y_test,␣
↪y_pred=y_pred_test))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_test))
25
print()
null_acc = 6000/(6000
+each_class[1])
print('Null Accuracy: %.3f'% null_acc)
#i tried to get a better f1 score and recall becuase its important for fraud␣
↪detection
Test:
Test Accuracy: 0.953
Training Accuracy: 0.958
Error: 0.047
Specificity: 0.26
Precision: 0.755
Recall/Sensitivity: 0.520
F1: 0.616
Null Accuracy: 0.924
[29]: import matplotlib.pyplot as plt

from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores =\

learning_curve(estimator=tree_model,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=1)
#print(train_sizes)
#print(train_scores)
#print(test_scores)
train_mean = np.mean(train_scores, axis=1)

train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.axhline(y = null_acc, color = 'black', linestyle = '--', label='Null␣

↪accuracy')
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
26
markersize=5, label='Training accuracy')
plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, 0, 1.3])
plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.1])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()
#i tried to keep a balance between variance and bias to prevent overfitting and␣
↪underfitting
27
[30]: from sklearn.metrics import roc_curve, auc
fig = plt.figure(figsize=(7, 5))
### TRAINING ROC CURVE

train_probas = tree_model.fit(X_train,
y_train).predict_proba(X_train)
fpr, tpr, thresholds = roc_curve(y_train,

train_probas[:, 1],
pos_label=1)
#print("X_train2: ", X_train)

#print("train_prob: ", train_probas)
#print("train_fpr: ", fpr)
#print("train_tpr: ", tpr)
#print("train_thr: ",thresholds)
#print()
#print()
roc_auc = auc(fpr, tpr)
28
plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))
### TEST ROC CURVE

test_probas = tree_model.predict_proba(X_test)
#test_probas = pipe_knn.predict_proba(X_test[:, :])
fpr, tpr, thresholds = roc_curve(y_test,

test_probas[:, 1],
pos_label=1)
#print("test_fpr: ", fpr)

#print("test_tpr: ", tpr)
#print("test_thr: ",thresholds)
plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))
plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')
plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
29
[31]: #KNN
[32]: X_knn = df3.iloc[:,[2,3]]

y_knn = df3.iloc[:,[8]]
y_knn = pd.factorize(df3['Class'])[0]
[33]: from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15,
p=2,
metric='minkowski')
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_knn, y_knn,␣

↪test_size=.3,random_state=1, stratify=y_knn)
[34]: sc.fit(X_train_k)
X_train_std_k = sc.transform(X_train_k)
X_test_std_k = sc.transform(X_test_k)
[35]: X_combined_std_k = np.vstack((X_train_std_k, X_test_std_k))

y_combined_k = np.hstack((y_train_k, y_test_k))
30
knn.fit(X_train_std_k, y_train_k)
[35]: KNeighborsClassifier(n_neighbors=15)
[36]: y_pred_test_k = knn.predict(X_test_std_k)

y_pred_train_k = knn.predict(X_train_k)
/opt/conda/lib/python3.10/site-packages/sklearn/base.py:402: UserWarning: X has

feature names, but KNeighborsClassifier was fitted without feature names
warnings.warn(

print(kf)
[38]: scores = []
for i, (train_index, val_index) in enumerate(kf.split(X_knn)):

X_train, X_val = pd.DataFrame(X_knn).iloc[train_index, :], pd.
↪DataFrame(X_knn).iloc[val_index, :]
y_train, y_val = pd.DataFrame(y_knn).iloc[train_index], pd.DataFrame(y_knn).

↪iloc[val_index]
#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)
knn.fit(X_train, y_train)
score = knn.score(X_val, y_val)
y_pred_test = knn.predict(X_val)
confmax = confusion_matrix(y_val,y_pred_test)
#print(confmax)
31
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 0 score: 0.9588839941262849
fold : 1 score: 0.9500734214390602
fold : 2 score: 0.9574155653450808
fold : 3 score: 0.9441997063142438
fold : 4 score: 0.9618208516886931
fold : 5 score: 0.9559471365638766
32
fold : 6 score: 0.9588839941262849
fold : 7 score: 0.9514705882352941
fold : 8 score: 0.9514705882352941
fold : 9 score: 0.9544117647058824
Averge: 0.9544577610779996
[39]: knn_confmat = confusion_matrix(y_test_k, y_pred_test_k)

knn_confmat
[39]: array([[1870, 25],

[ 68, 80]])
[40]: print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test_k,␣
↪y_pred=y_pred_test_k))
print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train_k,␣

↪y_pred=y_pred_train_k))
print('Precision: %.3f' % precision_score(y_true=y_test_k,␣

print('Recall: %.3f' % recall_score(y_true=y_test_k, y_pred=y_pred_test_k))

print('F1: %.3f' % f1_score(y_true=y_test_k, y_pred=y_pred_test_k))
print()
Test:
Precision: 0.762
Recall: 0.541
F1: 0.632
33

learning_curve(estimator=knn,
X=X_train_k,
y=y_train_k,
cv=10,
n_jobs=1)
#print(train_sizes)
#print(test_scores)

plt.axhline(y = null_acc, color = 'black', linestyle = '--')
plt.axis([750, 5000, .8, 1.3])
plt.grid()
34
plt.ylim([0.0, 1.0])
plt.tight_layout()
plt.show()

train_probas = knn.fit(X_train_k,
y_train_k).predict_proba(X_train_k)
fpr, tpr, thresholds = roc_curve(y_train_k,

train_probas[:, 1],
pos_label=1)
#print("X_train2: ", X_train_k)
35
#print()
#print()
plt.step(fpr,
tpr,
% (roc_auc))
### TEST ROC CURVE

test_probas = knn.predict_proba(X_test_k)
fpr, tpr, thresholds = roc_curve(y_test_k,

test_probas[:, 1],
pos_label=1)

plt.step(fpr,
tpr,
where='post',
% (roc_auc))
plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
plt.xlim([-0.05, 1.05])
36
plt.ylim([-0.05, 1.05])
plt.tight_layout()
plt.show()
[43]: #SVM
[44]: #balance data

X_svm = df3.iloc[:,[2,3]]
y_svm = df3.iloc[:,[8]]
y_svm = pd.factorize(df3['Class'])[0]
[45]: from sklearn.svm import SVC
#C 10 had better results when I tried 100 I got an inveted auc roc curve
svm = SVC(kernel='rbf', C = 1, random_state=1,probability=True)
37
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_svm, y_svm,␣
↪test_size=.3,random_state=1, stratify=y_svm)
#experiment with C (.1, 1, 10, 100)

print(kf)
[47]: scores = []
for i, (train_index, val_index) in enumerate(kf.split(X_svm)):

X_train3, X_val3 = pd.DataFrame(X_svm).iloc[train_index, :], pd.
↪DataFrame(X_svm).iloc[val_index, :]
y_train3, y_val3 = pd.DataFrame(y_svm).iloc[train_index], pd.

↪DataFrame(y_svm).iloc[val_index]
#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)
knn.fit(X_train3, y_train3)
score = knn.score(X_val3, y_val3)
y_pred_test3 = knn.predict(X_val3)
confmax = confusion_matrix(y_val3,y_pred_test3)
#print(confmax)
fold : 0 score: 0.9515418502202643

fold : 1 score: 0.9632892804698973
fold : 2 score: 0.9530102790014684
fold : 3 score: 0.9500734214390602
38
fold : 4 score: 0.947136563876652
fold : 5 score: 0.9574155653450808
fold : 6 score: 0.9618208516886931
fold : 7 score: 0.9573529411764706
fold : 8 score: 0.95
fold : 9 score: 0.9676470588235294
Averge: 0.9559287812041115
39
[48]: sc.fit(X_train_s)
X_train_std_s = sc.transform(X_train_s)
X_test_std_s = sc.transform(X_test_s)
[49]: svm.fit(X_train_std_s, y_train_s)
X_combined_std_s = np.vstack((X_train_std_s, X_test_std_s))

y_combined_s = np.hstack((y_train_s, y_test_s))
y_pred_test_s = svm.predict(X_test_std_s)
y_pred_train_s = svm.predict(X_train_s)
#plot_decision_regions(X_combined_std, y_combined,
#classifier=svm, test_idx=range(105, 150))
#plt.xlabel('petal length [standardized]')

#plt.ylabel('petal width [standardized]')
#plt.legend(loc='upper left')
#plt.tight_layout()
##plt.savefig('images/03_24.png', dpi=300)
#plt.show()
/opt/conda/lib/python3.10/site-packages/sklearn/base.py:402: UserWarning: X has

feature names, but SVC was fitted without feature names
warnings.warn(
[50]: print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test_k,␣
print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train_k,␣

↪y_pred=y_pred_train_k))
print('Precision: %.3f' % precision_score(y_true=y_test_k,␣

print('Recall: %.3f' % recall_score(y_true=y_test_k, y_pred=y_pred_test_k))

print('F1: %.3f' % f1_score(y_true=y_test_k, y_pred=y_pred_test_k))
print()
40
Test:
Precision: 0.762
Recall: 0.541
F1: 0.632
[51]: svm_confmat = confusion_matrix(y_test_s, y_pred_test_s)

svm_confmat
[51]: array([[1868, 27],

[ 66, 82]])


learning_curve(estimator=svm,
X=X_train_s,
y=y_train_s,
cv=10,
n_jobs=1)
#print(train_sizes)
#print(test_scores)

plt.axhline(y = null_acc, color = 'black', linestyle = '--', label='Null␣

↪accuracy')
41
plt.axis([750, 5000, .8, 1.3])
plt.grid()
plt.ylim([0.0, 1.0])
plt.tight_layout()
plt.show()
42

train_probas = svm.fit(X_train_s,
y_train_s).predict_proba(X_train_s)
fpr, tpr, thresholds = roc_curve(y_train_s,

train_probas[:, 1],
pos_label=1)
#print("X_train2: ", X_train_s)

#print()
#print()
43
plt.step(fpr,
tpr,
% (roc_auc))
### TEST ROC CURVE

test_probas = svm.predict_proba(X_test_s)
fpr, tpr, thresholds = roc_curve(y_test_s,

test_probas[:, 1],
pos_label=1)

plt.step(fpr,
tpr,
where='post',
% (roc_auc))
plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.tight_layout()
plt.show()
44
[ ]: #summary
[ ]: # I would use the knn model because it the accuracy was good for the area under␣
↪the curve and the prrecision was also good as well
#The model is also not over or under fit.
# DT and SVM seemed to be less accurate and probably underfit. I struggled to␣
↪get a good result while iterating through the parameters and hyperparameters
# the roc auc score was especially low for svm. svm was hard to get good␣
↪results with.
#6000 samples of class 0 was the best option for me becuase it took less time␣
↪to compute on my laptop. the data needed to be balanced anyways but␣
↪realistically.
[ ]:
45

File 1

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

File 1

Uploaded by

Copyright:

Available Formats

blankEDA

0.1 Exploratory Data Analysis

# To check recent matplotlib compatibility

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

# setup marker generator and color map

# plot the decision surface

for idx, cl in enumerate(np.unique(y)):

# highlight test examples

if LooseVersion(matplotlib.__version__) < LooseVersion('0.3.4'):

0.1.1 Import libraries

[2]: import numpy as np

0.1.2 1. Read in the data set and perform preliminary checks

# how many samples do we have?

# what are the classes

# how many classes do we have?

# summarize you findings

0.1.3 2. Check missing values, duplicates, outliers and statistics

[4]: # are there any missing/nan/null values?

[5]: # are there outliers?

# if so, what are you going to do about the outliers

# are there anomalies in the statistics?

# if so, what are you going to do about the anomalies

# what is the frequency distribution of each class

# summarize your findings

[6]: # calculated q1 and q3 to calculate the inner quartile range

#Anything above these numbers are ourliers

[8]: for col in df1.columns:

-0.920373384390322 1.315641693877865 2.236015078268187

See the caveats in the documentation: https://pandas.pydata.org/pandas-

See the caveats in the documentation: https://pandas.pydata.org/pandas-

[9]: #no more outliers

0.1.4 3. Check distributions and coorelations

[12]: plt.figure(figsize= (15,20))

[15]: # what kind of correlation does the data exhibit?

[17]: each_class = df['Class'].value_counts()

See the caveats in the documentation: https://pandas.pydata.org/pandas-

[20]: from sklearn.model_selection import train_test_split

[22]: from sklearn.preprocessing import StandardScaler

[23]: # Decision Tree

#better results with entropy and 6 depth

kf = KFold(n_splits=10, shuffle = True)

KFold(n_splits=10, random_state=None, shuffle=True)

[25]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

for i, (train_index, val_index) in enumerate(kf.split(X)):

y_train1, y_val1 = pd.DataFrame(y).iloc[train_index], pd.DataFrame(y).

print("Averge: ", sum(scores)/len(scores))

fold : 0 score: 0.9676945668135095

[26]: # using 8 features [0,1,2,3,4,5,6,7] 0.9319663401331882

[27]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#tree_confmat = confusion_matrix(y_test, y_pred_test)

tree_confmat = confusion_matrix(y_test, y_pred_test)

[28]: from sklearn.metrics import accuracy_score, precision_score, recall_score,␣

print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_test))

print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_test))

Null Accuracy: 0.924

[29]: import matplotlib.pyplot as plt

train_sizes, train_scores, test_scores =\

train_mean = np.mean(train_scores, axis=1)

plt.axhline(y = null_acc, color = 'black', linestyle = '--', label='Null␣

fig = plt.figure(figsize=(7, 5))

### TRAINING ROC CURVE

fpr, tpr, thresholds = roc_curve(y_train,

#print("X_train2: ", X_train)

if LooseVersion(matplotlib.version) < LooseVersion('0.3.4'):