You are on page 1of 45

blankEDA

October 9, 2023

0.1 Exploratory Data Analysis


[1]: from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

# To check recent matplotlib compatibility


import matplotlib
from distutils.version import LooseVersion

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

# setup marker generator and color map


markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])

# plot the decision surface


x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())

for idx, cl in enumerate(np.unique(y)):


plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.8,
color=colors[idx],
marker=markers[idx],
label=cl,
edgecolor='black')

# highlight test examples

1
if test_idx:
# plot all examples
X_test, y_test = X[test_idx, :], y[test_idx]

if LooseVersion(matplotlib.__version__) < LooseVersion('0.3.4'):


plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
else:
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='none',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')

0.1.1 Import libraries

[2]: import numpy as np


import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
import copy
sns.set()

0.1.2 1. Read in the data set and perform preliminary checks


[3]: # read the data set
df = pd.read_csv('creditcard (1).csv')

# how many samples do we have?


## 284807
# how many features do we have?

2
##29

# what are the classes


classes = pd.unique(df['Class'])
print(classes)

##0 and 1

# how many classes do we have?


## 2
# how many samples of each class? are they balanced?
## 0 has 278703 1 has 487
# what are the names of the features
## V1-V29
# what are the types of features?
print(df.info())
## all features are floats and the class is int

# summarize you findings


## This data is very imbalanced, but the dtypes look good and no nulls.

[0 1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Time 284807 non-null float64
1 V1 284807 non-null float64
2 V2 284807 non-null float64
3 V3 284807 non-null float64
4 V4 284807 non-null float64
5 V5 284807 non-null float64
6 V6 284807 non-null float64
7 V7 284807 non-null float64
8 V8 284807 non-null float64
9 V9 284807 non-null float64
10 V10 284807 non-null float64
11 V11 284807 non-null float64
12 V12 284807 non-null float64
13 V13 284807 non-null float64
14 V14 284807 non-null float64
15 V15 284807 non-null float64
16 V16 284807 non-null float64
17 V17 284807 non-null float64
18 V18 284807 non-null float64
19 V19 284807 non-null float64

3
20 V20 284807 non-null float64
21 V21 284807 non-null float64
22 V22 284807 non-null float64
23 V23 284807 non-null float64
24 V24 284807 non-null float64
25 V25 284807 non-null float64
26 V26 284807 non-null float64
27 V27 284807 non-null float64
28 V28 284807 non-null float64
29 Amount 284807 non-null float64
30 Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
None

0.1.3 2. Check missing values, duplicates, outliers and statistics

[4]: # are there any missing/nan/null values?


df.drop(df.index[148636])
display(df.isna().sum().to_frame())
## no missing values
# if so, what are you going to do about the missing/nan/null values?

0
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0

4
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0

[5]: # are there outliers?


df1 = df.loc[:, ~df.columns.isin(['Time','Amount','Class'])]
df1.boxplot(figsize = (15,6))

plt.show()
df1.describe().T

# if so, what are you going to do about the outliers


## I will replace the outliers

# are there anomalies in the statistics?


## Yes

# if so, what are you going to do about the anomalies


## I will replace the anomalies

# what is the frequency distribution of each class


## class 0 occurs more frequently than class 1
each_class = df['Class'].value_counts()
print(each_class)

# summarize your findings


## There are a lot of outliers and anomolies and the data is very inbalanced

5
0 284315
1 492
Name: Class, dtype: int64

[6]: # calculated q1 and q3 to calculate the inner quartile range


Q1 = df1.quantile(0.25)
Q3 = df1.quantile(0.75)
IQR = Q3 - Q1
print (Q1, Q3, IQR)

V1 -0.920373
V2 -0.598550
V3 -0.890365
V4 -0.848640
V5 -0.691597
V6 -0.768296
V7 -0.554076
V8 -0.208630
V9 -0.643098
V10 -0.535426
V11 -0.762494
V12 -0.405571
V13 -0.648539
V14 -0.425574
V15 -0.582884
V16 -0.468037
V17 -0.483748
V18 -0.498850
V19 -0.456299
V20 -0.211721
V21 -0.228395
V22 -0.542350
V23 -0.161846
V24 -0.354586
V25 -0.317145
V26 -0.326984
V27 -0.070840
V28 -0.052960
Name: 0.25, dtype: float64 V1 1.315642
V2 0.803724
V3 1.027196
V4 0.743341
V5 0.611926
V6 0.398565
V7 0.570436
V8 0.327346
V9 0.597139
V10 0.453923

6
V11 0.739593
V12 0.618238
V13 0.662505
V14 0.493150
V15 0.648821
V16 0.523296
V17 0.399675
V18 0.500807
V19 0.458949
V20 0.133041
V21 0.186377
V22 0.528554
V23 0.147642
V24 0.439527
V25 0.350716
V26 0.240952
V27 0.091045
V28 0.078280
Name: 0.75, dtype: float64 V1 2.236015
V2 1.402274
V3 1.917560
V4 1.591981
V5 1.303524
V6 1.166861
V7 1.124512
V8 0.535976
V9 1.240237
V10 0.989349
V11 1.502088
V12 1.023810
V13 1.311044
V14 0.918724
V15 1.231705
V16 0.991333
V17 0.883423
V18 0.999657
V19 0.915248
V20 0.344762
V21 0.414772
V22 1.070904
V23 0.309488
V24 0.794113
V25 0.667861
V26 0.567936
V27 0.161885
V28 0.131240
dtype: float64

7
[7]: #creating dataframes for points outside the iqr
q1 = ((df1 < (Q1 - 1.5 * IQR)).sum())
q2 = ((df1 > (Q3 + 1.5 * IQR)).sum())
print(q1)
print(q2)

#Anything above these numbers are ourliers


print((Q1 - 1.5 * IQR))
print((Q3 + 1.5 * IQR))

V1 7062
V2 8430
V3 3343
V4 2243
V5 3884
V6 1752
V7 4810
V8 12237
V9 2439
V10 3368
V11 119
V12 14579
V13 1139
V14 8757
V15 2465
V16 6521
V17 757
V18 3986
V19 5063
V20 8683
V21 6438
V22 926
V23 8170
V24 4638
V25 3688
V26 729
V27 19544
V28 18542
dtype: int64
V1 0
V2 5096
V3 20
V4 8905
V5 8411
V6 21213
V7 4138
V8 11897

8
V9 5844
V10 6128
V11 661
V12 769
V13 2229
V14 5392
V15 429
V16 1663
V17 6663
V18 3547
V19 5142
V20 19087
V21 8059
V22 391
V23 10371
V24 136
V25 1679
V26 4867
V27 19619
V28 11800
dtype: int64
V1 -4.274396
V2 -2.701961
V3 -3.766705
V4 -3.236612
V5 -2.646882
V6 -2.518586
V7 -2.240844
V8 -1.012593
V9 -2.503452
V10 -2.019449
V11 -3.015626
V12 -1.941286
V13 -2.615106
V14 -1.803660
V15 -2.430442
V16 -1.955036
V17 -1.808883
V18 -1.998335
V19 -1.829171
V20 -0.728865
V21 -0.850553
V22 -2.148706
V23 -0.626079
V24 -1.545755
V25 -1.318936
V26 -1.178888
V27 -0.313667

9
V28 -0.249819
dtype: float64
V1 4.669664
V2 2.907135
V3 3.903536
V4 3.131313
V5 2.567212
V6 2.148856
V7 2.257204
V8 1.131309
V9 2.457494
V10 1.937947
V11 2.992725
V12 2.153952
V13 2.629071
V14 1.871236
V15 2.496378
V16 2.010296
V17 1.724810
V18 2.000292
V19 1.831822
V20 0.650184
V21 0.808535
V22 2.134910
V23 0.611875
V24 1.630696
V25 1.352506
V26 1.092856
V27 0.333872
V28 0.275140
dtype: float64

[8]: for col in df1.columns:


q1 = df1[col].quantile(0.25)
q3 = df1[col].quantile(0.75)
iqr = q3 - q1
print(q1, q3, iqr)
df1.loc[(df1[col] < (q1 - 1.5 * iqr)), col] = (q1 - 1.5 * iqr)
df1.loc[(df1[col] > (q3 + 1.5 * iqr)), col] = (q3 + 1.5 * iqr)

-0.920373384390322 1.315641693877865 2.236015078268187


-0.598549913464916 0.8037238712400945 1.4022737847050104
-0.8903648381551406 1.027195542465555 1.9175603806206956
-0.848640116331273 0.7433412894685876 1.5919814057998605
-0.6915970708876575 0.611926439735193 1.3035235106228504
-0.768295608460489 0.39856489635610504 1.166860504816594
-0.5540758790365226 0.5704360728775986 1.1245119519141211
-0.2086297440394665 0.327345861923449 0.5359756059629155

10
-0.6430975702665915 0.5971390302822686 1.24023660054886
-0.5354257264933235 0.453923445139507 0.9893491716328305
-0.7624941955129775 0.739593407321606 1.5020876028345835
-0.40557148544041355 0.618238032946136 1.0238095183865497
-0.6485392991145684 0.662504959439974 1.3110442585545425
-0.4255740124549935 0.493149849218149 0.9187238616731425
-0.582884279157456 0.648820806317158 1.2317050854746139
-0.46803676671289796 0.523296312475344 0.991333079188242
/tmp/ipykernel_1165/417446864.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.loc[(df1[col] < (q1 - 1.5 * iqr)), col] = (q1 - 1.5 * iqr)
/tmp/ipykernel_1165/417446864.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.loc[(df1[col] > (q3 + 1.5 * iqr)), col] = (q3 + 1.5 * iqr)
-0.483748313707048 0.3996749826503845 0.8834232963574324
-0.498849798665041 0.5008067468872159 0.9996565455522569
-0.4562989187444475 0.458949355762679 0.9152482745071264
-0.21172136467424701 0.1330408409942945 0.34476220566854154
-0.22839494677851702 0.1863772033785755 0.4147721501570925
-0.5423503726606616 0.5285536353339865 1.0709040079946481
-0.16184634501488449 0.14764206385605 0.3094884088709345
-0.3545861364094985 0.439526600168186 0.7941127365776846
-0.31714505406527 0.350715562867386 0.667860616932656
-0.3269839258807195 0.2409521737147555 0.567936099595475
-0.07083952930446921 0.09104511968580689 0.1618846489902761
-0.0529597930169809 0.07827995475782015 0.13123974777480105

[9]: #no more outliers


df2 = ((df1 < (Q1 - 1.5 * IQR)).sum())
df3 = ((df1 > (Q3 + 1.5 * IQR)).sum())
print(df2)
print(df3)

V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0

11
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
dtype: int64
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0

12
V28 0
dtype: int64

0.1.4 3. Check distributions and coorelations


[10]: # what kind of distributions do the features have?
# fairly normal distributions
# what is your interpretation

plt.figure(figsize= (15,20))
plt.subplot(3, 3, 1)
plt.hist(df1['V1'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V1')

plt.subplot(3,3,2)
plt.hist(df1['V2'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V2')

plt.subplot(3,3,3)
plt.hist(df1['V3'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V3')

plt.subplot(3,3,4)
plt.hist(df1['V4'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V4')

plt.subplot(3, 3, 5)
plt.hist(df1['V5'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V5')

plt.subplot(3,3,6)
plt.hist(df1['V6'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V6')

plt.show()

13
[11]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.hist(df1['V7'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V7')

plt.subplot(3,3,2)
plt.hist(df1['V8'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V8')

plt.subplot(3,3,3)
plt.hist(df1['V9'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V9')

plt.subplot(3,3,4)
plt.hist(df1['V10'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V10')

14
plt.subplot(3,3,5)
plt.hist(df1['V11'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V11')

plt.subplot(3,3,6)
plt.hist(df1['V12'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V12')

plt.show()

[12]: plt.figure(figsize= (15,20))


plt.subplot(3,3,1)
plt.hist(df1['V13'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V13')

plt.subplot(3,3, 2)
plt.hist(df1['V14'], color='lightblue', edgecolor = 'black', alpha = 0.7)

15
plt.xlabel('V14')

plt.subplot(3,3,3)
plt.hist(df1['V15'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V15')

plt.subplot(3,3,4)
plt.hist(df1['V16'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V16')

plt.subplot(3,3,5)
plt.hist(df1['V17'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V17')

plt.subplot(3,3,6)
plt.hist(df1['V18'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V18')
plt.show()

16
[13]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.hist(df1['V19'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V19')

plt.subplot(3,3,2)
plt.hist(df1['V20'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V20')

plt.subplot(3,3,3)
plt.hist(df1['V21'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V21')

plt.subplot(3,3,4)
plt.hist(df1['V22'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V22')

plt.subplot(3,3,5)
plt.hist(df1['V23'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V23')

plt.subplot(3,3,6)
plt.hist(df1['V24'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V24')
plt.show()

17
[14]: plt.figure(figsize= (15,20))
plt.subplot(3,3,1)
plt.hist(df1['V25'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V25')

plt.subplot(3,3,2)
plt.hist(df1['V26'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V26')

plt.subplot(3,3,3)
plt.hist(df1['V27'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V27')

plt.subplot(3,3,4)
plt.hist(df1['V28'], color='lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('V28')

18
plt.show()

[ ]:

[15]: # what kind of correlation does the data exhibit?


# there are a lot of slightly correlated data
# what is your interpretation
# certain features are better correlated to predict a class
df1['Class'] = df['Class']

plt.figure(figsize= (25,25))
pairplot1 = sns.heatmap(df1.corr(),annot=True)
plt.show(pairplot1)

/tmp/ipykernel_1165/593985081.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

19
See the caveats in the documentation: https://pandas.pydata.org/pandas-
docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1['Class'] = df['Class']

[16]: # did you perform feature engineering (drop irrelevant features/columns, for␣
↪example)?

df2 = df1[['V2','V6','V7','V8','V21','V22','V27','V28','Class']]
plt.figure(figsize= (8,7))
pairplot2 = sns.heatmap(df2.corr(),annot=True)
plt.show(pairplot2)
# summarize your findings

20
#I chose the most correlated features to predict fraud and iterated through␣
↪them to see which ones were best

[17]: each_class = df['Class'].value_counts()


print(each_class)

0 284315
1 492
Name: Class, dtype: int64

[18]: #balance data but keeping a little more non fraud cases becuase they are more␣
↪common

#10,000 was the best sample size for me to balance bias and variance
df3 = df2
index = df2[(df2['Class'] == 0)].sample(n=278000).index
df3.drop(index,inplace=True)

21
each_class = df3['Class'].value_counts()
print(each_class)

0 6315
1 492
Name: Class, dtype: int64
/tmp/ipykernel_1165/1221864664.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df3.drop(index,inplace=True)

[19]: X = df3.iloc[:,[2,3]]
y = df3.iloc[:,[8]]
y = pd.factorize(df3['Class'])[0]

[20]: from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.
↪3,random_state=1, stratify=y)

[21]: print(np.bincount(y))
print(np.bincount(y_train))
print(np.bincount(y_test))

[6315 492]
[4420 344]
[1895 148]

[22]: from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

[23]: # Decision Tree


from sklearn.tree import DecisionTreeClassifier

#better results with entropy and 6 depth


tree_model =␣
↪DecisionTreeClassifier(criterion='entropy',max_depth=4,random_state=1)

tree_model.fit(X_train,y_train)
y_pred_test = tree_model.predict(X_test)
y_pred_train = tree_model.predict(X_train)

22
[24]: from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle = True)


print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)

[25]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


scores = []

for i, (train_index, val_index) in enumerate(kf.split(X)):


#print(f"Fold {i}:")
#print(f" Train: index={train_index}")
#print(f" Test: index={val_index}")

# Splitting the training set from the validation set for this specific fold
X_train1, X_val1 = pd.DataFrame(X).iloc[train_index, :], pd.DataFrame(X).
↪iloc[val_index, :]

y_train1, y_val1 = pd.DataFrame(y).iloc[train_index], pd.DataFrame(y).


↪iloc[val_index]

#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)

tree_model.fit(X_train1, y_train1)
score = tree_model.score(X_val1, y_val1)
print(f"fold :", i, "score: ", score)

y_pred_test1 = tree_model.predict(X_val1)

confmax = confusion_matrix(y_val1,y_pred_test1)
#print(confmax)

scores.append(score)

print("Averge: ", sum(scores)/len(scores))


#kfold resulted in a slight increase of accuracy

fold : 0 score: 0.9676945668135095


fold : 1 score: 0.947136563876652
fold : 2 score: 0.9647577092511013
fold : 3 score: 0.9456681350954479
fold : 4 score: 0.947136563876652
fold : 5 score: 0.9500734214390602

23
fold : 6 score: 0.9632892804698973
fold : 7 score: 0.95
fold : 8 score: 0.9661764705882353
fold : 9 score: 0.9426470588235294
Averge: 0.9544579770234087

[26]: # using 8 features [0,1,2,3,4,5,6,7] 0.9319663401331882


# using 7 features [0,1,2,3,4,5,6] 0.9304331542665871
# using 6 features [0,1,2,3,4,5] 0.9277530049272614
# using 5 features [0,1,2,3,4] we get a score of 0.9131350725044767
# using 4 features [0,1,2,3] we get a score of 0.9462799761244338
# using 4 other features [2,3,4,5] we get a score of 0.9397961213908688
# using 2 features [2,3] we get a score of 0.963004576150182
# using 2 other features [4,5] we get a score of 0.8590639373617499

[27]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#for spam emails we want to have less false positives so important emails dont␣
↪get flagged as spam

#tree_confmat = confusion_matrix(y_test, y_pred_test)

tree_confmat = confusion_matrix(y_test, y_pred_test)


cmd = ConfusionMatrixDisplay(tree_confmat,display_labels=['P','N'])
cmd.plot()
plt.show()
print(tree_confmat)
#for credit card fraud we want to reduce the number of false negatives (top␣
↪right number)

24
[[1870 25]
[ 71 77]]

[28]: from sklearn.metrics import accuracy_score, precision_score, recall_score,␣


↪f1_score

print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred_test))
print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train,␣
↪y_pred=y_pred_train))

print('Error: {}'.format(round(1-accuracy_score(y_true=y_test,␣
↪y_pred=y_pred_test),3)))

print('Specificity: {}'.format(round(tree_confmat[0][1]/(tree_confmat[0][1] +␣
↪tree_confmat[1][0]),3)))

print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_test))


print('Recall/Sensitivity: %.3f' % recall_score(y_true=y_test,␣
↪y_pred=y_pred_test))

print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_test))

25
print()

null_acc = 6000/(6000
+each_class[1])
print('Null Accuracy: %.3f'% null_acc)

#i tried to get a better f1 score and recall becuase its important for fraud␣
↪detection

Test:
Test Accuracy: 0.953
Training Accuracy: 0.958
Error: 0.047
Specificity: 0.26
Precision: 0.755
Recall/Sensitivity: 0.520
F1: 0.616

Null Accuracy: 0.924

[29]: import matplotlib.pyplot as plt


from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores =\


learning_curve(estimator=tree_model,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=1)

#print(train_sizes)
#print(train_scores)
#print(test_scores)

train_mean = np.mean(train_scores, axis=1)


train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.axhline(y = null_acc, color = 'black', linestyle = '--', label='Null␣


↪accuracy')

plt.plot(train_sizes, train_mean,
color='blue', marker='o',

26
markersize=5, label='Training accuracy')

plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')

plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, 0, 1.3])

plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.1])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()

#i tried to keep a balance between variance and bias to prevent overfitting and␣
↪underfitting

27
[30]: from sklearn.metrics import roc_curve, auc

fig = plt.figure(figsize=(7, 5))

### TRAINING ROC CURVE


train_probas = tree_model.fit(X_train,
y_train).predict_proba(X_train)

fpr, tpr, thresholds = roc_curve(y_train,


train_probas[:, 1],
pos_label=1)

#print("X_train2: ", X_train)


#print("train_prob: ", train_probas)
#print("train_fpr: ", fpr)
#print("train_tpr: ", tpr)
#print("train_thr: ",thresholds)
#print()
#print()

roc_auc = auc(fpr, tpr)

28
plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))

### TEST ROC CURVE


test_probas = tree_model.predict_proba(X_test)
#test_probas = pipe_knn.predict_proba(X_test[:, :])

fpr, tpr, thresholds = roc_curve(y_test,


test_probas[:, 1],
pos_label=1)

#print("test_fpr: ", fpr)


#print("test_tpr: ", tpr)
#print("test_thr: ",thresholds)

roc_auc = auc(fpr, tpr)

plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))

plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')

plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")

plt.tight_layout()
plt.show()

29
[31]: #KNN

[32]: X_knn = df3.iloc[:,[2,3]]


y_knn = df3.iloc[:,[8]]
y_knn = pd.factorize(df3['Class'])[0]

[33]: from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=15,
p=2,
metric='minkowski')

X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_knn, y_knn,␣


↪test_size=.3,random_state=1, stratify=y_knn)

[34]: sc.fit(X_train_k)
X_train_std_k = sc.transform(X_train_k)
X_test_std_k = sc.transform(X_test_k)

[35]: X_combined_std_k = np.vstack((X_train_std_k, X_test_std_k))


y_combined_k = np.hstack((y_train_k, y_test_k))

30
knn.fit(X_train_std_k, y_train_k)

[35]: KNeighborsClassifier(n_neighbors=15)

[36]: y_pred_test_k = knn.predict(X_test_std_k)


y_pred_train_k = knn.predict(X_train_k)

/opt/conda/lib/python3.10/site-packages/sklearn/base.py:402: UserWarning: X has


feature names, but KNeighborsClassifier was fitted without feature names
warnings.warn(

[37]: from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle = True)


print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)

[38]: scores = []

for i, (train_index, val_index) in enumerate(kf.split(X_knn)):


#print(f"Fold {i}:")
#print(f" Train: index={train_index}")
#print(f" Test: index={val_index}")

# Splitting the training set from the validation set for this specific fold
X_train, X_val = pd.DataFrame(X_knn).iloc[train_index, :], pd.
↪DataFrame(X_knn).iloc[val_index, :]

y_train, y_val = pd.DataFrame(y_knn).iloc[train_index], pd.DataFrame(y_knn).


↪iloc[val_index]

#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)

knn.fit(X_train, y_train)
score = knn.score(X_val, y_val)
print(f"fold :", i, "score: ", score)

y_pred_test = knn.predict(X_val)

confmax = confusion_matrix(y_val,y_pred_test)
#print(confmax)

scores.append(score)

31
print("Averge: ", sum(scores)/len(scores))

/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 0 score: 0.9588839941262849
fold : 1 score: 0.9500734214390602
fold : 2 score: 0.9574155653450808
fold : 3 score: 0.9441997063142438
fold : 4 score: 0.9618208516886931
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 5 score: 0.9559471365638766
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-

32
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 6 score: 0.9588839941262849
fold : 7 score: 0.9514705882352941
fold : 8 score: 0.9514705882352941
fold : 9 score: 0.9544117647058824
Averge: 0.9544577610779996
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)

[39]: knn_confmat = confusion_matrix(y_test_k, y_pred_test_k)


knn_confmat

[39]: array([[1870, 25],


[ 68, 80]])

[40]: print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test_k,␣
↪y_pred=y_pred_test_k))

print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train_k,␣


↪y_pred=y_pred_train_k))

print('Precision: %.3f' % precision_score(y_true=y_test_k,␣


↪y_pred=y_pred_test_k))

print('Recall: %.3f' % recall_score(y_true=y_test_k, y_pred=y_pred_test_k))


print('F1: %.3f' % f1_score(y_true=y_test_k, y_pred=y_pred_test_k))
print()
print('Null Accuracy: %.3f'% null_acc)

Test:
Test Accuracy: 0.954
Training Accuracy: 0.948
Precision: 0.762
Recall: 0.541
F1: 0.632

Null Accuracy: 0.924

33
[41]: import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores =\


learning_curve(estimator=knn,
X=X_train_k,
y=y_train_k,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=1)

#print(train_sizes)
#print(train_scores)
#print(test_scores)

train_mean = np.mean(train_scores, axis=1)


train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.axhline(y = null_acc, color = 'black', linestyle = '--')

plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5, label='Training accuracy')

plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')

plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, .8, 1.3])

plt.grid()

34
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.0])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()

[42]: from sklearn.metrics import roc_curve, auc

fig = plt.figure(figsize=(7, 5))

### TRAINING ROC CURVE


train_probas = knn.fit(X_train_k,
y_train_k).predict_proba(X_train_k)

fpr, tpr, thresholds = roc_curve(y_train_k,


train_probas[:, 1],
pos_label=1)

#print("X_train2: ", X_train_k)

35
#print("train_prob: ", train_probas)
#print("train_fpr: ", fpr)
#print("train_tpr: ", tpr)
#print("train_thr: ",thresholds)
#print()
#print()

roc_auc = auc(fpr, tpr)

plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))

### TEST ROC CURVE


test_probas = knn.predict_proba(X_test_k)
#test_probas = pipe_knn.predict_proba(X_test[:, :])

fpr, tpr, thresholds = roc_curve(y_test_k,


test_probas[:, 1],
pos_label=1)

#print("test_fpr: ", fpr)


#print("test_tpr: ", tpr)
#print("test_thr: ",thresholds)

roc_auc = auc(fpr, tpr)

plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))

plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')

plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')

plt.xlim([-0.05, 1.05])

36
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")

plt.tight_layout()
plt.show()

[43]: #SVM

[44]: #balance data


X_svm = df3.iloc[:,[2,3]]
y_svm = df3.iloc[:,[8]]
y_svm = pd.factorize(df3['Class'])[0]

[45]: from sklearn.svm import SVC

#C 10 had better results when I tried 100 I got an inveted auc roc curve
svm = SVC(kernel='rbf', C = 1, random_state=1,probability=True)

37
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_svm, y_svm,␣
↪test_size=.3,random_state=1, stratify=y_svm)

#experiment with C (.1, 1, 10, 100)

[46]: from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle = True)


print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)

[47]: scores = []

for i, (train_index, val_index) in enumerate(kf.split(X_svm)):


#print(f"Fold {i}:")
#print(f" Train: index={train_index}")
#print(f" Test: index={val_index}")

# Splitting the training set from the validation set for this specific fold
X_train3, X_val3 = pd.DataFrame(X_svm).iloc[train_index, :], pd.
↪DataFrame(X_svm).iloc[val_index, :]

y_train3, y_val3 = pd.DataFrame(y_svm).iloc[train_index], pd.


↪DataFrame(y_svm).iloc[val_index]

#print(X_train)
#print(y_train)
#print(X_val)
#print(y_val)

knn.fit(X_train3, y_train3)
score = knn.score(X_val3, y_val3)
print(f"fold :", i, "score: ", score)

y_pred_test3 = knn.predict(X_val3)

confmax = confusion_matrix(y_val3,y_pred_test3)
#print(confmax)

scores.append(score)

print("Averge: ", sum(scores)/len(scores))

fold : 0 score: 0.9515418502202643


fold : 1 score: 0.9632892804698973
fold : 2 score: 0.9530102790014684
fold : 3 score: 0.9500734214390602

38
fold : 4 score: 0.947136563876652
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
fold : 5 score: 0.9574155653450808
fold : 6 score: 0.9618208516886931
fold : 7 score: 0.9573529411764706
fold : 8 score: 0.95
fold : 9 score: 0.9676470588235294
Averge: 0.9559287812041115
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().

39
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)
/opt/conda/lib/python3.10/site-
packages/sklearn/neighbors/_classification.py:215: DataConversionWarning: A
column-vector y was passed when a 1d array was expected. Please change the shape
of y to (n_samples,), for example using ravel().
return self._fit(X, y)

[48]: sc.fit(X_train_s)
X_train_std_s = sc.transform(X_train_s)
X_test_std_s = sc.transform(X_test_s)

[49]: svm.fit(X_train_std_s, y_train_s)

X_combined_std_s = np.vstack((X_train_std_s, X_test_std_s))


y_combined_s = np.hstack((y_train_s, y_test_s))

y_pred_test_s = svm.predict(X_test_std_s)
y_pred_train_s = svm.predict(X_train_s)
#plot_decision_regions(X_combined_std, y_combined,
#classifier=svm, test_idx=range(105, 150))

#plt.xlabel('petal length [standardized]')


#plt.ylabel('petal width [standardized]')
#plt.legend(loc='upper left')
#plt.tight_layout()
##plt.savefig('images/03_24.png', dpi=300)
#plt.show()

/opt/conda/lib/python3.10/site-packages/sklearn/base.py:402: UserWarning: X has


feature names, but SVC was fitted without feature names
warnings.warn(

[50]: print('Test:')
print('Test Accuracy: %.3f' % accuracy_score(y_true=y_test_k,␣
↪y_pred=y_pred_test_k))

print('Training Accuracy: %.3f' % accuracy_score(y_true=y_train_k,␣


↪y_pred=y_pred_train_k))

print('Precision: %.3f' % precision_score(y_true=y_test_k,␣


↪y_pred=y_pred_test_k))

print('Recall: %.3f' % recall_score(y_true=y_test_k, y_pred=y_pred_test_k))


print('F1: %.3f' % f1_score(y_true=y_test_k, y_pred=y_pred_test_k))
print()

40
print('Null Accuracy: %.3f'% null_acc)

Test:
Test Accuracy: 0.954
Training Accuracy: 0.948
Precision: 0.762
Recall: 0.541
F1: 0.632

Null Accuracy: 0.924

[51]: svm_confmat = confusion_matrix(y_test_s, y_pred_test_s)


svm_confmat

[51]: array([[1868, 27],


[ 66, 82]])

[56]: import matplotlib.pyplot as plt


from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores =\


learning_curve(estimator=svm,
X=X_train_s,
y=y_train_s,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=1)

#print(train_sizes)
#print(train_scores)
#print(test_scores)

train_mean = np.mean(train_scores, axis=1)


train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.axhline(y = null_acc, color = 'black', linestyle = '--', label='Null␣


↪accuracy')

plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5, label='Training accuracy')

plt.fill_between(train_sizes,

41
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')

plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.axis([750, 5000, .8, 1.3])

plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.0])
plt.tight_layout()
# plt.savefig('images/06_05.png', dpi=300)
plt.show()

42
[57]: from sklearn.metrics import roc_curve, auc

fig = plt.figure(figsize=(7, 5))

### TRAINING ROC CURVE


train_probas = svm.fit(X_train_s,
y_train_s).predict_proba(X_train_s)

fpr, tpr, thresholds = roc_curve(y_train_s,


train_probas[:, 1],
pos_label=1)

#print("X_train2: ", X_train_s)


#print("train_prob: ", train_probas)
#print("train_fpr: ", fpr)
#print("train_tpr: ", tpr)
#print("train_thr: ",thresholds)
#print()
#print()

roc_auc = auc(fpr, tpr)

43
plt.step(fpr,
tpr,
label='Train ROC (area = %0.2f)'
% (roc_auc))

### TEST ROC CURVE


test_probas = svm.predict_proba(X_test_s)
#test_probas = pipe_knn.predict_proba(X_test[:, :])

fpr, tpr, thresholds = roc_curve(y_test_s,


test_probas[:, 1],
pos_label=1)

#print("test_fpr: ", fpr)


#print("test_tpr: ", tpr)
#print("test_thr: ",thresholds)

roc_auc = auc(fpr, tpr)

plt.step(fpr,
tpr,
where='post',
label='Test ROC (area = %0.2f)'
% (roc_auc))

plt.plot([0, 1],
[0, 1],
linestyle='--',
color=(0.6, 0.6, 0.6),
label='Random guessing')

plt.plot([0, 0, 1],
[0, 1, 1],
linestyle=':',
color='black',
label='Perfect performance')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")

plt.tight_layout()
plt.show()

44
[ ]: #summary

[ ]: # I would use the knn model because it the accuracy was good for the area under␣
↪the curve and the prrecision was also good as well

#The model is also not over or under fit.

# DT and SVM seemed to be less accurate and probably underfit. I struggled to␣
↪get a good result while iterating through the parameters and hyperparameters

# the roc auc score was especially low for svm. svm was hard to get good␣
↪results with.

#6000 samples of class 0 was the best option for me becuase it took less time␣
↪to compute on my laptop. the data needed to be balanced anyways but␣

↪realistically.

[ ]:

45

You might also like