You are on page 1of 14

PROJECT - CREDIT CARD FRAUD DETECTION

CODE OVERVIEW

1) Data Preprocessing
● Handling missing values by filling with the mean.
● Dropping duplicate rows.
● Standardizing features using different scalers: StandardScaler, RobustScaler, and MinMaxScaler
● Detecting outliers using box plot, kmeans cluster algorithm and RobustScaler.
● Resampling the minority class for balancing.

2) Modeling
● Splitting the balanced data into training and testing sets.
● Implementing three classifiers: Naive Bayes, KNN, and Decision Tree.
● Evaluating each model's performance using classification reports.
● Plotting confusion matrices and ROC curves for model evaluation.

Code Output

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import
train_test_split
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler,
RobustScaler, MinMaxScaler, Binarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn.cluster import KMeans

# Read the dataset


creditcard_dataset =
pd.read_csv('/creditcard_2023.csv')
print(creditcard_dataset.head())
df = creditcard_dataset
Code Output

df.describe()

# Handling missing values by filling with mean and


dropping duplicates
null_values = df.isnull().sum().sum()
print("Null values: ", null_values)
df.fillna(df.mean(), inplace = True)
df.drop_duplicates(inplace = True)

# Separate features and target variable


features = creditcard_dataset.drop(['id', 'Class'],
axis=1)
target = creditcard_dataset['Class']

# Standardize the features


scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
X = pd.DataFrame(scaled_features,
columns=features.columns)
Code Output

# Box plot is inefficient in detecting outliers


here since data is lost and macro avg is reduced to
0.50
plt.figure(figsize=(12, 6))
X.boxplot(vert=False)
plt.title('Boxplot of Features')
plt.show()

# Removing outliers using IQR


Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ((X < (Q1 - 1.5 * IQR)) | (X > (Q3 +
1.5 * IQR))).any(axis=1)
X_filtered = X[~outlier_mask]
y_filtered = target[~outlier_mask]
Code Output

# Using KMeans for outlier detection is inefficient


since it does not give a true picture of the data
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

# Assigning the cluster labels


creditcard_dataset['Cluster'] = kmeans.labels_

# Visualizing outliers with scatter plots


plt.figure(figsize=(12, 6))
sns.scatterplot(data=creditcard_dataset, x='V1',
y='V2', hue='Cluster', palette='Set1')
plt.title('Outlier Detection using KMeans - V1 vs
V2')
plt.show()
Code Output
Code Output

# Method used to handle outliers using RobustScaler


robust_scaler = RobustScaler()
x_robust_scaled =
robust_scaler.fit_transform(features)
X_robust = pd.DataFrame(x_robust_scaled,
columns=features.columns)

# Handling inconsistent values using Min-Max


scaling
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_min_max_scaled =
min_max_scaler.fit_transform(X_robust)
X_min_max = pd.DataFrame(x_min_max_scaled,
columns=X_robust.columns)

# Sampling - resampling the minority class


Code Output

minority =
creditcard_dataset[creditcard_dataset['Class'] ==
1]
majority =
creditcard_dataset[creditcard_dataset['Class'] ==
0]
minority_upsampled = resample(minority,
replace=True, n_samples=len(majority),
random_state=42)
balanced_df = pd.concat([majority,
minority_upsampled])

# Splitting the balanced data into train and test


sets
x_balanced = balanced_df.drop(['id', 'Class'],
axis=1)
y_balanced = balanced_df['Class']
x_train, x_test, y_train, y_test =
train_test_split(x_balanced, y_balanced,
test_size=0.2, random_state=42)

# Define a function to plot confusion matrix


def plot_confusion_matrix(model, x_test, y_test,
title):
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test,
model.predict(x_test)), annot=True, cmap='Blues',
fmt='g')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title(title)
plt.show()

# Define a function to plot ROC curve


def plot_roc_curve(model, x_test, y_test, title):
plt.figure(figsize=(8, 6))
fpr, tpr, thresholds = roc_curve(y_test,
model.predict_proba(x_test)[:, 1])
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(title)
plt.legend()
plt.show()
Code Output

# Naive Bayes Classifier


nb_classifier = GaussianNB()
nb_classifier.fit(x_train, y_train)
y_pred_nb = nb_classifier.predict(x_test)
print("Naive Bayes Classifier Report:")
print(classification_report(y_test, y_pred_nb))
plot_confusion_matrix(nb_classifier, x_test,
y_test, 'Confusion Matrix - Naive Bayes')
plot_roc_curve(nb_classifier, x_test, y_test, 'ROC
Curve - Naive Bayes')
Code Output

# KNN Classifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)
print("KNN Classifier Report:")
print(classification_report(y_test, y_pred_knn))
plot_confusion_matrix(knn, x_test, y_test,
'Confusion Matrix - KNN')
plot_roc_curve(knn, x_test, y_test, 'ROC Curve -
KNN')
Code Output

# Decision Tree Classifier


dec_tree = DecisionTreeClassifier()
dec_tree.fit(x_train, y_train)
y_pred_tree = dec_tree.predict(x_test)
print("Decision Tree Classifier Report:")
print(classification_report(y_test, y_pred_tree))
plot_confusion_matrix(dec_tree, x_test, y_test,
'Confusion Matrix - Decision Tree')
plot_roc_curve(dec_tree, x_test, y_test, 'ROC Curve
- Decision Tree')
Code Output

# Function for k-fold cross-validation and


evaluation
def kfold_cross_validation(model, X, y, title):
kf = KFold(n_splits=5, shuffle=True,
random_state=42)
for fold, (train_index, test_index) in
enumerate(kf.split(X), 1):
X_train, X_test = X.iloc[train_index],
X.iloc[test_index]
y_train, y_test = y.iloc[train_index],
y.iloc[test_index]

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"Fold {fold} - {title} -


Classification Report:")
print(classification_report(y_test,
y_pred))
plot_confusion_matrix(model, X_test,
y_test, f'Fold {fold} - Confusion Matrix -
{title}')
plot_roc_curve(model, X_test, y_test,
f'Fold {fold} - ROC Curve - {title}')
Code Output
Code Output

CONCLUSION

1. Naive Bayes
● Performs reasonably well but has slightly lower recall for fraudulent transactions.

2. KNN:
● Shows better performance in identifying both classes with higher precision and recall.

3. Decision Tree:
● Achieves perfect scores, indicating a likely overfitting issue.

These models offer different trade-offs between precision and recall for identifying fraudulent transactions. While KNN
seems balanced, Decision Tree's perfect scores might indicate overfitting and might not generalize well on new data.
Further data validation could enhance the model’s performance and generalize better to new datasets.

You might also like