Credit Card Fraud Detection

PROJECT - CREDIT CARD FRAUD DETECTION
CODE OVERVIEW
1) Data Preprocessing
● Handling missing values by filling with the mean.
● Dropping duplicate rows.
● Standardizing features using different scalers: StandardScaler, RobustScaler, and MinMaxScaler
● Detecting outliers using box plot, kmeans cluster algorithm and RobustScaler.
● Resampling the minority class for balancing.
2) Modeling
● Splitting the balanced data into training and testing sets.
● Implementing three classifiers: Naive Bayes, KNN, and Decision Tree.
● Evaluating each model's performance using classification reports.
● Plotting confusion matrices and ROC curves for model evaluation.
Code Output
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import
train_test_split
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler,
RobustScaler, MinMaxScaler, Binarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn.cluster import KMeans
# Read the dataset

creditcard_dataset =
pd.read_csv('/creditcard_2023.csv')
print(creditcard_dataset.head())
df = creditcard_dataset
Code Output
df.describe()
# Handling missing values by filling with mean and

dropping duplicates
null_values = df.isnull().sum().sum()
print("Null values: ", null_values)
df.fillna(df.mean(), inplace = True)
df.drop_duplicates(inplace = True)
# Separate features and target variable

features = creditcard_dataset.drop(['id', 'Class'],
axis=1)
target = creditcard_dataset['Class']
# Standardize the features

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
X = pd.DataFrame(scaled_features,
columns=features.columns)
Code Output
# Box plot is inefficient in detecting outliers

here since data is lost and macro avg is reduced to
0.50
plt.figure(figsize=(12, 6))
X.boxplot(vert=False)
plt.title('Boxplot of Features')
plt.show()
# Removing outliers using IQR

Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ((X < (Q1 - 1.5 * IQR)) | (X > (Q3 +
1.5 * IQR))).any(axis=1)
X_filtered = X[~outlier_mask]
y_filtered = target[~outlier_mask]
Code Output
# Using KMeans for outlier detection is inefficient

since it does not give a true picture of the data
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)
# Assigning the cluster labels

creditcard_dataset['Cluster'] = kmeans.labels_
# Visualizing outliers with scatter plots

sns.scatterplot(data=creditcard_dataset, x='V1',
y='V2', hue='Cluster', palette='Set1')
plt.title('Outlier Detection using KMeans - V1 vs
V2')
plt.show()
Code Output
Code Output
# Method used to handle outliers using RobustScaler

robust_scaler = RobustScaler()
x_robust_scaled =
robust_scaler.fit_transform(features)
X_robust = pd.DataFrame(x_robust_scaled,
columns=features.columns)
# Handling inconsistent values using Min-Max

scaling
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_min_max_scaled =
min_max_scaler.fit_transform(X_robust)
X_min_max = pd.DataFrame(x_min_max_scaled,
columns=X_robust.columns)
# Sampling - resampling the minority class

Code Output
minority =
creditcard_dataset[creditcard_dataset['Class'] ==
1]
majority =
creditcard_dataset[creditcard_dataset['Class'] ==
0]
minority_upsampled = resample(minority,
replace=True, n_samples=len(majority),
random_state=42)
balanced_df = pd.concat([majority,
minority_upsampled])
# Splitting the balanced data into train and test

sets
x_balanced = balanced_df.drop(['id', 'Class'],
axis=1)
y_balanced = balanced_df['Class']
x_train, x_test, y_train, y_test =
train_test_split(x_balanced, y_balanced,
test_size=0.2, random_state=42)
# Define a function to plot confusion matrix

def plot_confusion_matrix(model, x_test, y_test,
title):
sns.heatmap(confusion_matrix(y_test,
model.predict(x_test)), annot=True, cmap='Blues',
fmt='g')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title(title)
plt.show()
# Define a function to plot ROC curve

def plot_roc_curve(model, x_test, y_test, title):
fpr, tpr, thresholds = roc_curve(y_test,
model.predict_proba(x_test)[:, 1])
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(title)
plt.legend()
plt.show()
Code Output
# Naive Bayes Classifier

nb_classifier = GaussianNB()
nb_classifier.fit(x_train, y_train)
y_pred_nb = nb_classifier.predict(x_test)
print("Naive Bayes Classifier Report:")
print(classification_report(y_test, y_pred_nb))
plot_confusion_matrix(nb_classifier, x_test,
y_test, 'Confusion Matrix - Naive Bayes')
plot_roc_curve(nb_classifier, x_test, y_test, 'ROC
Curve - Naive Bayes')
Code Output
# KNN Classifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)
print("KNN Classifier Report:")
print(classification_report(y_test, y_pred_knn))
plot_confusion_matrix(knn, x_test, y_test,
'Confusion Matrix - KNN')
plot_roc_curve(knn, x_test, y_test, 'ROC Curve -
KNN')
Code Output
# Decision Tree Classifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(x_train, y_train)
y_pred_tree = dec_tree.predict(x_test)
print("Decision Tree Classifier Report:")
print(classification_report(y_test, y_pred_tree))
plot_confusion_matrix(dec_tree, x_test, y_test,
'Confusion Matrix - Decision Tree')
plot_roc_curve(dec_tree, x_test, y_test, 'ROC Curve
- Decision Tree')
Code Output
# Function for k-fold cross-validation and

evaluation
def kfold_cross_validation(model, X, y, title):
kf = KFold(n_splits=5, shuffle=True,
random_state=42)
for fold, (train_index, test_index) in
enumerate(kf.split(X), 1):
X_train, X_test = X.iloc[train_index],
X.iloc[test_index]
y_train, y_test = y.iloc[train_index],
y.iloc[test_index]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Fold {fold} - {title} -

Classification Report:")
print(classification_report(y_test,
y_pred))
plot_confusion_matrix(model, X_test,
y_test, f'Fold {fold} - Confusion Matrix -
{title}')
plot_roc_curve(model, X_test, y_test,
f'Fold {fold} - ROC Curve - {title}')
Code Output
Code Output
CONCLUSION
1. Naive Bayes
● Performs reasonably well but has slightly lower recall for fraudulent transactions.
2. KNN:
● Shows better performance in identifying both classes with higher precision and recall.
3. Decision Tree:
● Achieves perfect scores, indicating a likely overfitting issue.
These models offer different trade-offs between precision and recall for identifying fraudulent transactions. While KNN
seems balanced, Decision Tree's perfect scores might indicate overfitting and might not generalize well on new data.
Further data validation could enhance the model’s performance and generalize better to new datasets.

Credit Card Fraud Detection

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Credit Card Fraud Detection

Uploaded by

Copyright:

Available Formats

PROJECT - CREDIT CARD FRAUD DETECTION

# Read the dataset

# Handling missing values by filling with mean and

# Separate features and target variable

# Standardize the features

# Box plot is inefficient in detecting outliers

# Removing outliers using IQR

# Using KMeans for outlier detection is inefficient

# Assigning the cluster labels

# Visualizing outliers with scatter plots

# Method used to handle outliers using RobustScaler

# Handling inconsistent values using Min-Max

# Sampling - resampling the minority class

# Splitting the balanced data into train and test

# Define a function to plot confusion matrix

# Define a function to plot ROC curve

# Naive Bayes Classifier

# Decision Tree Classifier

# Function for k-fold cross-validation and

print(f"Fold {fold} - {title} -

You might also like