You are on page 1of 4

MailSpam

April 7, 2024

[1]: import pandas as pd

[2]: df_train=pd.read_csv('/content/drive/MyDrive/email.csv')
df_train.head()

[2]: Category Message


0 ham Go until jurong point, crazy.. Available only …
1 ham Ok lar… Joking wif u oni…
2 spam Free entry in 2 a wkly comp to win FA Cup fina…
3 ham U dun say so early hor… U c already then say…
4 ham Nah I don't think he goes to usf, he lives aro…

[3]: df_train.dtypes

[3]: Category object


Message object
dtype: object

[5]: df_train['spam']=df_train['Category'].apply(lambda x: 1 if x=='spam' else 0)


df_train.head()

[5]: Category Message spam


0 ham Go until jurong point, crazy.. Available only … 0
1 ham Ok lar… Joking wif u oni… 0
2 spam Free entry in 2 a wkly comp to win FA Cup fina… 1
3 ham U dun say so early hor… U c already then say… 0
4 ham Nah I don't think he goes to usf, he lives aro… 0

[6]: df_train.isnull().sum()

[6]: Category 0
Message 0
spam 0
dtype: int64

[7]: from sklearn.model_selection import train_test_split

X=df_train['Message']

1
y=df_train['spam']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

[20]: from sklearn import svm


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

clf_NaiveBayes = Pipeline([
('vectorizer', CountVectorizer()),
('classifier', MultinomialNB())
])

clf_rfc = Pipeline([
('vectorizer', CountVectorizer()),
('classifier', RandomForestClassifier())
])

clf_svm = Pipeline([
('vectorizer', CountVectorizer()),
('svm',svm.SVC(kernel='linear',C=100,gamma=0.001))
])

clf_NaiveBayes.fit(X_train,y_train)
y_pred=clf_NaiveBayes.predict(X_test)
acc=metrics.accuracy_score(y_pred,y_test)
cm=metrics.confusion_matrix(y_pred,y_test)
print("Navies Bayes")
print(cm)
print(acc)

clf_svm.fit(X_train,y_train)
y_pred=clf_svm.predict(X_test)
acc_svm=metrics.accuracy_score(y_pred,y_test)
cm_svm=metrics.confusion_matrix(y_pred,y_test)
print("SVM:")
print(cm_svm)
print(acc_svm)

clf_rfc.fit(X_train,y_train)
y_pred=clf_rfc.predict(X_test)
acc_rfc=metrics.accuracy_score(y_pred,y_test)
cm_rfc=metrics.confusion_matrix(y_pred,y_test)
print("Random Forest Classifier:")

2
print(cm_rfc)
print(acc_rfc)

Navies Bayes
[[974 13]
[ 9 119]]
0.9802690582959641
SVM:
[[981 11]
[ 2 121]]
0.9883408071748879
Random Forest Classifier:
[[983 23]
[ 0 109]]
0.979372197309417

[23]: from sklearn.model_selection import cross_val_score

svm_cross_valid=cross_val_score(clf_svm,X_train,y_train,cv=5)
mean_acc_svm=svm_cross_valid.mean()
print(mean_acc_svm)

rfc_cross_valid=cross_val_score(clf_rfc,X_train,y_train,cv=5)
mean_acc_rfc=rfc_cross_valid.mean()
print(mean_acc_rfc)

nb_cross_valid=cross_val_score(clf_NaiveBayes,X_train,y_train,cv=5)
mean_acc_nb=nb_cross_valid.mean()
print(mean_acc_nb)

0.9802599990940799
0.9755489624697399
0.9860931185295907

[37]: from sklearn.model_selection import GridSearchCV

svm_param_grid = {'svm__C': [0.1, 1, 10, 100], 'svm__gamma': [0.1, 0.01, 0.


↪001], 'svm__kernel': ['linear', 'rbf', 'sigmoid', 'poly']}

svm_model_param = GridSearchCV(clf_svm, svm_param_grid,cv=3)


svm_model_param.fit(X_train, y_train)

svm_best_estimator = svm_model_param.best_estimator_

y_pred_svm = svm_best_estimator.predict(X_test)

3
svm_acc_ht = metrics.accuracy_score(y_pred_svm, y_test)

print("SVM Accuracy:", svm_acc_ht)

SVM Accuracy: 0.9883408071748879

[36]: rfc_param_grid = {
'classifier__n_estimators': [100, 200, 300],
'classifier__max_depth': [None, 10, 50],
'classifier__min_samples_split': [2, 5],
'classifier__min_samples_leaf': [1, 2]
}

# Perform RandomizedSearchCV for Random Forest with reduced search space and␣
↪fewer CV folds

rfc_model_param = GridSearchCV(clf_rfc, rfc_param_grid, cv=3, n_jobs=-1)


rfc_model_param.fit(X_train, y_train)

# Get best estimator


rfc_best_estimator = rfc_model_param.best_estimator_

# Make predictions on the test data


y_pred_rfc = rfc_best_estimator.predict(X_test)

# Calculate accuracy
rfc_acc_ht = metrics.accuracy_score(y_pred_rfc, y_test)

print("Random Forest Accuracy:", rfc_acc_ht)

Random Forest Accuracy: 0.9829596412556054

You might also like