Professional Documents
Culture Documents
Mail Spam
Mail Spam
April 7, 2024
[2]: df_train=pd.read_csv('/content/drive/MyDrive/email.csv')
df_train.head()
[3]: df_train.dtypes
[6]: df_train.isnull().sum()
[6]: Category 0
Message 0
spam 0
dtype: int64
X=df_train['Message']
1
y=df_train['spam']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
clf_NaiveBayes = Pipeline([
('vectorizer', CountVectorizer()),
('classifier', MultinomialNB())
])
clf_rfc = Pipeline([
('vectorizer', CountVectorizer()),
('classifier', RandomForestClassifier())
])
clf_svm = Pipeline([
('vectorizer', CountVectorizer()),
('svm',svm.SVC(kernel='linear',C=100,gamma=0.001))
])
clf_NaiveBayes.fit(X_train,y_train)
y_pred=clf_NaiveBayes.predict(X_test)
acc=metrics.accuracy_score(y_pred,y_test)
cm=metrics.confusion_matrix(y_pred,y_test)
print("Navies Bayes")
print(cm)
print(acc)
clf_svm.fit(X_train,y_train)
y_pred=clf_svm.predict(X_test)
acc_svm=metrics.accuracy_score(y_pred,y_test)
cm_svm=metrics.confusion_matrix(y_pred,y_test)
print("SVM:")
print(cm_svm)
print(acc_svm)
clf_rfc.fit(X_train,y_train)
y_pred=clf_rfc.predict(X_test)
acc_rfc=metrics.accuracy_score(y_pred,y_test)
cm_rfc=metrics.confusion_matrix(y_pred,y_test)
print("Random Forest Classifier:")
2
print(cm_rfc)
print(acc_rfc)
Navies Bayes
[[974 13]
[ 9 119]]
0.9802690582959641
SVM:
[[981 11]
[ 2 121]]
0.9883408071748879
Random Forest Classifier:
[[983 23]
[ 0 109]]
0.979372197309417
svm_cross_valid=cross_val_score(clf_svm,X_train,y_train,cv=5)
mean_acc_svm=svm_cross_valid.mean()
print(mean_acc_svm)
rfc_cross_valid=cross_val_score(clf_rfc,X_train,y_train,cv=5)
mean_acc_rfc=rfc_cross_valid.mean()
print(mean_acc_rfc)
nb_cross_valid=cross_val_score(clf_NaiveBayes,X_train,y_train,cv=5)
mean_acc_nb=nb_cross_valid.mean()
print(mean_acc_nb)
0.9802599990940799
0.9755489624697399
0.9860931185295907
svm_best_estimator = svm_model_param.best_estimator_
y_pred_svm = svm_best_estimator.predict(X_test)
3
svm_acc_ht = metrics.accuracy_score(y_pred_svm, y_test)
[36]: rfc_param_grid = {
'classifier__n_estimators': [100, 200, 300],
'classifier__max_depth': [None, 10, 50],
'classifier__min_samples_split': [2, 5],
'classifier__min_samples_leaf': [1, 2]
}
# Perform RandomizedSearchCV for Random Forest with reduced search space and␣
↪fewer CV folds
# Calculate accuracy
rfc_acc_ht = metrics.accuracy_score(y_pred_rfc, y_test)