You are on page 1of 2

Naive Bayes classification for TEXT Classification

import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
import numpy as np
import seaborn as sns
#% matplotlib inline
import math
import sklearn.datasets as skd
from sklearn.utils import Bunch

categories= ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']


news_train=
skd.load_files('C:\\Users\\OCHIN\\Desktop\\Fetch20newsgroup\\train',categories=cate
gories, encoding='ISO-8859-1') # type: Bunch
news_test=
skd.load_files('C:\\Users\\OCHIN\\Desktop\\Fetch20newsgroup\\train',categories=cate
gories, encoding='ISO-8859-1')
print(news_train.keys())
print()
print(news_train['target_names'])
#print(news_train['alt.atheism'])
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vect= CountVectorizer()
X_train_tf=count_vect.fit_transform(news_train.data)
print('\n',X_train_tf.shape) # rows, column info # Term freq tells how often a
given word appears (word the might be imp)
# inverse document frequency tells about the words appears a lot across documents
(some valuable word is imp for classification rather than 'the' word
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer= TfidfTransformer() # type: TfidfTransformer
X_train_tfidf =tfidf_transformer.fit_transform(X_train_tf) # X_train_tf is the
total freq counts in rows & columns form
print(X_train_tfidf.shape)
#print(X_train_tfidf)

from sklearn.naive_bayes import MultinomialNB


clf=MultinomialNB().fit(X_train_tfidf, news_train.target)

docs_new=['God is everywhere', 'OpenGL on the GPU is fast']


X_new_counts=count_vect.transform(docs_new)
X_new_tfidf=tfidf_transformer.transform(X_new_counts)

predicted=clf.predict(X_new_tfidf)

for x in predicted:
print(x)

X_test_tf=count_vect.transform(news_test.data)
X_test_tfidf= tfidf_transformer.transform(X_test_tf)
predicted= clf.predict(X_test_tfidf)

from sklearn import metrics


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print("Accuracy : ", accuracy_score(news_test.target,predicted))
#print(metrics.classification_report(news_test.target,predicted,target_names=news_t
est.target_names))
print("Confusion Matrix")
print(metrics.confusion_matrix(news_test.target,predicted))
# for multiclass the cnf metrix ill be no of classes * no of class matrix (4*4) if
4 classes
# so for 1st class it ud be
#416 0
#0 64
# 1st row is alt.atheism, 2nd row computer graphics, 3rd is sci.med media, 4th is
religion
# 1st column is alt.atheism, 2nd column computer graphics, 3rd is sci.med media,
4th is religion

You might also like