You are on page 1of 21

11. Write a python program to prepare a Bag of word Model.

Code :-

import pandas as pd

import numpy as np

import collections
import re

doc1 = "Game of Thrones is an amazing tv series"

doc2 = "Game of Thrones is best tv series"


doc3 = "Game of Thrones is so great"

#Remove Punctuation

l_doc1 = re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()

l_doc2 = re.sub(r"[^a-zA-Z0-9]"," ",doc2.lower()).split()


l_doc3 = re.sub(r"[^a-zA-Z0-9]"," ",doc3.lower()).split()

#After we achieve the vocabulary, or wordset, which is composed of the unique


words founds in the three reviews

wordset12 = np.union1d(l_doc1,l_doc2)

wordset = np.union1d(wordset12,l_doc3)
print(wordset)

Tushar Parikh 21084341003


14
def calculateBOW(wordset,l_doc) :

tf_diz = dict.fromkeys(wordset,0)

for word in l_doc :

tf_diz[word] = l_doc.count(word)
return tf_diz

#We can finally obtain the bag of words representatives for the reviews. In the
end, we obtain a dataframe, where each row corresponds to the extracted
features of each document

bow1 = calculateBOW(wordset,l_doc1)

bow2 = calculateBOW(wordset,l_doc2)

bow3 = calculateBOW(wordset,l_doc3)

df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()

from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()

Tushar Parikh 21084341003


15
x = vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())

df_bow_sklearn = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names())

df_bow_sklearn.head()

vectorizer = CountVectorizer(stop_words = "english")

x = vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())

df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())

df_bow_sklearn.head()

Tushar Parikh 21084341003


16
vectorizer=CountVectorizer(stop_words="english",ngram_range=(2,2))

x=vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())

df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())

df_bow_sklearn.head()

import pandas as pd
dataset = pd.read_csv(r”c:\Users\HP\data.csv”,encoding=”ISO-8859-1”)
dataset.head()

Tushar Parikh 21084341003


17
import re
import nltk

from nltk.steam.porter import PorterStemmer


stemmer = PorterStemmer()

#spell correction
from nltk.corpus import stopwords
Data = []

for i in range(dataset.shape[0]) :
sms = dataset.iloc[I,1]
#remove non alphabetic characters
sms = re.sub(^[A-Za-z],’ ‘ ,sms)

#make words lowercase, because Go and go will be considered as two


words
sms = sms.lover()

#tokenising
tokenized_sms = wt(sms)

Tushar Parikh 21084341003


18
#remove stop words and stemming

sms_processed = []
for word in tokenized_sms :
if word not in set(stopwords.words(‘english’)) :
sms_processed.append(stemmer.stem(word))

sms_text = “ “.join(sms_processed)
data.append(sms_text)

# creating the feature matrix


from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
x = matrix.fit_transform(data).toarray()
y = dataset.iloc[:,0]

# split train and test data


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

Tushar Parikh 21084341003


19
# predict class
y_pred = classifier.predict(x_test)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report,
accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

accuracy

Tushar Parikh 21084341003


20
12. Write a python program to prepare a TF-IDF Model.
Code :-

from sklearn.feature_extraction.text import TfidfVectorizer

d0 = "The car is driven on the road"


d1 = "The truck is driven on the highway"
d2 = "The bike is run on road"

string = [d0,d1,d2]

tfidf = TfidfVectorizer()

result = tfidf.fit_transform(string)
result

print("\nword indices : ")


print(tfidf.vocabulary_)

Tushar Parikh 21084341003


21
print("\nidf values : ")
for ele1,ele2 in zip(tfidf.get_feature_names(),tfidf.idf_) :
print(ele1,":",ele2)

print("\ntf.idf values : ")


print(result)

Tushar Parikh 21084341003


22
print("\ntf.idf values i matrix form : ")
print(result.toarray())

Tushar Parikh 21084341003


23
13. Write a python program to prepare a CountVectorizer Model.
Code :-
from sklearn.feature_extraction.text import CountVectorizer

# To create a Count Vectorizer, we simply need to instantiate one.


# There are special parameters we can set here when making the vectorizer,
but
# for the most basic example, it is not needed

vectorizer = CountVectorizer()

# For our text, we are going to take some text form our previous blog post
# about count vectorization

sample_text = ["One of the most basic ways we can numerically represent


words "
"is through the one-hot encoding method (also sometimes called "
"count vectorizing)."]

# To actually create the vectorizer, we simply need to call fit on the text
# data that we wish to fix

vectorizer.fit(sample_text)

Tushar Parikh 21084341003


24
# Now, we can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
print("Vacabulary : ")
print(vectorizer.vocabulary_)

# If we would like to actually create a vector, we can do so by passing the


# text into the vectorizer to get back counts
vector = vectorizer.transform(sample_text)

# Our final vector :


print("Full vector : ")
print(vector.toarray())

# Or if we wanted to get the vectore for one word :


print("Hot vector : ")
print(vectorizer.transform(['hit']).toarray())

#or if we wanted to get multiple vectors at once to build matrices


print("Hot and One : ")
print(vectorizer.transform(['hot','one']).toarray())

# We could also do the whole thing at once with the fit_transform method :
print('One swoop : ')
new_text = ["Today is the day that I do the thing today, today"]
new_vectorizer = CountVectorizer()

Tushar Parikh 21084341003


25
print(new_vectorizer.fit_transform(new_text).toarray())

Tushar Parikh 21084341003


26
14. Write a python program to perform Text Classification with NLTK using
Naive Bayes Classifier.
Code :-
import numpy as np
import pandas as pd

df = pd.read_csv("C:\\Users\\Admin\\Downloads\\BBC_News_Train.csv")
df.head()

df.shape

df['Category'].value_counts()

import nltk
from nltk.corpus import stopwords
import string

Tushar Parikh 21084341003


27
def text_cleaning(a) :
remove_punctuation = [char for char in a if char not in string.punctuation]
remove_punctuation = ''.join(remove_punctuation)
return [word for word in remove_punctuation.split() if word.lower() not in
stopwords.words('english')]

print(df.iloc[:,1].apply(text_cleaning))

from sklearn.feature_extraction.text import CountVectorizer


bow_transformer = CountVectorizer(analyzer=text_cleaning).fit(df['Text'])
bow_transformer.vocabulary_

Tushar Parikh 21084341003


28
title_bow = bow_transformer.transform(df['Text'])
print(title_bow)

Tushar Parikh 21084341003


29
x = title_bow.toarray()
print(x)
x.shape

from sklearn.feature_extraction.text import TfidfTransformer


tfidf_transformer = TfidfTransformer().fit(title_bow)
print(tfidf_transformer)

title_tfidf = tfidf_transformer.transform(title_bow)
print(title_tfidf)
print(title_tfidf.shape)

Tushar Parikh 21084341003


30
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(title_tfidf,df['Category'])

all_predictions = model.predict(title_tfidf)
print(all_predictions)

from sklearn.metrics import confusion_matrix


confusion_matrix(df['Category'],all_predictions)

from sklearn.metrics import classification_report


print(classification_report(df['Category'], all_predictions))

Tushar Parikh 21084341003


31
15. Write a python program to converting words to features with NLTK.
Code :-
import nltk
nltk.download('movie_reviews')
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(field)), category)


for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words() :
all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(documnet) :
words = set(documents)
features = {}
for w in word_features :

Tushar Parikh 21084341003


32
features[w] = (w in words)

return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev),category) for (rev,category) in documents]


featuresets

Tushar Parikh 21084341003


33
Tushar Parikh 21084341003
34

You might also like