NLP Tushar

11. Write a python program to prepare a Bag of word Model.
Code :-
import pandas as pd
import numpy as np
import collections
import re
doc1 = "Game of Thrones is an amazing tv series"
doc2 = "Game of Thrones is best tv series"

doc3 = "Game of Thrones is so great"
#Remove Punctuation
l_doc1 = re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()

#After we achieve the vocabulary, or wordset, which is composed of the unique

words founds in the three reviews
wordset12 = np.union1d(l_doc1,l_doc2)
wordset = np.union1d(wordset12,l_doc3)
print(wordset)
Tushar Parikh 21084341003

14
def calculateBOW(wordset,l_doc) :
tf_diz = dict.fromkeys(wordset,0)
for word in l_doc :
tf_diz[word] = l_doc.count(word)
return tf_diz
#We can finally obtain the bag of words representatives for the reviews. In the
end, we obtain a dataframe, where each row corresponds to the extracted
features of each document
bow1 = calculateBOW(wordset,l_doc1)
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

15
x = vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())
df_bow_sklearn = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names())
df_bow_sklearn.head()
vectorizer = CountVectorizer(stop_words = "english")
x = vectorizer.fit_transform([doc1,doc2,doc3])
df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())

16
vectorizer=CountVectorizer(stop_words="english",ngram_range=(2,2))
x=vectorizer.fit_transform([doc1,doc2,doc3])
df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())
import pandas as pd
dataset = pd.read_csv(r”c:\Users\HP\data.csv”,encoding=”ISO-8859-1”)
dataset.head()

17
import re
import nltk
from nltk.steam.porter import PorterStemmer

stemmer = PorterStemmer()
#spell correction
from nltk.corpus import stopwords
Data = []
for i in range(dataset.shape[0]) :
sms = dataset.iloc[I,1]
#remove non alphabetic characters
sms = re.sub(^[A-Za-z],’ ‘ ,sms)
#make words lowercase, because Go and go will be considered as two

words
sms = sms.lover()
#tokenising
tokenized_sms = wt(sms)

18
#remove stop words and stemming
sms_processed = []
for word in tokenized_sms :
if word not in set(stopwords.words(‘english’)) :
sms_processed.append(stemmer.stem(word))
sms_text = “ “.join(sms_processed)
data.append(sms_text)
# creating the feature matrix

matrix = CountVectorizer(max_features=1000)
x = matrix.fit_transform(data).toarray()
y = dataset.iloc[:,0]
# split train and test data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

19
# predict class
y_pred = classifier.predict(x_test)
# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report,
accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
accuracy

20
12. Write a python program to prepare a TF-IDF Model.
Code :-
from sklearn.feature_extraction.text import TfidfVectorizer
d0 = "The car is driven on the road"

d1 = "The truck is driven on the highway"
d2 = "The bike is run on road"
string = [d0,d1,d2]
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(string)
result
print("\nword indices : ")

print(tfidf.vocabulary_)

21
print("\nidf values : ")
for ele1,ele2 in zip(tfidf.get_feature_names(),tfidf.idf_) :
print(ele1,":",ele2)
print("\ntf.idf values : ")

print(result)

22
print("\ntf.idf values i matrix form : ")
print(result.toarray())

23
13. Write a python program to prepare a CountVectorizer Model.
Code :-
# To create a Count Vectorizer, we simply need to instantiate one.

# There are special parameters we can set here when making the vectorizer,
but
# for the most basic example, it is not needed
vectorizer = CountVectorizer()
# For our text, we are going to take some text form our previous blog post
# about count vectorization
sample_text = ["One of the most basic ways we can numerically represent

words "
"is through the one-hot encoding method (also sometimes called "
"count vectorizing)."]
# To actually create the vectorizer, we simply need to call fit on the text
# data that we wish to fix
vectorizer.fit(sample_text)

24
# Now, we can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
print("Vacabulary : ")
print(vectorizer.vocabulary_)
# If we would like to actually create a vector, we can do so by passing the

# text into the vectorizer to get back counts
vector = vectorizer.transform(sample_text)
# Our final vector :

print("Full vector : ")
print(vector.toarray())
# Or if we wanted to get the vectore for one word :

print("Hot vector : ")
print(vectorizer.transform(['hit']).toarray())
#or if we wanted to get multiple vectors at once to build matrices

print("Hot and One : ")
print(vectorizer.transform(['hot','one']).toarray())
# We could also do the whole thing at once with the fit_transform method :
print('One swoop : ')
new_text = ["Today is the day that I do the thing today, today"]
new_vectorizer = CountVectorizer()

25
print(new_vectorizer.fit_transform(new_text).toarray())

26
14. Write a python program to perform Text Classification with NLTK using
Naive Bayes Classifier.
Code :-
import numpy as np
import pandas as pd
df = pd.read_csv("C:\\Users\\Admin\\Downloads\\BBC_News_Train.csv")
df.head()
df.shape
df['Category'].value_counts()
import nltk
from nltk.corpus import stopwords
import string

27
def text_cleaning(a) :
remove_punctuation = [char for char in a if char not in string.punctuation]
remove_punctuation = ''.join(remove_punctuation)
return [word for word in remove_punctuation.split() if word.lower() not in
stopwords.words('english')]
print(df.iloc[:,1].apply(text_cleaning))

bow_transformer = CountVectorizer(analyzer=text_cleaning).fit(df['Text'])
bow_transformer.vocabulary_

28
title_bow = bow_transformer.transform(df['Text'])
print(title_bow)

29
x = title_bow.toarray()
print(x)
x.shape
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(title_bow)
print(tfidf_transformer)
title_tfidf = tfidf_transformer.transform(title_bow)
print(title_tfidf)
print(title_tfidf.shape)

30
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(title_tfidf,df['Category'])
all_predictions = model.predict(title_tfidf)
print(all_predictions)
from sklearn.metrics import confusion_matrix

confusion_matrix(df['Category'],all_predictions)
from sklearn.metrics import classification_report

print(classification_report(df['Category'], all_predictions))

31
15. Write a python program to converting words to features with NLTK.
Code :-
import nltk
nltk.download('movie_reviews')
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(field)), category)

for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words() :
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(documnet) :
words = set(documents)
features = {}
for w in word_features :

32
features[w] = (w in words)
return features
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev),category) for (rev,category) in documents]

featuresets

33
34

NLP Tushar

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

NLP Tushar

Uploaded by

Copyright:

Available Formats

11. Write a python program to prepare a Bag of word Model.

doc1 = "Game of Thrones is an amazing tv series"

doc2 = "Game of Thrones is best tv series"

l_doc1 = re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()

l_doc2 = re.sub(r"[^a-zA-Z0-9]"," ",doc2.lower()).split()

#After we achieve the vocabulary, or wordset, which is composed of the unique

Tushar Parikh 21084341003

for word in l_doc :

from sklearn.feature_extraction.text import CountVectorizer

Tushar Parikh 21084341003

df_bow_sklearn = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names())

vectorizer = CountVectorizer(stop_words = "english")

Tushar Parikh 21084341003

Tushar Parikh 21084341003

from nltk.steam.porter import PorterStemmer

#make words lowercase, because Go and go will be considered as two

Tushar Parikh 21084341003

# creating the feature matrix

# split train and test data

Tushar Parikh 21084341003

accuracy = accuracy_score(y_test, y_pred)

Tushar Parikh 21084341003

from sklearn.feature_extraction.text import TfidfVectorizer

d0 = "The car is driven on the road"

print("\nword indices : ")

Tushar Parikh 21084341003

print("\ntf.idf values : ")

Tushar Parikh 21084341003

Tushar Parikh 21084341003

# To create a Count Vectorizer, we simply need to instantiate one.

sample_text = ["One of the most basic ways we can numerically represent

Tushar Parikh 21084341003

# If we would like to actually create a vector, we can do so by passing the

# Our final vector :

# Or if we wanted to get the vectore for one word :

#or if we wanted to get multiple vectors at once to build matrices

Tushar Parikh 21084341003

Tushar Parikh 21084341003

Tushar Parikh 21084341003

from sklearn.feature_extraction.text import CountVectorizer

Tushar Parikh 21084341003

Tushar Parikh 21084341003

from sklearn.feature_extraction.text import TfidfTransformer

Tushar Parikh 21084341003

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

Tushar Parikh 21084341003

documents = [(list(movie_reviews.words(field)), category)

Tushar Parikh 21084341003

featuresets = [(find_features(rev),category) for (rev,category) in documents]

Tushar Parikh 21084341003

You might also like