Professional Documents
Culture Documents
20BCP123 - NLP Lab Manual
20BCP123 - NLP Lab Manual
SCHOOL OF TECHNOLOGY
CSE DEPARTMENT
Submitted
by, Khushi
Shah
(20BCP123)
Lab 1: Tokenization
1. Sentence Tokenization:
def extract_sentences(paragraph):
paragraph = paragraph.replace("? ", "|")
paragraph = paragraph.replace("! ", "|")
paragraph = paragraph.replace(". ", "|")
sentences = paragraph.split("|")
return sentences
paragraph = "This is the first sentence. The second one follows? And here
comes the third sentence! My email is abc@gmail.com, you can contact
here."
sentences = extract_sentences(paragraph)
Output:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com or etc., many ways are there. " + \
"Khushi said: 'Today is her birthday. So she is very happy.' So everyone
wished her. "
Output:
Sentence tokenization: ['Dr. Ami Shah is a MD doctor.', 'Please feel free to contact her on
ami@gmail.com or etc., many ways are there.', "Khushi said: 'Today is her birthday.", "So she is
very happy.'", 'So everyone wished her.']
import nltk
nltk.download('punkt')
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com etc. many ways are there. " + \
"Don't do this. " + \
"Researcher's said AI will change the word. " + \
"You can't go there. "
Output:
Word tokenization: ['Dr.', 'Ami', 'Shah', 'is', 'a', 'MD', 'doctor', '.', 'Please', 'feel', 'free', 'to',
'contact', 'her', 'on', 'ami', '@', 'gmail.com', 'etc', '.', 'many', 'ways', 'are', 'there', '.', 'Do', "n't", 'do',
'this', '.', 'Researcher', "'s", 'said', 'AI', 'will', 'change', 'the', 'word', '.', 'You', 'ca', "n't", 'go', 'there',
'.']
def word_tokenize(sentence):
words = [] # Create an empty list to store the words.
current_word = "" # Initialize an empty string for the current word.
if current_word:
words.append(current_word) # Append the last word (if any) to the list.
return words
sentence = "Dr. Ami is not available today. Please feel free to contact her on
ami@gmail.com. Don't come. You can't go there."
tokens = word_tokenize(sentence)
print(tokens)
Output:
['Dr', 'Ami', 'is', 'not', 'available', 'today', 'Please', 'feel', 'free', 'to', 'contact', 'her', 'on', 'ami@gmail',
'com', "Don't", 'come', 'You', "can't", 'go', 'there']
Lab 2 : Stemming and Lemmatization
1. Stemming:
def stemx(word):
suffix = ['ed', 'es', 'ing', 'ship', 's', 'less', 'ion', 'ly', 'ions',
'full', 'able']
prefix = ['un', 'pre', 'dis', 're', 'il', 'im', 'in', 'de', 'mis', 'sub',
'non', 'anti']
for x in prefix:
if word.startswith(x):
word = word[len(x):]
for y in suffix:
if word.endswith(y):
word = word[:-len(y)]
return word
Output:
going : go
friends : friend
friendship : friend
happily : happi
friendly : friend
programming : programm
for w in words:
print(w, " : ", ps.stem(w))
Output:
friendship : friendship
programs : program
programmer : programm
programming : program
programmers : programm
predefined : predefin
happily : happili
class BasicLemmatizer:
def init (self):
self.dictionary = PyDictionary()
lemmatizer = WordNetLemmatizer()
words1 = ['smiling', 'died', 'purchased', 'went', 'bought']
for w in words1:
print(w + " : " + lemmatizer.lemmatize(w, pos="v")) # v - verb
Output:
smiling : smile
died : die
purchased : purchase
went : go
bought : buy
better : good
worst : bad
happiest : happy
babies : baby
mice : mouse
feet : foot
5. Porter Stemmer:
class PorterStemmer:
def init (self):
pass
# m: count of VC pair
def _measure(self, word):
return ''.join(['V' if char in "aeiou" else 'C' for char in
word]).count('VC')
def test_stemmer():
stemmer = PorterStemmer()
return print(results)
test_stemmer()
Output:
Code:
train_li_words = ['']
train_li_words*= num_words_train
train_li_tags = ['']
train_li_tags*= num_words_train
for i in range(num_words_train):
temp_li = tr_li[i].split("/")
train_li_words[i] = temp_li[0]
train_li_tags[i] = temp_li[1]
dict2_word_tag = {}
for i in range(num_words_train-1):
outer_key = train_li_tags[i]
inner_key = train_li_tags[i+1]
dict2_tag_follow_tag_[outer_key]=dict2_tag_follow_tag_.get(outer_key,{})
dict2_tag_follow_tag_[outer_key][inner_key] =
dict2_tag_follow_tag_[outer_key].get(inner_key,0)
dict2_tag_follow_tag_[outer_key][inner_key]+=1
outer_key = train_li_words[i]
inner_key = train_li_tags[i]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1
dict2_tag_follow_tag_['.'] = dict2_tag_follow_tag_.get('.',{})
dict2_tag_follow_tag_['.'][train_li_tags[0]] =
dict2_tag_follow_tag_['.'].get(train_li_tags[0],0)
dict2_tag_follow_tag_['.'][train_li_tags[0]]+=1
last_index = num_words_train-1
###Testing Phase###
te_li = te_str.split()
num_words_test = len(te_li)
test_li_words = ['']
test_li_words*= num_words_test
test_li_tags = ['']
test_li_tags*= num_words_test
output_li = ['']
output_li*= num_words_test
output_li_baseline = ['']
output_li_baseline*= num_words_test
num_errors = 0
num_errors_baseline = 0
for i in range(num_words_test):
temp_li = te_li[i].split("/")
test_li_words[i] = temp_li[0]
test_li_tags[i] = temp_li[1]
output_li_baseline[i] = dict_word_tag_baseline.get(temp_li[0],'')
#If unknown word - tag = 'NNP'
if output_li_baseline[i]=='':
output_li_baseline[i]='NNP'
if output_li_baseline[i]!=test_li_tags[i]:
num_errors_baseline+=1
if i==0: #Accounting for the 1st word in the test document for the
Viterbi
di_transition_probs = dict2_tag_follow_tag_['.']
else:
di_transition_probs = dict2_tag_follow_tag_[output_li[i-1]]
di_emission_probs = dict2_word_tag.get(test_li_words[i],'')
else:
max_prod_prob = 0
counter_trans = 0
counter_emis =0
prod_prob = 0
while counter_trans < len(di_transition_probs) and counter_emis <
len(di_emission_probs):
tag_tr = di_transition_probs[counter_trans][0]
tag_em = di_emission_probs[counter_emis][0]
if tag_tr < tag_em:
counter_trans+=1
elif tag_tr > tag_em:
counter_emis+=1
else:
prod_prob = di_transition_probs[counter_trans][1] *
di_emission_probs[counter_emis][1]
if prod_prob > max_prod_prob:
max_prod_prob = prod_prob
output_li[i] = tag_tr
#print "i=",i," and output=",output_li[i]
counter_trans+=1
counter_emis+=1
if output_li[i]=='': #In case there are no matching entries between the
transition tags and emission tags, we choose the most frequent emission
tag
output_li[i] = max(di_emission_probs,key=itemgetter(1))[0]
if output_li[i]!=test_li_tags[i]:
num_errors+=1
print("Correct tags:",test_li_tags)
Output:
import nltk
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
Senti_tweets=[]
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered=[e.lower() for e in words.split() if len(e)>=3]
Senti_tweets.append((words_filtered, sentiment))
print(Senti_tweets)
def get_words_in_tweets(tweets):
all_words=[]
for (words, sentiment) in Senti_tweets:
all_words.extend(words)
return (all_words)
def get_word_features(wordlist):
wordlist=nltk.FreqDist(wordlist)
word_features=wordlist.keys()
return word_features
word_features=get_word_features(get_words_in_tweets(Senti_tweets))
print(word_features)
word_features_filtered=[]
for w in word_features:
if w not in stopwords:
word_features_filtered.append(w)
print(word_features_filtered)
def extract_features(document):
document_words=set(document)
features={}
for word in word_features_filtered:
features['contains(%s)' %word] = (word in document_words)
return features
training_set = nltk.classify.apply_features(extract_features,
Senti_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
Output:
[(['not', 'impossible'], 'positive'), (['you', 'are', 'lovely', 'friend'], 'Positive'), (['she', 'beautiful',
'girl'], 'Positive'), (['looking', 'handsome'], 'Positive'), (['exercise', 'good', 'for', 'health'], 'Positive'),
(["today's", 'weather', 'fantastic'], 'Positive'), (['love', 'mango'], 'Positive'), (['you', 'are', 'enemy',
'friend'], 'Negative'), (['she', 'looking', 'ugly'], 'Negative'), (['looking', 'horrible'], 'Negative'),
(['sleeping', 'more', 'makes', 'you', 'lazy'], 'Negative'), (["today's", 'weather', 'very', 'bad'],
'Negative'), (['hate', 'banana'], 'Negative')]
dict_keys(['not', 'impossible', 'you', 'are', 'lovely', 'friend', 'she', 'beautiful', 'girl', 'looking',
'handsome', 'exercise', 'good', 'for', 'health', "today's", 'weather', 'fantastic', 'love', 'mango',
'enemy', 'ugly', 'horrible', 'sleeping', 'more', 'makes', 'lazy', 'very', 'bad', 'hate', 'banana'])
['impossible', 'lovely', 'friend', 'beautiful', 'girl', 'looking', 'handsome', 'exercise', 'good', 'health',
"today's", 'weather', 'fantastic', 'love', 'mango', 'enemy', 'ugly', 'horrible', 'sleeping', 'makes', 'lazy',
'bad', 'hate', 'banana']
import pandas as pd
df = pd.read_csv('full-corpus.csv')
df.head()
import nltk
from nltk.corpus import stopwords
def preprocess(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) #
Convert URLs to the word URL
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # Convert @username to
AT_USER
tweet = re.sub('[\s]+', ' ', tweet) # Remove additional whitespaces
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # Replace #word with word
tweet = tweet.strip('\'"') # Trim
return tweet
import re
import nltk
# Word tokenizer
def tokenize(tweet):
return nltk.word_tokenize(tweet)
# Extract features
def extract_features(document):
document_words = set(tokenize(document))
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
import nltk
nltk.download('punkt')
# Test the classifier (optional, if you want to test with a sample tweet)
test_tweet = "It’s easy to be brave when you’re hiding behind a keyboard.
You and your Hamas friends will regret your barbaric actions very soon."
print(classifier.classify(extract_features(preprocess(test_tweet))))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
# If you want a detailed report for each class (Positive, Negative, etc.)
print(classification_report(actual, predictions))
Output:
Negative
Lab 5 : Turney’s method of Sentiment Analysis
Code:
import math
import re
import json
import nltk
# nltk.download("averaged_perceptron_tagger", quiet=True)
def loadReviews(fileName):
file = open(fileName)
list_pos = []
list_neg = []
data = json.load(file)
for elem in data:
if float(elem["overall"]) >= 3.0:
list_pos.append(elem["reviewText"])
else:
list_neg.append(elem["reviewText"])
file.close()
return list_pos, list_neg
def make_datasets(fileName):
all_positive_reviews, all_negative_reviews = loadReviews(fileName)
dataset = {"train": {"neg": [], "pos": []}, "test": {"neg": [], "pos":
[]}}
dataset["train"]["pos"] = all_positive_reviews[:20000]
dataset["train"]["neg"] = all_negative_reviews[:20000]
dataset["test"]["pos"] = all_positive_reviews[-50:]
dataset["test"]["neg"] = all_negative_reviews[-50:]
return dataset
def find_pattern(postag):
tag_pattern = []
for k in range(len(postag) - 2):
if postag[k][1] == "JJ" and (
postag[k + 1][1] == "NN" or postag[k + 1][1] == "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
(postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
postag[k][1] == "JJ"
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
(postag[k][1] == "NN" or postag[k][1] == "NNS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS"
) and (
postag[k + 1][1] == "VB"
or postag[k + 1][1] == "VBD"
or postag[k + 1][1] == "VBN"
or postag[k + 1][1] == "VBG"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
return tag_pattern
class Turney(object):
def init (self, dataset):
self.datasets = dataset
self.pos_phrases_hits = []
self.neg_phrases_hits = []
self.pos_hits = 0.01
self.neg_hits = 0.01
self.accuracy = 0
def turney(self):
tp = 0
fp = 0
tn = 0
fn = 0
for boolean, test_klass in enumerate(["pos", "neg"]):
for i, data in enumerate(self.datasets["test"][test_klass]):
print(
str(i)
+ " out of "
+ str(len(self.datasets["test"][test_klass]))
+ " --> round "
+ str(boolean)
)
phrases =
find_pattern(nltk.pos_tag(nltk.word_tokenize(data)))
if len(phrases) == 0:
continue
self.pos_phrases_hits = [0.01] * len(phrases)
self.neg_phrases_hits = [0.01] * len(phrases)
self.pos_hits = 0.01
self.neg_hits = 0.01
Output:
Lab 6 : Text Summarization
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent[:15]] = freq_table
return frequency_matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}
idf_matrix[sent] = idf_table
return idf_matrix
count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence /
count_words_in_sentence
return sentenceValue
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
return summary
# 1 Sentence Tokenization
sentences = sent_tokenize(text)
total_documents = len(sentences)
print(total_documents)
41
Output:
Israel has published photos of dead infants after the terror attack. But the reports are still
unconfirmed, and in some cases have been retracted. Follow-ups that retract or add context are
less likely to be repeated or reach the same audience. It may turn out that the slaughter was done
in a particularly barbaric way. The video has been viewed more than 11 million times on X,
according to its view counter. Baby stories are very emotive.
Lab 7 : Multi document text summarization
articles = []
import pandas as pd
import numpy as np
articles.append({ "Titl
e": "Article1",
"Content":""" """})
articles.append({ "Titl
e": "Article2",
"Content":""" """})
articles.append({ "Titl
e": "Article3",
"Content":""" """})
articles.append({ "Titl
e": "Article4",
"Content":""" """})
articles.append({ "Titl
e": "Article5",
"Content":""" """})
articles.append({ "Titl
e": "Article6",
"Content":""" """})
articles.append({ "Titl
e": "Article7",
"Content":""" """})
df = pd.DataFrame(articles)
print(df)
summaries.append(top_sentence)
# Extract the most representative sentence from each article (as we did before)
summaries = []
for idx, article in enumerate(df['Content']):
sentences = article.split('. ')
sentence_vectors = tfidf_vectorizer.transform(sentences)
cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
top_sentence = sentences[top_sentence_idx]
summaries.append(top_sentence)
# Create a merged summary by concatenating the most representative sentences from each
article
merged_summary = '. '.join(summaries)
print("Merged Summary:")
print(merged_summary)
Output:
Title Content
0 Article1 Unverified reports of ‘40 babies beheaded’ in ...
1 Article2 The 'horrendous toll' on children caught in th...
2 Article3 What we actually know about the viral report o...
3 Article4 Israel releases horrific images of slain child...
4 Article5 ‘I would see and have confirmed pictures of te...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 ‘I would see and have confirmed pictures of te…
Title Summary
0 Article1 In another tweet, Zedeck wrote that soldiers t...
1 Article2 "Depriving children of access to food and esse...
2 Article3 What happened in Kibbutz Kfar Aza is a massacr...
3 Article4 Secretary of State Antony Blinken and NATO
def... 4 Article5 I never thought I’d ever — anyway.” He did
not... 5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 I never thought I’d ever — anyway.” He did
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
Output:
Tokens in Doc_1: ['Unverified', 'reports', 'of', '‘', '40', 'babies', 'beheaded', '’', 'in', 'Israel-Hamas',
'war', 'inflame', 'social', 'media', 'No', 'photo', 'evidence', 'had', 'been', 'made', 'public', 'as', 'of',,
'.',.......']
Tokens in Doc_2: ['The', "'horrendous", 'toll', "'", 'on', 'children', 'caught', 'in', 'the', 'Israel-Gaza',
'conflict', 'Hundreds', 'of', 'children', 'have', 'been', 'killed', 'so', 'far', ',', 'with', 'the', 'true', 'total',
'Russell', 'said', '.......']
Tokens in Doc_3: ['What', 'we', 'actually', 'know', 'about', 'the', 'viral', 'report', 'of', 'beheaded',
'babies', 'in', 'Israel', 'One', 'journalist', 'from', 'the', 'Tel', 'Aviv-based', 'news', 'channel', 'i24',
'said', 'war', '. ']
Tokens in Doc_4: ['Israel', 'releases', 'horrific', 'images', 'of', 'slain', 'children', 'after', 'Hamas',
'attack', 'JERUSALEM', '/', 'TEL', 'AVIV', '/', 'BRUSSELS', '-', 'CONTENT', 'WARNING', ':',
'right', 'things', '.', ……’]
Tokens in Doc_5: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',
'attack', '........']
Tokens in Doc_6: ['At', 'least', '40', 'babies', 'killed', ',', 'beheaded', 'in', 'Israeli', 'kibbutz', 'outside',
'Gaza', 'Strip', ',', 'reports', 'say', 'KFAR', 'AZA', ',', 'Israel', '(', 'TND', ')', '—', 'Dozens',
'of', 'Monday','.']
Tokens in Doc_7: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',, 'the',
'attack', ‘….....']
contentious,: [1, 0, 0, 0, 0, 0, 0]
personnel: [0, 1, 0, 0, 0, 0, 0]
photo: [1, 0, 0, 1, 0, 0, 0]
expressed: [0, 0, 0, 0, 1, 0, 1]
body: [0, 0, 1, 1, 0, 0, 0]
killing,: [0, 1, 0, 0, 0, 0, 0]
particularly: [1, 0, 0, 1, 0, 0, 0]
spoke: [1, 1, 1, 0, 1, 0, 1]
Coordination: [0, 1, 0, 0, 0, 0, 0]
"unconfirmed".: [0, 0, 1, 0, 0, 0, 0]
caught: [0, 1, 0, 0, 0, 0, 0]
I'm: [0, 1, 0, 0, 0, 1, 0]
reposts.: [1, 0, 0, 0, 0, 0, 0]
"Please: [0, 0, 1, 0, 0, 0, 0]
still: [1, 1, 0, 0, 0, 1, 0]
thousands: [0, 0, 0, 1, 0, 0, 0]
targeted: [0, 0, 0, 1, 0, 0, 0]
linking: [0, 0, 1, 0, 0, 0, 0]
report.: [1, 0, 0, 0, 1, 0, 1]
babies”: [1, 0, 0, 0, 0, 0, 0]
Unverified: [1, 0, 0, 0, 0, 0, 0]
called: [0, 1, 0, 0, 0, 0, 0]
comparing: [0, 0, 0, 1, 0, 0, 0]
"To: [0, 0, 1, 0, 0, 0, 0]
bitter: [1, 0, 0, 0, 0, 0, 0]
260: [0, 1, 0, 0, 0, 0, 0]
small: [0, 0, 1, 0, 0, 1, 0]
Kibbutz: [0, 0, 1, 0, 0, 1, 0]
retract: [1, 0, 0, 0, 0, 0, 0]
decapitated: [1, 0, 0, 0, 1, 0, 1]
another: [1, 1, 1, 0, 1, 0, 1]
war: [1, 1, 1, 0, 0, 1, 0]
print(v1)
print(v2)
print('----------------')
v3 = v1 & v2
print(v3)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
[0 1 0 0 0 0 0]
print(v1)
print(v2)
print('----------------')
v = v1 | v2
print(v)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
[1 1 1 0 0 1 0]