20BCP123 - NLP Lab Manual

PANDIT DEENDAYAL ENERGY UNIVERSITY
SCHOOL OF TECHNOLOGY
CSE DEPARTMENT
Natural Language Processing Lab Manual
Submitted to,
Dr. Santosh Bharti
Submitted by,
Khushi Shah
(20BCP123)
Lab 1: Tokenization
1. Sentence Tokenization:
def extract_sentences(paragraph):
paragraph = paragraph.replace("? ", "|")
paragraph = paragraph.replace("! ", "|")
paragraph = paragraph.replace(". ", "|")
sentences = paragraph.split("|")
sentences = [sentence.strip() for sentence in sentences if

sentence.strip()]
return sentences
paragraph = "This is the first sentence. The second one follows? And here
comes the third sentence! My email is abc@gmail.com, you can contact
here."
sentences = extract_sentences(paragraph)
for idx, sentence in enumerate(sentences, start=1):

print(f"Sentence {idx}: {sentence}")
Output:
2. Sentence Tokenization using NLTK:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com or etc., many ways are there. " + \
"Khushi said: 'Today is her birthday. So she is very happy.' So everyone
wished her. "
print("Sentence tokenization: ", sent_tokenize(text))
Output:
Sentence tokenization: ['Dr. Ami Shah is a MD doctor.', 'Please feel free to contact her on
ami@gmail.com or etc., many ways are there.', "Khushi said: 'Today is her birthday.", "So she is
very happy.'", 'So everyone wished her.']
3. Word Tokenization using punkt:
import nltk
from nltk.tokenize import word_tokenize
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com etc. many ways are there. " + \
"Don't do this. " + \
"Researcher's said AI will change the word. " + \
"You can't go there. "
print("\n\nWord tokenization: " ,word_tokenize(text))
Output:
Word tokenization: ['Dr.', 'Ami', 'Shah', 'is', 'a', 'MD', 'doctor', '.', 'Please', 'feel', 'free', 'to',
'contact', 'her', 'on', 'ami', '@', 'gmail.com', 'etc', '.', 'many', 'ways', 'are', 'there', '.', 'Do', "n't", 'do',
'this', '.', 'Researcher', "'s", 'said', 'AI', 'will', 'change', 'the', 'word', '.', 'You', 'ca', "n't", 'go', 'there',
'.']
4. Word tokenization without NLTK:
def word_tokenize(sentence):
words = [] # Create an empty list to store the words.
current_word = "" # Initialize an empty string for the current word.
for char in sentence:

if char.isalnum() or char == "'" or char == "@": # Check if the character is a
letter, digit, or an apostrophe.
current_word += char # Add the character to the current word.
else:
if current_word: # Check if there's a current word (not empty).
words.append(current_word) # Append the current word to the list of
words.
current_word = "" # Reset the current word.
if current_word:
words.append(current_word) # Append the last word (if any) to the list.
return words
sentence = "Dr. Ami is not available today. Please feel free to contact her on
ami@gmail.com. Don't come. You can't go there."
tokens = word_tokenize(sentence)
print(tokens)
Output:
['Dr', 'Ami', 'is', 'not', 'available', 'today', 'Please', 'feel', 'free', 'to', 'contact', 'her', 'on', 'ami@gmail',
'com', "Don't", 'come', 'You', "can't", 'go', 'there']
Lab 2 : Stemming and Lemmatization
1. Stemming:
def stemx(word):
suffix = ['ed', 'es', 'ing', 'ship', 's', 'less', 'ion', 'ly', 'ions',
'full', 'able']
prefix = ['un', 'pre', 'dis', 're', 'il', 'im', 'in', 'de', 'mis', 'sub',
'non', 'anti']
for x in prefix:
if word.startswith(x):
word = word[len(x):]
for y in suffix:
if word.endswith(y):
word = word[:-len(y)]
return word
words = ['going', 'friends', 'friendship', 'happily', 'friendly',

'programming']
for word in words:
print(word + " : ", stemx(word))
Output:
going : go
friends : friend
friendship : friend
happily : happi
friendly : friend
programming : programm
2. Stemming using Porter Stemmer from NLTK:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["friendship", "programs", "programmer", "programming",

"programmers", "predefined", "happily"]
for w in words:
print(w, " : ", ps.stem(w))
Output:
friendship : friendship
programs : program
programmer : programm
programming : program
programmers : programm
predefined : predefin
happily : happili
3. Lemmatization without NLTK:
from PyDictionary import PyDictionary
class BasicLemmatizer:
def __init__(self):
self.dictionary = PyDictionary()
def lemmatize(self, word):

synonyms = self.dictionary.synonym(word)
if not synonyms:
return word
# If the word itself is in the synonyms list, return it
if word in synonyms:
return word
# Otherwise, return the first synonym as a potential base form
return synonyms[0]
words = ['smiling', 'died', 'purchased', 'went', 'bought', 'better',

'mice']
lemmatizer = BasicLemmatizer()
for word in words:
print(f"Lemmatized word (basic): {lemmatizer.lemmatize(word)}")
4. Lemmatization using NLTK:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words1 = ['smiling', 'died', 'purchased', 'went', 'bought']
for w in words1:
print(w + " : " + lemmatizer.lemmatize(w, pos="v")) # v - verb
words2 = ['better', 'worst', 'happiest']

for x in words2:
print(x + " : ", lemmatizer.lemmatize(x, pos ="a")) # a - adjective
words3 = ['babies', 'mice', 'feet']

for y in words3:
print(y + " : ", lemmatizer.lemmatize(y, pos ="n")) # n - noun
Output:
smiling : smile
died : die
purchased : purchase
went : go
bought : buy
better : good
worst : bad
happiest : happy
babies : baby
mice : mouse
feet : foot
5. Porter Stemmer:
class PorterStemmer:
def __init__(self):
pass
def _contains_vowel(self, word):

return any(char in "aeiou" for char in word)
# m: count of VC pair
def _measure(self, word):
return ''.join(['V' if char in "aeiou" else 'C' for char in
word]).count('VC')
def _double_consonant(self, word):

if len(word) >= 2 and word[-1] == word[-2] and word[-1] not in
"aeiou":
return True
return False
def _replace_suffix(self, word, old, new):

if word.endswith(old):
return word[:-len(old)] + new
return word
def _ends_with_cvc(self, word):

if len(word) < 3:
return False
consonants = "bcdfghjklmnpqrstvwxyz"
vowels = "aeiou"
return word[-3] in consonants and word[-2] in vowels and word[-1]
in consonants
def _step1(self, word):

if word.endswith("sses"):
return word[:-4] + "ss"
elif word.endswith("ies"):
return word[:-3] + "i"
elif word.endswith("ss"):
return word
elif word.endswith("s"):
return word[:-1]
return word
if word.endswith("eed"):
stem = word[:-3]
if self._measure(stem) > 0:
return stem + "ee"
elif word.endswith("ed"):
stem = word[:-2]
if self._contains_vowel(stem):
return stem
elif word.endswith("ing"):
stem = word[:-3]
if self._contains_vowel(stem):
return stem
return word

if word.endswith("y") and self._contains_vowel(word[:-1]):
return word[:-1] + "i"
return word

suffixes = {
"ational": "ate",
"tional": "tion",
"izer": "ize",
"abli": "able",
"eli": "e",
"ization": "ize",
"ation": "ate",
"biliti": "ble",
}
for key in suffixes:
if word.endswith(key):
stem = word[:-len(key)]
return stem + suffixes[key]
return word

suffixes = {
"icate": "ic",
"ative": "",
"alize": "al",
"iciti": "ic",
"ical": "ic",
"ful": "",
"ness": ""
}
for key in suffixes:
if word.endswith(key):
stem = word[:-len(key)]
return stem + suffixes[key]
return word

suffixes = ["al", "ance", "ence", "er", "ic", "able", "ible",
"ant", "ement",
"ment", "ent", "ou", "ism", "ate", "iti", "ous", "ive",
"ize"]
for suffix in suffixes:
if word.endswith(suffix):
stem = word[:-len(suffix)]
return stem
return word
def _step7a(self, word):

if word.endswith("e"):
stem = word[:-1]
return stem
elif self._measure(stem) == 1 and not
self._ends_with_cvc(stem):
return stem
return word
def _step7b(self, word):

if self._measure(word) > 1 and self._double_consonant(word) and
word.endswith("l"):
return word[:-1]
return word
def stem(self, word):

word = self._step1(word)
word = self._step7a(word)
word = self._step7b(word)
return word
def test_stemmer():
stemmer = PorterStemmer()
words = ["computers", "singing", "controlling", "generalizations",

"elephants", "doing", "relational","caresses"]
results = []
for word in words:

results.append((word, stemmer.stem(word)))
return print(results)
test_stemmer()
Output:
[('computers', 'comput'), ('singing', 'sing'), ('controlling', 'control'), ('generalizations', 'gener'),

('elephants', 'eleph'), ('doing', 'do'), ('relational', 'relat'), ('caresses', 'caress')]
Lab 3 : POS Tagging
Code:
from __future__ import division #To avoid integer division

from operator import itemgetter
###Training Phase###
with open("wsj_training.txt", "r") as myfile:

tr_str = myfile.read()
tr_li = tr_str.split()
num_words_train = len(tr_li)
train_li_words = ['']
train_li_words*= num_words_train
train_li_tags = ['']
train_li_tags*= num_words_train
for i in range(num_words_train):
temp_li = tr_li[i].split("/")
train_li_words[i] = temp_li[0]
train_li_tags[i] = temp_li[1]
dict2_tag_follow_tag_ = {} # for transition probability
dict2_word_tag = {}
dict_word_tag_baseline = {} # for emission probability
for i in range(num_words_train-1):
outer_key = train_li_tags[i]
inner_key = train_li_tags[i+1]
dict2_tag_follow_tag_[outer_key]=dict2_tag_follow_tag_.get(outer_key,{})
dict2_tag_follow_tag_[outer_key][inner_key] =
dict2_tag_follow_tag_[outer_key].get(inner_key,0)
dict2_tag_follow_tag_[outer_key][inner_key]+=1
outer_key = train_li_words[i]
inner_key = train_li_tags[i]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1
dict2_tag_follow_tag_['.'] = dict2_tag_follow_tag_.get('.',{})
dict2_tag_follow_tag_['.'][train_li_tags[0]] =
dict2_tag_follow_tag_['.'].get(train_li_tags[0],0)
dict2_tag_follow_tag_['.'][train_li_tags[0]]+=1
last_index = num_words_train-1
#Accounting for the last word-tag pair

outer_key = train_li_words[last_index]
inner_key = train_li_tags[last_index]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1
for key in dict2_tag_follow_tag_:

di = dict2_tag_follow_tag_[key]
s = sum(di.values())
for innkey in di:
di[innkey] /= s
di = di.items()
di = sorted(di,key=lambda x: x[0])
dict2_tag_follow_tag_[key] = di
for key in dict2_word_tag:

di = dict2_word_tag[key]
dict_word_tag_baseline[key] = max(di, key=di.get)
s = sum(di.values())
for innkey in di:
di[innkey] /= s
di = di.items()
di = sorted(di,key=lambda x: x[0])
dict2_word_tag[key] = di
###Testing Phase###
with open("wsj_test.txt", "r") as myfile:

te_str = myfile.read()
te_li = te_str.split()
num_words_test = len(te_li)
test_li_words = ['']
test_li_words*= num_words_test
test_li_tags = ['']
test_li_tags*= num_words_test
output_li = ['']
output_li*= num_words_test
output_li_baseline = ['']
output_li_baseline*= num_words_test
num_errors = 0
num_errors_baseline = 0
for i in range(num_words_test):
temp_li = te_li[i].split("/")
test_li_words[i] = temp_li[0]
test_li_tags[i] = temp_li[1]
output_li_baseline[i] = dict_word_tag_baseline.get(temp_li[0],'')
#If unknown word - tag = 'NNP'
if output_li_baseline[i]=='':
output_li_baseline[i]='NNP'
if output_li_baseline[i]!=test_li_tags[i]:
num_errors_baseline+=1
if i==0: #Accounting for the 1st word in the test document for the
Viterbi
di_transition_probs = dict2_tag_follow_tag_['.']
else:
di_transition_probs = dict2_tag_follow_tag_[output_li[i-1]]
di_emission_probs = dict2_word_tag.get(test_li_words[i],'')
#If unknown word - tag = 'NNP'

if di_emission_probs=='':
output_li[i]='NNP'
else:
max_prod_prob = 0
counter_trans = 0
counter_emis =0
prod_prob = 0
while counter_trans < len(di_transition_probs) and counter_emis <
len(di_emission_probs):
tag_tr = di_transition_probs[counter_trans][0]
tag_em = di_emission_probs[counter_emis][0]
if tag_tr < tag_em:
counter_trans+=1
elif tag_tr > tag_em:
counter_emis+=1
else:
prod_prob = di_transition_probs[counter_trans][1] *
di_emission_probs[counter_emis][1]
if prod_prob > max_prod_prob:
max_prod_prob = prod_prob
output_li[i] = tag_tr
#print "i=",i," and output=",output_li[i]
counter_trans+=1
counter_emis+=1
if output_li[i]=='': #In case there are no matching entries between the
transition tags and emission tags, we choose the most frequent emission
tag
output_li[i] = max(di_emission_probs,key=itemgetter(1))[0]
if output_li[i]!=test_li_tags[i]:
num_errors+=1
print("Fraction of errors (Baseline)

:",num_errors_baseline/num_words_test)
print("Fraction of errors (Viterbi):",num_errors/num_words_test)
print("Tags suggested by Baseline Algorithm:",output_li_baseline)
print("Tags suggested by Viterbi Algorithm:", output_li)
print("Correct tags:",test_li_tags)
Output:
Fraction of errors (Baseline) : 0.15384615384615385

Fraction of errors (Viterbi): 0.07692307692307693
Tags suggested by Baseline Algorithm: ['NNP', 'VBD', 'IN', 'VBN', 'NNP', 'VBN', 'TO', 'NNS',
'RB', 'CD', 'NN', 'IN', '.']
Tags suggested by Viterbi Algorithm: ['NNP', 'VBD', 'IN', 'VBD', 'NNP', 'VBN', 'TO', 'NNS',
'RB', 'CD', 'NN', 'IN', '.']
Correct tags: ['NNP', 'VBD', 'WDT', 'VBD', 'NNP', 'VBN', 'TO', 'NNS', 'RB', 'CD', 'NN', 'IN', '.']
Lab 4 : Sentiment Analysis
1. Sentiment Analysis using Supervised Learning Model
import nltk
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
pos_tweets=[('It is not impossible', 'positive'),

('You are my lovely friend', 'Positive'),
('She is beautiful girl', 'Positive'),
('He is looking handsome', 'Positive'),
('Exercise is good for health', 'Positive'),
('Today\'s weather is fantastic', 'Positive'),
('I love Mango', 'Positive')]
neg_tweets=[('You are my enemy friend', 'Negative'),

('She is looking ugly ', 'Negative'),
('He is looking horrible', 'Negative'),
('Sleeping more makes you lazy', 'Negative'),
('Today\'s weather is very bad', 'Negative'),
('I hate Banana', 'Negative')]
Senti_tweets=[]
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered=[e.lower() for e in words.split() if len(e)>=3]
Senti_tweets.append((words_filtered, sentiment))
print(Senti_tweets)
def get_words_in_tweets(tweets):
all_words=[]
for (words, sentiment) in Senti_tweets:
all_words.extend(words)
return (all_words)
def get_word_features(wordlist):
wordlist=nltk.FreqDist(wordlist)
word_features=wordlist.keys()
return word_features
word_features=get_word_features(get_words_in_tweets(Senti_tweets))
print(word_features)
word_features_filtered=[]
for w in word_features:
if w not in stopwords:
word_features_filtered.append(w)
print(word_features_filtered)
def extract_features(document):
document_words=set(document)
features={}
for word in word_features_filtered:
features['contains(%s)' %word] = (word in document_words)
return features
training_set = nltk.classify.apply_features(extract_features,
Senti_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
test_tweet='This is a horrible book'

print()
print("{}: Sentiment = {}".format(test_tweet,
classifier.classify(extract_features(test_tweet.split()))))
Output:
[(['not', 'impossible'], 'positive'), (['you', 'are', 'lovely', 'friend'], 'Positive'), (['she', 'beautiful',
'girl'], 'Positive'), (['looking', 'handsome'], 'Positive'), (['exercise', 'good', 'for', 'health'], 'Positive'),
(["today's", 'weather', 'fantastic'], 'Positive'), (['love', 'mango'], 'Positive'), (['you', 'are', 'enemy',
'friend'], 'Negative'), (['she', 'looking', 'ugly'], 'Negative'), (['looking', 'horrible'], 'Negative'),
(['sleeping', 'more', 'makes', 'you', 'lazy'], 'Negative'), (["today's", 'weather', 'very', 'bad'],
'Negative'), (['hate', 'banana'], 'Negative')]
dict_keys(['not', 'impossible', 'you', 'are', 'lovely', 'friend', 'she', 'beautiful', 'girl', 'looking',
'handsome', 'exercise', 'good', 'for', 'health', "today's", 'weather', 'fantastic', 'love', 'mango',
'enemy', 'ugly', 'horrible', 'sleeping', 'more', 'makes', 'lazy', 'very', 'bad', 'hate', 'banana'])
['impossible', 'lovely', 'friend', 'beautiful', 'girl', 'looking', 'handsome', 'exercise', 'good', 'health',
"today's", 'weather', 'fantastic', 'love', 'mango', 'enemy', 'ugly', 'horrible', 'sleeping', 'makes', 'lazy',
'bad', 'hate', 'banana']
This is a horrible book: Sentiment = Negative
2. Sentiment Analysis on csv data
import pandas as pd
df = pd.read_csv('full-corpus.csv')
df.head()
tweets_data = list(zip(df['TweetText'], df['Sentiment']))
# Split the data into positive and negative tweets

pos_tweets = [(text, 'Positive') for text, sentiment in tweets_data if
sentiment.lower() == 'positive']
neg_tweets = [(text, 'Negative') for text, sentiment in tweets_data if
sentiment.lower() == 'negative']
import nltk
def preprocess(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) #
Convert URLs to the word URL
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # Convert @username to
AT_USER
tweet = re.sub('[\s]+', ' ', tweet) # Remove additional whitespaces
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # Replace #word with word
tweet = tweet.strip('\'"') # Trim
return tweet
# Extract word features

def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
import re
import nltk
# Word tokenizer
def tokenize(tweet):
return nltk.word_tokenize(tweet)
# Remove stopwords and get words from tweets

def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(tokenize(words))
return all_words
# Extract features
def extract_features(document):
document_words = set(tokenize(document))
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
# Replace the hardcoded sample tweets with the extracted tweets

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
tweets.append((preprocess(words), sentiment))
import nltk
# Create word features and train the classifier

word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)
# Test the classifier (optional, if you want to test with a sample tweet)
test_tweet = "It’s easy to be brave when you’re hiding behind a keyboard.
You and your Hamas friends will regret your barbaric actions very soon."
print(classifier.classify(extract_features(preprocess(test_tweet))))
from sklearn.metrics import accuracy_score, precision_score, recall_score,

classification_report
# Assuming tweets is a list of (tweet, sentiment) pairs

# Splitting data into 80% training and 20% testing
train_size = int(len(tweets) * 0.8)
train_tweets = tweets[:train_size]
test_tweets = tweets[train_size:]
# Train the classifier with the training data

training_set = nltk.classify.apply_features(extract_features,
train_tweets)
# Predict the sentiments of the test set

predictions = [classifier.classify(extract_features(tweet)) for tweet, _
in test_tweets]
actual = [sentiment for _, sentiment in test_tweets]
# Calculate accuracy, precision, and recall

accuracy = accuracy_score(actual, predictions)
precision = precision_score(actual, predictions, average='weighted')
recall = recall_score(actual, predictions, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
# If you want a detailed report for each class (Positive, Negative, etc.)
print(classification_report(actual, predictions))
Output:
Negative
Lab 5 : Turney’s method of Sentiment Analysis
Code:
import math
import re
import json
import nltk
# nltk.download("averaged_perceptron_tagger", quiet=True)
def loadReviews(fileName):
file = open(fileName)
list_pos = []
list_neg = []
data = json.load(file)
for elem in data:
if float(elem["overall"]) >= 3.0:
list_pos.append(elem["reviewText"])
else:
list_neg.append(elem["reviewText"])
file.close()
return list_pos, list_neg
def make_datasets(fileName):
all_positive_reviews, all_negative_reviews = loadReviews(fileName)
dataset = {"train": {"neg": [], "pos": []}, "test": {"neg": [], "pos":
[]}}
dataset["train"]["pos"] = all_positive_reviews[:20000]
dataset["train"]["neg"] = all_negative_reviews[:20000]
dataset["test"]["pos"] = all_positive_reviews[-50:]
dataset["test"]["neg"] = all_negative_reviews[-50:]
return dataset
def find_pattern(postag):
tag_pattern = []
for k in range(len(postag) - 2):
if postag[k][1] == "JJ" and (
postag[k + 1][1] == "NN" or postag[k + 1][1] == "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
(postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
elif (
postag[k][1] == "JJ"
):
elif (
(postag[k][1] == "NN" or postag[k][1] == "NNS")
):
elif (
postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS"
) and (
postag[k + 1][1] == "VB"
or postag[k + 1][1] == "VBD"
or postag[k + 1][1] == "VBN"
or postag[k + 1][1] == "VBG"
):
return tag_pattern
def near_operator(phrase, word, text):

try:
string = (
word
+ r"\W+(?:\w+\W+){0,400}?"
+ phrase
+ r"|"
+ phrase
+ r"\W+(?:\w+\W+){0,400}?"
+ word
)
freq_phrase_near_word = len(re.findall(string, text))
return freq_phrase_near_word
except:
return 0
class Turney(object):
def __init__(self, dataset):
self.datasets = dataset
self.pos_phrases_hits = []
self.neg_phrases_hits = []
self.pos_hits = 0.01
self.neg_hits = 0.01
self.accuracy = 0
def turney(self):
tp = 0
fp = 0
tn = 0
fn = 0
for boolean, test_klass in enumerate(["pos", "neg"]):
for i, data in enumerate(self.datasets["test"][test_klass]):
print(
str(i)
+ " out of "
+ str(len(self.datasets["test"][test_klass]))
+ " --> round "
+ str(boolean)
)
phrases =
find_pattern(nltk.pos_tag(nltk.word_tokenize(data)))
if len(phrases) == 0:
continue
self.pos_phrases_hits = [0.01] * len(phrases)
self.neg_phrases_hits = [0.01] * len(phrases)
self.pos_hits = 0.01
self.neg_hits = 0.01
for train_klass in ["pos", "neg"]:

for text in self.datasets["train"][train_klass]:
for ind, phrase in enumerate(phrases):
self.pos_phrases_hits[ind] += near_operator(
phrase, "excellent", text
)
self.neg_phrases_hits[ind] += near_operator(
phrase, "poor", text
)
self.pos_hits += text.count("excellent")
self.neg_hits += text.count("poor")
res = self.calculate_sentiment(boolean)
# compute if correct prediction
if res == 1 and boolean == 0:
fp += 1
elif res == 1 and boolean == 1:
tp += 1
fn += 1
tn += 1
print("Accuracy: " + str(self.accuracy / 100))
print("True positive: " + str(tp))
print("False positive: " + str(fp))
print("True negative: " + str(tn))
print("False negative: " + str(fn))
print("Recall-positive: " + str(tp / (tp + fn)))
print("Precision-positive: " + str(tp / (tp + fp)))
print("Recall-negative: " + str(tn / (tn + fp)))
print("Precision-negative: " + str(tn / (tn + fn)))
def calculate_sentiment(self, is_negative=0):

polarities = [0] * len(self.pos_phrases_hits)
for i in range(len(self.pos_phrases_hits)):
polarities[i] = math.log(
(self.pos_phrases_hits[i] * self.neg_hits)
/ (self.neg_phrases_hits[i] * self.pos_hits),
2,
)
pmi = sum(polarities) / len(polarities)
if (pmi > 0 and is_negative == 0) or (pmi < 0 and is_negative ==
1):
self.accuracy += 1
return 1
return 0
if __name__ == "__main__":
FILE_PATH = "/content/Cell_Phones_and_Accessories_5.json"
datasets = make_datasets(FILE_PATH)
turney = Turney(datasets)
turney.turney()
Output:
Lab 6 : Text Summarization
1. Single document summarization using TF-IDF:
import nltk
import nltk
nltk.download('stopwords')
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
text = """ Unverified reports of ‘40 babies beheaded’ in Israel-Hamas war

inflame social media No photo evidence had been made public as of Thursday
morning corroborating claims that babies had been beheaded. Israel has
published photos of dead infants after the terror attack.
Editor’s note: This story includes graphic descriptions of violent acts
that some readers may find disturbing.
A series of shocking reports have spread horrific claims of baby
beheadings by Hamas militants across social and mainstream media in recent
days, adding a particularly incendiary element to an already violent and
bitter war. But the reports are still unconfirmed, and in some cases have
been retracted.
The most high-profile claim came Wednesday night when President Joe Biden
said that he had seen photographic evidence of terrorists beheading
children. The White House later clarified that Biden was referring to news
reports about beheadings, which have not included or referred to
photographic evidence.
Photos have been published by Hamas showing beheaded soldiers and the X
account belonging to Israeli Prime Minister Benjamin Netanyahu posted
pictures on Thursday of babies killed and burned by Hamas. No photo
evidence had been made public as of Thursday morning corroborating claims
that babies had been beheaded.
Unverified information spreads quickly on social media, particularly
around breaking news events, reaching even larger audiences when it is
shared by mainstream news outlets, politicians and people with large
followings. Follow-ups that retract or add context are less likely to be
repeated or reach the same audience.
Biden’s statement followed a series of news reports and comments from
Israeli officials, most of which have since been softened or walked back.
Easily debunked misinformation like fake press releases have circulated
widely since the start of the war, but such stories often die down quickly
once proven false. The claims about beheadings, difficult to verify, have
continued to spread thanks in part to the lack of clarity.
Alexei Abrahams, a disinformation researcher at McGill University in
Montreal, said that even without the allegations of beheaded babies, just
the facts themselves are horrifying enough to have the kind of effect you
expect.
It may turn out that the slaughter was done in a particularly barbaric
way. But one way or another, this is an absolutely shocking, unprecedented
event of violence, Abrahams said. The general concern, of course, is that
it’s going to exacerbate what is already a very fraught situation.
On Wednesday, a spokesman for Israeli Prime Minister Benjamin Netanyahu
told CNN that babies and toddlers were found with their 'heads
decapitated' in southern Israel after Hamas’ attack. By Thursday morning,
an Israeli official told CNN the government had not confirmed claims of
the beheadings.
A senior State Department official said Thursday morning that the agency
was not in a position to confirm the beheading claims.
Many of the reports appear to have originated from Israeli soldiers and
people affiliated with the Israel Defense Force (IDF).
An IDF spokesperson told Business Insider on Tuesday that soldiers had
found decapitated babies, but said Wednesday it would not investigate or
provide further evidence regarding the claim. Late Wednesday, an IDF
spokesperson said in a video on X that the IDF had relative confidence of
the claims.
On Thursday, in a call with a group of international journalists, Colonel
Golan Vach, the head of the IDF’s national search and rescue unit, said
that he had “found one baby with his head cut.'
Marc Owen Jones, an associate professor of Middle East studies at Hamad
Bin Khalifa University in Qatar who studies misinformation, told NBC News
that he found that the source of the '40 babies beheaded” allegations
largely stemmed from a viral Israeli news broadcast clip that did not
specifically refer to the allegation.
Nicole Zedeck, a correspondent for the privately owned Israeli news outlet
i24NEWS, said in the video that Israeli soldiers told her they’d found
“babies, their heads cut off.' The video has been viewed more than 11
million times on X, according to its view counter. In another tweet,
Zedeck wrote that soldiers told her they believe “40 babies/children were
killed.
Somehow those two bits of information were connected, the story became ‘40
babies were beheaded,’ and in the British press today, about six or seven
newspapers had it on their front pages,' Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he
could not confirm i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka,
Israel’s volunteer civilian emergency response organization, told CBS News
that he saw the bodies of beheaded children and babies, parents and
children who had been tortured and had their hands bound, and “a lot more
that cannot be described for now, because it’s very hard to describe.
By Wednesday, the claims, though still contentious, were going viral
online — being used as evidence of Hamas’ depravity. On Wednesday, the
phrase “Did Hamas kill babies” saw the biggest increase in search interest
on Google of anything related to the war.
Stranger Things star Noah Schnapp posted the shocking claim to his 25
million Instagram followers: '40 babies were beheaded and burned alive in
front of their parents by Hamas.' Sen. Ted Cruz, R-Texas, mentioned
beheaded babies in a post on X, and Rep. Mike McCaul, R-Texas, echoed the
allegations on CNN.
Jones found that the '40 babies beheaded' claim had over 44 million
impressions on X, with over 300,000 likes and more than 100,000 reposts.
The main accounts propagating the claims were i24NEWS and the official
Israel account, Jones’ data showed.
Baby stories are very emotive. Historically, they’re stories that can be
used to rationalize a very brutal response, Jones said. It’s such a
volatile information environment that such claims will inevitably be taken
out of context, both deliberately and accidentally.
"""
def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
for sent in sentences:

freq_table = {}
words = word_tokenize(sent)
for word in words:
word = word.lower()
word = ps.stem(word)
if word in stopWords:
continue
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent[:15]] = freq_table
return frequency_matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
for sent, f_table in freq_matrix.items():

tf_table = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}

for word, count in f_table.items():
if word in word_per_doc_table:
word_per_doc_table[word] += 1
else:
word_per_doc_table[word] = 1
return word_per_doc_table
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):

idf_matrix = {}

idf_table = {}
for word in f_table.keys():

idf_table[word] = math.log10(total_documents /
float(count_doc_per_words[word]))
idf_matrix[sent] = idf_table
return idf_matrix
def _score_sentences(tf_idf_matrix) -> dict:

sentenceValue = {}
for sent, f_table in tf_idf_matrix.items():

total_score_per_sentence = 0
count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence /
count_words_in_sentence
return sentenceValue
def _find_average_score(sentenceValue) -> int:

sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original summary_text

average = (sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:

if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]]
>= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
# 1 Sentence Tokenization
sentences = sent_tokenize(text)
total_documents = len(sentences)
print(total_documents)
41
# 2 Creating the Frequency matrix of the words in each sentence.

freq_matrix = _create_frequency_matrix(sentences)
# 3 Calculate Term Frequency matrix

tf_matrix = _create_tf_matrix(freq_matrix)
# 5 Calculate IDF and generate a matrix

idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words,
total_documents)
# 6 Calculate TF-IDF and generate a matrix

tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
# 7 Important Algorithm: score the sentences

sentence_scores = _score_sentences(tf_idf_matrix)
# 8 Find the threshold

threshold = _find_average_score(sentence_scores)
print(threshold)
0.07720921963564482
# 9 Important step: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
print(summary)
Output:
Israel has published photos of dead infants after the terror attack. But the reports are still
unconfirmed, and in some cases have been retracted. Follow-ups that retract or add context are
less likely to be repeated or reach the same audience. It may turn out that the slaughter was done
in a particularly barbaric way. The video has been viewed more than 11 million times on X,
according to its view counter. Baby stories are very emotive.
Lab 7 : Multi document text summarization
1. Multi document summarization
articles = []
import pandas as pd
import numpy as np
articles.append({
"Title": "Article1",
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
df = pd.DataFrame(articles)
print(df)
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer with stop_words set to 'english'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Compute the TF-IDF vectors for the articles

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])
from sklearn.metrics.pairwise import linear_kernel

# Compute cosine similarities between all pairs of articles
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_similarities)
# Extract the most representative sentence from each article

summaries = []
for idx, article in enumerate(df['Content']):
# Tokenize the article into sentences
sentences = article.split('. ')
# Compute the TF-IDF vectors for the sentences

sentence_vectors = tfidf_vectorizer.transform(sentences)
# Compute cosine similarities between the article and its sentences

cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
# Find the sentence with the highest cosine similarity

top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
top_sentence = sentences[top_sentence_idx]
summaries.append(top_sentence)
# Add summaries to the dataframe

df['Summary'] = summaries
print(df[['Title', 'Summary']])
# Extract the most representative sentence from each article (as we did before)
summaries = []
for idx, article in enumerate(df['Content']):
sentences = article.split('. ')
sentence_vectors = tfidf_vectorizer.transform(sentences)
cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
top_sentence = sentences[top_sentence_idx]
summaries.append(top_sentence)
# Create a merged summary by concatenating the most representative sentences from each
article
merged_summary = '. '.join(summaries)
print("Merged Summary:")
print(merged_summary)
Output:
Title Content
0 Article1 Unverified reports of ‘40 babies beheaded’ in ...
1 Article2 The 'horrendous toll' on children caught in th...
2 Article3 What we actually know about the viral report o...
3 Article4 Israel releases horrific images of slain child...
4 Article5 ‘I would see and have confirmed pictures of te...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 ‘I would see and have confirmed pictures of te…
[[1. 0.14483033 0.39585 0.2582229 0.26818106 0.20422504

0.27895569]
[0.14483033 1. 0.22168557 0.24798395 0.2299564 0.21169182
0.23901851]
[0.39585 0.22168557 1. 0.28336111 0.30100884 0.31592353
0.31263709]
[0.2582229 0.24798395 0.28336111 1. 0.20396822 0.18262805
0.21117037]
[0.26818106 0.2299564 0.30100884 0.20396822 1. 0.18920684
0.96356329]
[0.20422504 0.21169182 0.31592353 0.18262805 0.18920684 1.
0.1972829 ]
[0.27895569 0.23901851 0.31263709 0.21117037 0.96356329 0.1972829
1. ]]
Title Summary
0 Article1 In another tweet, Zedeck wrote that soldiers t...
1 Article2 "Depriving children of access to food and esse...
2 Article3 What happened in Kibbutz Kfar Aza is a massacr...
3 Article4 Secretary of State Antony Blinken and NATO def...
4 Article5 I never thought I’d ever — anyway.” He did not...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 I never thought I’d ever — anyway.” He did not…
Merged Summary:
In another tweet, Zedeck wrote that soldiers told her they believe “40 babies/children were
killed.”
“Somehow those two bits of information were connected, the story became ‘40 babies were
beheaded,’ and in the British press today, about six or seven newspapers had it on their front
pages,” Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not confirm
i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer civilian
emergency response organization, told CBS News that he saw the bodies of beheaded children
and babies, parents and children who had been tortured and had their hands bound, and “a lot
more that cannot be described for now, because it’s very hard to describe.”
By Wednesday, the claims, though still contentious, were going viral online — being used as
evidence of Hamas’ depravity. "Depriving children of access to food and essential services puts
their lives at risk, as do attacks on civilian areas and infrastructure."
According to UNICEF, 80% of those who live in the Gaza Strip rely on some form of
humanitarian assistance.
The conflict has led to "grave humanitarian consequences," Lynn Hastings, a humanitarian
coordinator for the Gaza Strip for the United Nations Office for the Coordination of
Humanitarian Affairs, while calling for all parties to abide by international humanitarian law.
"Civilians, especially children, medical facilities, humanitarian personnel health workers, and
journalists must be protected," Hastings said in a statement Tuesday. What happened in Kibbutz
Kfar Aza is a massacre in which women, children and toddlers and elderly were brutally
butchered in an ISIS way of action."
What happened at the kibbutz?
The Kfar Aza kibbutz is one of several self-contained Israeli settlements close to the Gaza
border.
It is located between Netivot and Sderot - around three miles from the border in southern Israel.
Because of its proximity to Gaza and the unprecedented nature of last weekend's incursion,
which saw Hamas militants breach the usually heavily guarded border on foot - it was one of the
first sites they reached on Saturday.
Four days later, journalists got to see the destruction left behind.
Ramsay said the scene "can only be described as a massacre".
"The stories here are shocking - families being woken without warning to voices outside their
houses, mums and dads hiding their children in cupboards, wine cellars and basements, husbands
and wives becoming separated in the fight," he said.
He added it took 17 hours for help to arrive, as the IDF focused on urban areas first - leaving
residents defenceless and numbers of dead high.
Why are there reports of 'babies being beheaded'?
Claims Hamas fighters beheaded babies have only been reported by one journalist - Nicole
Zedek from i24 - and have not been verified by Sky News.
Ms Zedek was among the reporters invited to see what was left at the kibbutz on Tuesday.
In one live broadcast, which has since been viewed millions of times on X, formerly known as
Twitter, she says: "Talking to some of the soldiers here, they say what they witnessed as they've
been walking through these communities is bodies of babies with their heads cut off and families
gunned down in their beds.
"We can see some of these soldiers right now, comforting each other."
She is also filmed speaking to the deputy commander of the IDF's unit 71, David Ben Zion, who
describes Hamas fighters as "aggressive" and "very bad".
He says: "They cut off heads… of children, of women."
And in another live broadcast, Zedek describes "40 babies at least were taken out on gurneys" -
which is where the widely shared 40 figure comes from.
show more (open the raw output data in a text editor) ...
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with
"decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm it
officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency response organization,
Zaka, shared with CBS News that he witnessed the gruesome sight of children and infants who
had been decapitated.
“I saw a lot more that cannot be described for now, because it’s very hard to describe,” he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to
report the allegations of child beheadings on Tuesday
Lab 8 : Term Incidence Matrix
Code:
Doc_1 = """ """
Doc_2 = """ """
Doc_3 = """ """
Doc_4 = """ """
Doc_5 = """ """
Doc_6 = """ """
Doc_7 = """ """
import nltk
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#getting unique terms in the documents

# set - does not allow duplicate elements
unique_terms= set() # creating an empty set

for doc in docs:
# tokenize words
for term in doc.split():
if term.lower() not in stop_words:
unique_terms.add(term)
print(unique_terms)
print(len(unique_terms))
# Initialize NLTK's word tokenizer

nltk.download('punkt') # Download the required dataset (only need to do
this once)
# Tokenize words in each document

tokenized_docs = [word_tokenize(doc) for doc in docs]
# Print tokenized documents

for i, doc_tokens in enumerate(tokenized_docs, 1):
print(f"Tokens in Doc_{i}: {doc_tokens}")
#creating term-document incidence matrix in the form of a dictionary

doc_term_matrix = {}
for term in unique_terms:

doc_term_matrix[term] = []
for doc in docs:

if term in doc:
doc_term_matrix[term].append(1)
else: doc_term_matrix[term].append(0)
for term, term_list in doc_term_matrix.items():

print(f"{term}: {term_list}")
Output:
{'contentious,', 'personnel', 'photo', 'expressed', 'body', 'killing,', 'particularly', 'spoke',

'Coordination', '"unconfirmed".', 'caught', "I'm", 'reposts.', '"Please', 'still', 'thousands', 'targeted',
'linking', 'report.', 'babies”', 'Unverified', 'called', 'comparing', '"To', 'bitter', '260', 'small',
'Kibbutz', 'retract', 'decapitated', 'another', 'large', 'unconfirmed,', 'United', 'related', 'harmed',
'point', 'suffering', 'founded', 'possible', 'located', 'believed', 'mums', 'Wednesday,', 'Barkat',
……..
}
1534
Tokens in Doc_1: ['Unverified', 'reports', 'of', '‘', '40', 'babies', 'beheaded', '’', 'in', 'Israel-Hamas',
'war', 'inflame', 'social', 'media', 'No', 'photo', 'evidence', 'had', 'been', 'made', 'public', 'as', 'of',,
'.',...... ']
Tokens in Doc_2: ['The', "'horrendous", 'toll', "'", 'on', 'children', 'caught', 'in', 'the', 'Israel-Gaza',
'conflict', 'Hundreds', 'of', 'children', 'have', 'been', 'killed', 'so', 'far', ',', 'with', 'the', 'true', 'total',
'Russell', 'said', '.......']
Tokens in Doc_3: ['What', 'we', 'actually', 'know', 'about', 'the', 'viral', 'report', 'of', 'beheaded',
'babies', 'in', 'Israel', 'One', 'journalist', 'from', 'the', 'Tel', 'Aviv-based', 'news', 'channel', 'i24', 'said',
'war', '........']
Tokens in Doc_4: ['Israel', 'releases', 'horrific', 'images', 'of', 'slain', 'children', 'after', 'Hamas',
'attack', 'JERUSALEM', '/', 'TEL', 'AVIV', '/', 'BRUSSELS', '-', 'CONTENT', 'WARNING', ':',
'right', 'things', '.', ……’]
Tokens in Doc_5: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',
'attack', '........']
Tokens in Doc_6: ['At', 'least', '40', 'babies', 'killed', ',', 'beheaded', 'in', 'Israeli', 'kibbutz', 'outside',
'Gaza', 'Strip', ',', 'reports', 'say', 'KFAR', 'AZA', ',', 'Israel', '(', 'TND', ')', '—', 'Dozens', 'of',
'Monday', …..'.']
Tokens in Doc_7: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',, 'the',
'attack', ‘…….']
contentious,: [1, 0, 0, 0, 0, 0, 0]
personnel: [0, 1, 0, 0, 0, 0, 0]
photo: [1, 0, 0, 1, 0, 0, 0]
expressed: [0, 0, 0, 0, 1, 0, 1]
body: [0, 0, 1, 1, 0, 0, 0]
killing,: [0, 1, 0, 0, 0, 0, 0]
particularly: [1, 0, 0, 1, 0, 0, 0]
spoke: [1, 1, 1, 0, 1, 0, 1]
Coordination: [0, 1, 0, 0, 0, 0, 0]
"unconfirmed".: [0, 0, 1, 0, 0, 0, 0]
caught: [0, 1, 0, 0, 0, 0, 0]
I'm: [0, 1, 0, 0, 0, 1, 0]
reposts.: [1, 0, 0, 0, 0, 0, 0]
"Please: [0, 0, 1, 0, 0, 0, 0]
still: [1, 1, 0, 0, 0, 1, 0]
thousands: [0, 0, 0, 1, 0, 0, 0]
targeted: [0, 0, 0, 1, 0, 0, 0]
linking: [0, 0, 1, 0, 0, 0, 0]
report.: [1, 0, 0, 0, 1, 0, 1]
babies”: [1, 0, 0, 0, 0, 0, 0]
Unverified: [1, 0, 0, 0, 0, 0, 0]
called: [0, 1, 0, 0, 0, 0, 0]
comparing: [0, 0, 0, 1, 0, 0, 0]
"To: [0, 0, 1, 0, 0, 0, 0]
bitter: [1, 0, 0, 0, 0, 0, 0]
260: [0, 1, 0, 0, 0, 0, 0]
small: [0, 0, 1, 0, 0, 1, 0]
Kibbutz: [0, 0, 1, 0, 0, 1, 0]
retract: [1, 0, 0, 0, 0, 0, 0]
decapitated: [1, 0, 0, 0, 1, 0, 1]
another: [1, 1, 1, 0, 1, 0, 1]
war: [1, 1, 1, 0, 0, 1, 0]
docs = ["Doc_1", "Doc_2", "Doc_3", "Doc_4", "Doc_5", "Doc_6", "Doc_7"]
# Convert the document list to a NumPy array

docs_array = np.array(docs, dtype='object')
# query to find documents containing killing and war

import numpy as np
v1 = np.array(doc_term_matrix['killing'])
v2 = np.array(doc_term_matrix['war'])
print(v1)
print(v2)
print('---------')
v3 = v1 & v2
print(v3)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
---------
[0 1 0 0 0 0 0]
matching_doc_indices = np.where(v3 == 1)[0]

print("Documents where 'killing' and 'war' are both present:")
for doc_index in matching_doc_indices:
print(docs_array[doc_index])
Documents where 'killing' and 'war' are both present:

Doc_2
# query to find documents containing killing or war

import numpy as np
v1 = np.array(doc_term_matrix['killing'])
v2 = np.array(doc_term_matrix['war'])
print(v1)
print(v2)
print('---------')
v = v1 | v2
print(v)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
---------
[1 1 1 0 0 1 0]
matching_doc_indices = np.where(v == 1)[0]

print("Documents where either 'killing' or 'war' are present:")
for doc_index in matching_doc_indices:
print(docs_array[doc_index])
Documents where either 'killing' or 'war' are present:

Doc_1
Doc_2
Doc_3
Doc_6

20BCP123 - NLP Lab Manual

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

20BCP123 - NLP Lab Manual

Uploaded by

Copyright:

Available Formats

PANDIT DEENDAYAL ENERGY UNIVERSITY

Natural Language Processing Lab Manual

sentences = [sentence.strip() for sentence in sentences if

for idx, sentence in enumerate(sentences, start=1):

2. Sentence Tokenization using NLTK:

print("Sentence tokenization: ", sent_tokenize(text))

3. Word Tokenization using punkt:

from nltk.tokenize import word_tokenize

print("\n\nWord tokenization: " ,word_tokenize(text))

4. Word tokenization without NLTK:

for char in sentence:

words = ['going', 'friends', 'friendship', 'happily', 'friendly',

2. Stemming using Porter Stemmer from NLTK:

from nltk.stem import PorterStemmer

words = ["friendship", "programs", "programmer", "programming",

3. Lemmatization without NLTK:

from PyDictionary import PyDictionary

def lemmatize(self, word):

words = ['smiling', 'died', 'purchased', 'went', 'bought', 'better',

4. Lemmatization using NLTK:

from nltk.stem import WordNetLemmatizer

words2 = ['better', 'worst', 'happiest']

words3 = ['babies', 'mice', 'feet']

def _contains_vowel(self, word):

def _double_consonant(self, word):

def _replace_suffix(self, word, old, new):

def _ends_with_cvc(self, word):

def _step1(self, word):

def _step3(self, word):

def _step4(self, word):

def _step5(self, word):

def _step6(self, word):

def _step7a(self, word):

def _step7b(self, word):

def stem(self, word):

words = ["computers", "singing", "controlling", "generalizations",

for word in words:

[('computers', 'comput'), ('singing', 'sing'), ('controlling', 'control'), ('generalizations', 'gener'),

from __future__ import division #To avoid integer division

with open("wsj_training.txt", "r") as myfile:

dict2_tag_follow_tag_ = {} # for transition probability

dict_word_tag_baseline = {} # for emission probability

#Accounting for the last word-tag pair

for key in dict2_tag_follow_tag_:

for key in dict2_word_tag:

with open("wsj_test.txt", "r") as myfile:

#If unknown word - tag = 'NNP'

print("Fraction of errors (Baseline)

print("Tags suggested by Baseline Algorithm:",output_li_baseline)

print("Tags suggested by Viterbi Algorithm:", output_li)

Fraction of errors (Baseline) : 0.15384615384615385

1. Sentiment Analysis using Supervised Learning Model

pos_tweets=[('It is not impossible', 'positive'),

neg_tweets=[('You are my enemy friend', 'Negative'),

test_tweet='This is a horrible book'

This is a horrible book: Sentiment = Negative

2. Sentiment Analysis on csv data

tweets_data = list(zip(df['TweetText'], df['Sentiment']))

# Split the data into positive and negative tweets

# Extract word features

# Remove stopwords and get words from tweets

# Replace the hardcoded sample tweets with the extracted tweets

# Create word features and train the classifier

from future import division #To avoid integer division