You are on page 1of 45

PANDIT DEENDAYAL ENERGY UNIVERSITY

SCHOOL OF TECHNOLOGY

CSE DEPARTMENT

Natural Language Processing Lab Manual

Submitted to,
Dr. Santosh Bharti

Submitted by,
Khushi Shah
(20BCP123)
Lab 1: Tokenization

1. Sentence Tokenization:

def extract_sentences(paragraph):
paragraph = paragraph.replace("? ", "|")
paragraph = paragraph.replace("! ", "|")
paragraph = paragraph.replace(". ", "|")
sentences = paragraph.split("|")

sentences = [sentence.strip() for sentence in sentences if


sentence.strip()]

return sentences

paragraph = "This is the first sentence. The second one follows? And here
comes the third sentence! My email is abc@gmail.com, you can contact
here."
sentences = extract_sentences(paragraph)

for idx, sentence in enumerate(sentences, start=1):


print(f"Sentence {idx}: {sentence}")

Output:

2. Sentence Tokenization using NLTK:

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com or etc., many ways are there. " + \
"Khushi said: 'Today is her birthday. So she is very happy.' So everyone
wished her. "

print("Sentence tokenization: ", sent_tokenize(text))

Output:

Sentence tokenization: ['Dr. Ami Shah is a MD doctor.', 'Please feel free to contact her on
ami@gmail.com or etc., many ways are there.', "Khushi said: 'Today is her birthday.", "So she is
very happy.'", 'So everyone wished her.']

3. Word Tokenization using punkt:

import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com etc. many ways are there. " + \
"Don't do this. " + \
"Researcher's said AI will change the word. " + \
"You can't go there. "

print("\n\nWord tokenization: " ,word_tokenize(text))

Output:

Word tokenization: ['Dr.', 'Ami', 'Shah', 'is', 'a', 'MD', 'doctor', '.', 'Please', 'feel', 'free', 'to',
'contact', 'her', 'on', 'ami', '@', 'gmail.com', 'etc', '.', 'many', 'ways', 'are', 'there', '.', 'Do', "n't", 'do',
'this', '.', 'Researcher', "'s", 'said', 'AI', 'will', 'change', 'the', 'word', '.', 'You', 'ca', "n't", 'go', 'there',
'.']

4. Word tokenization without NLTK:

def word_tokenize(sentence):
words = [] # Create an empty list to store the words.
current_word = "" # Initialize an empty string for the current word.

for char in sentence:


if char.isalnum() or char == "'" or char == "@": # Check if the character is a
letter, digit, or an apostrophe.
current_word += char # Add the character to the current word.
else:
if current_word: # Check if there's a current word (not empty).
words.append(current_word) # Append the current word to the list of
words.
current_word = "" # Reset the current word.

if current_word:
words.append(current_word) # Append the last word (if any) to the list.

return words

sentence = "Dr. Ami is not available today. Please feel free to contact her on
ami@gmail.com. Don't come. You can't go there."
tokens = word_tokenize(sentence)
print(tokens)

Output:

['Dr', 'Ami', 'is', 'not', 'available', 'today', 'Please', 'feel', 'free', 'to', 'contact', 'her', 'on', 'ami@gmail',
'com', "Don't", 'come', 'You', "can't", 'go', 'there']
Lab 2 : Stemming and Lemmatization

1. Stemming:

def stemx(word):
suffix = ['ed', 'es', 'ing', 'ship', 's', 'less', 'ion', 'ly', 'ions',
'full', 'able']
prefix = ['un', 'pre', 'dis', 're', 'il', 'im', 'in', 'de', 'mis', 'sub',
'non', 'anti']

for x in prefix:
if word.startswith(x):
word = word[len(x):]

for y in suffix:
if word.endswith(y):
word = word[:-len(y)]

return word

words = ['going', 'friends', 'friendship', 'happily', 'friendly',


'programming']
for word in words:
print(word + " : ", stemx(word))

Output:

going : go
friends : friend
friendship : friend
happily : happi
friendly : friend
programming : programm

2. Stemming using Porter Stemmer from NLTK:

from nltk.stem import PorterStemmer


from nltk.tokenize import word_tokenize
ps = PorterStemmer()

words = ["friendship", "programs", "programmer", "programming",


"programmers", "predefined", "happily"]

for w in words:
print(w, " : ", ps.stem(w))

Output:

friendship : friendship
programs : program
programmer : programm
programming : program
programmers : programm
predefined : predefin
happily : happili

3. Lemmatization without NLTK:

from PyDictionary import PyDictionary

class BasicLemmatizer:
def __init__(self):
self.dictionary = PyDictionary()

def lemmatize(self, word):


synonyms = self.dictionary.synonym(word)
if not synonyms:
return word
# If the word itself is in the synonyms list, return it
if word in synonyms:
return word
# Otherwise, return the first synonym as a potential base form
return synonyms[0]

words = ['smiling', 'died', 'purchased', 'went', 'bought', 'better',


'mice']
lemmatizer = BasicLemmatizer()
for word in words:
print(f"Lemmatized word (basic): {lemmatizer.lemmatize(word)}")

4. Lemmatization using NLTK:

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words1 = ['smiling', 'died', 'purchased', 'went', 'bought']

for w in words1:
print(w + " : " + lemmatizer.lemmatize(w, pos="v")) # v - verb

words2 = ['better', 'worst', 'happiest']


for x in words2:
print(x + " : ", lemmatizer.lemmatize(x, pos ="a")) # a - adjective

words3 = ['babies', 'mice', 'feet']


for y in words3:
print(y + " : ", lemmatizer.lemmatize(y, pos ="n")) # n - noun

Output:

smiling : smile
died : die
purchased : purchase
went : go
bought : buy
better : good
worst : bad
happiest : happy
babies : baby
mice : mouse
feet : foot

5. Porter Stemmer:

class PorterStemmer:
def __init__(self):
pass

def _contains_vowel(self, word):


return any(char in "aeiou" for char in word)

# m: count of VC pair
def _measure(self, word):
return ''.join(['V' if char in "aeiou" else 'C' for char in
word]).count('VC')

def _double_consonant(self, word):


if len(word) >= 2 and word[-1] == word[-2] and word[-1] not in
"aeiou":
return True
return False

def _replace_suffix(self, word, old, new):


if word.endswith(old):
return word[:-len(old)] + new
return word

def _ends_with_cvc(self, word):


if len(word) < 3:
return False
consonants = "bcdfghjklmnpqrstvwxyz"
vowels = "aeiou"
return word[-3] in consonants and word[-2] in vowels and word[-1]
in consonants

def _step1(self, word):


if word.endswith("sses"):
return word[:-4] + "ss"
elif word.endswith("ies"):
return word[:-3] + "i"
elif word.endswith("ss"):
return word
elif word.endswith("s"):
return word[:-1]
return word
def _step2(self, word):
if word.endswith("eed"):
stem = word[:-3]
if self._measure(stem) > 0:
return stem + "ee"
elif word.endswith("ed"):
stem = word[:-2]
if self._contains_vowel(stem):
return stem
elif word.endswith("ing"):
stem = word[:-3]
if self._contains_vowel(stem):
return stem
return word

def _step3(self, word):


if word.endswith("y") and self._contains_vowel(word[:-1]):
return word[:-1] + "i"
return word

def _step4(self, word):


suffixes = {
"ational": "ate",
"tional": "tion",
"izer": "ize",
"abli": "able",
"eli": "e",
"ization": "ize",
"ation": "ate",
"biliti": "ble",
}
for key in suffixes:
if word.endswith(key):
stem = word[:-len(key)]
if self._measure(stem) > 0:
return stem + suffixes[key]
return word

def _step5(self, word):


suffixes = {
"icate": "ic",
"ative": "",
"alize": "al",
"iciti": "ic",
"ical": "ic",
"ful": "",
"ness": ""
}
for key in suffixes:
if word.endswith(key):
stem = word[:-len(key)]
if self._measure(stem) > 0:
return stem + suffixes[key]
return word

def _step6(self, word):


suffixes = ["al", "ance", "ence", "er", "ic", "able", "ible",
"ant", "ement",
"ment", "ent", "ou", "ism", "ate", "iti", "ous", "ive",
"ize"]
for suffix in suffixes:
if word.endswith(suffix):
stem = word[:-len(suffix)]
if self._measure(stem) > 1:
return stem
return word

def _step7a(self, word):


if word.endswith("e"):
stem = word[:-1]
if self._measure(stem) > 1:
return stem
elif self._measure(stem) == 1 and not
self._ends_with_cvc(stem):
return stem
return word

def _step7b(self, word):


if self._measure(word) > 1 and self._double_consonant(word) and
word.endswith("l"):
return word[:-1]
return word

def stem(self, word):


word = self._step1(word)
word = self._step2(word)
word = self._step3(word)
word = self._step4(word)
word = self._step5(word)
word = self._step6(word)
word = self._step7a(word)
word = self._step7b(word)
return word

def test_stemmer():
stemmer = PorterStemmer()

words = ["computers", "singing", "controlling", "generalizations",


"elephants", "doing", "relational","caresses"]
results = []

for word in words:


results.append((word, stemmer.stem(word)))

return print(results)
test_stemmer()

Output:

[('computers', 'comput'), ('singing', 'sing'), ('controlling', 'control'), ('generalizations', 'gener'),


('elephants', 'eleph'), ('doing', 'do'), ('relational', 'relat'), ('caresses', 'caress')]
Lab 3 : POS Tagging

Code:

from __future__ import division #To avoid integer division


from operator import itemgetter
###Training Phase###

with open("wsj_training.txt", "r") as myfile:


tr_str = myfile.read()
tr_li = tr_str.split()
num_words_train = len(tr_li)

train_li_words = ['']
train_li_words*= num_words_train

train_li_tags = ['']
train_li_tags*= num_words_train

for i in range(num_words_train):
temp_li = tr_li[i].split("/")
train_li_words[i] = temp_li[0]
train_li_tags[i] = temp_li[1]

dict2_tag_follow_tag_ = {} # for transition probability

dict2_word_tag = {}

dict_word_tag_baseline = {} # for emission probability

for i in range(num_words_train-1):
outer_key = train_li_tags[i]
inner_key = train_li_tags[i+1]

dict2_tag_follow_tag_[outer_key]=dict2_tag_follow_tag_.get(outer_key,{})
dict2_tag_follow_tag_[outer_key][inner_key] =
dict2_tag_follow_tag_[outer_key].get(inner_key,0)
dict2_tag_follow_tag_[outer_key][inner_key]+=1

outer_key = train_li_words[i]
inner_key = train_li_tags[i]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1

dict2_tag_follow_tag_['.'] = dict2_tag_follow_tag_.get('.',{})
dict2_tag_follow_tag_['.'][train_li_tags[0]] =
dict2_tag_follow_tag_['.'].get(train_li_tags[0],0)
dict2_tag_follow_tag_['.'][train_li_tags[0]]+=1

last_index = num_words_train-1

#Accounting for the last word-tag pair


outer_key = train_li_words[last_index]
inner_key = train_li_tags[last_index]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1

for key in dict2_tag_follow_tag_:


di = dict2_tag_follow_tag_[key]
s = sum(di.values())
for innkey in di:
di[innkey] /= s
di = di.items()
di = sorted(di,key=lambda x: x[0])
dict2_tag_follow_tag_[key] = di

for key in dict2_word_tag:


di = dict2_word_tag[key]
dict_word_tag_baseline[key] = max(di, key=di.get)
s = sum(di.values())
for innkey in di:
di[innkey] /= s
di = di.items()
di = sorted(di,key=lambda x: x[0])
dict2_word_tag[key] = di

###Testing Phase###

with open("wsj_test.txt", "r") as myfile:


te_str = myfile.read()

te_li = te_str.split()
num_words_test = len(te_li)

test_li_words = ['']
test_li_words*= num_words_test

test_li_tags = ['']
test_li_tags*= num_words_test

output_li = ['']
output_li*= num_words_test

output_li_baseline = ['']
output_li_baseline*= num_words_test

num_errors = 0
num_errors_baseline = 0

for i in range(num_words_test):
temp_li = te_li[i].split("/")
test_li_words[i] = temp_li[0]
test_li_tags[i] = temp_li[1]

output_li_baseline[i] = dict_word_tag_baseline.get(temp_li[0],'')
#If unknown word - tag = 'NNP'
if output_li_baseline[i]=='':
output_li_baseline[i]='NNP'
if output_li_baseline[i]!=test_li_tags[i]:
num_errors_baseline+=1

if i==0: #Accounting for the 1st word in the test document for the
Viterbi
di_transition_probs = dict2_tag_follow_tag_['.']
else:
di_transition_probs = dict2_tag_follow_tag_[output_li[i-1]]

di_emission_probs = dict2_word_tag.get(test_li_words[i],'')

#If unknown word - tag = 'NNP'


if di_emission_probs=='':
output_li[i]='NNP'

else:
max_prod_prob = 0
counter_trans = 0
counter_emis =0
prod_prob = 0
while counter_trans < len(di_transition_probs) and counter_emis <
len(di_emission_probs):
tag_tr = di_transition_probs[counter_trans][0]
tag_em = di_emission_probs[counter_emis][0]
if tag_tr < tag_em:
counter_trans+=1
elif tag_tr > tag_em:
counter_emis+=1
else:
prod_prob = di_transition_probs[counter_trans][1] *
di_emission_probs[counter_emis][1]
if prod_prob > max_prod_prob:
max_prod_prob = prod_prob
output_li[i] = tag_tr
#print "i=",i," and output=",output_li[i]
counter_trans+=1
counter_emis+=1
if output_li[i]=='': #In case there are no matching entries between the
transition tags and emission tags, we choose the most frequent emission
tag
output_li[i] = max(di_emission_probs,key=itemgetter(1))[0]

if output_li[i]!=test_li_tags[i]:
num_errors+=1

print("Fraction of errors (Baseline)


:",num_errors_baseline/num_words_test)
print("Fraction of errors (Viterbi):",num_errors/num_words_test)

print("Tags suggested by Baseline Algorithm:",output_li_baseline)

print("Tags suggested by Viterbi Algorithm:", output_li)

print("Correct tags:",test_li_tags)

Output:

Fraction of errors (Baseline) : 0.15384615384615385


Fraction of errors (Viterbi): 0.07692307692307693
Tags suggested by Baseline Algorithm: ['NNP', 'VBD', 'IN', 'VBN', 'NNP', 'VBN', 'TO', 'NNS',
'RB', 'CD', 'NN', 'IN', '.']
Tags suggested by Viterbi Algorithm: ['NNP', 'VBD', 'IN', 'VBD', 'NNP', 'VBN', 'TO', 'NNS',
'RB', 'CD', 'NN', 'IN', '.']
Correct tags: ['NNP', 'VBD', 'WDT', 'VBD', 'NNP', 'VBN', 'TO', 'NNS', 'RB', 'CD', 'NN', 'IN', '.']
Lab 4 : Sentiment Analysis

1. Sentiment Analysis using Supervised Learning Model

import nltk
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

pos_tweets=[('It is not impossible', 'positive'),


('You are my lovely friend', 'Positive'),
('She is beautiful girl', 'Positive'),
('He is looking handsome', 'Positive'),
('Exercise is good for health', 'Positive'),
('Today\'s weather is fantastic', 'Positive'),
('I love Mango', 'Positive')]

neg_tweets=[('You are my enemy friend', 'Negative'),


('She is looking ugly ', 'Negative'),
('He is looking horrible', 'Negative'),
('Sleeping more makes you lazy', 'Negative'),
('Today\'s weather is very bad', 'Negative'),
('I hate Banana', 'Negative')]

Senti_tweets=[]
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered=[e.lower() for e in words.split() if len(e)>=3]
Senti_tweets.append((words_filtered, sentiment))
print(Senti_tweets)

def get_words_in_tweets(tweets):
all_words=[]
for (words, sentiment) in Senti_tweets:
all_words.extend(words)
return (all_words)

def get_word_features(wordlist):
wordlist=nltk.FreqDist(wordlist)
word_features=wordlist.keys()
return word_features

word_features=get_word_features(get_words_in_tweets(Senti_tweets))
print(word_features)

word_features_filtered=[]
for w in word_features:
if w not in stopwords:
word_features_filtered.append(w)

print(word_features_filtered)

def extract_features(document):
document_words=set(document)
features={}
for word in word_features_filtered:
features['contains(%s)' %word] = (word in document_words)
return features

training_set = nltk.classify.apply_features(extract_features,
Senti_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

test_tweet='This is a horrible book'


print()
print("{}: Sentiment = {}".format(test_tweet,
classifier.classify(extract_features(test_tweet.split()))))

Output:

[(['not', 'impossible'], 'positive'), (['you', 'are', 'lovely', 'friend'], 'Positive'), (['she', 'beautiful',
'girl'], 'Positive'), (['looking', 'handsome'], 'Positive'), (['exercise', 'good', 'for', 'health'], 'Positive'),
(["today's", 'weather', 'fantastic'], 'Positive'), (['love', 'mango'], 'Positive'), (['you', 'are', 'enemy',
'friend'], 'Negative'), (['she', 'looking', 'ugly'], 'Negative'), (['looking', 'horrible'], 'Negative'),
(['sleeping', 'more', 'makes', 'you', 'lazy'], 'Negative'), (["today's", 'weather', 'very', 'bad'],
'Negative'), (['hate', 'banana'], 'Negative')]
dict_keys(['not', 'impossible', 'you', 'are', 'lovely', 'friend', 'she', 'beautiful', 'girl', 'looking',
'handsome', 'exercise', 'good', 'for', 'health', "today's", 'weather', 'fantastic', 'love', 'mango',
'enemy', 'ugly', 'horrible', 'sleeping', 'more', 'makes', 'lazy', 'very', 'bad', 'hate', 'banana'])
['impossible', 'lovely', 'friend', 'beautiful', 'girl', 'looking', 'handsome', 'exercise', 'good', 'health',
"today's", 'weather', 'fantastic', 'love', 'mango', 'enemy', 'ugly', 'horrible', 'sleeping', 'makes', 'lazy',
'bad', 'hate', 'banana']

This is a horrible book: Sentiment = Negative

2. Sentiment Analysis on csv data

import pandas as pd

df = pd.read_csv('full-corpus.csv')
df.head()

tweets_data = list(zip(df['TweetText'], df['Sentiment']))

# Split the data into positive and negative tweets


pos_tweets = [(text, 'Positive') for text, sentiment in tweets_data if
sentiment.lower() == 'positive']
neg_tweets = [(text, 'Negative') for text, sentiment in tweets_data if
sentiment.lower() == 'negative']

import nltk
from nltk.corpus import stopwords

def preprocess(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) #
Convert URLs to the word URL
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # Convert @username to
AT_USER
tweet = re.sub('[\s]+', ' ', tweet) # Remove additional whitespaces
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # Replace #word with word
tweet = tweet.strip('\'"') # Trim
return tweet

# Extract word features


def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features

import re
import nltk

# Word tokenizer
def tokenize(tweet):
return nltk.word_tokenize(tweet)

# Remove stopwords and get words from tweets


def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(tokenize(words))
return all_words

# Extract features
def extract_features(document):
document_words = set(tokenize(document))
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features

# Replace the hardcoded sample tweets with the extracted tweets


tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
tweets.append((preprocess(words), sentiment))

import nltk
nltk.download('punkt')

# Create word features and train the classifier


word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Test the classifier (optional, if you want to test with a sample tweet)
test_tweet = "It’s easy to be brave when you’re hiding behind a keyboard.
You and your Hamas friends will regret your barbaric actions very soon."
print(classifier.classify(extract_features(preprocess(test_tweet))))

from sklearn.metrics import accuracy_score, precision_score, recall_score,


classification_report

# Assuming tweets is a list of (tweet, sentiment) pairs


# Splitting data into 80% training and 20% testing
train_size = int(len(tweets) * 0.8)
train_tweets = tweets[:train_size]
test_tweets = tweets[train_size:]

# Train the classifier with the training data


training_set = nltk.classify.apply_features(extract_features,
train_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Predict the sentiments of the test set


predictions = [classifier.classify(extract_features(tweet)) for tweet, _
in test_tweets]
actual = [sentiment for _, sentiment in test_tweets]

# Calculate accuracy, precision, and recall


accuracy = accuracy_score(actual, predictions)
precision = precision_score(actual, predictions, average='weighted')
recall = recall_score(actual, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# If you want a detailed report for each class (Positive, Negative, etc.)
print(classification_report(actual, predictions))

Output:

Negative
Lab 5 : Turney’s method of Sentiment Analysis

Code:

import math
import re
import json
import nltk

# nltk.download("averaged_perceptron_tagger", quiet=True)

def loadReviews(fileName):
file = open(fileName)
list_pos = []
list_neg = []
data = json.load(file)
for elem in data:
if float(elem["overall"]) >= 3.0:
list_pos.append(elem["reviewText"])
else:
list_neg.append(elem["reviewText"])
file.close()
return list_pos, list_neg

def make_datasets(fileName):
all_positive_reviews, all_negative_reviews = loadReviews(fileName)
dataset = {"train": {"neg": [], "pos": []}, "test": {"neg": [], "pos":
[]}}
dataset["train"]["pos"] = all_positive_reviews[:20000]
dataset["train"]["neg"] = all_negative_reviews[:20000]
dataset["test"]["pos"] = all_positive_reviews[-50:]
dataset["test"]["neg"] = all_negative_reviews[-50:]

return dataset

def find_pattern(postag):
tag_pattern = []
for k in range(len(postag) - 2):
if postag[k][1] == "JJ" and (
postag[k + 1][1] == "NN" or postag[k + 1][1] == "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))

elif (
(postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))

elif (
postag[k][1] == "JJ"
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))

elif (
(postag[k][1] == "NN" or postag[k][1] == "NNS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))

elif (
postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS"
) and (
postag[k + 1][1] == "VB"
or postag[k + 1][1] == "VBD"
or postag[k + 1][1] == "VBN"
or postag[k + 1][1] == "VBG"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
return tag_pattern

def near_operator(phrase, word, text):


try:
string = (
word
+ r"\W+(?:\w+\W+){0,400}?"
+ phrase
+ r"|"
+ phrase
+ r"\W+(?:\w+\W+){0,400}?"
+ word
)
freq_phrase_near_word = len(re.findall(string, text))
return freq_phrase_near_word
except:
return 0

class Turney(object):
def __init__(self, dataset):
self.datasets = dataset
self.pos_phrases_hits = []
self.neg_phrases_hits = []
self.pos_hits = 0.01
self.neg_hits = 0.01
self.accuracy = 0

def turney(self):
tp = 0
fp = 0
tn = 0
fn = 0
for boolean, test_klass in enumerate(["pos", "neg"]):
for i, data in enumerate(self.datasets["test"][test_klass]):
print(
str(i)
+ " out of "
+ str(len(self.datasets["test"][test_klass]))
+ " --> round "
+ str(boolean)
)

phrases =
find_pattern(nltk.pos_tag(nltk.word_tokenize(data)))
if len(phrases) == 0:
continue
self.pos_phrases_hits = [0.01] * len(phrases)
self.neg_phrases_hits = [0.01] * len(phrases)
self.pos_hits = 0.01
self.neg_hits = 0.01

for train_klass in ["pos", "neg"]:


for text in self.datasets["train"][train_klass]:
for ind, phrase in enumerate(phrases):
self.pos_phrases_hits[ind] += near_operator(
phrase, "excellent", text
)
self.neg_phrases_hits[ind] += near_operator(
phrase, "poor", text
)
self.pos_hits += text.count("excellent")
self.neg_hits += text.count("poor")
res = self.calculate_sentiment(boolean)
# compute if correct prediction
if res == 1 and boolean == 0:
fp += 1
elif res == 1 and boolean == 1:
tp += 1
elif res == 0 and boolean == 0:
fn += 1
elif res == 0 and boolean == 1:
tn += 1
print("Accuracy: " + str(self.accuracy / 100))
print("True positive: " + str(tp))
print("False positive: " + str(fp))
print("True negative: " + str(tn))
print("False negative: " + str(fn))
print("Recall-positive: " + str(tp / (tp + fn)))
print("Precision-positive: " + str(tp / (tp + fp)))
print("Recall-negative: " + str(tn / (tn + fp)))
print("Precision-negative: " + str(tn / (tn + fn)))

def calculate_sentiment(self, is_negative=0):


polarities = [0] * len(self.pos_phrases_hits)
for i in range(len(self.pos_phrases_hits)):
polarities[i] = math.log(
(self.pos_phrases_hits[i] * self.neg_hits)
/ (self.neg_phrases_hits[i] * self.pos_hits),
2,
)
pmi = sum(polarities) / len(polarities)
if (pmi > 0 and is_negative == 0) or (pmi < 0 and is_negative ==
1):
self.accuracy += 1
return 1
return 0

if __name__ == "__main__":
FILE_PATH = "/content/Cell_Phones_and_Accessories_5.json"
datasets = make_datasets(FILE_PATH)
turney = Turney(datasets)
turney.turney()

Output:
Lab 6 : Text Summarization

1. Single document summarization using TF-IDF:

import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')

import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

text = """ Unverified reports of ‘40 babies beheaded’ in Israel-Hamas war


inflame social media No photo evidence had been made public as of Thursday
morning corroborating claims that babies had been beheaded. Israel has
published photos of dead infants after the terror attack.
Editor’s note: This story includes graphic descriptions of violent acts
that some readers may find disturbing.
A series of shocking reports have spread horrific claims of baby
beheadings by Hamas militants across social and mainstream media in recent
days, adding a particularly incendiary element to an already violent and
bitter war. But the reports are still unconfirmed, and in some cases have
been retracted.
The most high-profile claim came Wednesday night when President Joe Biden
said that he had seen photographic evidence of terrorists beheading
children. The White House later clarified that Biden was referring to news
reports about beheadings, which have not included or referred to
photographic evidence.
Photos have been published by Hamas showing beheaded soldiers and the X
account belonging to Israeli Prime Minister Benjamin Netanyahu posted
pictures on Thursday of babies killed and burned by Hamas. No photo
evidence had been made public as of Thursday morning corroborating claims
that babies had been beheaded.
Unverified information spreads quickly on social media, particularly
around breaking news events, reaching even larger audiences when it is
shared by mainstream news outlets, politicians and people with large
followings. Follow-ups that retract or add context are less likely to be
repeated or reach the same audience.
Biden’s statement followed a series of news reports and comments from
Israeli officials, most of which have since been softened or walked back.
Easily debunked misinformation like fake press releases have circulated
widely since the start of the war, but such stories often die down quickly
once proven false. The claims about beheadings, difficult to verify, have
continued to spread thanks in part to the lack of clarity.
Alexei Abrahams, a disinformation researcher at McGill University in
Montreal, said that even without the allegations of beheaded babies, just
the facts themselves are horrifying enough to have the kind of effect you
expect.
It may turn out that the slaughter was done in a particularly barbaric
way. But one way or another, this is an absolutely shocking, unprecedented
event of violence, Abrahams said. The general concern, of course, is that
it’s going to exacerbate what is already a very fraught situation.
On Wednesday, a spokesman for Israeli Prime Minister Benjamin Netanyahu
told CNN that babies and toddlers were found with their 'heads
decapitated' in southern Israel after Hamas’ attack. By Thursday morning,
an Israeli official told CNN the government had not confirmed claims of
the beheadings.
A senior State Department official said Thursday morning that the agency
was not in a position to confirm the beheading claims.
Many of the reports appear to have originated from Israeli soldiers and
people affiliated with the Israel Defense Force (IDF).
An IDF spokesperson told Business Insider on Tuesday that soldiers had
found decapitated babies, but said Wednesday it would not investigate or
provide further evidence regarding the claim. Late Wednesday, an IDF
spokesperson said in a video on X that the IDF had relative confidence of
the claims.
On Thursday, in a call with a group of international journalists, Colonel
Golan Vach, the head of the IDF’s national search and rescue unit, said
that he had “found one baby with his head cut.'
Marc Owen Jones, an associate professor of Middle East studies at Hamad
Bin Khalifa University in Qatar who studies misinformation, told NBC News
that he found that the source of the '40 babies beheaded” allegations
largely stemmed from a viral Israeli news broadcast clip that did not
specifically refer to the allegation.
Nicole Zedeck, a correspondent for the privately owned Israeli news outlet
i24NEWS, said in the video that Israeli soldiers told her they’d found
“babies, their heads cut off.' The video has been viewed more than 11
million times on X, according to its view counter. In another tweet,
Zedeck wrote that soldiers told her they believe “40 babies/children were
killed.
Somehow those two bits of information were connected, the story became ‘40
babies were beheaded,’ and in the British press today, about six or seven
newspapers had it on their front pages,' Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he
could not confirm i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka,
Israel’s volunteer civilian emergency response organization, told CBS News
that he saw the bodies of beheaded children and babies, parents and
children who had been tortured and had their hands bound, and “a lot more
that cannot be described for now, because it’s very hard to describe.
By Wednesday, the claims, though still contentious, were going viral
online — being used as evidence of Hamas’ depravity. On Wednesday, the
phrase “Did Hamas kill babies” saw the biggest increase in search interest
on Google of anything related to the war.
Stranger Things star Noah Schnapp posted the shocking claim to his 25
million Instagram followers: '40 babies were beheaded and burned alive in
front of their parents by Hamas.' Sen. Ted Cruz, R-Texas, mentioned
beheaded babies in a post on X, and Rep. Mike McCaul, R-Texas, echoed the
allegations on CNN.
Jones found that the '40 babies beheaded' claim had over 44 million
impressions on X, with over 300,000 likes and more than 100,000 reposts.
The main accounts propagating the claims were i24NEWS and the official
Israel account, Jones’ data showed.
Baby stories are very emotive. Historically, they’re stories that can be
used to rationalize a very brutal response, Jones said. It’s such a
volatile information environment that such claims will inevitably be taken
out of context, both deliberately and accidentally.
"""

def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()

for sent in sentences:


freq_table = {}
words = word_tokenize(sent)
for word in words:
word = word.lower()
word = ps.stem(word)
if word in stopWords:
continue

if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1

frequency_matrix[sent[:15]] = freq_table

return frequency_matrix

def _create_tf_matrix(freq_matrix):
tf_matrix = {}

for sent, f_table in freq_matrix.items():


tf_table = {}

count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence

tf_matrix[sent] = tf_table

return tf_matrix

def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}

for sent, f_table in freq_matrix.items():


for word, count in f_table.items():
if word in word_per_doc_table:
word_per_doc_table[word] += 1
else:
word_per_doc_table[word] = 1
return word_per_doc_table

def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):


idf_matrix = {}

for sent, f_table in freq_matrix.items():


idf_table = {}

for word in f_table.keys():


idf_table[word] = math.log10(total_documents /
float(count_doc_per_words[word]))

idf_matrix[sent] = idf_table

return idf_matrix

def _score_sentences(tf_idf_matrix) -> dict:


sentenceValue = {}

for sent, f_table in tf_idf_matrix.items():


total_score_per_sentence = 0

count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score

sentenceValue[sent] = total_score_per_sentence /
count_words_in_sentence

return sentenceValue

def _find_average_score(sentenceValue) -> int:


sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]

# Average value of a sentence from original summary_text


average = (sumValues / len(sentenceValue))

return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''

for sentence in sentences:


if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]]
>= (threshold):
summary += " " + sentence
sentence_count += 1

return summary

# 1 Sentence Tokenization
sentences = sent_tokenize(text)
total_documents = len(sentences)
print(total_documents)

41

# 2 Creating the Frequency matrix of the words in each sentence.


freq_matrix = _create_frequency_matrix(sentences)

# 3 Calculate Term Frequency matrix


tf_matrix = _create_tf_matrix(freq_matrix)

# 5 Calculate IDF and generate a matrix


idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words,
total_documents)

# 6 Calculate TF-IDF and generate a matrix


tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

# 7 Important Algorithm: score the sentences


sentence_scores = _score_sentences(tf_idf_matrix)

# 8 Find the threshold


threshold = _find_average_score(sentence_scores)
print(threshold)
0.07720921963564482
# 9 Important step: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
print(summary)

Output:

Israel has published photos of dead infants after the terror attack. But the reports are still
unconfirmed, and in some cases have been retracted. Follow-ups that retract or add context are
less likely to be repeated or reach the same audience. It may turn out that the slaughter was done
in a particularly barbaric way. The video has been viewed more than 11 million times on X,
according to its view counter. Baby stories are very emotive.
Lab 7 : Multi document text summarization

1. Multi document summarization

articles = []
import pandas as pd
import numpy as np

articles.append({
"Title": "Article1",
"Content":""" """})
articles.append({
"Title": "Article2",
"Content":""" """})
articles.append({
"Title": "Article3",
"Content":""" """})
articles.append({
"Title": "Article4",
"Content":""" """})
articles.append({
"Title": "Article5",
"Content":""" """})
articles.append({
"Title": "Article6",
"Content":""" """})
articles.append({
"Title": "Article7",
"Content":""" """})

df = pd.DataFrame(articles)
print(df)

from sklearn.feature_extraction.text import TfidfVectorizer


# Instantiate the vectorizer with stop_words set to 'english'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Compute the TF-IDF vectors for the articles


tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])

from sklearn.metrics.pairwise import linear_kernel


# Compute cosine similarities between all pairs of articles
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_similarities)

# Extract the most representative sentence from each article


summaries = []
for idx, article in enumerate(df['Content']):
# Tokenize the article into sentences
sentences = article.split('. ')

# Compute the TF-IDF vectors for the sentences


sentence_vectors = tfidf_vectorizer.transform(sentences)

# Compute cosine similarities between the article and its sentences


cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()

# Find the sentence with the highest cosine similarity


top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
top_sentence = sentences[top_sentence_idx]

summaries.append(top_sentence)

# Add summaries to the dataframe


df['Summary'] = summaries
print(df[['Title', 'Summary']])

# Extract the most representative sentence from each article (as we did before)
summaries = []
for idx, article in enumerate(df['Content']):
sentences = article.split('. ')
sentence_vectors = tfidf_vectorizer.transform(sentences)
cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
top_sentence = sentences[top_sentence_idx]
summaries.append(top_sentence)

# Create a merged summary by concatenating the most representative sentences from each
article
merged_summary = '. '.join(summaries)
print("Merged Summary:")
print(merged_summary)

Output:

Title Content
0 Article1 Unverified reports of ‘40 babies beheaded’ in ...
1 Article2 The 'horrendous toll' on children caught in th...
2 Article3 What we actually know about the viral report o...
3 Article4 Israel releases horrific images of slain child...
4 Article5 ‘I would see and have confirmed pictures of te...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 ‘I would see and have confirmed pictures of te…

[[1. 0.14483033 0.39585 0.2582229 0.26818106 0.20422504


0.27895569]
[0.14483033 1. 0.22168557 0.24798395 0.2299564 0.21169182
0.23901851]
[0.39585 0.22168557 1. 0.28336111 0.30100884 0.31592353
0.31263709]
[0.2582229 0.24798395 0.28336111 1. 0.20396822 0.18262805
0.21117037]
[0.26818106 0.2299564 0.30100884 0.20396822 1. 0.18920684
0.96356329]
[0.20422504 0.21169182 0.31592353 0.18262805 0.18920684 1.
0.1972829 ]
[0.27895569 0.23901851 0.31263709 0.21117037 0.96356329 0.1972829
1. ]]

Title Summary
0 Article1 In another tweet, Zedeck wrote that soldiers t...
1 Article2 "Depriving children of access to food and esse...
2 Article3 What happened in Kibbutz Kfar Aza is a massacr...
3 Article4 Secretary of State Antony Blinken and NATO def...
4 Article5 I never thought I’d ever — anyway.” He did not...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 I never thought I’d ever — anyway.” He did not…

Merged Summary:
In another tweet, Zedeck wrote that soldiers told her they believe “40 babies/children were
killed.”
“Somehow those two bits of information were connected, the story became ‘40 babies were
beheaded,’ and in the British press today, about six or seven newspapers had it on their front
pages,” Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not confirm
i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer civilian
emergency response organization, told CBS News that he saw the bodies of beheaded children
and babies, parents and children who had been tortured and had their hands bound, and “a lot
more that cannot be described for now, because it’s very hard to describe.”
By Wednesday, the claims, though still contentious, were going viral online — being used as
evidence of Hamas’ depravity. "Depriving children of access to food and essential services puts
their lives at risk, as do attacks on civilian areas and infrastructure."
According to UNICEF, 80% of those who live in the Gaza Strip rely on some form of
humanitarian assistance.
The conflict has led to "grave humanitarian consequences," Lynn Hastings, a humanitarian
coordinator for the Gaza Strip for the United Nations Office for the Coordination of
Humanitarian Affairs, while calling for all parties to abide by international humanitarian law.
"Civilians, especially children, medical facilities, humanitarian personnel health workers, and
journalists must be protected," Hastings said in a statement Tuesday. What happened in Kibbutz
Kfar Aza is a massacre in which women, children and toddlers and elderly were brutally
butchered in an ISIS way of action."
What happened at the kibbutz?
The Kfar Aza kibbutz is one of several self-contained Israeli settlements close to the Gaza
border.
It is located between Netivot and Sderot - around three miles from the border in southern Israel.
Because of its proximity to Gaza and the unprecedented nature of last weekend's incursion,
which saw Hamas militants breach the usually heavily guarded border on foot - it was one of the
first sites they reached on Saturday.
Four days later, journalists got to see the destruction left behind.
Ramsay said the scene "can only be described as a massacre".
"The stories here are shocking - families being woken without warning to voices outside their
houses, mums and dads hiding their children in cupboards, wine cellars and basements, husbands
and wives becoming separated in the fight," he said.
He added it took 17 hours for help to arrive, as the IDF focused on urban areas first - leaving
residents defenceless and numbers of dead high.
Why are there reports of 'babies being beheaded'?
Claims Hamas fighters beheaded babies have only been reported by one journalist - Nicole
Zedek from i24 - and have not been verified by Sky News.
Ms Zedek was among the reporters invited to see what was left at the kibbutz on Tuesday.
In one live broadcast, which has since been viewed millions of times on X, formerly known as
Twitter, she says: "Talking to some of the soldiers here, they say what they witnessed as they've
been walking through these communities is bodies of babies with their heads cut off and families
gunned down in their beds.
"We can see some of these soldiers right now, comforting each other."
She is also filmed speaking to the deputy commander of the IDF's unit 71, David Ben Zion, who
describes Hamas fighters as "aggressive" and "very bad".
He says: "They cut off heads… of children, of women."
And in another live broadcast, Zedek describes "40 babies at least were taken out on gurneys" -
which is where the widely shared 40 figure comes from.
show more (open the raw output data in a text editor) ...
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with
"decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm it
officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency response organization,
Zaka, shared with CBS News that he witnessed the gruesome sight of children and infants who
had been decapitated.
“I saw a lot more that cannot be described for now, because it’s very hard to describe,” he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to
report the allegations of child beheadings on Tuesday
Lab 8 : Term Incidence Matrix
Code:

Doc_1 = """ """

Doc_2 = """ """

Doc_3 = """ """

Doc_4 = """ """

Doc_5 = """ """

Doc_6 = """ """

Doc_7 = """ """

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

#getting unique terms in the documents


# set - does not allow duplicate elements

unique_terms= set() # creating an empty set


for doc in docs:
# tokenize words
for term in doc.split():
if term.lower() not in stop_words:
unique_terms.add(term)
print(unique_terms)
print(len(unique_terms))

# Initialize NLTK's word tokenizer


nltk.download('punkt') # Download the required dataset (only need to do
this once)

# Tokenize words in each document


tokenized_docs = [word_tokenize(doc) for doc in docs]

# Print tokenized documents


for i, doc_tokens in enumerate(tokenized_docs, 1):
print(f"Tokens in Doc_{i}: {doc_tokens}")

#creating term-document incidence matrix in the form of a dictionary


doc_term_matrix = {}

for term in unique_terms:


doc_term_matrix[term] = []

for doc in docs:


if term in doc:
doc_term_matrix[term].append(1)
else: doc_term_matrix[term].append(0)

for term, term_list in doc_term_matrix.items():


print(f"{term}: {term_list}")

Output:

{'contentious,', 'personnel', 'photo', 'expressed', 'body', 'killing,', 'particularly', 'spoke',


'Coordination', '"unconfirmed".', 'caught', "I'm", 'reposts.', '"Please', 'still', 'thousands', 'targeted',
'linking', 'report.', 'babies”', 'Unverified', 'called', 'comparing', '"To', 'bitter', '260', 'small',
'Kibbutz', 'retract', 'decapitated', 'another', 'large', 'unconfirmed,', 'United', 'related', 'harmed',
'point', 'suffering', 'founded', 'possible', 'located', 'believed', 'mums', 'Wednesday,', 'Barkat',
……..
}
1534

Tokens in Doc_1: ['Unverified', 'reports', 'of', '‘', '40', 'babies', 'beheaded', '’', 'in', 'Israel-Hamas',
'war', 'inflame', 'social', 'media', 'No', 'photo', 'evidence', 'had', 'been', 'made', 'public', 'as', 'of',,
'.',...... ']
Tokens in Doc_2: ['The', "'horrendous", 'toll', "'", 'on', 'children', 'caught', 'in', 'the', 'Israel-Gaza',
'conflict', 'Hundreds', 'of', 'children', 'have', 'been', 'killed', 'so', 'far', ',', 'with', 'the', 'true', 'total',
'Russell', 'said', '.......']
Tokens in Doc_3: ['What', 'we', 'actually', 'know', 'about', 'the', 'viral', 'report', 'of', 'beheaded',
'babies', 'in', 'Israel', 'One', 'journalist', 'from', 'the', 'Tel', 'Aviv-based', 'news', 'channel', 'i24', 'said',
'war', '........']
Tokens in Doc_4: ['Israel', 'releases', 'horrific', 'images', 'of', 'slain', 'children', 'after', 'Hamas',
'attack', 'JERUSALEM', '/', 'TEL', 'AVIV', '/', 'BRUSSELS', '-', 'CONTENT', 'WARNING', ':',
'right', 'things', '.', ……’]
Tokens in Doc_5: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',
'attack', '........']
Tokens in Doc_6: ['At', 'least', '40', 'babies', 'killed', ',', 'beheaded', 'in', 'Israeli', 'kibbutz', 'outside',
'Gaza', 'Strip', ',', 'reports', 'say', 'KFAR', 'AZA', ',', 'Israel', '(', 'TND', ')', '—', 'Dozens', 'of',
'Monday', …..'.']
Tokens in Doc_7: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',, 'the',
'attack', ‘…….']

contentious,: [1, 0, 0, 0, 0, 0, 0]
personnel: [0, 1, 0, 0, 0, 0, 0]
photo: [1, 0, 0, 1, 0, 0, 0]
expressed: [0, 0, 0, 0, 1, 0, 1]
body: [0, 0, 1, 1, 0, 0, 0]
killing,: [0, 1, 0, 0, 0, 0, 0]
particularly: [1, 0, 0, 1, 0, 0, 0]
spoke: [1, 1, 1, 0, 1, 0, 1]
Coordination: [0, 1, 0, 0, 0, 0, 0]
"unconfirmed".: [0, 0, 1, 0, 0, 0, 0]
caught: [0, 1, 0, 0, 0, 0, 0]
I'm: [0, 1, 0, 0, 0, 1, 0]
reposts.: [1, 0, 0, 0, 0, 0, 0]
"Please: [0, 0, 1, 0, 0, 0, 0]
still: [1, 1, 0, 0, 0, 1, 0]
thousands: [0, 0, 0, 1, 0, 0, 0]
targeted: [0, 0, 0, 1, 0, 0, 0]
linking: [0, 0, 1, 0, 0, 0, 0]
report.: [1, 0, 0, 0, 1, 0, 1]
babies”: [1, 0, 0, 0, 0, 0, 0]
Unverified: [1, 0, 0, 0, 0, 0, 0]
called: [0, 1, 0, 0, 0, 0, 0]
comparing: [0, 0, 0, 1, 0, 0, 0]
"To: [0, 0, 1, 0, 0, 0, 0]
bitter: [1, 0, 0, 0, 0, 0, 0]
260: [0, 1, 0, 0, 0, 0, 0]
small: [0, 0, 1, 0, 0, 1, 0]
Kibbutz: [0, 0, 1, 0, 0, 1, 0]
retract: [1, 0, 0, 0, 0, 0, 0]
decapitated: [1, 0, 0, 0, 1, 0, 1]
another: [1, 1, 1, 0, 1, 0, 1]
war: [1, 1, 1, 0, 0, 1, 0]

docs = ["Doc_1", "Doc_2", "Doc_3", "Doc_4", "Doc_5", "Doc_6", "Doc_7"]

# Convert the document list to a NumPy array


docs_array = np.array(docs, dtype='object')

# query to find documents containing killing and war


import numpy as np
docs_array = np.array(docs, dtype='object')
v1 = np.array(doc_term_matrix['killing'])
v2 = np.array(doc_term_matrix['war'])

print(v1)
print(v2)
print('---------')
v3 = v1 & v2
print(v3)

[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
---------
[0 1 0 0 0 0 0]

matching_doc_indices = np.where(v3 == 1)[0]


print("Documents where 'killing' and 'war' are both present:")
for doc_index in matching_doc_indices:
print(docs_array[doc_index])

Documents where 'killing' and 'war' are both present:


Doc_2

# query to find documents containing killing or war


import numpy as np
docs_array = np.array(docs, dtype='object')
v1 = np.array(doc_term_matrix['killing'])
v2 = np.array(doc_term_matrix['war'])

print(v1)
print(v2)
print('---------')
v = v1 | v2
print(v)

[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
---------
[1 1 1 0 0 1 0]

matching_doc_indices = np.where(v == 1)[0]


print("Documents where either 'killing' or 'war' are present:")
for doc_index in matching_doc_indices:
print(docs_array[doc_index])

Documents where either 'killing' or 'war' are present:


Doc_1
Doc_2
Doc_3
Doc_6

You might also like