Professional Documents
Culture Documents
20BCP123 - NLP Lab Manual
20BCP123 - NLP Lab Manual
SCHOOL OF TECHNOLOGY
CSE DEPARTMENT
Submitted to,
Dr. Santosh Bharti
Submitted by,
Khushi Shah
(20BCP123)
Lab 1: Tokenization
1. Sentence Tokenization:
def extract_sentences(paragraph):
paragraph = paragraph.replace("? ", "|")
paragraph = paragraph.replace("! ", "|")
paragraph = paragraph.replace(". ", "|")
sentences = paragraph.split("|")
return sentences
paragraph = "This is the first sentence. The second one follows? And here
comes the third sentence! My email is abc@gmail.com, you can contact
here."
sentences = extract_sentences(paragraph)
Output:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com or etc., many ways are there. " + \
"Khushi said: 'Today is her birthday. So she is very happy.' So everyone
wished her. "
Output:
Sentence tokenization: ['Dr. Ami Shah is a MD doctor.', 'Please feel free to contact her on
ami@gmail.com or etc., many ways are there.', "Khushi said: 'Today is her birthday.", "So she is
very happy.'", 'So everyone wished her.']
import nltk
nltk.download('punkt')
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com etc. many ways are there. " + \
"Don't do this. " + \
"Researcher's said AI will change the word. " + \
"You can't go there. "
Output:
Word tokenization: ['Dr.', 'Ami', 'Shah', 'is', 'a', 'MD', 'doctor', '.', 'Please', 'feel', 'free', 'to',
'contact', 'her', 'on', 'ami', '@', 'gmail.com', 'etc', '.', 'many', 'ways', 'are', 'there', '.', 'Do', "n't", 'do',
'this', '.', 'Researcher', "'s", 'said', 'AI', 'will', 'change', 'the', 'word', '.', 'You', 'ca', "n't", 'go', 'there',
'.']
def word_tokenize(sentence):
words = [] # Create an empty list to store the words.
current_word = "" # Initialize an empty string for the current word.
if current_word:
words.append(current_word) # Append the last word (if any) to the list.
return words
sentence = "Dr. Ami is not available today. Please feel free to contact her on
ami@gmail.com. Don't come. You can't go there."
tokens = word_tokenize(sentence)
print(tokens)
Output:
['Dr', 'Ami', 'is', 'not', 'available', 'today', 'Please', 'feel', 'free', 'to', 'contact', 'her', 'on', 'ami@gmail',
'com', "Don't", 'come', 'You', "can't", 'go', 'there']
Lab 2 : Stemming and Lemmatization
1. Stemming:
def stemx(word):
suffix = ['ed', 'es', 'ing', 'ship', 's', 'less', 'ion', 'ly', 'ions',
'full', 'able']
prefix = ['un', 'pre', 'dis', 're', 'il', 'im', 'in', 'de', 'mis', 'sub',
'non', 'anti']
for x in prefix:
if word.startswith(x):
word = word[len(x):]
for y in suffix:
if word.endswith(y):
word = word[:-len(y)]
return word
Output:
going : go
friends : friend
friendship : friend
happily : happi
friendly : friend
programming : programm
for w in words:
print(w, " : ", ps.stem(w))
Output:
friendship : friendship
programs : program
programmer : programm
programming : program
programmers : programm
predefined : predefin
happily : happili
class BasicLemmatizer:
def __init__(self):
self.dictionary = PyDictionary()
lemmatizer = WordNetLemmatizer()
words1 = ['smiling', 'died', 'purchased', 'went', 'bought']
for w in words1:
print(w + " : " + lemmatizer.lemmatize(w, pos="v")) # v - verb
Output:
smiling : smile
died : die
purchased : purchase
went : go
bought : buy
better : good
worst : bad
happiest : happy
babies : baby
mice : mouse
feet : foot
5. Porter Stemmer:
class PorterStemmer:
def __init__(self):
pass
# m: count of VC pair
def _measure(self, word):
return ''.join(['V' if char in "aeiou" else 'C' for char in
word]).count('VC')
def test_stemmer():
stemmer = PorterStemmer()
return print(results)
test_stemmer()
Output:
Code:
train_li_words = ['']
train_li_words*= num_words_train
train_li_tags = ['']
train_li_tags*= num_words_train
for i in range(num_words_train):
temp_li = tr_li[i].split("/")
train_li_words[i] = temp_li[0]
train_li_tags[i] = temp_li[1]
dict2_word_tag = {}
for i in range(num_words_train-1):
outer_key = train_li_tags[i]
inner_key = train_li_tags[i+1]
dict2_tag_follow_tag_[outer_key]=dict2_tag_follow_tag_.get(outer_key,{})
dict2_tag_follow_tag_[outer_key][inner_key] =
dict2_tag_follow_tag_[outer_key].get(inner_key,0)
dict2_tag_follow_tag_[outer_key][inner_key]+=1
outer_key = train_li_words[i]
inner_key = train_li_tags[i]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1
dict2_tag_follow_tag_['.'] = dict2_tag_follow_tag_.get('.',{})
dict2_tag_follow_tag_['.'][train_li_tags[0]] =
dict2_tag_follow_tag_['.'].get(train_li_tags[0],0)
dict2_tag_follow_tag_['.'][train_li_tags[0]]+=1
last_index = num_words_train-1
###Testing Phase###
te_li = te_str.split()
num_words_test = len(te_li)
test_li_words = ['']
test_li_words*= num_words_test
test_li_tags = ['']
test_li_tags*= num_words_test
output_li = ['']
output_li*= num_words_test
output_li_baseline = ['']
output_li_baseline*= num_words_test
num_errors = 0
num_errors_baseline = 0
for i in range(num_words_test):
temp_li = te_li[i].split("/")
test_li_words[i] = temp_li[0]
test_li_tags[i] = temp_li[1]
output_li_baseline[i] = dict_word_tag_baseline.get(temp_li[0],'')
#If unknown word - tag = 'NNP'
if output_li_baseline[i]=='':
output_li_baseline[i]='NNP'
if output_li_baseline[i]!=test_li_tags[i]:
num_errors_baseline+=1
if i==0: #Accounting for the 1st word in the test document for the
Viterbi
di_transition_probs = dict2_tag_follow_tag_['.']
else:
di_transition_probs = dict2_tag_follow_tag_[output_li[i-1]]
di_emission_probs = dict2_word_tag.get(test_li_words[i],'')
else:
max_prod_prob = 0
counter_trans = 0
counter_emis =0
prod_prob = 0
while counter_trans < len(di_transition_probs) and counter_emis <
len(di_emission_probs):
tag_tr = di_transition_probs[counter_trans][0]
tag_em = di_emission_probs[counter_emis][0]
if tag_tr < tag_em:
counter_trans+=1
elif tag_tr > tag_em:
counter_emis+=1
else:
prod_prob = di_transition_probs[counter_trans][1] *
di_emission_probs[counter_emis][1]
if prod_prob > max_prod_prob:
max_prod_prob = prod_prob
output_li[i] = tag_tr
#print "i=",i," and output=",output_li[i]
counter_trans+=1
counter_emis+=1
if output_li[i]=='': #In case there are no matching entries between the
transition tags and emission tags, we choose the most frequent emission
tag
output_li[i] = max(di_emission_probs,key=itemgetter(1))[0]
if output_li[i]!=test_li_tags[i]:
num_errors+=1
print("Correct tags:",test_li_tags)
Output:
import nltk
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
Senti_tweets=[]
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered=[e.lower() for e in words.split() if len(e)>=3]
Senti_tweets.append((words_filtered, sentiment))
print(Senti_tweets)
def get_words_in_tweets(tweets):
all_words=[]
for (words, sentiment) in Senti_tweets:
all_words.extend(words)
return (all_words)
def get_word_features(wordlist):
wordlist=nltk.FreqDist(wordlist)
word_features=wordlist.keys()
return word_features
word_features=get_word_features(get_words_in_tweets(Senti_tweets))
print(word_features)
word_features_filtered=[]
for w in word_features:
if w not in stopwords:
word_features_filtered.append(w)
print(word_features_filtered)
def extract_features(document):
document_words=set(document)
features={}
for word in word_features_filtered:
features['contains(%s)' %word] = (word in document_words)
return features
training_set = nltk.classify.apply_features(extract_features,
Senti_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
Output:
[(['not', 'impossible'], 'positive'), (['you', 'are', 'lovely', 'friend'], 'Positive'), (['she', 'beautiful',
'girl'], 'Positive'), (['looking', 'handsome'], 'Positive'), (['exercise', 'good', 'for', 'health'], 'Positive'),
(["today's", 'weather', 'fantastic'], 'Positive'), (['love', 'mango'], 'Positive'), (['you', 'are', 'enemy',
'friend'], 'Negative'), (['she', 'looking', 'ugly'], 'Negative'), (['looking', 'horrible'], 'Negative'),
(['sleeping', 'more', 'makes', 'you', 'lazy'], 'Negative'), (["today's", 'weather', 'very', 'bad'],
'Negative'), (['hate', 'banana'], 'Negative')]
dict_keys(['not', 'impossible', 'you', 'are', 'lovely', 'friend', 'she', 'beautiful', 'girl', 'looking',
'handsome', 'exercise', 'good', 'for', 'health', "today's", 'weather', 'fantastic', 'love', 'mango',
'enemy', 'ugly', 'horrible', 'sleeping', 'more', 'makes', 'lazy', 'very', 'bad', 'hate', 'banana'])
['impossible', 'lovely', 'friend', 'beautiful', 'girl', 'looking', 'handsome', 'exercise', 'good', 'health',
"today's", 'weather', 'fantastic', 'love', 'mango', 'enemy', 'ugly', 'horrible', 'sleeping', 'makes', 'lazy',
'bad', 'hate', 'banana']
import pandas as pd
df = pd.read_csv('full-corpus.csv')
df.head()
import nltk
from nltk.corpus import stopwords
def preprocess(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) #
Convert URLs to the word URL
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # Convert @username to
AT_USER
tweet = re.sub('[\s]+', ' ', tweet) # Remove additional whitespaces
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # Replace #word with word
tweet = tweet.strip('\'"') # Trim
return tweet
import re
import nltk
# Word tokenizer
def tokenize(tweet):
return nltk.word_tokenize(tweet)
# Extract features
def extract_features(document):
document_words = set(tokenize(document))
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
import nltk
nltk.download('punkt')
# Test the classifier (optional, if you want to test with a sample tweet)
test_tweet = "It’s easy to be brave when you’re hiding behind a keyboard.
You and your Hamas friends will regret your barbaric actions very soon."
print(classifier.classify(extract_features(preprocess(test_tweet))))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
# If you want a detailed report for each class (Positive, Negative, etc.)
print(classification_report(actual, predictions))
Output:
Negative
Lab 5 : Turney’s method of Sentiment Analysis
Code:
import math
import re
import json
import nltk
# nltk.download("averaged_perceptron_tagger", quiet=True)
def loadReviews(fileName):
file = open(fileName)
list_pos = []
list_neg = []
data = json.load(file)
for elem in data:
if float(elem["overall"]) >= 3.0:
list_pos.append(elem["reviewText"])
else:
list_neg.append(elem["reviewText"])
file.close()
return list_pos, list_neg
def make_datasets(fileName):
all_positive_reviews, all_negative_reviews = loadReviews(fileName)
dataset = {"train": {"neg": [], "pos": []}, "test": {"neg": [], "pos":
[]}}
dataset["train"]["pos"] = all_positive_reviews[:20000]
dataset["train"]["neg"] = all_negative_reviews[:20000]
dataset["test"]["pos"] = all_positive_reviews[-50:]
dataset["test"]["neg"] = all_negative_reviews[-50:]
return dataset
def find_pattern(postag):
tag_pattern = []
for k in range(len(postag) - 2):
if postag[k][1] == "JJ" and (
postag[k + 1][1] == "NN" or postag[k + 1][1] == "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
(postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
postag[k][1] == "JJ"
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
(postag[k][1] == "NN" or postag[k][1] == "NNS")
and postag[k + 1][1] == "JJ"
and postag[k + 2][1] != "NN"
and postag[k + 2][1] != "NNS"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif (
postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1]
== "RBS"
) and (
postag[k + 1][1] == "VB"
or postag[k + 1][1] == "VBD"
or postag[k + 1][1] == "VBN"
or postag[k + 1][1] == "VBG"
):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
return tag_pattern
class Turney(object):
def __init__(self, dataset):
self.datasets = dataset
self.pos_phrases_hits = []
self.neg_phrases_hits = []
self.pos_hits = 0.01
self.neg_hits = 0.01
self.accuracy = 0
def turney(self):
tp = 0
fp = 0
tn = 0
fn = 0
for boolean, test_klass in enumerate(["pos", "neg"]):
for i, data in enumerate(self.datasets["test"][test_klass]):
print(
str(i)
+ " out of "
+ str(len(self.datasets["test"][test_klass]))
+ " --> round "
+ str(boolean)
)
phrases =
find_pattern(nltk.pos_tag(nltk.word_tokenize(data)))
if len(phrases) == 0:
continue
self.pos_phrases_hits = [0.01] * len(phrases)
self.neg_phrases_hits = [0.01] * len(phrases)
self.pos_hits = 0.01
self.neg_hits = 0.01
if __name__ == "__main__":
FILE_PATH = "/content/Cell_Phones_and_Accessories_5.json"
datasets = make_datasets(FILE_PATH)
turney = Turney(datasets)
turney.turney()
Output:
Lab 6 : Text Summarization
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent[:15]] = freq_table
return frequency_matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}
idf_matrix[sent] = idf_table
return idf_matrix
count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence /
count_words_in_sentence
return sentenceValue
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
return summary
# 1 Sentence Tokenization
sentences = sent_tokenize(text)
total_documents = len(sentences)
print(total_documents)
41
Output:
Israel has published photos of dead infants after the terror attack. But the reports are still
unconfirmed, and in some cases have been retracted. Follow-ups that retract or add context are
less likely to be repeated or reach the same audience. It may turn out that the slaughter was done
in a particularly barbaric way. The video has been viewed more than 11 million times on X,
according to its view counter. Baby stories are very emotive.
Lab 7 : Multi document text summarization
articles = []
import pandas as pd
import numpy as np
articles.append({
"Title": "Article1",
"Content":""" """})
articles.append({
"Title": "Article2",
"Content":""" """})
articles.append({
"Title": "Article3",
"Content":""" """})
articles.append({
"Title": "Article4",
"Content":""" """})
articles.append({
"Title": "Article5",
"Content":""" """})
articles.append({
"Title": "Article6",
"Content":""" """})
articles.append({
"Title": "Article7",
"Content":""" """})
df = pd.DataFrame(articles)
print(df)
summaries.append(top_sentence)
# Extract the most representative sentence from each article (as we did before)
summaries = []
for idx, article in enumerate(df['Content']):
sentences = article.split('. ')
sentence_vectors = tfidf_vectorizer.transform(sentences)
cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
top_sentence = sentences[top_sentence_idx]
summaries.append(top_sentence)
# Create a merged summary by concatenating the most representative sentences from each
article
merged_summary = '. '.join(summaries)
print("Merged Summary:")
print(merged_summary)
Output:
Title Content
0 Article1 Unverified reports of ‘40 babies beheaded’ in ...
1 Article2 The 'horrendous toll' on children caught in th...
2 Article3 What we actually know about the viral report o...
3 Article4 Israel releases horrific images of slain child...
4 Article5 ‘I would see and have confirmed pictures of te...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 ‘I would see and have confirmed pictures of te…
Title Summary
0 Article1 In another tweet, Zedeck wrote that soldiers t...
1 Article2 "Depriving children of access to food and esse...
2 Article3 What happened in Kibbutz Kfar Aza is a massacr...
3 Article4 Secretary of State Antony Blinken and NATO def...
4 Article5 I never thought I’d ever — anyway.” He did not...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 I never thought I’d ever — anyway.” He did not…
Merged Summary:
In another tweet, Zedeck wrote that soldiers told her they believe “40 babies/children were
killed.”
“Somehow those two bits of information were connected, the story became ‘40 babies were
beheaded,’ and in the British press today, about six or seven newspapers had it on their front
pages,” Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not confirm
i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer civilian
emergency response organization, told CBS News that he saw the bodies of beheaded children
and babies, parents and children who had been tortured and had their hands bound, and “a lot
more that cannot be described for now, because it’s very hard to describe.”
By Wednesday, the claims, though still contentious, were going viral online — being used as
evidence of Hamas’ depravity. "Depriving children of access to food and essential services puts
their lives at risk, as do attacks on civilian areas and infrastructure."
According to UNICEF, 80% of those who live in the Gaza Strip rely on some form of
humanitarian assistance.
The conflict has led to "grave humanitarian consequences," Lynn Hastings, a humanitarian
coordinator for the Gaza Strip for the United Nations Office for the Coordination of
Humanitarian Affairs, while calling for all parties to abide by international humanitarian law.
"Civilians, especially children, medical facilities, humanitarian personnel health workers, and
journalists must be protected," Hastings said in a statement Tuesday. What happened in Kibbutz
Kfar Aza is a massacre in which women, children and toddlers and elderly were brutally
butchered in an ISIS way of action."
What happened at the kibbutz?
The Kfar Aza kibbutz is one of several self-contained Israeli settlements close to the Gaza
border.
It is located between Netivot and Sderot - around three miles from the border in southern Israel.
Because of its proximity to Gaza and the unprecedented nature of last weekend's incursion,
which saw Hamas militants breach the usually heavily guarded border on foot - it was one of the
first sites they reached on Saturday.
Four days later, journalists got to see the destruction left behind.
Ramsay said the scene "can only be described as a massacre".
"The stories here are shocking - families being woken without warning to voices outside their
houses, mums and dads hiding their children in cupboards, wine cellars and basements, husbands
and wives becoming separated in the fight," he said.
He added it took 17 hours for help to arrive, as the IDF focused on urban areas first - leaving
residents defenceless and numbers of dead high.
Why are there reports of 'babies being beheaded'?
Claims Hamas fighters beheaded babies have only been reported by one journalist - Nicole
Zedek from i24 - and have not been verified by Sky News.
Ms Zedek was among the reporters invited to see what was left at the kibbutz on Tuesday.
In one live broadcast, which has since been viewed millions of times on X, formerly known as
Twitter, she says: "Talking to some of the soldiers here, they say what they witnessed as they've
been walking through these communities is bodies of babies with their heads cut off and families
gunned down in their beds.
"We can see some of these soldiers right now, comforting each other."
She is also filmed speaking to the deputy commander of the IDF's unit 71, David Ben Zion, who
describes Hamas fighters as "aggressive" and "very bad".
He says: "They cut off heads… of children, of women."
And in another live broadcast, Zedek describes "40 babies at least were taken out on gurneys" -
which is where the widely shared 40 figure comes from.
show more (open the raw output data in a text editor) ...
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with
"decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm it
officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency response organization,
Zaka, shared with CBS News that he witnessed the gruesome sight of children and infants who
had been decapitated.
“I saw a lot more that cannot be described for now, because it’s very hard to describe,” he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to
report the allegations of child beheadings on Tuesday
Lab 8 : Term Incidence Matrix
Code:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
Output:
Tokens in Doc_1: ['Unverified', 'reports', 'of', '‘', '40', 'babies', 'beheaded', '’', 'in', 'Israel-Hamas',
'war', 'inflame', 'social', 'media', 'No', 'photo', 'evidence', 'had', 'been', 'made', 'public', 'as', 'of',,
'.',...... ']
Tokens in Doc_2: ['The', "'horrendous", 'toll', "'", 'on', 'children', 'caught', 'in', 'the', 'Israel-Gaza',
'conflict', 'Hundreds', 'of', 'children', 'have', 'been', 'killed', 'so', 'far', ',', 'with', 'the', 'true', 'total',
'Russell', 'said', '.......']
Tokens in Doc_3: ['What', 'we', 'actually', 'know', 'about', 'the', 'viral', 'report', 'of', 'beheaded',
'babies', 'in', 'Israel', 'One', 'journalist', 'from', 'the', 'Tel', 'Aviv-based', 'news', 'channel', 'i24', 'said',
'war', '........']
Tokens in Doc_4: ['Israel', 'releases', 'horrific', 'images', 'of', 'slain', 'children', 'after', 'Hamas',
'attack', 'JERUSALEM', '/', 'TEL', 'AVIV', '/', 'BRUSSELS', '-', 'CONTENT', 'WARNING', ':',
'right', 'things', '.', ……’]
Tokens in Doc_5: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',
'attack', '........']
Tokens in Doc_6: ['At', 'least', '40', 'babies', 'killed', ',', 'beheaded', 'in', 'Israeli', 'kibbutz', 'outside',
'Gaza', 'Strip', ',', 'reports', 'say', 'KFAR', 'AZA', ',', 'Israel', '(', 'TND', ')', '—', 'Dozens', 'of',
'Monday', …..'.']
Tokens in Doc_7: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel', 'Reports',, 'the',
'attack', ‘…….']
contentious,: [1, 0, 0, 0, 0, 0, 0]
personnel: [0, 1, 0, 0, 0, 0, 0]
photo: [1, 0, 0, 1, 0, 0, 0]
expressed: [0, 0, 0, 0, 1, 0, 1]
body: [0, 0, 1, 1, 0, 0, 0]
killing,: [0, 1, 0, 0, 0, 0, 0]
particularly: [1, 0, 0, 1, 0, 0, 0]
spoke: [1, 1, 1, 0, 1, 0, 1]
Coordination: [0, 1, 0, 0, 0, 0, 0]
"unconfirmed".: [0, 0, 1, 0, 0, 0, 0]
caught: [0, 1, 0, 0, 0, 0, 0]
I'm: [0, 1, 0, 0, 0, 1, 0]
reposts.: [1, 0, 0, 0, 0, 0, 0]
"Please: [0, 0, 1, 0, 0, 0, 0]
still: [1, 1, 0, 0, 0, 1, 0]
thousands: [0, 0, 0, 1, 0, 0, 0]
targeted: [0, 0, 0, 1, 0, 0, 0]
linking: [0, 0, 1, 0, 0, 0, 0]
report.: [1, 0, 0, 0, 1, 0, 1]
babies”: [1, 0, 0, 0, 0, 0, 0]
Unverified: [1, 0, 0, 0, 0, 0, 0]
called: [0, 1, 0, 0, 0, 0, 0]
comparing: [0, 0, 0, 1, 0, 0, 0]
"To: [0, 0, 1, 0, 0, 0, 0]
bitter: [1, 0, 0, 0, 0, 0, 0]
260: [0, 1, 0, 0, 0, 0, 0]
small: [0, 0, 1, 0, 0, 1, 0]
Kibbutz: [0, 0, 1, 0, 0, 1, 0]
retract: [1, 0, 0, 0, 0, 0, 0]
decapitated: [1, 0, 0, 0, 1, 0, 1]
another: [1, 1, 1, 0, 1, 0, 1]
war: [1, 1, 1, 0, 0, 1, 0]
print(v1)
print(v2)
print('---------')
v3 = v1 & v2
print(v3)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
---------
[0 1 0 0 0 0 0]
print(v1)
print(v2)
print('---------')
v = v1 | v2
print(v)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
---------
[1 1 1 0 0 1 0]