You are on page 1of 3

In [68]: #TASK 1

#import all required libraray import pandas as pd import math import numpy
as np from scipy import sparse from scipy.stats import uniform from
sklearn.feature_extraction.text import TfidfVectorizer

# input data string corpus = ['this is the first document',


'this document is the second document', 'and this is the third one', 'is
this the first document'] # use fit method to compute Bag of words

vectorizer = TfidfVectorizer() vectorizer.fit(corpus) skl_output =


vectorizer.transform(corpus) bow=vectorizer.get_feature_names() print(bow)
IDF_reference=vectorizer.idf_ print(IDF_reference) #compute IDF using
custom method

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() vectors = vectorizer.fit_transform(corpus)

matrix = CountVectorizer() matrix.fit(corpus) # after this statement the


matrix will build the vocabulary with all th e unique words

Create PDF in your applications with the Pdfcrowd HTML to PDF API PDFCROWD
# you should call this function only after fit()

# to convert the sentance into numerical vectors, we will call transfor


m() # the first feature name will corresponds to first column in
transforme d matrix # the 2nd feature name will corresponds to 2nd column
in transformed ma trix

print(matrix.transform(corpus).toarray())

# Here we will print the sklearn tfidf vectorizer idf values after appl
ying the fit method # After using the fit function on the corpus the vocab
has 9 words in i t, and each has its idf value.

#compute IDF using custom method

for i in range(len(bow)):
Y=0 word=bow[i] for j in range(len(corpus)):
list[j]=corpus[j].split()

if(word in list[j]): #print(word) #print(list[j]) Y=Y+1 X=len(corpus)


XY=math.log((1+X)/(1+Y)) IDF_custom=XY+1 print(IDF_custom)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'th

is'] [1 91629073 1 22314355 1 51082562 1 1 91629073 1 91629073 Create PDF

in your applications with the Pdfcrowd HTML to PDF API PDFCROWD


[1.91629073 1.22314355 1.51082562 1. 1.91629073 1.91629073
1. 1.91629073 1. ] [[0 1 1 1 0 0 1 0 1] [0 2 0 1 0 1 1 0 1] [1 0 0 1 1 0 1
1 1] [0 1 1 1 0 0 1 0 1]] 1.916290731874155 1.2231435513142097
1.5108256237659907 1.0 1.916290731874155 1.916290731874155 1.0
1.916290731874155 1.0

In [15]: #TASK2
import pickle import numpy as np with
open("E:\Applied_AI\Assignments\cleaned_strings","rb") as f:
data = pickle.load(f) # printing the length of the corpus loaded
print("Number of documents in data = ",len(data))

#call all usique words using fit and tranform function from
sklearn.feature_extraction.text import TfidfVectorizer vectorizer =
TfidfVectorizer() vectorizer.fit(data) skl_output =
vectorizer.transform(data) bow=vectorizer.get_feature_names()

#compute IDF IDF=vectorizer.idf_

#sort IDF in descending order sorted_IDF=np.sort(IDF)


required_IDF=sorted_IDF[::-1]

#print top 50 IDF values

Create PDF in your applications with the Pdfcrowd HTML to PDF API PDFCROWD
print(required_IDF[0:49])

Number of documents in data = 746 [6.922918 6.922918 6.922918 6.922918


6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918

6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918

6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918

6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918

6.922918 6.922918 6.922918 6.92291 8 6.922918]

Create PDF in your applications with the Pdfcrowd HTML to PDF API PDFCROWD

You might also like