You are on page 1of 18

M.

Sri Phani Bhushan

AP19110010110

In [34]: 

import nltk

Splitting the text file into 1400 different text files

In [58]: 

i=1
punctuations='''()-[]{};:\/,.<>@$''^+1234567890*%&=?'''

with open('cran/cran.all.1400','r') as f:
for line in f:
if line[:2]=='.I':
file=open(f'Dataset assignment-1\{line[3:len(line)-1]}.txt','x')
elif line[:2] in ['.T','.B','.A','.W']:
continue

elif line=='':
file.close()
else:
no_punct_line=""
for char in line:
if(char not in punctuations):
no_punct_line+=char

file.write(no_punct_line)

In [59]: 

file=open(f'Dataset assignment-1\1400.txt','w')
file.close()

Tokenizing and stemming the documents


In [114]: 

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
tokens = {}

for i in range(1, 1401):


file = open(f'Dataset assignment-1\{i}.txt')
for line in file:
for word in line.split():
w = str(ps.stem(word))
if w in tokens:
tokens[w][0] = len(tokens[w][1])
tokens[w][1].add(i)
else:
tokens[w] = [1,set()]
tokens[w][1].add(i)
# tokens[w] = 1
file.close()

In [115]: 

print(tokens.keys())

dict_keys(['experiment', 'investig', 'of', 'the', 'aerodynam', 'a', 'win


g', 'in', 'slipstream', 'brenckmanm', 'j', 'ae', 'sc', 'an', 'studi', 'pro
pel', 'wa', 'made', 'order', 'to', 'determin', 'spanwis', 'distribut', 'li
ft', 'increas', 'due', 'at', 'differ', 'angl', 'attack', 'and', 'free', 's
tream', 'veloc', 'ratio', 'result', 'were', 'intend', 'part', 'as', 'eval
u', 'basi', 'for', 'theoret', 'treatment', 'thi', 'problem', 'compar', 'sp
an', 'load', 'curv', 'togeth', 'with', 'support', 'evid', 'show', 'that',
'substanti', 'increment', 'produc', 'by', 'destal', 'or', 'boundarylayerco
ntrol', 'effect', 'integr', 'remain', 'after', 'subtract', 'found', 'agr
e', 'well', 'potenti', 'flow', 'theori', 'empir', 'specif', 'configur', 'e
xperi', 'simpl', 'shear', 'past', 'flat', 'plate', 'incompress', 'fluid',
'small', 'viscos', 'tingyili', 'depart', 'aeronaut', 'engin', 'renssela',
'polytechn', 'institut', 'troy', 'ny', 'highspe', 'viscou', 'twodimensio
n', 'bodi', 'it', 'is', 'usual', 'necessari', 'consid', 'shock', 'wave',
'emit', 'from', 'nose', 'lead', 'edg', 'consequ', 'there', 'exist', 'invis
cid', 'rotat', 'region', 'between', 'boundari', 'layer', 'such', 'situat',
'aris', 'instanc', 'hyperson', 'somewhat', 'prandtl', 'classic', 'boundary
lay', 'origin', 'outsid', 'irrot', 'while', 'must', 'be', 'possibl', 'vort
ic', 'have', 'been', 'recent', 'discuss', 'ferri', 'libbi', 'present', 'pa

In [116]: 

len(tokens)

Out[116]:

7480
In [488]: 

#Sorting the tokens

sorted_tokens=dict(sorted(tokens.items(),key=lambda x:x[1][0],reverse=True))
print(sorted_tokens)

{'of': [1395, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 3
6, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 7
3, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 10
8, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 1
23, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257,
258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302,

In [118]: 

print(sorted_tokens.keys())
'environment', 'page', 'philosophi', 'cut', 'socal', 'paramount', 'countr
i', 'drastic', 'lester', 'classif', 'log', 'jmathphi', 'hypergeometr', 'ow
enpr', 'insert', 'grid', 'grade', 'typifi', 'neighbourhood', 'signal', 'fi
lter', 'db', 'lubric', 'mathieu', 'stratiform', 'apprais', 'elabor', 'wort
h', 'quadrupol', 'shapiro', 'telemet', 'track', 'interplanetari', 'refract
ori', 'aeroquart', 'ward', 'biconvex', 'xy', 'gaussian', 'immers', 'millik
ancb', 'belong', 'lowturbul', 'unfortun', 'ship', 'hodograph', 'zeroord',
'miss', 'mar', 'overshoot', 'brake', 'walker', 'dispers', 'inert', 'cell',
'lowdens', 'photographi', 'annulu', 'holderdw', 'cravenah', 'entrain', 'de
duct', 'utia', 'maxwellian', 'molyneuxwg', 'tnstruct', 'compressibleflow',
'monatom', 'liter', 'unusu', 'yashuram', 'jphyssoc', 'lengthwis', 'aj', 'p
ercentthick', 'idealga', 'rankin', 'seal', 'lowfrequ', 'omit', 'british',
'constrain', 'screen', 'stand', 'sideforc', 'manufactur', 'stratfordb', 'c
oncurr', 'bubbl', 'singlestag', 'stator', 'powel', 'astronaut', 'ensur',
'smoke', 'quasicylindr', 'quasi', 'bank', 'angularli', 'weapon', 'tangen
c', 'tangentcon', 'quasicylind', 'referenc', 'code', 'kettledj', 'strut',
'gooderumpb', 'woodgp', 'visualis', 'visibl', 'royal', 'deposit', 'oil',
'compact', 'white', 'hard', 'eventu', 'said', 'weberj', 'squir', 'sweptw',
'lilleygm', 'civil', 'poiseuil', 'nuclear', 'jappphi', 'eckhausw', 'con',
'hour', 'jd', 'polish', 'stagnationtowal', 'mission', 'evapor', 'entail',
In [119]: 

# creating the dataframe


import pandas as pd
words = list(sorted_tokens.keys())
temp = list(sorted_tokens.values())
count, freq = [], []
for i in range(len(temp)):
count.append(len(temp[i][1]))
freq.append(list(temp[i][1]))

In [120]: 

dict_tokens = {'Tokens': words, 'DOC Frequency':count, 'Document ID': freq}


df = pd.DataFrame.from_dict(dict_tokens)
df.head()

Out[120]:

Tokens DOC Frequency Document ID

0 of 1395 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...

1 the 1391 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...

2 and 1342 [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...

3 a 1307 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...

4 to 1252 [1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...

In [121]: 

#stopwords removal

dict_tokens = {'Tokens': words, 'DOC Frequency':count, 'DocumentID': freq}


df = pd.DataFrame.from_dict(dict_tokens).set_index("Tokens")
stopwords = df.head(30)
df.drop(index=df.index[:30], axis=0, inplace=True)
stopwords.to_csv('stopwords.csv')

In [122]: 

# printing tokens

df=df.sort_values('Tokens')
df.to_csv('index.csv')
In [123]: 

# printing tokens
index_df=pd.read_csv("index.csv")
index_df.head()

Out[123]:

Tokens DOC Frequency DocumentID

0 aaaero 1 [1111]

1 aaaeroconf 1 [899]

2 aasu 1 [722]

3 ab 3 [744, 924, 1381]

4 abbott 1 [1340]

In [124]: 

# Stopwords csv

stopwords_df=pd.read_csv("stopwords.csv")
stopwords_df.head(15)

Out[124]:

Tokens DOC Frequency DocumentID

0 of 1395 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...

1 the 1391 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...

2 and 1342 [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...

3 a 1307 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...

4 to 1252 [1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...

5 in 1241 [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15...

6 is 1151 [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...

7 for 1145 [1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 17, ...

8 are 1029 [3, 4, 5, 6, 7, 11, 12, 14, 15, 17, 18, 19, 20...

9 with 1010 [1, 3, 4, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17,...

10 on 913 [7, 8, 9, 11, 13, 14, 15, 17, 18, 19, 21, 22, ...

11 by 854 [1, 2, 4, 6, 7, 9, 13, 14, 15, 16, 17, 20, 21,...

12 that 805 [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18,...

13 an 796 [1, 2, 8, 9, 10, 11, 14, 15, 16, 17, 19, 21, 2...

14 at 771 [1, 5, 6, 7, 8, 9, 10, 11, 13, 14, 18, 19, 24,...

Query processing
In [125]: 

i=1
punctuations='''()-[]{};:\/,.<>@$^*%&'''

with open('cran/cran.qry','r') as f:
for line in f:
if line[:2]=='.I':
file=open(f'Cran Query\{i}.txt','x')
i+=1
elif line[:2] in ['.T','.B','.A','.W']:
continue

elif line=='':
file.close()
else:
no_punct_line=""
for char in line:
if(char not in punctuations):
no_punct_line+=char

file.write(no_punct_line)

...

In [126]: 

file=open(f'Cran Query\225.txt','w')
file.close()
In [505]: 

stopwords_list

Out[505]:

['of',

'the',

'and',

'a',

'to',

'in',

'is',

'for',

'are',

'with',

'on',

'by',

'that',

'an',

'at',

'be',

'flow',

'result',

'thi',

'as',

'from',

'it',

'which',

'number',

'effect',

'pressur',

'use',

'present',

'j',

'obtain']

Query execution-AND
In [506]: 

output = dict()
for i in range(1,226):
with open(f'Cran Query\{i}.txt', 'r') as f:
mat = []
for line in f:
for word in line.split():
if word in sorted_tokens:
mat.append(sorted_tokens[word][1])
k = mat[0]
for i in range(1, len(mat)):
k = k.intersection(mat[i])
if len(k)!=0:
output[i]=list(k)
output

Out[506]:

{9: [329, 142, 1263, 625, 1107, 1300, 1204, 983, 666, 1307, 1213],

3: [1040, 185, 1250, 486],

4: [315, 1323, 131],

1: [2,

3,

4,

1029,

8,

9,

522,

13,

525,

15,

527,

1042,

21,

22,

In [531]: 

# QUERY-OR
output = dict()
for i in range(1,226):
with open(f'Cran Query\{i}.txt', 'r') as f:
mat = []
for line in f:
for word in line.split():
if word in sorted_tokens:
mat.append(sorted_tokens[word][1])
k = mat[0]
for i in range(1, len(mat)):
k = k.union(mat[i])
if len(k)!=0:
output[i]=list(k)
output

Out[531]:

{7: [1,

2,

3,

4,

5,

6,

7,

8,

9,

10,

11,

12,

13,

14,

15,

16,

17,

Query-1
In [507]: 

# Creating a dataframe for the documents of the query-9


query_df=pd.DataFrame(output[1],columns=["docid"])
query_df["qid"]=1
query_df

Out[507]:

docid qid

0 2 1

1 3 1

2 4 1

3 1029 1

4 8 1

... ... ...

211 493 1

212 1008 1

213 500 1

214 1013 1

215 1014 1

216 rows × 2 columns


In [508]: 

# creating a dataframe for the query and relevance scores

import pandas as pd
re_pd=pd.read_csv('cran/cranqrel.csv',names=["qid","docid","rel"])
re_pd

Out[508]:

qid docid rel

0 1 184 2

1 1 29 2

2 1 31 2

3 1 12 3

4 1 51 3

... ... ... ...

1832 225 1062 3

1833 225 1074 3

1834 225 1075 3

1835 225 1213 3

1836 225 1188 -1

1837 rows × 3 columns

In [509]: 

# merging the dataframe for the relevant retives docs

final_df=pd.merge(re_pd, query_df, on=["qid","docid"], how='inner')


final_df=final_df[final_df.rel<3]

final_df

Out[509]:

qid docid rel

0 1 29 2

1 1 31 2

5 1 486 -1

In [510]: 

# defining values for calculating precision recall

relevant_retrived=len(final_df)
total_relevant=len(re_pd[re_pd.qid == 1])
total_retrived=len(query_df)
In [511]: 

print(relevant_retrived)
print(total_relevant)
print(total_retrived)

29

216

In [512]: 

def precision(a,b):
return a/b

def recall(c,d):
return c/d

In [513]: 

precision_9=precision(relevant_retrived,total_retrived)
recall_9=recall(relevant_retrived,total_relevant)

measures_dict={}
measures_dict["Query1"]=[precision_9,recall_9]

In [514]: 

measures_dict

Out[514]:

{'Query1': [0.013888888888888888, 0.10344827586206896]}

QID-3
In [515]: 

def score(a):
query_df=pd.DataFrame(output[a],columns=["docid"])
query_df["qid"]=a
final_df=pd.merge(re_pd, query_df, on=["qid","docid"], how='inner')
final_df=final_df[final_df.rel<3]
relevant_retrived=len(final_df)
print(final_df)
total_relevant=len(re_pd[re_pd.qid == a])
total_retrived=len(query_df)
precision_3=precision(relevant_retrived,total_retrived)
recall_3=recall(relevant_retrived,total_relevant)
measures_dict["Query"+str(a)]=[precision_3,recall_3]
print("Total relevant documents are {}".format(total_relevant))
print("Total relevant retrived documents are {}".format(relevant_retrived))
print("Total retrived documents are {}".format(total_retrived))
return measures_dict

In [516]: 

score(3)

Empty DataFrame

Columns: [qid, docid, rel]

Index: []

Total relevant documents are 9

Total relevant retrived documents are 0

Total retrived documents are 4

Out[516]:

{'Query1': [0.013888888888888888, 0.10344827586206896], 'Query3': [0.0, 0.


0]}

In [517]: 

measures_dict

Out[517]:

{'Query1': [0.013888888888888888, 0.10344827586206896], 'Query3': [0.0, 0.


0]}

QID-15
In [518]: 

score(15)

qid docid rel

0 15 463 1

2 15 497 -1

Total relevant documents are 3

Total relevant retrived documents are 2

Total retrived documents are 1395

Out[518]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666]}

In [519]: 

measures_dict

Out[519]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666]}

QID-71
In [520]: 

score(71)

qid docid rel

0 71 569 1

1 71 571 1

2 71 1355 2

6 71 572 1

Total relevant documents are 9

Total relevant retrived documents are 4

Total retrived documents are 913

Out[520]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444]}


In [521]: 

measures_dict

Out[521]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444]}

QID-2
In [522]: 

score(2)

Empty DataFrame

Columns: [qid, docid, rel]

Index: []

Total relevant documents are 25

Total relevant retrived documents are 0

Total retrived documents are 99

Out[522]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0]}

In [523]: 

measures_dict

Out[523]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0]}

QID-109
In [524]: 

score(109)

qid docid rel

0 109 860 1

1 109 861 1

5 109 766 -1

Total relevant documents are 6

Total relevant retrived documents are 3

Total retrived documents are 1252

Out[524]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0],

'Query109': [0.0023961661341853034, 0.5]}

In [525]: 

measures_dict

Out[525]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0],

'Query109': [0.0023961661341853034, 0.5]}

QID-6
In [526]: 

score(6)

Empty DataFrame

Columns: [qid, docid, rel]

Index: []

Total relevant documents are 5

Total relevant retrived documents are 0

Total retrived documents are 10

Out[526]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0],

'Query109': [0.0023961661341853034, 0.5],

'Query6': [0.0, 0.0]}


QID-192
In [527]: 

score(192)

qid docid rel

0 192 733 1

1 192 734 1

2 192 735 1

3 192 736 1

4 192 641 -1

Total relevant documents are 5

Total relevant retrived documents are 5

Total retrived documents are 1010

Out[527]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0],

'Query109': [0.0023961661341853034, 0.5],

'Query6': [0.0, 0.0],

'Query192': [0.0049504950495049506, 1.0]}

QID-204
In [528]: 

score(204)

Empty DataFrame

Columns: [qid, docid, rel]

Index: []

Total relevant documents are 15

Total relevant retrived documents are 0

Total retrived documents are 27

Out[528]:

{'Query1': [0.013888888888888888, 0.10344827586206896],

'Query3': [0.0, 0.0],

'Query15': [0.0014336917562724014, 0.6666666666666666],

'Query71': [0.004381161007667032, 0.4444444444444444],

'Query2': [0.0, 0.0],

'Query109': [0.0023961661341853034, 0.5],

'Query6': [0.0, 0.0],

'Query192': [0.0049504950495049506, 1.0],

'Query204': [0.0, 0.0]}


In [529]: 

# creating a dataframe with precision and recall values

values_df=pd.DataFrame.from_dict(measures_dict, orient='index',columns=["Precision","Recall

In [530]: 

values_df

Out[530]:

Precision Recall

Query1 0.013889 0.103448

Query3 0.000000 0.000000

Query15 0.001434 0.666667

Query71 0.004381 0.444444

Query2 0.000000 0.000000

Query109 0.002396 0.500000

Query6 0.000000 0.000000

Query192 0.004950 1.000000

Query204 0.000000 0.000000

In [ ]: 

In [ ]: 

You might also like