Professional Documents
Culture Documents
AP19110010110
In [34]:
import nltk
In [58]:
i=1
punctuations='''()-[]{};:\/,.<>@$''^+1234567890*%&=?'''
with open('cran/cran.all.1400','r') as f:
for line in f:
if line[:2]=='.I':
file=open(f'Dataset assignment-1\{line[3:len(line)-1]}.txt','x')
elif line[:2] in ['.T','.B','.A','.W']:
continue
elif line=='':
file.close()
else:
no_punct_line=""
for char in line:
if(char not in punctuations):
no_punct_line+=char
file.write(no_punct_line)
In [59]:
file=open(f'Dataset assignment-1\1400.txt','w')
file.close()
ps = PorterStemmer()
tokens = {}
In [115]:
print(tokens.keys())
In [116]:
len(tokens)
Out[116]:
7480
In [488]:
sorted_tokens=dict(sorted(tokens.items(),key=lambda x:x[1][0],reverse=True))
print(sorted_tokens)
{'of': [1395, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 3
6, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 7
3, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 10
8, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 1
23, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257,
258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302,
In [118]:
print(sorted_tokens.keys())
'environment', 'page', 'philosophi', 'cut', 'socal', 'paramount', 'countr
i', 'drastic', 'lester', 'classif', 'log', 'jmathphi', 'hypergeometr', 'ow
enpr', 'insert', 'grid', 'grade', 'typifi', 'neighbourhood', 'signal', 'fi
lter', 'db', 'lubric', 'mathieu', 'stratiform', 'apprais', 'elabor', 'wort
h', 'quadrupol', 'shapiro', 'telemet', 'track', 'interplanetari', 'refract
ori', 'aeroquart', 'ward', 'biconvex', 'xy', 'gaussian', 'immers', 'millik
ancb', 'belong', 'lowturbul', 'unfortun', 'ship', 'hodograph', 'zeroord',
'miss', 'mar', 'overshoot', 'brake', 'walker', 'dispers', 'inert', 'cell',
'lowdens', 'photographi', 'annulu', 'holderdw', 'cravenah', 'entrain', 'de
duct', 'utia', 'maxwellian', 'molyneuxwg', 'tnstruct', 'compressibleflow',
'monatom', 'liter', 'unusu', 'yashuram', 'jphyssoc', 'lengthwis', 'aj', 'p
ercentthick', 'idealga', 'rankin', 'seal', 'lowfrequ', 'omit', 'british',
'constrain', 'screen', 'stand', 'sideforc', 'manufactur', 'stratfordb', 'c
oncurr', 'bubbl', 'singlestag', 'stator', 'powel', 'astronaut', 'ensur',
'smoke', 'quasicylindr', 'quasi', 'bank', 'angularli', 'weapon', 'tangen
c', 'tangentcon', 'quasicylind', 'referenc', 'code', 'kettledj', 'strut',
'gooderumpb', 'woodgp', 'visualis', 'visibl', 'royal', 'deposit', 'oil',
'compact', 'white', 'hard', 'eventu', 'said', 'weberj', 'squir', 'sweptw',
'lilleygm', 'civil', 'poiseuil', 'nuclear', 'jappphi', 'eckhausw', 'con',
'hour', 'jd', 'polish', 'stagnationtowal', 'mission', 'evapor', 'entail',
In [119]:
In [120]:
Out[120]:
2 and 1342 [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
In [121]:
#stopwords removal
In [122]:
# printing tokens
df=df.sort_values('Tokens')
df.to_csv('index.csv')
In [123]:
# printing tokens
index_df=pd.read_csv("index.csv")
index_df.head()
Out[123]:
0 aaaero 1 [1111]
1 aaaeroconf 1 [899]
2 aasu 1 [722]
4 abbott 1 [1340]
In [124]:
# Stopwords csv
stopwords_df=pd.read_csv("stopwords.csv")
stopwords_df.head(15)
Out[124]:
2 and 1342 [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
7 for 1145 [1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 17, ...
8 are 1029 [3, 4, 5, 6, 7, 11, 12, 14, 15, 17, 18, 19, 20...
9 with 1010 [1, 3, 4, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17,...
10 on 913 [7, 8, 9, 11, 13, 14, 15, 17, 18, 19, 21, 22, ...
12 that 805 [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18,...
13 an 796 [1, 2, 8, 9, 10, 11, 14, 15, 16, 17, 19, 21, 2...
Query processing
In [125]:
i=1
punctuations='''()-[]{};:\/,.<>@$^*%&'''
with open('cran/cran.qry','r') as f:
for line in f:
if line[:2]=='.I':
file=open(f'Cran Query\{i}.txt','x')
i+=1
elif line[:2] in ['.T','.B','.A','.W']:
continue
elif line=='':
file.close()
else:
no_punct_line=""
for char in line:
if(char not in punctuations):
no_punct_line+=char
file.write(no_punct_line)
...
In [126]:
file=open(f'Cran Query\225.txt','w')
file.close()
In [505]:
stopwords_list
Out[505]:
['of',
'the',
'and',
'a',
'to',
'in',
'is',
'for',
'are',
'with',
'on',
'by',
'that',
'an',
'at',
'be',
'flow',
'result',
'thi',
'as',
'from',
'it',
'which',
'number',
'effect',
'pressur',
'use',
'present',
'j',
'obtain']
Query execution-AND
In [506]:
output = dict()
for i in range(1,226):
with open(f'Cran Query\{i}.txt', 'r') as f:
mat = []
for line in f:
for word in line.split():
if word in sorted_tokens:
mat.append(sorted_tokens[word][1])
k = mat[0]
for i in range(1, len(mat)):
k = k.intersection(mat[i])
if len(k)!=0:
output[i]=list(k)
output
Out[506]:
{9: [329, 142, 1263, 625, 1107, 1300, 1204, 983, 666, 1307, 1213],
1: [2,
3,
4,
1029,
8,
9,
522,
13,
525,
15,
527,
1042,
21,
22,
In [531]:
# QUERY-OR
output = dict()
for i in range(1,226):
with open(f'Cran Query\{i}.txt', 'r') as f:
mat = []
for line in f:
for word in line.split():
if word in sorted_tokens:
mat.append(sorted_tokens[word][1])
k = mat[0]
for i in range(1, len(mat)):
k = k.union(mat[i])
if len(k)!=0:
output[i]=list(k)
output
Out[531]:
{7: [1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
Query-1
In [507]:
Out[507]:
docid qid
0 2 1
1 3 1
2 4 1
3 1029 1
4 8 1
211 493 1
212 1008 1
213 500 1
214 1013 1
215 1014 1
import pandas as pd
re_pd=pd.read_csv('cran/cranqrel.csv',names=["qid","docid","rel"])
re_pd
Out[508]:
0 1 184 2
1 1 29 2
2 1 31 2
3 1 12 3
4 1 51 3
In [509]:
final_df
Out[509]:
0 1 29 2
1 1 31 2
5 1 486 -1
In [510]:
relevant_retrived=len(final_df)
total_relevant=len(re_pd[re_pd.qid == 1])
total_retrived=len(query_df)
In [511]:
print(relevant_retrived)
print(total_relevant)
print(total_retrived)
29
216
In [512]:
def precision(a,b):
return a/b
def recall(c,d):
return c/d
In [513]:
precision_9=precision(relevant_retrived,total_retrived)
recall_9=recall(relevant_retrived,total_relevant)
measures_dict={}
measures_dict["Query1"]=[precision_9,recall_9]
In [514]:
measures_dict
Out[514]:
QID-3
In [515]:
def score(a):
query_df=pd.DataFrame(output[a],columns=["docid"])
query_df["qid"]=a
final_df=pd.merge(re_pd, query_df, on=["qid","docid"], how='inner')
final_df=final_df[final_df.rel<3]
relevant_retrived=len(final_df)
print(final_df)
total_relevant=len(re_pd[re_pd.qid == a])
total_retrived=len(query_df)
precision_3=precision(relevant_retrived,total_retrived)
recall_3=recall(relevant_retrived,total_relevant)
measures_dict["Query"+str(a)]=[precision_3,recall_3]
print("Total relevant documents are {}".format(total_relevant))
print("Total relevant retrived documents are {}".format(relevant_retrived))
print("Total retrived documents are {}".format(total_retrived))
return measures_dict
In [516]:
score(3)
Empty DataFrame
Index: []
Out[516]:
In [517]:
measures_dict
Out[517]:
QID-15
In [518]:
score(15)
0 15 463 1
2 15 497 -1
Out[518]:
In [519]:
measures_dict
Out[519]:
QID-71
In [520]:
score(71)
0 71 569 1
1 71 571 1
2 71 1355 2
6 71 572 1
Out[520]:
measures_dict
Out[521]:
QID-2
In [522]:
score(2)
Empty DataFrame
Index: []
Out[522]:
In [523]:
measures_dict
Out[523]:
QID-109
In [524]:
score(109)
0 109 860 1
1 109 861 1
5 109 766 -1
Out[524]:
In [525]:
measures_dict
Out[525]:
QID-6
In [526]:
score(6)
Empty DataFrame
Index: []
Out[526]:
score(192)
0 192 733 1
1 192 734 1
2 192 735 1
3 192 736 1
4 192 641 -1
Out[527]:
QID-204
In [528]:
score(204)
Empty DataFrame
Index: []
Out[528]:
values_df=pd.DataFrame.from_dict(measures_dict, orient='index',columns=["Precision","Recall
In [530]:
values_df
Out[530]:
Precision Recall
In [ ]:
In [ ]: