You are on page 1of 1

from collections import Counter

import preprocess_index as pi
import booleanrm as bm
import os, docx2txt, re

def atoi(text):
return int(text) if text.isdigit() else text

def nkey(text):
return [atoi(c) for c in re.split(r'(\d+)',text)]

def __open(nfile,pathf):
content = docx2txt.process(os.path.join(pathf,nfile))
return content

def __mydict(li):
return list(dict.fromkeys(li))

pathf = os.path.abspath("C:\Pemrograman\Python\Boolean\Dokumen")
files = os.listdir(pathf)
files.sort(key=nkey)

cont = Counter()
data = {}
title = {}
termloc = {}

i=1
for f in files:
content = __open(f,pathf)

titcon = content.split('\n',1)[0]
titcon = str(titcon.replace(" ","").split(':',1)[1])
title[i] = titcon

contb = "".join(content.split("\n")[2:])
str_contb = str(contb)
PP = pi.preproc()
contb = PP.run_pp(str_contb)
contb = __mydict(contb)

IP = pi.indeks()
cont = IP.countword(contb,cont)
termloc = IP.setloc(i,cont,contb,termloc)
i+=1

data = IP.todict(cont)

query = input("Masukkan query : ")


query = PP.casefolding(query)
query = PP.tokenisasi(query)

BRM = bm.brm(data,termloc,title)
BRM.searchbrm(query)

You might also like