Professional Documents
Culture Documents
import preprocess_index as pi
import booleanrm as bm
import os, docx2txt, re
def atoi(text):
return int(text) if text.isdigit() else text
def nkey(text):
return [atoi(c) for c in re.split(r'(\d+)',text)]
def __open(nfile,pathf):
content = docx2txt.process(os.path.join(pathf,nfile))
return content
def __mydict(li):
return list(dict.fromkeys(li))
pathf = os.path.abspath("C:\Pemrograman\Python\Boolean\Dokumen")
files = os.listdir(pathf)
files.sort(key=nkey)
cont = Counter()
data = {}
title = {}
termloc = {}
i=1
for f in files:
content = __open(f,pathf)
titcon = content.split('\n',1)[0]
titcon = str(titcon.replace(" ","").split(':',1)[1])
title[i] = titcon
contb = "".join(content.split("\n")[2:])
str_contb = str(contb)
PP = pi.preproc()
contb = PP.run_pp(str_contb)
contb = __mydict(contb)
IP = pi.indeks()
cont = IP.countword(contb,cont)
termloc = IP.setloc(i,cont,contb,termloc)
i+=1
data = IP.todict(cont)
BRM = bm.brm(data,termloc,title)
BRM.searchbrm(query)