You are on page 1of 4

JURUSAN INFORMATIKA

UNIVERSITAS SANATA DHARMA

PRAKTIKUM PEMEROLEHAN INFORMASI


VECTOR SPACE MODEL

A. TUJUAN PRAKTIKUM
a. Mahasiswa mampu mengimplementasikan vector space model

B. PRAKTIKUM
I. Perbaiki kode program berikut sehingga program dapat menampilkan nama file dokumen terurut
yang sesuai dengan rangking relevansi query.
!pip install PySastrawi

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
import glob
import re
import os
import numpy as np
import sys
Stopwords = set(stopwords.words('indonesian'))
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def finding_all_unique_words_and_freq(words):
words_unique = []
word_freq = {}
for word in words:
if word not in words_unique:
words_unique.append(word)
for word in words_unique:
word_freq[word] = words.count(word)
return word_freq

def remove_special_characters(text):
regex = re.compile('[^a-zA-Z0-9\s]')
text_returned = re.sub(regex,'',text)
return text_returned

1
all_words = []
dict_global = {}
file_folder = '/content/drive/MyDrive/PI2023/dok/*'
idx = 1
files_with_index = {}
for file in glob.glob(file_folder):
#print(file)
fname = file
file = open(file , "r")
text = file.read()
text = remove_special_characters(text)
text = re.sub(re.compile('\d'),'',text)
sentences = sent_tokenize(text)
words = word_tokenize(text)
words = [word for word in words if len(words)>1]
words = [word.lower() for word in words]
words = [word for word in words if word not in Stopwords]
words = [stemmer.stem(word) for word in words]
dict_global.update(finding_all_unique_words_and_freq(words))
files_with_index[idx] = os.path.basename(fname)
idx = idx + 1

unique_words_all = set(dict_global.keys())

class Node:
def __init__(self ,docId, freq = 0):
self.freq = freq
self.tfidf = 0
self.doc = docId
self.nextval = None

class SlinkedList:
def __init__(self ,head = None, nDoc=0):
self.head = head
self.ndoc = nDoc
self.idf = 0

linked_list_data = {}
for word in unique_words_all:
linked_list_data[word] = SlinkedList()
linked_list_data[word].head = Node(1,Node)
word_freq_in_doc = {}
idx = 0
for file in glob.glob(file_folder):
file = open(file, "r")
text = file.read()
text = remove_special_characters(text)

2
text = re.sub(re.compile('\d'),'',text)
sentences = sent_tokenize(text)
words = word_tokenize(text)
words = [word for word in words if len(words)>1]
words = [word.lower() for word in words]
words = [word for word in words if word not in Stopwords]
words = [stemmer.stem(word) for word in words]
word_freq_in_doc = finding_all_unique_words_and_freq(words)
for word in word_freq_in_doc.keys():
linked_list = linked_list_data[word].head
linked_list_data[word].ndoc=linked_list_data[word].ndoc+1
while linked_list.nextval is not None:
linked_list = linked_list.nextval
linked_list.nextval = Node(idx ,word_freq_in_doc[word])
idx = idx + 1

import math
import operator
nTot=len(files_with_index)
for word in unique_words_all:
linkedlist = linked_list_data[word].head
df=linked_list_data[word].ndoc
idf=0
idf = math.log2(nTot/df)+1

linked_list_data[word].idf=idf
while linkedlist.nextval is not None:
linkedlist = linkedlist.nextval
linkedlist.tfidf = idf * linkedlist.freq

dict_words=list(unique_words_all)
total_files = len(files_with_index)
total_vocab = len(dict_words)
vecD = np.zeros((total_files,total_vocab))
for i in range(len(dict_words)):
linkedlist = linked_list_data[dict_words[i]].head
while linkedlist.nextval is not None:
linkedlist = linkedlist.nextval
vecD[linkedlist.doc - 1][i] = linkedlist.tfidf

query = input('Enter your query:')


dict_query = {}
vecQ = np.zeros(total_vocab)

wordsq = word_tokenize(query)
wordsq = [word for word in wordsq if len(wordsq)>1]
wordsq = [word.lower() for word in wordsq]
wordsq = [word for word in wordsq if word not in Stopwords]
wordsq = [stemmer.stem(word) for word in wordsq]
dict_query.update(finding_all_unique_words_and_freq(wordsq))

3
for word in dict_query.keys():
vecQ[dict_words.index(word)] = linked_list_data[word].idf * dict_qu
ery[word]

def cosine_sim(a, b):


cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
return cos_sim

d_cosines = []
for d in vecD:
d_cosines.append(cosine_sim(vecQ,d))
out=np.array(d_cosines).argsort()[-total_files:][::-1]
print(out)

II. Ujilah menggunakan dokumen yang ada di teori, dan cocokkan hasil cosine similarity yang
didapat

C. LAPORAN PRAKTIKUM

1. Struktur data apa yang digunakan untuk menyimpan inverted index. Gambarkan dan
jelaskan isi dari inverted index
2. Jelaskan kode program untuk menghitung term frequency
3. Jelaskan kode program untuk menghitung index document frequency
4. Jelaskan kode program untuk menghitung cosine similarity
5. Bagaimana cara program merangking dokumen hasil kueri ?
6. Capture hasil dokumen dari kueri dan ujilah untuk beberapa kata kueri

D. PUSTAKA

https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-
IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-
IDF.ipynb
https://belajarpython.com/2018/05/sastrawi-natural-language-processing-bahasa-indonesia/
https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-
world-dataset-796d339a4089
@ Salam sehat selalu @

You might also like