Praktikum 2 PI Genap2023

JURUSAN INFORMATIKA
UNIVERSITAS SANATA DHARMA
PRAKTIKUM PEMEROLEHAN INFORMASI

VECTOR SPACE MODEL
A. TUJUAN PRAKTIKUM
a. Mahasiswa mampu mengimplementasikan vector space model
B. PRAKTIKUM
I. Perbaiki kode program berikut sehingga program dapat menampilkan nama file dokumen terurut
yang sesuai dengan rangking relevansi query.
!pip install PySastrawi
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
import glob
import re
import os
import numpy as np
import sys
Stopwords = set(stopwords.words('indonesian'))
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def finding_all_unique_words_and_freq(words):
words_unique = []
word_freq = {}
for word in words:
if word not in words_unique:
words_unique.append(word)
for word in words_unique:
word_freq[word] = words.count(word)
return word_freq
def remove_special_characters(text):
regex = re.compile('[^a-zA-Z0-9\s]')
text_returned = re.sub(regex,'',text)
return text_returned
1
all_words = []
dict_global = {}
file_folder = '/content/drive/MyDrive/PI2023/dok/*'
idx = 1
files_with_index = {}
for file in glob.glob(file_folder):
#print(file)
fname = file
file = open(file , "r")
text = file.read()
text = remove_special_characters(text)
text = re.sub(re.compile('\d'),'',text)
sentences = sent_tokenize(text)
words = word_tokenize(text)
words = [word for word in words if len(words)>1]
words = [word.lower() for word in words]
words = [word for word in words if word not in Stopwords]
words = [stemmer.stem(word) for word in words]
dict_global.update(finding_all_unique_words_and_freq(words))
files_with_index[idx] = os.path.basename(fname)
idx = idx + 1
unique_words_all = set(dict_global.keys())
class Node:
def __init__(self ,docId, freq = 0):
self.freq = freq
self.tfidf = 0
self.doc = docId
self.nextval = None
class SlinkedList:
def __init__(self ,head = None, nDoc=0):
self.head = head
self.ndoc = nDoc
self.idf = 0
linked_list_data = {}
for word in unique_words_all:
linked_list_data[word] = SlinkedList()
linked_list_data[word].head = Node(1,Node)
word_freq_in_doc = {}
idx = 0
for file in glob.glob(file_folder):
file = open(file, "r")
text = file.read()
text = remove_special_characters(text)
2
text = re.sub(re.compile('\d'),'',text)
sentences = sent_tokenize(text)
words = word_tokenize(text)
words = [word for word in words if len(words)>1]
words = [word.lower() for word in words]
words = [word for word in words if word not in Stopwords]
words = [stemmer.stem(word) for word in words]
word_freq_in_doc = finding_all_unique_words_and_freq(words)
for word in word_freq_in_doc.keys():
linked_list = linked_list_data[word].head
linked_list_data[word].ndoc=linked_list_data[word].ndoc+1
while linked_list.nextval is not None:
linked_list = linked_list.nextval
linked_list.nextval = Node(idx ,word_freq_in_doc[word])
idx = idx + 1
import math
import operator
nTot=len(files_with_index)
for word in unique_words_all:
linkedlist = linked_list_data[word].head
df=linked_list_data[word].ndoc
idf=0
idf = math.log2(nTot/df)+1
linked_list_data[word].idf=idf
while linkedlist.nextval is not None:
linkedlist = linkedlist.nextval
linkedlist.tfidf = idf * linkedlist.freq
dict_words=list(unique_words_all)
total_files = len(files_with_index)
total_vocab = len(dict_words)
vecD = np.zeros((total_files,total_vocab))
for i in range(len(dict_words)):
linkedlist = linked_list_data[dict_words[i]].head
while linkedlist.nextval is not None:
linkedlist = linkedlist.nextval
vecD[linkedlist.doc - 1][i] = linkedlist.tfidf
query = input('Enter your query:')

dict_query = {}
vecQ = np.zeros(total_vocab)
wordsq = word_tokenize(query)
wordsq = [word for word in wordsq if len(wordsq)>1]
wordsq = [word.lower() for word in wordsq]
wordsq = [word for word in wordsq if word not in Stopwords]
wordsq = [stemmer.stem(word) for word in wordsq]
dict_query.update(finding_all_unique_words_and_freq(wordsq))
3
for word in dict_query.keys():
vecQ[dict_words.index(word)] = linked_list_data[word].idf * dict_qu
ery[word]
def cosine_sim(a, b):

cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
return cos_sim
d_cosines = []
for d in vecD:
d_cosines.append(cosine_sim(vecQ,d))
out=np.array(d_cosines).argsort()[-total_files:][::-1]
print(out)
II. Ujilah menggunakan dokumen yang ada di teori, dan cocokkan hasil cosine similarity yang
didapat
C. LAPORAN PRAKTIKUM
1. Struktur data apa yang digunakan untuk menyimpan inverted index. Gambarkan dan
jelaskan isi dari inverted index
2. Jelaskan kode program untuk menghitung term frequency
3. Jelaskan kode program untuk menghitung index document frequency
4. Jelaskan kode program untuk menghitung cosine similarity
5. Bagaimana cara program merangking dokumen hasil kueri ?
6. Capture hasil dokumen dari kueri dan ujilah untuk beberapa kata kueri
D. PUSTAKA
https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-
IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-
IDF.ipynb
https://belajarpython.com/2018/05/sastrawi-natural-language-processing-bahasa-indonesia/
https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-
world-dataset-796d339a4089
@ Salam sehat selalu @

Praktikum 2 PI Genap2023

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Praktikum 2 PI Genap2023

Uploaded by

Copyright:

Available Formats

JURUSAN INFORMATIKA

UNIVERSITAS SANATA DHARMA

PRAKTIKUM PEMEROLEHAN INFORMASI

query = input('Enter your query:')

def cosine_sim(a, b):

You might also like