Professional Documents
Culture Documents
(BTL IT4853) Nhom 3-Bao CaoTongKet
(BTL IT4853) Nhom 3-Bao CaoTongKet
BI TP LN
MN: TM KIM V TRNH DIN THNG TIN
:
Nguyn Quang Tun
20093005
V nh Tng
20090173
20092146
Mc l
Ti liu mu s dng.....................................................................................4
1.1.1.
Ti liu mu:..............................................................................................4
1.1.2.
Kt qu chit xut......................................................................................5
1.2.
1.3.
Cu hnh my.................................................................................................7
1.4.
M ngun........................................................................................................8
1.5.
Kt qu thc nghim...................................................................................23
1.6.
Trang 2
Trang 3
CHNG 1: GIAI ON 1
1.1.
Ti liu mu s dng
Trang 4
<link>http://vi.wikipedia.org/wiki/Ti%E1%BA%BFn_qu
%C3%A2n_ca#Xem_th.C3.AAm</link>
</sublink>
<sublink linktype="nav">
<anchor>Ch thch</anchor>
<link>http://vi.wikipedia.org/wiki/Ti%E1%BA%BFn_qu
%C3%A2n_ca#Ch.C3.BA_th.C3.ADch</link>
</sublink>
<sublink linktype="nav"><anchor>Lin kt ngoi</anchor>
<link>http://vi.wikipedia.org/wiki/Ti%E1%BA%BFn_qu
%C3%A2n_ca#Li.C3.AAn_k.E1.BA.BFt_ngo.C3.A0i</link>
</sublink>
</links>
</doc>
</feed>
Trang 5
Trang 6
1.3. Cu hnh my
Processor: Intel Core i5-3210M CPU @ 2.50GHz
OS type: 32-bit
Memory: 2GB RAM
Operating System: Ubuntu 13.04
Trang 7
1.4.
M ngun
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
<xapian.h>
<stdio.h>
<iostream>
<string.h>
<cstring>
<cstdlib> // For exit().
<sys/types.h>
<sys/stat.h>
<fcntl.h>
<unistd.h>
<sys/mman.h>
<stdlib.h>
#include "XapianIndex.h"
#include "Timer.h"
// The one and only application object
using namespace std;
static
static
static
static
//test
int iAbstractNumber = 0;
int iLinksNumber = 0;
int iAuthorNumber = 0;
int iTitleNumber = 0;
int main()
{
int iType;
printf("1. File Test with ProcessXML\n2. File data
with ProcessXML\n\n");
scanf("%d",&iType);
switch(iType)
{
Trang 8
return 1;
Trang 9
if(!S_ISREG(sb.st_mode))
{
cout << "Error" << endl;
close(hFile);
return 0;
}
dwFileSize = sb.st_size;
lpMapFile = (char*)mmap(0, sb.st_size, PROT_READ,
MAP_SHARED, hFile, 0);
if(lpMapFile == MAP_FAILED)
{
cout << "error map" << endl;
close(hFile);
return 0;
}
if(close(hFile) == -1)
{
cout << "Error close" << endl;
return 0;
}
//Start Analysis
int iStartElement = 0, iEndElement;
iSeekData = 0;
int iLenDoc, iLenDocTotal = 0;
int iStartTitle, iEndTile;
int iDocNumber = 0;
int iItemSize = 0, iPointer = 0;
char * pValue = new char[0x100000];
char szUrl[10000];
char szTitle[10000];
db.begin_transaction();
Trang 10
Trang 11
Trang 12
szTitle);
doc.add_value(XAPIAN_FIELD_CONTENT, para);
// Add the document to the database.
db.replace_document(szId,doc);
continue;
}
Trang 13
bFlagSublink = true;
while(bFlagSublink == true)
{
//c anchor
SetSeekData(iStartSublink +
strlen("<sublink"));
int iStartAnchor =
GetOffsetElement("<anchor");
int iEndAnchor =
GetOffsetElement("</anchor>");
if (iStartAnchor < 0
|| iEndAnchor < 0
|| iStartAnchor > iEndAnchor
|| iEndAnchor > iEndSubLink)
{
}
else
{
iAuthorNumber++;
iItemSize =
GetLentDataOnElement_(iStartAnchor,iEndAnchor,iPointer);
iLenDoc += iItemSize + 1; //them ki
tu \n
memcpy(pValue,(char*)(lpMapFile +
iPointer),iItemSize );
pValue[iItemSize] = '\n';
pValue[iItemSize + 1] = 0;
para += pValue;
}
//tm sublink tip theo
SetSeekData(iEndSubLink +
strlen("</sublink>"));
bFlagSublink = false;
iStartSublink =
GetOffsetElement("<sublink");
iEndSubLink =
GetOffsetElement("</sublink>");
Trang 14
Trang 15
int iAvgLenDoc;
if (iDocNumber > 0)
{
iAvgLenDoc = iLenDocTotal/iDocNumber;
}
else
{
iAvgLenDoc = 0;
}
finish = ti.getElapsedTime();
double duration = (double)(finish - start);
printf("Thoi gian thuc thi: %.10lf\n", duration);
printf("So van ban: %d\n",iDocNumber);
printf("Tong do dai: %d\n", iLenDocTotal);
printf("Do dai trung binh: %d\n",iAvgLenDoc);
//Thong ke 30 tu xuat hien nhieu nhat
Trang 17
}
}
else{
m = m + i - pTable[i];
if (pTable[i] > -1) i = pTable[i];
else i = 0;
}
}
free(pTable);
free(pSource);
return -1;
}
int KMP_Search(char* pBuffer, char* pSample, int
iBufferSize, int iSampleSize)
{
int pNext[KMP_MAX_SAMPLE_SIZE] = {0};
preKMP(pSample,iSampleSize,pNext);
return
KMP(pBuffer,pSample,iBufferSize,iSampleSize,pNext);
}
void preKMP(char* pSample, int iSampleSize, int* pNext)
{
int k = 0, value = -1;
pNext[0] = -1;
while(k < iSampleSize)
{
while(value >= 0 && pSample[k] !=
pSample[value])
{
value = pNext[value];
}
k++;
value++;
pNext[k] = value;
}
}
Trang 20
Trang 21
return
KMPi(pBuffer,pSample,iBufferSize,iSampleSize,pNext);
}
Trang 23
CHNG 2: GIAI ON 2
2.1. Thut ton s dng
Bc 1. Trch rt vn bn (G I khng s dng th vin)
Bc 2. nh ch mc cho vn bn va c trch rt
Bc 3. Quay tr li Bc 1 cho ti khi nh ch mc cho ton b cc vn bn
trch rt c trong ti liu
M ngun
Xapian::WritableDatabase db("MyIndex",
Xapian::DB_CREATE_OR_OPEN);
Xapian::TermGenerator indexer;
Xapian::Stem stemmer("english");
db.begin_transaction();
For(int i=0;i<iNumberDoc;i++){
//para: ni dung ca 1 vn bn trch rt
indexer.set_stemmer(stemmer);
Xapian::Document doc;
indexer.set_document(doc);
sprintf(szId, "UID:%d" , iDocNumber);
indexer.index_text(para);
doc.add_term(szId);
doc.add_value(XAPIAN_FIELD_CONTENT, para);
db.replace_document(szId,doc);
}
db.commit_transaction();
Trang 24
2.2. Cu trc ch mc
2.2.1. Gii thiu
- Xapian l mt th vin m ngun m c vit bng C++ c th s dng
vi cc ngn ng Perl, Python, PHP, Java, Tcl, C#, Ruy, Lua nh ch
mc v tm kim mt cch ti u cho ng dng.
2.2.2. nh ch mc vi Xapian
- Thng tin ch mc ca database c nh bi xapin s c lu trong
mt database di dng mt nh x t mi term( mt t hoc mt nhm
t ) n mt danh sch tt c cc document cha n, vi mt vi cc
thng k v tn xut xut hin.
- Xapian lu d liu vi mt nh dng c thit lp cho php tm kim
vi mt hiu nng nhanh nht. Xapian khng s dng lin kt d liu
trong database gc.
- Cu trc lu tr d liu ca xapian l mt backend s dng mt cu trc
copy-on-write B+ tree ( l mt sao chp c bit ca cu trc B tree vi
nhiu hiu nng chuyn bit).
- Xapian cho php nh ch mc mt cch ng thi cc ti liu.
2.2.3. Cc thnh phn ca Xapian
- Document( ti liu ): document trong xapian chnh l mt ti liu c tr
v bi mt kt qu tm kim. Mt document gm mt nh
danh( document ID) nh danh document trong database v 3 thnh
phn: data, term v values.
- Document Data: l mt khi d liu tu c lin kt vi document.
N c th c s dng lu mt tham chiu n mt document
khc( nh mt kho ngoi ) hoc c th c s dng lu ton b ni
dung ca document. Thng thng, document data c s dng lu
bt k thng tin c dng cho vic sp xp hin th kt qu tm kim ca
document.
- Term( gm mt t hoc mt nhm t) : l thnh phn quan trng nht khi
tm kim vi xapian. Mt cu truy vn tm kim s c so snh vi cc
term a ra danh sch cc document khp nht vi cu truy vn.
Xapian s lu li tn xut xut hin ca cc term cng nh v tr xut hin
ca n trong mi document.
- Stemmers: l dng chun ca mt t. Stemming l mt vic thc hin
chuyn i cc dng khc nhau ca mt t v dng chun ca t nhng t
. V d t birds s c chuyn thnh bird. iu ny ch p dng c
vi mt s ngn ng m xapian h tr.
- Values: l mt dng phc tp hn ca tearm. Mi document c th c mt
tp cc values kt ni vi n. N c th c s dng nh l mt kho
Trang 25
loi b t dng
1148
1148
498970
498970
568
568
Tc nh ch mc trung bnh
(MB/s)
1.8
1.8
Trang 26
CHNG 3: GIAI ON 3
3.1. Giao din tm kim
Ty chn kiu tm kim:
FLAG_BOOLEAN
FLAG_PHRASE
FLAG_LOVEHATE
FLAG_WILDCARD
Sau nhp cu truy vn
Kt qu truy vn
Trang 27
Trang 28
Trang 29
Trang 30
Trang 31
Truy vn
-dat +anh AND pham
"cong nghe" +viet
-dat +cong -tin AND "tin 3"
ban cong nghe
bach AND khoa OR (luoi AND
simple)
Ba*
Ye*
-a +b -c + 5
title:ong
"wi" AND "phong"
Thi gian trung bnh
Trang 32
Tiu
URL
im
xp
hng
Ph chnh
hp xc
(P)
ph
(R)
Wikipedia:
Lng ng
http://vi.wikipedia.org/wi
ki/L%C4%83ng_
%C3%94ng
8.00917 0
0/1
0/27
Wikipedia:
n ng l
n ng
http://vi.wikipedia.org/wi
ki/%C4%90%C3%A0n_
%C3%B4ng_l%C3%A0_ 7.66269 0
%C4%91%C3%A0n_
%C3%B4ng
0/2
0/27
Wikipedia:
C Tu
http://vi.wikipedia.org/wi
ki/C%E1%BB%95_T
%E1%BA%A9u
7.50975 1
1/3
1/27
Wikipedia:
Bt Hi Cao
Vng
http://vi.wikipedia.org/wi
ki/B%E1%BB%99t_H
%E1%BA%A3i_Cao_V
%C6%B0%C6%A1ng
7.44531 1
2/4
2/27
Wikipedia:
Gio hong
Calixt I
http://vi.wikipedia.org/wi
ki/Gi%C3%A1o_ho
%C3%A0ng_Calixt
%C3%B4_I
7.43055 1
3/5
3/27
Wikipedia:
L hi
nghinh ng
http://vi.wikipedia.org/wi
ki/L%E1%BB%85_h
%E1%BB%99i_nghinh_
%C3%94ng
7.39914 0
3/6
3/27
Wikipedia:
Eurycratides
http://vi.wikipedia.org/wi
ki/Eurycratides
7.37856 1
4/7
4/27
Wikipedia:
Dng
Chiu
http://vi.wikipedia.org/wi
ki/D
%C6%B0%C6%A1ng_C
7.31823 1
5/8
5/27
Trang 33
hi%C3%AAu
Wikipedia:
Sh Shish
http://vi.wikipedia.org/wi
ki/Sh%C5%8D_Shish
%C5%8D
7.23303 0
5/9
5/27
Wikipedia:
Vologases
10
IV ca
Parthia
http://vi.wikipedia.org/wi
ki/Vologases_IV_c
%E1%BB%A7a_Parthia
7.23303 1
6/10
6/27
Wikipedia:
11 T M
Phng
http://vi.wikipedia.org/wi
ki/T%C6%B0_M
%C3%A3_Ph
%C3%B2ng
7.17449 1
7/11
7/27
Wikipedia:
Sh Shitsu
http://vi.wikipedia.org/wi
ki/Sh%C5%8D_Shitsu
7.17176 1
8/12
8/27
Wikipedia:
13
Tn Ha
http://vi.wikipedia.org/wi
ki/T%C3%B4n_H
%C3%B2a
7.14302 1
9/13
9/27
Wikipedia:
14 Chu Do
Tung
http://vi.wikipedia.org/wi
ki/Chu_Do_Tung
7.13674 1
10/14 10/27
Wikipedia:
15
Thnh Anr
http://vi.wikipedia.org/wi
ki/Th%C3%A1nh_Anr
%C3%AA
7.08647 1
11/15 11/27
Wikipedia:
16 Alcetas II
ca Ipiros
http://vi.wikipedia.org/wi
ki/Alcetas_II_c%E1%BB 7.0755
%A7a_Ipiros
Wikipedia:
17
Trn Hnh
12
11/16 11/27
http://vi.wikipedia.org/wi
ki/Tr%E1%BA%A7n_H
%E1%BA%A1nh
6.96684 1
12/17 12/27
Wikipedia:
18 Srindravarm
an
http://vi.wikipedia.org/wi
ki/Srindravarman
6.90373 0
12/18 12/27
19 Wikipedia:
Ng nh
Thc
http://vi.wikipedia.org/wi
ki/Ng%C3%B4_
%C4%90%C3%ACnh_T
6.90373 1
13/19 13/27
Trang 34
h%E1%BA%A1c
Wikipedia:
20 Nguyn
Qunh
http://vi.wikipedia.org/wi
ki/Nguy%E1%BB
%85n_Qu%E1%BB
%B3nh
6.88635 1
14/20 14/27
Wikipedia:
21 Chris
Wooding
http://vi.wikipedia.org/wi
ki/Chris_Wooding
6.86876 0
14/21 14/27
Wikipedia:
22 i Huyn
Tch
http://vi.wikipedia.org/wi
ki/%C4%90%E1%BA
%A1i_Huy%E1%BB
%81n_T%C3%ADch
6.86402 1
15/22 15/27
Wikipedia:
23 Vua ca
chu
http://vi.wikipedia.org/wi
ki/Vua_c%E1%BB
%A7a_ch%C3%A2u_
%C3%81
6.85264 1
16/23 16/27
Wikipedia:
http://vi.wikipedia.org/wi
24 Alexandros I ki/Alexandros_I_c
ca Ipiros
%E1%BB%A7a_Ipiros
6.84578 1
17/24 17/27
Wikipedia:
25 Bc Ngy
Tit Mn
http://vi.wikipedia.org/wi
ki/B%E1%BA%AFc_Ng
%E1%BB%A5y_Ti
%E1%BA%BFt_M
%E1%BA%ABn_
%C4%90%E1%BA%BF
6.82631 0
17/25 17/27
Wikipedia:
26 Nam ng
mng lc
http://vi.wikipedia.org/wi
ki/Nam_%C3%94ng_m
%E1%BB%99ng_l
%E1%BB%A5c
6.80549 0
17/26 17/27
Wikipedia:
27 Egbert ca
Wessex
http://vi.wikipedia.org/wi
ki/Egbert_c%E1%BB
%A7a_Wessex
6.7476
17/27 17/27
Wikipedia:
28 Taisei
(Ryukyu)
http://vi.wikipedia.org/wi
ki/Taisei_(Ryukyu)
6.7342
17/28 17/27
29 Wikipedia:
http://vi.wikipedia.org/wi
6.71275 1
18/29 18/27
Trang 35
H Vn Hu
ki/H%E1%BB%93_V
%C4%83n_Hu%C3%AA
Wikipedia:
30
Ai
http://vi.wikipedia.org/wi
ki/%C4%90%E1%BA
%BF_Ai
6.71178 1
19/30 19/27
Wikipedia:
Johnathan
31
Hnh
Nguyn
http://vi.wikipedia.org/wi
ki/Johnathan_H
%E1%BA%A1nh_Nguy
%E1%BB%85n
6.70706 1
20/31 20/27
Wikipedia:
32 Thc Pht
Li Lc C
http://vi.wikipedia.org/wi
ki/Th%E1%BB%91c_Ph
%C3%A1t_L%E1%BB
%A3i_L%E1%BB
%99c_C%C3%B4
6.69786 1
21/32 21/27
Wikipedia:
33 on Mnh
Giao
http://vi.wikipedia.org/wi
ki/%C4%90o
%C3%A0n_M%E1%BA
%A1nh_Giao
6.69247 1
22/33 22/27
Wikipedia:
Tn Ph
http://vi.wikipedia.org/wi
ki/T%E1%BA%A5n_Ph
%E1%BA%BF_
%C4%90%E1%BA%BF
6.68517 1
23/34 23/27
http://vi.wikipedia.org/wi
ki/Tr%C3%AC_H
Wikipedia:
35
%E1%BA%A1o_
Tr Ho in
%C4%90i%E1%BB
%81n
6.67215 1
24/35 24/27
34
Wikipedia:
John Parnell
http://vi.wikipedia.org/wi
ki/John_Parnell
6.67215 1
25/36 25/27
Wikipedia:
37 Luitpold ca
Bayern
http://vi.wikipedia.org/wi
ki/Luitpold_c%E1%BB
%A7a_Bayern
6.61511 0
25/37 25/27
36
38
Wikipedia:
Neferkara I
http://vi.wikipedia.org/wi
ki/Neferkara_I
6.59837 1
26/38 26/27
39
Wikipedia:
Samsu-Iluna
http://vi.wikipedia.org/wi
ki/Samsu-Iluna
6.5262
27/39 27/27
Trang 36
Wikipedia:
40
Trn Kin
http://vi.wikipedia.org/wi
ki/Tr%E1%BA%A7n_Ki
%E1%BA%BFn
6.5262
27/40 27/27
3.4.2. th PR
0.8
0.7
Precision
0.6
0.5
0.4
0.3
0.2
0.1
0
0
0.2
0.4
0.6
0.8
1.2
Recall
Trang 37
case 1:
iFlag = Xapian::QueryParser::FLAG_BOOLEAN;
break;
case 2:
iFlag = Xapian::QueryParser::FLAG_PHRASE;
break;
case 3:
iFlag = Xapian::QueryParser::FLAG_LOVEHATE;
break;
case 4:
iFlag = Xapian::QueryParser::FLAG_WILDCARD;
break;
case 5:
iFlag = Xapian::QueryParser::FLAG_DEFAULT;
break;
default:
return 0;
Trang 38
3.6. M ngun
#include
#include
#include
#include
#include
<xapian.h>
<stdio.h>
<iostream>
<ostream>
<string>
Trang 39
"MYINDEX_Data"
XAPIAN_FIELD_UID 0
XAPIAN_FIELD_URL 0
XAPIAN_FIELD_TITLE 1
XAPIAN_FIELD_CONTENT 2
"Kiu tm kim:\n"
"1. FLAG_BOOLEAN " << endl
"2. FLAG_PHRASE" << endl
"3. FLAG_LOVEHATE " << endl
"4. FLAG_WILDCARD" << endl
"5. FLAG_DEFAULT " << endl;
scanf("%d",&iTypeSearch);
fgets (szQuery,1000,stdin);
printf( "Cu truy vn:");
fgets (szQuery,1000,stdin);
switch(iTypeSearch)
{
case 1:
iFlag = Xapian::QueryParser::FLAG_BOOLEAN;
break;
case 2:
iFlag = Xapian::QueryParser::FLAG_PHRASE;
break;
case 3:
iFlag = Xapian::QueryParser::FLAG_LOVEHATE;
Trang 40
break;
case 4:
iFlag = Xapian::QueryParser::FLAG_WILDCARD;
break;
case 5:
iFlag = Xapian::QueryParser::FLAG_DEFAULT;
break;
default:
return 0;
Trang 41
Trang 42