Professional Documents
Culture Documents
Tai Lieu
Tai Lieu
TRN C THUN
TRN C THUN
LI CAM OAN
Trn c Thun
MC LC
Li cam oan .......................................................................................................................... i
Mc lc ..................................................................................................................................ii
Danh mc bng biu .............................................................................................................. v
Danh mc cc hnh ................................................................................................................ v
M u ................................................................................................................................... 1
1. L do chn ti. .............................................................................................................. 1
2. Mc tiu nghin cu ...................................................................................................... 1
3. Phng php nghin cu ............................................................................................... 2
4. Tng quan lun vn........................................................................................................ 2
CHNG 1-TNG QUAN L THUYT V PHN CM D LIU ............................. 3
1.1. Tng quan v phn cm d liu.................................................................................. 3
1.2. Phn cm trong phn loi d liu ............................................................................... 4
1.3. Cc yu cu ca phn cm d liu............................................................................. 6
1.4. Cc kiu d liu trong phn cm ................................................................................ 8
1.4.1. Phn loi kiu d liu da trn kch thc min ................................................. 9
1.4.2. Phn loi kiu d liu da trn h o ................................................................. 9
1.5. Cc php o tng t v khong cch i vi cc kiu d liu .......................... 10
1.5.1. Khi nim tng t v phi tng t .................................................................. 10
1.5.2. Thuc tnh khong cch .................................................................................... 11
1.5.3. Thuc tnh nh phn .......................................................................................... 13
1.5.4. Thuc tnh nh danh ........................................................................................ 15
1.5.5. Thuc tnh c th t .......................................................................................... 16
1.5.6. Thuc tnh t l ................................................................................................... 16
1.6. Kt lun chng ........................................................................................................ 17
CHNG 2 - K THUT PHN CM D LIU NG DNG TRONG PHN LOI
CU TRC PROTEIN ....................................................................................................... 18
2.1. Gii thiu .................................................................................................................. 18
2.2. Thut ton K-means .................................................................................................. 18
2.3. Thut ton PAM........................................................................................................ 22
2.4. Thut ton CLARA ................................................................................................... 24
2.5. Thut ton CLARANS.............................................................................................. 26
DANH MC CC HNH
Hnh 1.1. Phn cm cc vector truy vn .................................................... 5
Hnh 1.2. Hnh thnh cm cha ................................................................... 6
Hnh 1.3. Cc t l khc nhau c th dn ti cc cm khc nhau .............. 12
Hnh 2.1 S phn loi cc phng php phn cm.. 18
Hnh 2.2. Cc thit lp xc nh danh gii cc cm ban u ................ 19
Hnh 2.3. Tnh ton trng tm ca cc cm mi ........................................ 20
Hnh 2.4 V d minh ha thut ton K-means ........................................... 21
Hnh 2.5 V d minh ha thut ton PAM ................................................ 24
Hnh 3.1. Ch thuyt trung tm ca sinh hc phn t ............................... 30
Hnh 3.2. Cu trc DNA ............................................................................ 30
Hnh 3.3. Cc kiu cu trc ca Protein ..................................................... 32
Hnh 3.4. Cu trc bc 2 thng thy ca protein ..................................... 33
Hnh 3.5. Hai v d v protein mng .......................................................... 34
Hnh 3.6. S pht trin ca cu trc d liu protein .................................. 35
Hnh 4.1 u vo d liu... 48
Hnh 4.2 Giao din chn tp u vo. 49
Hnh 4.3 Giao din tab Lc d liu.. 49
Hnh 4.4 Giao din tab chnh sa d liu. 50
Hnh 4.5 Giao din Tab K-Means, s dng K-means hoc K-medians 51
phn cm
Hnh 4.6 u ra d liu. 52
M U
1. L DO CHN TI
Vi s pht trin vt bc ca cng ngh thng tin, c bit l ng dng
cng ngh thng tin vo cc ngnh sinh hc gip ch rt nhiu cho vic tm
hiu nghin cu v sinh hc phn t. Chnh v vy Tin sinh hc, mt lnh vc
cn kh mi, ra i, s dng cc cng ngh ca cc ngnh ton hc ng
dng, tin hc, thng k, khoa hc my tnh, tr tu nhn to, ha hc, sinh hc
gii quyt cc vn ca sinh hc.
Nh chng ta bit, cc c s phn t ca cuc sng da trn hot
ng ca phn t sinh hc, bao gm axit nucleic (DNA v RNA),
carbohydrate, cht bo, v protein. Mc d mi loi u ng mt vai tr thit
yu trong cuc sng, nhng protein c mt s ni bt bi chng l thnh phn
biu din chnh cc chc nng ca t bo. Chnh v vy, tm hiu v nghin
cu cu trc phn t sinh hc ni ln nh mt hng i mi vi nhng tri
nghim hng vo vic khm ph cu trc ca cc phn t sinh hc. Hng
pht trin ny ca sinh hc tri qua vi s pht trin cao thng qua nghin
cu cu trc vi mc ch c ci nhn ton din v khng gian cu trc
protein, thng tin lu tr trong d liu cu trc protein l cha kha thnh
cng nm trong kh nng t chc, phn tch thng tin cha trong c s d
liu, tch hp nhng thng tin vi nhng n lc khc nhm gii quyt
nhng b n ca chc nng t bo.
Nhn thy tnh thit thc ca vn ny v c s gi ca ging vin
hng dn, em chn ti "Phn cm d liu v ng dng trong phn
loi cu trc protein"
2. MC TIU NGHIN CU
- Tm hiu tng quan v l thuyt phn cm d liu.
- Nghin cu mt s k thut phn cm d liu ng dng trong phn loi
cu trc protein.
CHNG 1
TNG QUAN L THUYT V PHN CM D LIU
mm, trn cng, trn mng internet thnh gip to lp v hon chnh kho d
liu khng l v tri thc ca loi ngi.
L mt chc nng khai ph d liu, phn cm c th s dng nh mt
cng c c lp quan st c trng ca mi cm thu c bn trong s
phn b d liu v tp trung vo mt tp ring bit ca cc cm phn
tch. Phn cm c th dng nh mt bc tin x l cho cc thut ton nh
phn loi, m t c im, pht hin ra cc cm vi cc c trng, tnh cht
khc nhau.
1.2. PHN CM TRONG PHN LOI D LIU
Cc mc d liu tng t nhau c nhm li hnh thnh cc cm
trn c s o mc tng t no . Mi cm c biu din bi trng tm
vector c trng ca cm. Trong khi truy vn, ta tnh ton tng t gia
vector truy vn v tng cm (i din bi trng tm cm). Cc cm m
tng t ca n vi vector truy vn m ln hn ngng no th c la
chn. Sau , tng t gia vector truy vn vi tng vector c trng
trong cm c tnh ton v k mc gn nht c xp hng v c xem nh
kt qu.
V d, cc vector c trng trn hnh 1.1 c nhm vo 11 cm. Trong
khi truy tm, vector truy vn c so snh vi ln lt 11 trng tm cm. Nu
tm thy trng tm cm 2 gn ging vector truy vn nht th ta tnh khong
cch gia vector truy vn vi tng vector c trng trong cm 2. Tng s tnh
ton khong cch i hi phi nh hn nhiu tng cc vector c trng trong
c s d liu.
x y
2
dist q ( x, y) i i ;
i 1
d (i, j) bc
a bcd
o khong cch bt i xng ca bin nh phn:
d (i, j) bc
a bc
Cc php o tng t ca cc trng hp vi d liu thuc tnh nh
phn c thc hin bng cc cch sau [8]:
+ H s Jaccard:
a
d ( x, y) ; tham s ny b qua s cc i snh 0 - 0.
abc
pm
d ( x, y) ;
p
trong , m l s thuc tnh i snh tng ng trng nhau, v p l tng
s cc thuc tnh.
1.5.5. Thuc tnh c th t
Php o phi tng t gia cc i tng d liu vi thuc tnh th t
c thc hin nh sau [8]:
Gi s i l thuc tnh th t c Mi gi tr (Mi l kch thc min gi tr):
Cc trng thi Mi c sp th t nh nhau: [1...Mi], c th thay th mi
gi tr ca thuc tnh bng gi tr cng loi ri vi ri {1...Mi}.
Mi mt thuc tnh c th t c cc min gi tr khc nhau, v vy phi
chuyn i chng v cng min gi tr [0, 1] bng cch thc hin php bin
i sau cho mi thuc tnh:
ri( f ) 1
Z ( j)
;
M i 1
i
w x y
2
d ( x, y) i i i
i 1
CHNG 2
K THUT PHN CM D LIU NG DNG TRONG
PHN LOI CU TRC PROTEIN
ti thiu [8].
Trong : mi l trung bnh cm ca cm Ci, x l im d liu i din
cho mt i tng. Trng tm ca cm l mt vector, trong gi tr ca mi
phn t ca n l trung bnh cng ca cc thnh phn tng ng ca cc i
tng vector d liu trong cm ang xt. Tham s u vo ca thut ton l
s cm k, v tham s u ra ca thut ton l cc trng tm ca cc cm d
liu. o khong cch gia cc i tng d liu thng c s dng l
Input: S cm k v cc trng tm cm m j j 1
k
o khong cch, thut ton PAM c p dng cho d liu khng gian.
xc nh cc i din, PAM bt u bng cch la chn k i tng i din
bt k. Sau mi bc thc hin, PAM c gng hon chuyn gia i tng
i din Om v mt i tng Op, khng phi l i din, min l s hon
chuyn ny nhm ci tin cht lng ca phn cm, qu trnh ny kt thc khi
cht lng phn cm khng thay i. Cht lng phn cm c nh gi
thng qua hm tiu chun, cht lng phn cm tt nht khi hm tiu chun
t gi tr ti thiu. PAM tnh gi tr Cjmp cho tt c
cc i tng Oj lm cn c cho vic hon chuyn gia Om v Op.
Om: l i tng i din hin thi cn c thay th:
Op: l i tng i din mi thay th cho Om;
Oj: L i tng d liu (Khng phi i din) c th c di chuyn
sang cm khc;
Oj,2: L i tng i din hin thi gn i tng Oj nht
TCmp l tng khong cch gia i tng i din hin thi Op v i
tng i din Om mi thay th Op
Cc bc thc hin thut ton PAM
Input: Tp d liu c n phn t, s cm k.
Output: k cm d liu sao cho cht lng phn hoch l tt nht.
BEGIN
1. Chn k i tng i din bt k;
2. Tnh TCmp cho tt c cc cp i tng Om, Op. Trong , Om l i
tng i din v Op l i tng khng phi i din;
3. Chn cp i tng Om v Op. Tnh MinOm, MinOp, TCmp, nu TCmp l
m thay th Om bi Op v quay li bc 2. Nu TCmp dng, chuyn sang
bc 4;
CHNG 3
TIN SINH HC V PHN LOI CU TRC PROTEIN
CHNG 4
CHNG TRNH DEMO VI PHN MM CLUSTERS 3.0
KT LUN
Trong bn lun vn ny ti tm hiu, nghin cu mt s vn sau:
- Lun vn trnh by l thuyt c bn v phn cm d liu, v mt s thut
ton phn cm d liu da vo cm trung tm ng dng vo phn loi cu trc
protein.
- Gii thiu v Protein, cu trc, chc nng ca protein, mt s phng
php phn loi cu trc protein
- Lun vn s dng chng trnh Cluster 3.0 vi mc ch biu din
phn cm d liu s dng thut ton K-means v K-medians. Chng trnh
c s dng th vin phn cm C p dng cc thut ton K-means, v K-
medians tm trung tm cm, trung v cm v x l phn cm da trn hai
thut ton trn.
Vi nhim v l nghin cu tm hiu vic p dng mt s thut ton phn
cm phn loi cu trc protein. Tuy nhin chng trnh c nhiu hn ch:
- Chng trnh mi ch s dng u vo l tp tin .txt (c th m vi
excel) cha s dng nh dng ca protein t c ngn hng d liu protein
PDB phn loi.
- Chng trnh mi ch dng vic s dng 2 thut ton K-means v
K-medians.
HNG NGHIN CU TRONG THI GIAN TI
Trong tng lai ti pht trin theo hng nghin cu phn loi Protein
vi phn loi trnh t, phn loi cu trc ca protein.
Tm hiu ngn hng d liu protein v s dng ngun d liu ny trong
chng trnh.
C s d liu protein l rt ln v a dng cho nhiu loi, v vy cn s
dng cc thut ton c tc x l tt hn.
Ting Vit
[1] Lu Th Hi Yn (2008), Lun vn Thc s - ti: Nghin cu pht
trin h thng a phng tin trn c s phn cm d liu, i hc Thi
Nguyn- Khoa cng ngh thng tin.
[2] Nguyn Q.D., Trn .H., Trn T.T.B., Phm T.H. (2011). D on
chc nng protein bng phng php phn cm d liu, c san CNTT, Tp
ch Khoa hc HSPHN, 56, 3-16.
Ting Anh
[3] A.K Jain, M.N Murty and P,J Flyn (1999), Data Clustering: A
Review, ACM Computing Survey.
[4] A.K Jain, Richard C. Dubes (1988), Algorithms for Clustering Data,
Prentice Hall, Englewood Cliffs, New Jersey 07632.
[5] Patrice Koehl (2006), Protein Structure Classification, Department of
Computer Science and Genome Center, University of California, Davis,
California.
[6] Michiel Jan Laurens de Hoon (2010), The C Clustering Library for
cDNA microarray data, The University of Tokyo, Institute of Medical
Science, Human Genome Center, University of Tokyo, 4-6-1 Shirokanedai,
Minato-ku, Tokyo 108-8639, Japan.
[7] Michael Eisen; updated by Michiel de Hoon (2002), Cluster 3.0
Manual, Human Genome Center, University of Tokyo, 4-6-1 Shirokanedai,
Minato-ku, Tokyo 108-8639, Japan.
[8] Michelle Kamber and Jiawei Han (2001), Data Mining: Concepts
and Techniques. Morgan Kaufmann Publishers, University of Illinois at
Urbana-Champaign, 500 Sansome Street, Suite 400, San Francisco, CA 94111
PH LC
/* ********HM GETCLUSTERCENTROIDS*******/
/* **********TH TC GETCLUSTERMEANS******* */
}
}
}
else
{ for (i = 0; i < nrows; i++)
{ for (j = 0; j < nclusters; j++)
{ cdata[i][j] = 0.;
cmask[i][j] = 0;
}
}
for (k = 0; k < ncolumns; k++)
{ i = clusterid[k];
for (j = 0; j < nrows; j++)
{ if (mask[j][k] != 0)
{ cdata[j][i]+=data[j][k];
cmask[j][i]++;
}
}
}
for (i = 0; i < nrows; i++)
{ for (j = 0; j < nclusters; j++)
{ if (cmask[i][j]>0)
{ cdata[i][j] /= cmask[i][j];
cmask[i][j] = 1;
}
}
}
}
}
/* ****************TH TC GETCLUSTERMEDIANS***********/
static void
getclustermedians(int nclusters, int nrows, int ncolumns, double** data,
int** mask, int clusterid[], double** cdata, int** cmask,int transpose,
double cache[])
{ int i, j, k;
if (transpose==0)
{ for (i = 0; i < nclusters; i++)
{ for (j = 0; j < ncolumns; j++)
{ int count = 0;
for (k = 0; k < nrows; k++)
{ if (i==clusterid[k] && mask[k][j])
{ cache[count] = data[k][j];
count++;
}
}
if (count>0)
{ cdata[i][j] = median(count,cache);
cmask[i][j] = 1;
}
else
{ cdata[i][j] = 0.;
cmask[i][j] = 0;
}
}
}
}
else
int i;
int ok;
int* tclusterid;
int* mapping = NULL;
double** cdata;
int** cmask;
int* counts;
*ifound = -1;
return;
}
mapping = malloc(nclusters*sizeof(int));
if (!mapping)
{ free(counts);
free(tclusterid);
return;
}
for (i = 0; i < nelements; i++) clusterid[i] = 0;
}
if (method=='m')
{ double* cache = malloc(nelements*sizeof(double));
if(cache)
{ *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
transpose, npass, dist, cdata, cmask, clusterid,
error,
tclusterid, counts, mapping, cache);
free(cache);
}
}
else
*ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
transpose, npass, dist, cdata, cmask, clusterid,
error,
tclusterid, counts, mapping);
free(counts);
}
/* *********************************************************************
*/
static int
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
*error = DBL_MAX;
do
{ double total = DBL_MAX;
int counter = 0;
int period = 10;
if (npass<=1)
{ *error = total;
break;
}
free(saved);
return ifound;
}
/* ----------------------------------------------------------------------
*/
static int
kmedians(int nclusters, int nrows, int ncolumns, double** data, int**
mask,
double weight[], int transpose, int npass, char dist,
double** cdata, int** cmask, int clusterid[], double* error,
*error = DBL_MAX;
do
{ double total = DBL_MAX;
int counter = 0;
int period = 10;
{ double tdistance;
if (j==k) continue;
tdistance =
metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
if (tdistance < distance)
{ distance = tdistance;
counts[tclusterid[i]]--;
tclusterid[i] = j;
counts[j]++;
}
}
total += distance;
}
if (total>=previous) break;
/* total>=previous is FALSE on some machines even if total and
previous
* are bitwise identical. */
for (i = 0; i < nelements; i++)
if (saved[i]!=tclusterid[i]) break;
if (i==nelements)
break; /* Identical solution found; break out of this loop */
}
if (npass<=1)
{ *error = total;
break;
}
free(saved);
return ifound;
}
/* *********************************************************************
*/