Matlab Program

%last updated: 11/01/2010z
%Copyright: Kevin Bleakley 2010.
%Purpose: script to do drug-target interaction prediction. Uses 10 fold cross-validation.
%Requires: the libsvm package. [GIVE URL!]
%The actual datasets we used in the correct format are downloadable

%from the same page you got this file. So you can see them run before trying to use your own data.
%Output: gives two continuously-valued scores for each putative drug-target interaction. Also,
%we calculate ROC curves and Area under them (AUC).
%General remarks:
%1) The code should work on similarity matrices also, and not just positive semi-definite
% kernel matrices.
%2) You have to have pre-calculated your similarity/kernel matrices in advance.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%
%here we load the dataset(s) and, as they are textfiles converted from a format needing to be read by
%"R", they have one extra column to be removed before proceeding. You have to comment or
uncomment the
%data set that you want to use.
% %1) ENZYMES
% %adjacency matrix:
% load e_admat_dgc2.txt
% y = e_admat_dgc2(:,2:(size(e_admat_dgc2,2)));
% %compound similarity matrix:
% load e_simmat_dc2.txt
% kCompound = e_simmat_dc2(:,2:(size(e_simmat_dc2,2)));
% %target similarity matrix:
% load e_simmat_dg2.txt
% kTarget = e_simmat_dg2(:,2:(size(e_simmat_dg2,2)));
% %2) ION CHANNELS

% load ic_admat_dgc2.txt
% y = ic_admat_dgc2(:,2:(size(ic_admat_dgc2,2)));
% load ic_simmat_dc2.txt
% kCompound = ic_simmat_dc2(:,2:(size(ic_simmat_dc2,2)));
% load ic_simmat_dg2.txt
% kTarget = ic_simmat_dg2(:,2:(size(ic_simmat_dg2,2)));
% %3)GPCRs
% load gpcr_admat_dgc2.txt
% y = gpcr_admat_dgc2(:,2:(size(gpcr_admat_dgc2,2)));
% load gpcr_simmat_dc2.txt
% kCompound = gpcr_simmat_dc2(:,2:(size(gpcr_simmat_dc2,2)));
% load gpcr_simmat_dg2.txt
% kTarget = gpcr_simmat_dg2(:,2:(size(gpcr_simmat_dg2,2)));
%4)NUCLEAR RECEPTORS
%adjacency matrix:
load nr_admat_dgc2.txt
y = nr_admat_dgc2(:,2:(size(nr_admat_dgc2,2)));
%compound similarity matrix:
load "nr_simmat_dc2.txt"
kCompound = nr_simmat_dc2(:,2:(size(nr_simmat_dc2,2)));
%target similarity matrix:
load nr_simmat_dg2.txt
kTarget = nr_simmat_dg2(:,2:(size(nr_simmat_dg2,2)));
%We had problems with kCompound having imaginary eigenvalues. Here, we symmetrize kCompound
%before continuing:
kCompound = (kCompound + kCompound')/2;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Checking for positive semi-definite. This is because, at present, these are "similarity" matrices rather
%than "kernel" matrices. For now, we just add increments of a small epsilon to the diagonal, until the
%matrix becomes positive semi-definite:
epsilon = .1;
while sum(eig(kCompound) >= 0) < size(kCompound,1) | isreal(eig(kCompound))==0
kCompound = kCompound + epsilon*eye(size(kCompound,1));
end
while sum(eig(kTarget) >= 0) < size(kTarget,1) | isreal(eig(kTarget))==0
kTarget = kTarget + epsilon*eye(size(kTarget,1));
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%intialise some variables:

lengthKCompound = size(kCompound,1);
myPredictions = zeros(lengthKCompound,size(y,1));
lengthKTarget = size(kTarget,1);
myPredictions2 = zeros(lengthKTarget,size(y,2));
numbEdges1 = zeros(lengthKCompound,size(y,1));
numbEdges2 = zeros(lengthKTarget,size(y,2));
%randomly permute the order of the drugs and compounds.:

firstRP = randperm(size(kCompound,1));
secondRP = randperm(size(kTarget,1));
kCompound = kCompound(firstRP,firstRP);
kTarget = kTarget(secondRP,secondRP);
y = y(secondRP,firstRP);
%and we have to save the 'inverse' of these permutations:

[myback1 mb1] = sort(firstRP);
[myback2 mb2] = sort(secondRP);
%define the number of cross-validation splits:

numOfSplits = 10;
theseIndices = zeros(numOfSplits,2);
thebasic = floor(size(y,1)/numOfSplits);
theseIndices(1,1) = 1;
for gg = 1:(numOfSplits-1)
theseIndices(gg,2) = thebasic*gg;
theseIndices(gg+1,1) = thebasic*gg + 1;
end
theseIndices(numOfSplits,2) = size(y,1);
theseIndices2 = zeros(numOfSplits,2);
thebasic2 = floor(size(y,2)/numOfSplits);
theseIndices2(1,1) = 1;
for gg = 1:(numOfSplits-1)
theseIndices2(gg,2) = thebasic2*gg;
theseIndices2(gg+1,1) = thebasic2*gg + 1;
end
theseIndices2(numOfSplits,2) = size(y,2);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Fixing each target in turn, then doing ten fold cross validation
%on the set of compounds, to predict which ones
%are/are not targeting the target in question:
for i=1:size(y,1);
i
pause(.3)
currentY = y(i,:)';
for j = 1:lengthKCompound
%%%!%%%
whatSplit = sum(j >= theseIndices2(:,1));
%%%!%%%
%j;
if sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)))
== 1) > 0
%these are the weights for positive and negative examples. They alter the behavior of libsvm.
m2a = 1;
m2b = 1;
%defining the train and test matrices that will be input into libsvm.
%%%!%%%
K1 = [[1:(lengthKCompound-(theseIndices2(whatSplit,2)-theseIndices2(whatSplit,1)+1))]'
kCompound(setdiff(1:lengthKCompound,
[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]),setdiff(1:lengthKCompound,
[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))];
K2 = [(j)' kCompound(j,setdiff(1:lengthKCompound,
[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))];
%%%!%%%
%training:
model =
svmtrain(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))),K
1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)]));
%testing:
[predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model);
%%!%%
%this line is to fix a weakness in libsvm, the first training label is always automatically
%labeled +1, even if it's -1 !
firstLabel = currentY(1)*(j>theseIndices2(1,2)) +
currentY(theseIndices2(2,1))*(j<=theseIndices2(1,2));
%%!%%
myPredictions(j,i) = dec_values*sign(firstLabel - 1/2);
numbEdges1(j,i) =
sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1);
else
myPredictions(j,i) = -1;
numbEdges1(j,i) = 0;
end
end
end
%un-permuting the row and column orders:

myPredictions = myPredictions(mb1,mb2);
ytemp = y';
ytemp = ytemp(mb1,mb2);
ytemp = ytemp';
%Turning myPredictions and "y" into vectors:

myScores =[];
myTrueLabels = [];
for k = 1:lengthKCompound
myScores = [myScores myPredictions(k,:)];
myTrueLabels = [myTrueLabels ytemp(:,k)'];
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Fixing each compound in turn, then doing ten fold cross validation
%on the set of targets, to predict which ones
%are/are not targeted by the compound in question:
for i= 1:size(y,2);
i;
currentY = y(:,i);
for j = 1:lengthKTarget
%%%!%%%
whatSplit = sum(j >= theseIndices(:,1));
%%%!%%%
if sum(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))) == 1) >
0
%these are the weights for positive and negative examples. They alter the behavior of
%libsvm.
m2a = 1;
m2b = 1;
%matrices to input to libsvm:
K1 = [[1:(lengthKTarget-(theseIndices(whatSplit,2)-theseIndices(whatSplit,1)+1))]'
kTarget(setdiff(1:lengthKTarget,
[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]),setdiff(1:lengthKTarget,
[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))];
K2 = [(j)' kTarget(j,setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))];
%training:
model =
svmtrain(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))),K1,strca
t(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)]));
%testing:
[predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model);
%fixing libsvms desire to label -1 examples as +1 examples if they are the first item in the
%training set.
firstLabel = currentY(1)*(j>theseIndices(1,2)) + currentY(theseIndices(2,1))*(j<=theseIndices(1,2));
myPredictions2(j,i) = dec_values*sign(firstLabel - 1/2);
numbEdges2(j,i) =
sum(currentY(setdiff(1:lengthKTarget,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1);
else
myPredictions2(j,i) = -1;
numbEdges2(j,i) = 0;
end
end
end
%un-permuting the row and column orders:

myPredictions2 = myPredictions2';
myPredictions2 = myPredictions2(mb1,mb2);
myPredictions2 = myPredictions2';
y = y(mb2,mb1);
%Turning myPredictions and "y" into vectors:

myScores2 =[];
myTrueLabels2 = [];
for k = 1:lengthKTarget
myScores2 = [myScores2 myPredictions2(k,:)];
myTrueLabels2 = [myTrueLabels2 y(k,:)];
end
%and again, but this time by column, (for later):

myScores2b =[];
myTrueLabels2b = [];
myScores2b = [myScores2b myPredictions2(:,k)'];
myTrueLabels2b = [myTrueLabels2b y(:,k)'];
end
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%AREA UNDER ROC CURVE for fixing targets in turn:
myNumb = length((-2.5):.001:(2.5));
TruPos = zeros(1,myNumb);
FalPos = zeros(1,myNumb);
countMan = 0;
for moveit = (-2.5):.001:(2.5)
countMan = countMan + 1;
TruPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==1);
FalPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==0);
end
plot(FalPos/max(FalPos),TruPos/max(TruPos),'r');
hold;
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xx = FalPos/max(FalPos);
yy = TruPos/max(TruPos);
areaUnderROC = 0;
old = [0 0];
for bb = 1:myNumb
new = [xx(bb) yy(bb)];
if new(1) == old(1) & new(2) > old(2)

old = new;
elseif new(1) > old(1) & new(2) > old(2)

areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1));
old = new;
elseif new(1) > old(1) & new(2) == old(2)

areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1));
old = new;
end
end
areaUnderROC
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %AREA UNDER ROC CURVE for fixing compounds in turn:
myNumb = length((-2.5):.001:(2.5));
TruPos2 = zeros(1,myNumb);
FalPos2 = zeros(1,myNumb);
countMan = 0;
for moveit = (-2.5):.001:(2.5)
TruPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==1);
FalPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==0);
end
plot(FalPos2/max(FalPos2),TruPos2/max(TruPos2),'b');
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xx = FalPos2/max(FalPos2);
yy = TruPos2/max(TruPos2);
areaUnderROC = 0;
old = [0 0];
for bb = 1:myNumb
new = [xx(bb) yy(bb)];
if new(1) == old(1) & new(2) > old(2)

old = new;
elseif new(1) > old(1) & new(2) > old(2)

areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1));
old = new;
elseif new(1) > old(1) & new(2) == old(2)

areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1));
old = new;
end
end
areaUnderROC
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %AREA UNDER ROC CURVE for both situations added together:
%various ways of combining the two predictions made for each edge:
%1) adding them:
%myPredictions3 = myPredictions + myPredictions2';
%2) taking the largest:
myPredictions3 = max(myPredictions,myPredictions2');
myScores3 =[];
myTrueLabels3 = [];
myScores3 = [myScores3 myPredictions3(k,:)];
myTrueLabels3 = [myTrueLabels3 y(:,k)'];
end
myNumb = length((-5):.001:(5));
TruPos3 = zeros(1,myNumb);
FalPos3 = zeros(1,myNumb);
countMan = 0;
for moveit = (-5):.001:(5)
TruPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==1);
FalPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==0);
end
plot(FalPos3/max(FalPos3),TruPos3/max(TruPos3),'g');
hold off;
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xx3 = FalPos3/max(FalPos3);
yy3 = TruPos3/max(TruPos3);
areaUnderROC3 = 0;
old3 = [0 0];
for bb = 1:length(xx3)
new3 = [xx3(bb) yy3(bb)];
if new3(1) == old3(1) & new3(2) > old3(2)

old3 = new3;
elseif new3(1) > old3(1) & new3(2) > old3(2)

areaUnderROC3 = areaUnderROC3 + ((old3(2) + new3(2))/2) * (new3(1) - old3(1));
old3 = new3;
elseif new3(1) > old3(1) & new3(2) == old3(2)
areaUnderROC3 = areaUnderROC3 + old3(2) * (new3(1) - old3(1));
old3 = new3;
end
end
areaUnderROC3
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%here we calculate area under precision-recall curve, only for the 3rd "max" data set.
%by copying and pasting and changing all the '3' to '1' or '2',
%you can do this for the TruePos1 etc., and TruePos2 etc.
%FDR curves and/or precision-recall curves:

xFDRPR = TruPos3/max(TruPos3);
yFDR = FalPos3./(FalPos3 + TruPos3);
yPR = 1 - yFDR;
plot(xFDRPR,yFDR);
plot(xFDRPR,yPR);
%area under precision accuracy curve: (actually it's a code for area under the FDR curve, so
%in the last line we do "1 - result".
xxPR = TruPos3/max(TruPos3);
yyPR = FalPos3./(FalPos3 + TruPos3 + .000000001*(FalPos3==0 & TruPos3==0));
xRun = 0;
addToSum = 0;
myXa = 0;
for myCounta = 1:myNumb
if xxPR(myCounta) > xRun & myCounta > 1
myXb = xxPR(myCounta);
addToSum = addToSum + (myXb - myXa)*(yyPR(myCounta - 1) + yyPR(myCounta))/2;
myXa = myXb;
xRun = xxPR(myCounta);
elseif xxPR(myCounta) > xRun & myCounta == 1
myXa = xxPR(1);
xRun = xxPR(myCounta);
end
end
AUPR = 1 - addToSum
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%

Matlab Program

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Matlab Program

Uploaded by

Copyright:

Available Formats

%last updated: 11/01/2010z

%Copyright: Kevin Bleakley 2010.

%Purpose: script to do drug-target interaction prediction. Uses 10 fold cross-validation.

%Requires: the libsvm package. [GIVE URL!]

%The actual datasets we used in the correct format are downloadable

% %2) ION CHANNELS

%intialise some variables:

%randomly permute the order of the drugs and compounds.:

%and we have to save the 'inverse' of these permutations:

%define the number of cross-validation splits:

%un-permuting the row and column orders:

%Turning myPredictions and "y" into vectors:

%un-permuting the row and column orders:

%Turning myPredictions and "y" into vectors:

%and again, but this time by column, (for later):

if new(1) == old(1) & new(2) > old(2)

elseif new(1) > old(1) & new(2) > old(2)

elseif new(1) > old(1) & new(2) == old(2)

if new(1) == old(1) & new(2) > old(2)

elseif new(1) > old(1) & new(2) > old(2)

elseif new(1) > old(1) & new(2) == old(2)

if new3(1) == old3(1) & new3(2) > old3(2)

elseif new3(1) > old3(1) & new3(2) > old3(2)

%FDR curves and/or precision-recall curves:

You might also like