You are on page 1of 2

#include<stdio.

h>
#include<cuda.h>
#include<conio.h>
__global__ void addmatrix(float *a , float *b,float *c)
{
int idx = threadIdx.x;
c[idx] = a[idx] + b[idx];
}
int main()
{
float **a_h ,**b_h , **c_h ;
float *a_d , *b_d, *c_d ;
int row , col , i ,j ;
printf("Enter the size(rows space col) of the matrix>>");
scanf("%d %d",&row,&col);
size_t size = row*sizeof(float);//size calculation for the device
//memory allocation for the host device
a_h = (float**)malloc(sizeof(float*)*row);
for(i=0;i<col;i++)
{
a_h[i] = (float*)malloc(sizeof(float)*col);
}
b_h = (float**)malloc(sizeof(float*)*row);
for(i=0;i<col;i++)
{
b_h[i] = (float*)malloc(sizeof(float)*col);
}
c_h = (float**)malloc(sizeof(float*)*row);
for(i=0;i<col;i++)
{
c_h[i] = (float*)malloc(sizeof(float)*col);
}

// memory allocation on DRAM of device


cudaMalloc((void**)&a_d,size);
cudaMalloc((void**)&b_d,size);
cudaMalloc((void**)&c_d,size);
// data input for the arrays

printf("Enter the first matrix\n");


for(i =0;i<row;i++)
{
for(j=0;j<col;j++)
{
printf("Enter the a_h[%d][%d]>>",i,j);
scanf("%f",&a_h[i][j]);
}
}
printf("Enter the second matrix\n");
for(i=0;i<row;i++)
{
for(j=0;j<col;j++)
{
printf("Enter the b_h[%d][%d]>>",i,j);
scanf("%f",&b_h[i][j]);
}
}
//copy data from host to device
for(i=0;i<row;i++)
{
cudaMemcpy(a_d,a_h[i],sizeof(float)*col,cudaMemcpyHostToDevice);
cudaMemcpy(b_d,b_h[i],sizeof(float)*col,cudaMemcpyHostToDevice);
//call to the kernel
addmatrix<<<1,col>>>(a_d,b_d,c_d);
//retrieve data from device
cudaMemcpy(c_h[i],c_d,sizeof(float)*col,cudaMemcpyDeviceToHost);
}
//print result
for(i=0;i<row;i++)
{
for(j=0;j<col;j++)
{
printf("%f\t",c_h[i][j]);
}
printf("\n");
}
//free the memory
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
getch();
}

You might also like