You are on page 1of 2

// Matrix Transpose with single block

#include<stdio.h>
#include<conio.h>
#include<cuda.h>

__global__ void Matrix_Transpose(float *input , float *output, int col)


{
int idx = threadIdx.x;
int idy = threadIdx.y;
output[idx * col + idy] = input[idy * col + idx];

void main()
{
float *a_h, *c_h;
float *a_d, *c_d;
int row, col;
printf("Enter the size of the matrix>>");
scanf("%d %d",&row,&col);
// Memory allocation on host
a_h = (float*)malloc(sizeof(float)*row*col);
c_h = (float*)malloc(sizeof(float)*row*col);
// Memory allocation on device
cudaMalloc((void**)&a_d,sizeof(float)*row*col);
cudaMalloc((void**)&c_d,sizeof(float)*row*col);
//Input from user
printf("Enter the matix\n");
for(int i =0 ; i < row*col ; i++)
{
scanf("%f",&a_h[i]);
}
// Display the matrix
for(int i= 0; i<row; i++)
{
for(int j =0 ; j< col ; j++)
{
printf("%f\t",a_h[i*col + j]);
}
printf("\n");
}
//copying the data to device
cudaMemcpy(a_d,a_h,sizeof(float)*row*col,cudaMemcpyHostToDevice);
//defining block size
dim3 dimBlock(col,row,1);

//kernel call
Matrix_Transpose<<< 1 , dimBlock >>>(a_d,c_d,col);

// Sendind data to host from device


cudaMemcpy(c_h,c_d,sizeof(float)*row*col,cudaMemcpyDeviceToHost);
//displaying transpose matrix
printf("\n");
for(int i= 0; i<row; i++)
{
for(int j =0 ; j< col ; j++)
{
printf("%f\t",c_h[i*col + j]);
}
printf("\n");
}
getch();
//dealloating the memory
free(a_h);
free(c_h);
cudaFree(a_d);
cudaFree(c_d);
}

You might also like