//Matrix multiplication using multiple blocks #include<stdio.h> #include<conio.h> #include<cuda.

h> //Kernel funtion __global__ void matrix_mul_blocks(float *first, float *second,float *result,int tile_width,int width1,int width2) { float a,b,sum; int idx = threadIdx.x; int idy = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int k ,uidx , uidy ; uidx = bx*tile_width + idx; uidy = by*tile_width + idy; sum =0; for(k=0 ; k<width1 ;k++) { a = first[uidy*width1+k]; b = second[uidx + k*width2]; sum = sum + (a*b); } result[uidy*width2 + uidx] = sum;

} void main() { float *a_h, *b_h, *c_h, *a_d, *b_d, *c_d; int row1, col1, row2, col2 ; int tile_width , width1 , width2; char option; //acquiring the size printf("Enter the size of the first matrix(row col)>>"); scanf("%d %d",&row1,&col1); printf("Enter the size of the second matrix(row col)>>"); scanf("%d %d",&row2,&col2); //memory allocation on host a_h = (float*)malloc(sizeof(float)*row1*col1); b_h = (float*)malloc(sizeof(float)*row2*col2); c_h = (float*)malloc(sizeof(float)*row1*col2); //memory allocation on device cudaMalloc((void**)&a_d,sizeof(float)*row1*col1); cudaMalloc((void**)&b_d,sizeof(float)*row2*col2); cudaMalloc((void**)&c_d,sizeof(float)*row1*col2); //getting the data on host printf("Want to enter automatically(Y/N)>>");

scanf(" %c",&option); if(option== 'Y') { for(int i =0;i<row1;i++) { for(int j =0;j<col1;j++) { a_h[i*col1 +j] = i+j; } } for(int i=0;i<row2;i++) { for(int j=0;j<col2;j++) { b_h[i*col2 +j] = i*j; } } } else if(option == 'N') { printf("Enter the data for the first matrix\n"); for(int i =0;i<row1;i++) { for(int j=0;j<col1;j++) { printf("Enter the element a_h[%d][%d]>>",i,j); scanf("%f",&a_h[i*col1+j]); } } printf("Enter the data for the second matrix\n"); for(int i =0;i<row2;i++) { for(int j=0;j<col2;j++) { printf("Enter the element b_h[%d][%d]>>",i,j); scanf("%f",&b_h[i*col2+j]); } } } //printing the data entered for(int i =0;i<row1;i++) { for(int j=0;j<col1;j++) { printf("%f\t",a_h[i*col1+j]); } printf("\n"); } printf("\n"); for(int i =0;i<row2;i++) { for(int j=0;j<col2;j++) { printf("%f\t",b_h[i*col2+j]); } printf("\n");

} printf("\n"); //copying the data to device cudaMemcpy(a_d,a_h,sizeof(float)*row1*col1,cudaMemcpyHostToDevice); cudaMemcpy(b_d,b_h,sizeof(float)*row2*col2,cudaMemcpyHostToDevice); //determing width width1 = col1; width2 = col2; //defining the no of blocks and threads per block dim3 dimGrid(col2/2 , row1/2 ,1); dim3 dimBlock(2,2,1); tile_width = 2; //call to the kernel function matrix_mul_blocks<<<dimGrid , dimBlock>>>(a_d,b_d,c_d,tile_width,width1, width2); //retrieving data from device cudaMemcpy(c_h,c_d,sizeof(float)*row1*col2,cudaMemcpyDeviceToHost); //displaying the result for(int i =0;i<row1;i++) { for(int j=0;j<col2;j++) { printf("%f\t",c_h[i*col2+j]); } printf("\n"); } getch(); //deallocating the memory free(a_h); free(b_h); free(c_h); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); }

Sign up to vote on this title
UsefulNot useful