Welcome to Scribd, the world's digital library. Read, publish, and share books and documents. See more
Standard view
Full view
of .
Look up keyword
Like this
0 of .
Results for:
No results containing your search query
P. 1
Shared Blocks Matrix Mul.cu

Shared Blocks Matrix Mul.cu

Ratings: (0)|Views: 116|Likes:
Published by Farhan Ahmad
CUDA C program for multipying two matrices. Uses shared memory and multiple thread blocks.
CUDA C program for multipying two matrices. Uses shared memory and multiple thread blocks.

More info:

Categories:Types, School Work
Published by: Farhan Ahmad on Oct 04, 2010
Copyright:Attribution Non-commercial


Read on Scribd mobile: iPhone, iPad and Android.
download as TXT, PDF, TXT or read online from Scribd
See more
See less





//Matrix multiplication using shared mempry and multiple blocks#include<stdio.h>#include<conio.h>#include<cuda.h>#include<stdlib.h>//Kernel funtion __global__ void matrix_mul_blocks(float *first, float *second,float *result,inttile_width,int width1,int width2){float sum;int idx = threadIdx.x;int idy = threadIdx.y;int bx = blockIdx.x;int by = blockIdx.y;int k ,uidx , uidy , i;uidx = bx*tile_width + idx;uidy = by*tile_width + idy;sum = 0;// Allocating memory in shared memory __shared__ float temp1[4][4]; __shared__ float temp2[4][4];//copying the data to shared memoryfor( i =0;i< width1/4; i++){ temp1[idy][idx] = first[uidy * width1 + ((i*tile_width)+uidx)%width1];temp2[idy][idx] = second[(i*tile_width+uidy * width2)%width1 + uidx]; __syncthreads();// multiplying matrices in shared memoryfor(k=0 ; k < tile_width ;k++){sum = sum + temp1[idy][k]*temp2[k][idx];}}// synchronizing the threads __syncthreads();result[uidy*width2 + uidx] = sum;}void main(){float *a_h, *b_h, *c_h, *a_d, *b_d, *c_d;int row1, col1, row2, col2 ;int tile_width , width1 , width2;char option;//acquiring the sizeprintf("Enter the size of the first matrix(row col)>>");scanf("%d %d",&row1,&col1);
printf("Enter the size of the second matrix(row col)>>");scanf("%d %d",&row2,&col2);//memory allocation on hosta_h = (float*)malloc(sizeof(float)*row1*col1);b_h = (float*)malloc(sizeof(float)*row2*col2);c_h = (float*)malloc(sizeof(float)*row1*col2);//memory allocation on devicecudaMalloc((void**)&a_d,sizeof(float)*row1*col1);cudaMalloc((void**)&b_d,sizeof(float)*row2*col2);cudaMalloc((void**)&c_d,sizeof(float)*row1*col2);//getting the data on hostprintf("Want to enter automatically(Y/N)>>");scanf(" %c",&option);if(option== 'Y'){for(int i =0;i<row1;i++){for(int j =0;j<col1;j++){a_h[i*col1 +j] = 1;}}for(int i=0;i<row2;i++){for(int j=0;j<col2;j++){b_h[i*col2 +j] = 1;}}}else if(option == 'N'){printf("Enter the data for the first matrix\n");for(int i =0;i<row1;i++){for(int j=0;j<col1;j++){printf("Enter the element a_h[%d][%d]>>",i,j);scanf("%f",&a_h[i*col1+j]);}}printf("Enter the data for the second matrix\n");for(int i =0;i<row2;i++){for(int j=0;j<col2;j++){printf("Enter the element b_h[%d][%d]>>",i,j);scanf("%f",&b_h[i*col2+j]);}}}//printing the data enteredprintf("First Matrix\n\n");
for(int i =0;i<row1;i++){for(int j=0;j<col1;j++){printf("%f\t",a_h[i*col1+j]);}printf("\n");}printf("\nSecond Matrix\n");for(int i =0;i<row2;i++){for(int j=0;j<col2;j++){printf("%f\t",b_h[i*col2+j]);}printf("\n");}printf("\n");//copying the data to devicecudaMemcpy(a_d,a_h,sizeof(float)*row1*col1,cudaMemcpyHostToDevice);cudaMemcpy(b_d,b_h,sizeof(float)*row2*col2,cudaMemcpyHostToDevice);//determing widthwidth1 = col1;width2 = col2;//defining the no of blocks and threads per blockdim3 dimGrid(col2/4 , row1/4 ,1);dim3 dimBlock(4,4,1);tile_width = 4;//call to the kernel functionmatrix_mul_blocks<<<dimGrid , dimBlock>>>(a_d,b_d,c_d,tile_width,width1,width2);//retrieving data from devicecudaMemcpy(c_h,c_d,sizeof(float)*row1*col2,cudaMemcpyDeviceToHost);//displaying the resultfor(int i =0;i<row1;i++){for(int j=0;j<col2;j++){printf(" %f",c_h[i*col2+j]);}printf("\n\n");}getch();//deallocating the memoryfree(a_h);free(b_h);free(c_h);

You're Reading a Free Preview

/*********** DO NOT ALTER ANYTHING BELOW THIS LINE ! ************/ var s_code=s.t();if(s_code)document.write(s_code)//-->