Professional Documents
Culture Documents
Q1(b With appropriate example show memory as a limiting factor to parallelism. (5M)
)
All threads access global memory for their input matrix elements
OR
Q3(a) Elaborate upon synchronization and transparent scalability in CUDA GPUs. (5M)
A barrier is a synchronization point:
Q3(b Write a CUDA code to add two matrices of the order N; involving kernel definition and (5M)
) launch of kernel from the host code.
#include<stdio.h>
#include<cuda.h>
#define N 3
int x=blockIdx.x;
int y=blockIdx.y;
int id=gridDim.x * y +x;
n[id]=l[id]+m[id];
}
int main()
int a[N][N];
int b[N][N];
int c[N][N];
int *d,*e,*f;
int i,j;
cudaMalloc((void **)&d,N*N*sizeof(int));
cudaMalloc((void **)&e,N*N*sizeof(int));
cudaMalloc((void **)&f,N*N*sizeof(int));
cudaMemcpy(d,a,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(e,b,N*N*sizeof(int),cudaMemcpyHostToDevice);
dim3 grid(N,N);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is
dim3 grid(no. of columns,no. of rows) */
matadd<<<grid,1>>>(d,e,f);
cudaMemcpy(c,f,N*N*sizeof(int),cudaMemcpyDeviceToHost);
printf("\nSum of two matrices:\n ");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf("%d\t",c[i][j]);
}
printf("\n");
}
cudaFree(d);
cudaFree(e);
cudaFree(f);
return 0;
}
Q. No. 1a 1b 2a 2b 3a 3b