Professional Documents
Culture Documents
OF TECHNOLOGY
PROBLEM STATEMENT
On this assignment, kernel code for the offset access and stride access is
implemented using OpenCL to see the impact on performance. And also simple
uncached version and cached version of matrix multiplication is implemented using
OpenCL to see the benefit of caching on local memory.
But because the machine used to perform this experiment is not giving the output
as expected, the benefit of memory coalescing and the benefit of caching on local
memory cannot be seen on the result. So to see the effects a machine that is
configured well is needed.
INDEX A OpenCL code for offset access
#include<stdio.h>
#include<CL/cl.h>
#include<ctime>
#include<iostream>
using namespace std;
void main () {
int N = 67108864;
for(int offset = 0; offset <= 16; offset++) {
cout <<"with offset value: " << offset << endl ;
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
int i;
for(i = 0; i < N; i++){
a[i] = i;
b[i] = N - i;
}
double elapsed ;
clock_t start=clock();
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0,
NULL, NULL);
clFinish(queue);
clock_t end=clock();
elapsed = double (end - start) / CLOCKS_PER_SEC ;
cout<<"the time it take to run the kernel is : " << elapsed <<endl <<endl;
free(a);
free(b);
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
system("PAUSE");
}
INDEX B OpenCL code for stride access
#include<stdio.h>
#include<CL/cl.h>
#include<ctime>
#include<iostream>
using namespace std;
void main () {
int N = 67108864;
for(int stride = 0; stride <= 16; stride++) {
cout <<"with stride value: " << stride << endl ;
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
int i;
for(i = 0; i < N; i++){
a[i] = i;
b[i] = N - i;
}
double elapsed ;
clock_t start=clock();
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0,
NULL, NULL);
clFinish(queue);
clock_t end=clock();
elapsed = double (end - start) / CLOCKS_PER_SEC ;
cout<<"the time it take to run the kernel is : " << elapsed <<endl <<endl;
free(a);
free(b);
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
system("PAUSE");
}
INDEX C OpenCL code for simple matrix multiply
#include<stdio.h>
#include<CL/cl.h>
#include<ctime>
#include<iostream>
using namespace std;
void main () {
int N = 1024;
for(int workgroup = 4; workgroup <= 256; workgroup*=2) {
//cout <<"with stride value: " << stride << endl ;
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
int i;
for(i = 0; i < N*N; i++){
a[i] = i;
b[i] = N - i;
}
double elapsed ;
cl_mem a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, N*N*sizeof(cl_float), b, NULL);
cl_mem b_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, N*N*sizeof(cl_float), a, NULL);
cl_mem c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
N*N*sizeof(cl_float), NULL, NULL);
free(a);
free(b);
free(c);
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
system("PAUSE");
}
INDEX D OpenCL code for cached matrix multiply
#include<stdio.h>
#include<CL/cl.h>
#include<ctime>
#include<iostream>
using namespace std;
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
double elapsed ;
free(a);
free(b);
free(c);
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
system("PAUSE");
return 0;
}