You are on page 1of 16

1.

dot product

#include<stdio.h>
#include<cuda.h>
#define size 4

__global__ void dotProduct(int *a, int *b, int *c)


{
int i = threadIdx.x + blockDim.x * blockIdx.x;
atomicAdd(c,a[i]*b[i]);
}

int main(){
int a[size] = {1,2,3,4};
int b[size] = {1,2,3,4};
int c = 0;

int *da,*db,*dc;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
cudaMalloc((void **)&db,s);
cudaMemcpy(db,b,s, cudaMemcpyHostToDevice);
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc,&c,sizeof(int), cudaMemcpyHostToDevice);
dotProduct<<<2,2>>>(da,db,dc);
cudaMemcpy(&c,dc,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n dot product is %d ",c);
return 0;
}

2. prefix sum

#include<stdio.h>
#include<cuda.h>
#define size 4

__global__ void prefixSum(int *a)


{
int i = threadIdx.x + blockDim.x * blockIdx.x;

a[i] =a[i]+a[i-1];

int main(){
int a[size] = {1,2,3,4};
int *da;
int s = size*sizeof(int);

for(int i = 0;i<4;i++)
{
printf("\n the element are %d ",a[i]);
}

cudaMalloc((void **) &da,s);


cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
prefixSum<<<2,2>>>(da);

cudaMemcpy(&a,da,s,cudaMemcpyDeviceToHost);
for(int i = 0;i<4;i++)
{
printf("\n prefix sum is %d ",a[i]);
}

return 0;
}

3. odd even sum

#include<stdio.h>
#include<cuda.h>
#define size 4

__global__ void oddSum(int *a, int *odd)


{
int i = threadIdx.x + blockDim.x * blockIdx.x;

if (a[i] %2 != 0)
atomicAdd(odd,a[i]);
}

__global__ void evenSum(int *a,int *even )


{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (a[i]%2 == 0)
atomicAdd(even,a[i]);

int main(){
int a[size] = {1,2,3,4};
int *da;
int s = size*sizeof(int);
int *odd,*even;
int o = 0,e = 0;
printf("\n the element are ");
for(int i = 0;i<4;i++)
{
printf("\n %d ",a[i]);
}

cudaMalloc((void **) &da,s);


cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
cudaMalloc((void **) &odd,sizeof(int));
cudaMemcpy(odd,&o,sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void **) &even,sizeof(int));
cudaMemcpy(even,&e,sizeof(int), cudaMemcpyHostToDevice);

oddSum<<<2,2>>>(da,odd);
evenSum<<<2,2>>>(da,even);

cudaMemcpy(&o,odd,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&e,even,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n odd sum is %d ",o);
printf("\n even sum is %d ",e);
return 0;
}

4,5. change case - upper to lower and lower to upper


#include<stdio.h>
//#include<string.h>
#include<cuda.h>
#define size 4

__global__ void caseChange(char *a, int *len)


{

int i = threadIdx.x + blockDim.x * blockIdx.x;

if (a[i]>='a' && a[i] <= 'z' )


a[i] = a[i] -32;
else if (a[i]>='A' && a[i] <= 'Z' )
a[i] = a[i] +32;
else
a[i] = a[i];
}

int main(){
char a[50];
printf("enter a string in lowercase : ");
scanf("%[^\n]s",a);
int l = strlen(a);
int *dl;
char *da;
int s = l*sizeof(char);

printf("\nThe input string is : ");


printf(" %s ",a);

cudaMalloc((void **) &da,s);


cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
cudaMalloc((void **) &dl,sizeof(int));
cudaMemcpy(dl,&l,sizeof(int), cudaMemcpyHostToDevice);

caseChange<<<2,l>>>(da,dl);

cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);

printf("\nThe string after changing the case : ");

printf("%s ",a);

return 0;
}

6.odd even transposition sort


#include<stdio.h>
#include<cuda.h>
#define size 6
__global__ void evenSort(int *a)
{
int id = threadIdx.x + blockDim.x * blockIdx.x;
id = id*2;
int t = 0;
if(id<=4)
{
if (a[id]>a[id+1])
{
t = a[id];
a[id] = a[id+1];
a[id+1] = t;
}
}
}

__global__ void oddSort(int *a)


{
int id = threadIdx.x + blockDim.x * blockIdx.x;
id = id*2+1;
int t = 0;
if(id<=4)
{
if (a[id]>a[id+1])
{
t = a[id];
a[id] = a[id+1];
a[id+1] = t;
}
}
}

int main(){
int a[size] = {9,1,8,4,7,2};
printf("The elements before sorting are: ");
for(int i = 0;i<6;i++)
{
printf("%d ",a[i]);
}

int *da;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
for(int k = 0;k<=3;k++){
evenSort<<<2,3>>>(da);
oddSort<<<2,3>>>(da);
}

cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("\n After sorting ");
for(int i = 0;i<6;i++)
{
printf("%d ",a[i]);
}
return 0;
}
7. armstrong numbers from 1 to 1000

#include<stdio.h>
#include<cuda.h>
#define size 20

__global__ void armstrong(int *a)


{
int id = threadIdx.x + blockDim.x * blockIdx.x;
int digi,sum=0;

int num,cube=0;
num = a[id];
printf(" num is %d\n ",num);
while(num != 0){
digi = num%10;
cube = digi*digi*digi;
sum = sum + cube;
num = num/10;
}
printf("%d the sum is %d \n ",a[id],sum);
if (sum == a[id])
a[id] = a[id];
else
a[id] = -1;

int main(){
int a[size];
//printf("The elements before sorting are: ");
for(int i = 0;i<size;i++)
{
a[i] = i+1;
}

int *da;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);

armstrong<<<4,5>>>(da);

cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("\n The armstrong numbers are : ");

for(int i = 0;i<size;i++)
{
if(a[i] != -1)
printf("%d ",a[i]);
}

return 0;
}

8. prime numbers
#include<stdio.h>
#include<cuda.h>
#define size 20

__global__ void prime(int *a)


{
int count = 0;
int id = threadIdx.x + blockDim.x * blockIdx.x;
for(int j=0;j<=size;j++)
{
if (a[id]%j==0)
count = count +1;
}
if (count >2)
a[id] = -1;
else
a[id] = a[id];

int main(){
int a[size];
//printf("The elements before sorting are: ");
for(int i = 0;i<size;i++)
{
a[i] = i+1;
}

int *da;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);

prime<<<4,5>>>(da);

cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("\n The prime numbers are : ");

for(int i = 0;i<size;i++)
{
if(a[i] != -1)
printf("%d ",a[i]);
}

return 0;
}

9.matrix transpose
#include<stdio.h>
#include<cuda.h>
__global__ void trans(int *in, int *out, int nx, int ny) {
__shared__ int tile[2][2];
unsigned int ix,iy,ti,to;
ix = blockIdx.x *blockDim.x + threadIdx.x;
iy = blockIdx.y *blockDim.y + threadIdx.y;
ti = iy*nx + ix;
unsigned int bidx,irow,icol;
bidx = threadIdx.y*blockDim.x + threadIdx.x;
irow = bidx/blockDim.y;
icol = bidx%blockDim.y;
ix = blockIdx.y * blockDim.y + icol;
iy = blockIdx.x * blockDim.x + irow;
to = iy*ny + ix;
if (ix < nx && iy < ny)
{
tile[threadIdx.y][threadIdx.x] = in[ti];
__syncthreads();
out[to] = tile[icol][irow];
}
}

int main(){
int a[2][2] = {{1, 2}, {3, 4}};
int res[2][2] = {{0, 0}, {0, 0}};
int *da;
int *dc;
cudaMalloc((void**)&da, 2* 2*sizeof(int));
cudaMalloc((void**)&dc, 2* 2*sizeof(int));
cudaMemcpy(da, &a, 2 *2*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dc, &res, 2 *2*sizeof(int), cudaMemcpyHostToDevice);
dim3 grid(2, 2);
printf("before transpose: \n");
for(int i = 0; i < 2; i++){
for(int j = 0; j< 2; j++) {
printf("%d ", a[i][j]);
}
printf("\n");
}
trans<<<grid, 1>>>(da, dc, 2, 2);
cudaMemcpy(&res, dc, 2*2*sizeof(int), cudaMemcpyDeviceToHost);
printf("after transpose: \n");
for(int i = 0; i < 2; i++){
for(int j = 0; j< 2; j++) {
printf("%d ", res[i][j]);
}
printf("\n");
}
cudaFree(da);
cudaFree(dc);

10,11. factorial , product of first n natural numbers


#include<stdio.h>
#include<cuda.h>
__global__ void Factorial(int *gpu_num,long int *gpu_res)
{
int i=threadIdx.x+blockDim.x*blockIdx.x;
*gpu_res=1;
for(i=1;i<=*gpu_num;i++)
{
*gpu_res = *gpu_res * i;
}
}

int main()
{
int Number;
int *dev_number;
long int *res, result;
system("clear");
printf("\n\t Enter the number : ");
scanf("%d",&Number);

cudaMalloc((void**)&dev_number,sizeof(int));
cudaMalloc((void**)&res,sizeof(long int));

cudaMemcpy(dev_number,&Number,sizeof(int),cudaMemcpyHostToDevice);

Factorial<<<5,5>>>(dev_number,res);

cudaMemcpy(&result,res,sizeof(long int),cudaMemcpyDeviceToHost);

printf("\n\t Factorial of number %d is %ld \n",Number,result);

return 0;
}

12. stencil

#include <cuda_runtime.h>
#include<stdio.h>
#include<sys/time.h>
#define RADIUS 4
#define BDIM 8

__constant__ float coef[RADIUS + 1];


#define a0 0
#define a1 1
#define a2 2
#define a3 3
#define a4 4

double cpuSecond(){
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
void initialData(float *in, const int size)
{
for (int i = 0; i < size; i++)
{
in[i]=i+1;
}
}
void printData(float *in, const int size)
{
for (int i = RADIUS; i < size; i++)
{
printf("%f ", in[i]);
}
printf("\n");
}
void cpu_stencil_1d (float *in, float *out, int isize)
{
for (int i = RADIUS; i <= isize; i++)
{
float tmp = a1 * (in[i + 1] - in[i - 1])
+ a2 * (in[i + 2] - in[i - 2])
+ a3 * (in[i + 3] - in[i - 3])
+ a4 * (in[i + 4] - in[i - 4]);
out[i] = tmp;
}
}
__global__ void stencil_1d(float *in, float *out, int N)
{
__shared__ float smem[BDIM + 2 * RADIUS];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int sidx = threadIdx.x + RADIUS;
smem[sidx] = in[idx];
printf("\nsmem[%d]=in[%d] by %d, value is %f",sidx,idx,threadIdx.x,in[idx]);
__syncthreads();
if (threadIdx.x < RADIUS)
{
smem[sidx - RADIUS] = in[idx - RADIUS];
smem[sidx + BDIM] = in[idx + BDIM];
printf("\nsmem[%d]=in[%d] by %d, value is %f",sidx-RADIUS,idx-
RADIUS,threadIdx.x,in[idx-RADIUS]);
printf("\nsmem[%d]=in[%d] by %d,value is
%f",sidx+BDIM,idx+BDIM,threadIdx.x,in[idx+BDIM]);
}
__syncthreads();
float tmp = 0.0f;
for (int i = 1; i <= RADIUS; i++)
{
tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]);
}
out[idx] = tmp;
printf("\nin[%d] is %f",idx,in[threadIdx.x]);
printf("\nout[%d] = %f by %d", idx,tmp,threadIdx.x);
}

int main(int argc, char **argv)


{
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("%s starting transpose at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
cudaSetDevice(dev);
int isize = 1 << 3;
size_t nBytes = (isize + 2 * RADIUS) * sizeof(float);
printf("array size: %d ", isize);
bool iprint = 1;
float *h_in = (float *)malloc(nBytes);
float *hostRef = (float *)malloc(nBytes);
float *gpuRef = (float *)malloc(nBytes);
float *d_in, *d_out;
cudaMalloc((float**)&d_in, nBytes);
cudaMalloc((float**)&d_out, nBytes);
initialData(h_in, isize + 2 * RADIUS);
cudaMemcpy(d_in, h_in, nBytes, cudaMemcpyHostToDevice);
const float h_coef[] = {a0, a1, a2, a3, a4};
cudaMemcpyToSymbol( coef, h_coef, (RADIUS + 1) * sizeof(float));
cudaDeviceProp info;
cudaGetDeviceProperties(&info, 0);
dim3 block(BDIM, 1);
dim3 grid(info.maxGridSize[0] < isize / block.x ? info.maxGridSize[0] :
isize / block.x, 1);
printf("(grid, block) %d,%d \n ", grid.x, block.x);
double istart = cpuSecond();
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
stencil_1d<<<1, 8>>>(d_in + RADIUS, d_out + RADIUS, isize);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float t=0;
cudaEventElapsedTime(&t,start,stop);
double ielapsed = cpuSecond() - istart;
cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost);
double cpustart = cpuSecond();
cpu_stencil_1d(h_in, hostRef, isize);
double cpuelapsed = cpuSecond() - cpustart;
if(iprint)
{ printf("\nisize is %d\n",isize);
printData(gpuRef, isize);
}
printf("GPU Elapsed Time %f\n",t);
printf("CPU Elapsed Time %lf\n",cpuelapsed);
cudaFree(d_in);
cudaFree(d_out);
free(h_in);
free(hostRef);
free(gpuRef);
cudaDeviceReset();
return EXIT_SUCCESS;}

13. pre-order tree traversal


#include <stdio.h>
__device__ int temp[7][7];
__global__ void traverse(int *parent,int *child,int *sibling,int
*edge0,int *edge1,int *succ0,int *succ1,int *position,int *preorder) {
int i=threadIdx.x;
if(parent[edge0[i]]==edge1[i]){
if(sibling[edge0[i]]!=-1){
succ0[i]=edge1[i];
succ1[i]=sibling[edge0[i]];
}
else if(parent[edge1[i]]!=-1){
succ0[i]=edge1[i];
succ1[i]=parent[edge1[i]];
}
else{
succ0[i]=edge0[i];
succ1[i]=edge1[i];
preorder[edge1[i]]=1;
}
}
else{
if(child[edge1[i]]!=-1){
succ0[i]=edge1[i];
succ1[i]=child[edge1[i]];
}
else{
succ0[i]=edge1[i];
succ1[i]=edge0[i];
}
}
if(parent[edge0[i]]==edge1[i]){
position[i]=0;
}
else{
position[i]=1;
}
int x;
for(int k=0;k<4;k++){
x=temp[succ0[i]][succ1[i]];
position[i]=position[i]+position[x];
succ0[i]=succ0[x];
succ1[i]=succ1[x];
}
if(edge0[i]==parent[edge1[i]]){
preorder[edge1[i]]=7+1-position[i];
}
}
__global__ void initialize(int *edge0,int *edge1){
for(int i=0;i<12;i++){
temp[edge0[i]][edge1[i]]=i;
}
}
int main()
{
char vertices[7]={'a','b','c','d','e','f','g'};
int parent[7]={-1,0,0,1,1,2,2};
int child[7]={1,3,5,-1,-1,-1,-1};
int sibling[7]={-1,2,-1,4,-1,6,-1};
int edge0[12]={0,1,0,2,1,3,1,4,2,5,2,6};
int edge1[12]={1,0,2,0,3,1,4,1,5,2,6,2};
int succ0[12]; int succ1[12]; int position[12]; int preorder[7];
int
*dparent,*dchild,*dsibling,*dedge0,*dedge1,*dsucc0,*dsucc1,*dposition,*dpreorder;
cudaMalloc((void**)&dparent,7*sizeof(int));
cudaMalloc((void**)&dchild,7*sizeof(int));
cudaMalloc((void**)&dsibling,7*sizeof(int));
cudaMalloc((void**)&dedge0,12*sizeof(int));
cudaMalloc((void**)&dedge1,12*sizeof(int));
cudaMalloc((void**)&dsucc0,12*sizeof(int));
cudaMalloc((void**)&dsucc1,12*sizeof(int));
cudaMalloc((void**)&dposition,12*sizeof(int));
cudaMalloc((void**)&dpreorder,7*sizeof(int));
cudaMemcpy(dparent,&parent,7*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dchild,&child,7*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dsibling,&sibling,7*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dedge0,&edge0,12*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dedge1,&edge1,12*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dsucc0,&succ0,12*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dsucc1,&succ1,12*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dposition,&position,12*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dpreorder,&preorder,7*sizeof(int),cudaMemcpyHostToDevice);
initialize<<<1,1>>>(dedge0,dedge1);
traverse<<<1,12>>>(dparent,dchild,dsibling,dedge0,dedge1,dsucc0,dsucc1,dposition,dp
reorder);
cudaMemcpy(&succ0,dsucc0,12*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&succ1,dsucc1,12*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&preorder,dpreorder,7*sizeof(int),cudaMemcpyDeviceToHost);
printf("Preorder Traversal numbering to the vertices: \n");
for(int i=0;i<7;i++){
printf("%c -> %d\n",vertices[i],preorder[i]);
}
cudaFree(dparent);
cudaFree(dchild);
cudaFree(dsibling);
cudaFree(dedge0);
cudaFree(dedge1);
cudaFree(dsucc0);
cudaFree(dsucc1);
cudaFree(dposition);
cudaFree(dpreorder);
return 0;
}

14. quick sort, 15. max and min in an array 16. second max an dsecond min
17. kth largest ele in unsorted array

#include<cuda.h>
#include <stdio.h>
__global__ void part(int *a,int *pos,int size){
int *cp_a,i;
int pivot, left, right, t;
cp_a = (int*)malloc(sizeof(int)*size);
for(i = 0;i < size;i++){
cp_a[i] = a[i];

}
t = cp_a[threadIdx.x];
cp_a[threadIdx.x] = cp_a[0];
cp_a[0] = t;
pivot = cp_a[0];
left = 1; right = size - 1;
while(left < right){
while(left>0&&left<size&&cp_a[left] <= pivot)
left++;
while(right>0&&right<size&&cp_a[right] > pivot)
right--;
if(left < right&&left>0&&left<size&&right>0&&right<size){
t = cp_a[left];
cp_a[left] = cp_a[right];
cp_a[right] = t;
}
}
pos[threadIdx.x] = right;
}
int main(){
int *a,*dev_a,*pos,*dev_pos,i,*fin,sz;
printf("\n Enter the size of the array : ");
scanf("%d",&sz);
a = (int*) malloc(sizeof(int)*sz);
pos = (int*) malloc(sizeof(int)*sz);
fin = (int*) malloc(sizeof(int)*sz);
for(i = 0;i < sz;i++){
printf("\n Enter the %dth element : ",i);
scanf("%d",a + i);
fin[i] = INT_MAX;
}
cudaMalloc((void**)&dev_a,sizeof(int)*sz);
cudaMalloc((void**)&dev_pos,sizeof(int)*sz);

cudaMemcpy(dev_a,a,sz*sizeof(int),cudaMemcpyHostToDevice);
part<<<1,sz>>>(dev_a,dev_pos,sz);
cudaMemcpy(pos,dev_pos,sz*sizeof(int),cudaMemcpyDeviceToHost);
for(i = 0; i < sz;i++)
fin[pos[i]] = a[i];
for(i = sz - 1;i >= 0;i--)
if(fin[i] == INT_MAX)
fin[i] = fin[i+1];
for(i = 0;i < sz;i++)
printf("\n %dth element : %d",i,*(fin+i));
printf("\n\n");
return 0;
}

18. sum of n natural numbers


#include<stdio.h>
#include<cuda.h>

__global__ void sumnatural(int *a,int *c){


int id=threadIdx.x+blockIdx.x*blockDim.x;
int j=1;
atomicAdd(c,a[id]*j);
}
int main(){
int a[100];
int size=100*sizeof(int);
int *da,*res;
cudaMalloc((void**)&da,size);
cudaMalloc((void**)&res,sizeof(int));
for(int i=0;i<100;i++){
a[i]=i+1;
}
cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
sumnatural<<<2,50>>>(da,res);
int ans;
cudaMemcpy(&ans,res,sizeof(int),cudaMemcpyDeviceToHost);
printf("The sum of first 100 natural numbers: %d\n",ans);
return 0;

}
********************************************************
#include<stdio.h>
#include<cuda.h>
__global__ void fill(int *c,int n)

{
int tid=blockDim.x*blockIdx.x+threadIdx.x;
if (tid<=n)
{
c[tid]=tid;
}
}

__global__ void parallel_red(int *c)


{
int t=blockDim.x*blockIdx.x+threadIdx.x;
for (int a=1;a<gridDim.x;a*=2)
{
if(t%(2*a)==0)
{
c[t]+=c[t+a];
}
}
}

int main()
{
int n;
printf("enter n");
scanf("%d",&n);
int *res;
res=(int*)malloc(n*sizeof(int));
int *p;
cudaMalloc((void **) &p,n*sizeof(int));
fill<<<n,n>>>(p,n);
parallel_red<<<n,n>>>(p);
cudaMemcpy(res,p,n*sizeof(int),cudaMemcpyDeviceToHost);
printf("in main");

printf("%d\n",res[0]);

return 0;
}

19.Addition of two vectors

#include<stdio.h>
#include<cuda.h>
#define n 10
__global__ void add(int *a,int *b,int *c)
{
int tid=threadIdx.x + blockIdx.x * blockDim.x;
if(tid<n)
{
c[tid]=a[tid]+b[tid];
}
}
int main()
{
int a[n],b[n],c[n];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void **)&dev_a,n*sizeof(int));
cudaMalloc((void **)&dev_b,n*sizeof(int));
cudaMalloc((void **)&dev_c,n*sizeof(int));
for(int i=0;i<n;i++)
{
a[i]=i;
b[i]=5;
}
cudaMemcpy(dev_a,a,n*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,n*sizeof(int),cudaMemcpyHostToDevice);
//cudaMemcpy(dev_c,c,size*sizeof(int),cudaMemcpyHostToDevice);
add<<<2,5>>>(dev_a,dev_b,dev_c);
cudaMemcpy(c,dev_c,n*sizeof(int),cudaMemcpyDeviceToHost);
printf("After the addition of two vectors \n");
for(int i=0;i<n;i++)
{
printf("%d \n",c[i]);
}
return 0;
}

20. arithmetic operations

#include<stdio.h>
#include<cuda.h>
__global__ void calc(int *a,int *b,int *res)
{
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx==0)
{
res[idx]=*a+*b;
}
else if(idx==1)
{
res[idx]=*a-*b;
}
else if(idx==2)
{
res[idx]=*a**b;
}
else if(idx==3)
{
res[idx]=*a/ *b;
}
}
int main()
{
int a=2;
int b=1;
int res[4];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void**)&dev_a,sizeof(int));
cudaMalloc((void**)&dev_b,sizeof(int));
cudaMalloc((void**)&dev_c,4*sizeof(int));
cudaMemcpy(dev_a,&a,sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,&b,sizeof(int),cudaMemcpyHostToDevice);
calc<<<4,4>>>(dev_a,dev_b,dev_c);
cudaMemcpy(res,dev_c,4*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<4;i++)
{
printf("%d",res[i]);
}
}

You might also like