Professional Documents
Culture Documents
dot product
#include<stdio.h>
#include<cuda.h>
#define size 4
int main(){
int a[size] = {1,2,3,4};
int b[size] = {1,2,3,4};
int c = 0;
int *da,*db,*dc;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
cudaMalloc((void **)&db,s);
cudaMemcpy(db,b,s, cudaMemcpyHostToDevice);
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc,&c,sizeof(int), cudaMemcpyHostToDevice);
dotProduct<<<2,2>>>(da,db,dc);
cudaMemcpy(&c,dc,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n dot product is %d ",c);
return 0;
}
2. prefix sum
#include<stdio.h>
#include<cuda.h>
#define size 4
a[i] =a[i]+a[i-1];
int main(){
int a[size] = {1,2,3,4};
int *da;
int s = size*sizeof(int);
for(int i = 0;i<4;i++)
{
printf("\n the element are %d ",a[i]);
}
cudaMemcpy(&a,da,s,cudaMemcpyDeviceToHost);
for(int i = 0;i<4;i++)
{
printf("\n prefix sum is %d ",a[i]);
}
return 0;
}
#include<stdio.h>
#include<cuda.h>
#define size 4
if (a[i] %2 != 0)
atomicAdd(odd,a[i]);
}
int main(){
int a[size] = {1,2,3,4};
int *da;
int s = size*sizeof(int);
int *odd,*even;
int o = 0,e = 0;
printf("\n the element are ");
for(int i = 0;i<4;i++)
{
printf("\n %d ",a[i]);
}
oddSum<<<2,2>>>(da,odd);
evenSum<<<2,2>>>(da,even);
cudaMemcpy(&o,odd,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&e,even,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n odd sum is %d ",o);
printf("\n even sum is %d ",e);
return 0;
}
int main(){
char a[50];
printf("enter a string in lowercase : ");
scanf("%[^\n]s",a);
int l = strlen(a);
int *dl;
char *da;
int s = l*sizeof(char);
caseChange<<<2,l>>>(da,dl);
cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("%s ",a);
return 0;
}
int main(){
int a[size] = {9,1,8,4,7,2};
printf("The elements before sorting are: ");
for(int i = 0;i<6;i++)
{
printf("%d ",a[i]);
}
int *da;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
for(int k = 0;k<=3;k++){
evenSort<<<2,3>>>(da);
oddSort<<<2,3>>>(da);
}
cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("\n After sorting ");
for(int i = 0;i<6;i++)
{
printf("%d ",a[i]);
}
return 0;
}
7. armstrong numbers from 1 to 1000
#include<stdio.h>
#include<cuda.h>
#define size 20
int num,cube=0;
num = a[id];
printf(" num is %d\n ",num);
while(num != 0){
digi = num%10;
cube = digi*digi*digi;
sum = sum + cube;
num = num/10;
}
printf("%d the sum is %d \n ",a[id],sum);
if (sum == a[id])
a[id] = a[id];
else
a[id] = -1;
int main(){
int a[size];
//printf("The elements before sorting are: ");
for(int i = 0;i<size;i++)
{
a[i] = i+1;
}
int *da;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
armstrong<<<4,5>>>(da);
cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("\n The armstrong numbers are : ");
for(int i = 0;i<size;i++)
{
if(a[i] != -1)
printf("%d ",a[i]);
}
return 0;
}
8. prime numbers
#include<stdio.h>
#include<cuda.h>
#define size 20
int main(){
int a[size];
//printf("The elements before sorting are: ");
for(int i = 0;i<size;i++)
{
a[i] = i+1;
}
int *da;
int s = size*sizeof(int);
cudaMalloc((void **) &da,s);
cudaMemcpy(da,a,s, cudaMemcpyHostToDevice);
prime<<<4,5>>>(da);
cudaMemcpy(a,da,s,cudaMemcpyDeviceToHost);
printf("\n The prime numbers are : ");
for(int i = 0;i<size;i++)
{
if(a[i] != -1)
printf("%d ",a[i]);
}
return 0;
}
9.matrix transpose
#include<stdio.h>
#include<cuda.h>
__global__ void trans(int *in, int *out, int nx, int ny) {
__shared__ int tile[2][2];
unsigned int ix,iy,ti,to;
ix = blockIdx.x *blockDim.x + threadIdx.x;
iy = blockIdx.y *blockDim.y + threadIdx.y;
ti = iy*nx + ix;
unsigned int bidx,irow,icol;
bidx = threadIdx.y*blockDim.x + threadIdx.x;
irow = bidx/blockDim.y;
icol = bidx%blockDim.y;
ix = blockIdx.y * blockDim.y + icol;
iy = blockIdx.x * blockDim.x + irow;
to = iy*ny + ix;
if (ix < nx && iy < ny)
{
tile[threadIdx.y][threadIdx.x] = in[ti];
__syncthreads();
out[to] = tile[icol][irow];
}
}
int main(){
int a[2][2] = {{1, 2}, {3, 4}};
int res[2][2] = {{0, 0}, {0, 0}};
int *da;
int *dc;
cudaMalloc((void**)&da, 2* 2*sizeof(int));
cudaMalloc((void**)&dc, 2* 2*sizeof(int));
cudaMemcpy(da, &a, 2 *2*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dc, &res, 2 *2*sizeof(int), cudaMemcpyHostToDevice);
dim3 grid(2, 2);
printf("before transpose: \n");
for(int i = 0; i < 2; i++){
for(int j = 0; j< 2; j++) {
printf("%d ", a[i][j]);
}
printf("\n");
}
trans<<<grid, 1>>>(da, dc, 2, 2);
cudaMemcpy(&res, dc, 2*2*sizeof(int), cudaMemcpyDeviceToHost);
printf("after transpose: \n");
for(int i = 0; i < 2; i++){
for(int j = 0; j< 2; j++) {
printf("%d ", res[i][j]);
}
printf("\n");
}
cudaFree(da);
cudaFree(dc);
int main()
{
int Number;
int *dev_number;
long int *res, result;
system("clear");
printf("\n\t Enter the number : ");
scanf("%d",&Number);
cudaMalloc((void**)&dev_number,sizeof(int));
cudaMalloc((void**)&res,sizeof(long int));
cudaMemcpy(dev_number,&Number,sizeof(int),cudaMemcpyHostToDevice);
Factorial<<<5,5>>>(dev_number,res);
cudaMemcpy(&result,res,sizeof(long int),cudaMemcpyDeviceToHost);
return 0;
}
12. stencil
#include <cuda_runtime.h>
#include<stdio.h>
#include<sys/time.h>
#define RADIUS 4
#define BDIM 8
double cpuSecond(){
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
void initialData(float *in, const int size)
{
for (int i = 0; i < size; i++)
{
in[i]=i+1;
}
}
void printData(float *in, const int size)
{
for (int i = RADIUS; i < size; i++)
{
printf("%f ", in[i]);
}
printf("\n");
}
void cpu_stencil_1d (float *in, float *out, int isize)
{
for (int i = RADIUS; i <= isize; i++)
{
float tmp = a1 * (in[i + 1] - in[i - 1])
+ a2 * (in[i + 2] - in[i - 2])
+ a3 * (in[i + 3] - in[i - 3])
+ a4 * (in[i + 4] - in[i - 4]);
out[i] = tmp;
}
}
__global__ void stencil_1d(float *in, float *out, int N)
{
__shared__ float smem[BDIM + 2 * RADIUS];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int sidx = threadIdx.x + RADIUS;
smem[sidx] = in[idx];
printf("\nsmem[%d]=in[%d] by %d, value is %f",sidx,idx,threadIdx.x,in[idx]);
__syncthreads();
if (threadIdx.x < RADIUS)
{
smem[sidx - RADIUS] = in[idx - RADIUS];
smem[sidx + BDIM] = in[idx + BDIM];
printf("\nsmem[%d]=in[%d] by %d, value is %f",sidx-RADIUS,idx-
RADIUS,threadIdx.x,in[idx-RADIUS]);
printf("\nsmem[%d]=in[%d] by %d,value is
%f",sidx+BDIM,idx+BDIM,threadIdx.x,in[idx+BDIM]);
}
__syncthreads();
float tmp = 0.0f;
for (int i = 1; i <= RADIUS; i++)
{
tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]);
}
out[idx] = tmp;
printf("\nin[%d] is %f",idx,in[threadIdx.x]);
printf("\nout[%d] = %f by %d", idx,tmp,threadIdx.x);
}
14. quick sort, 15. max and min in an array 16. second max an dsecond min
17. kth largest ele in unsorted array
#include<cuda.h>
#include <stdio.h>
__global__ void part(int *a,int *pos,int size){
int *cp_a,i;
int pivot, left, right, t;
cp_a = (int*)malloc(sizeof(int)*size);
for(i = 0;i < size;i++){
cp_a[i] = a[i];
}
t = cp_a[threadIdx.x];
cp_a[threadIdx.x] = cp_a[0];
cp_a[0] = t;
pivot = cp_a[0];
left = 1; right = size - 1;
while(left < right){
while(left>0&&left<size&&cp_a[left] <= pivot)
left++;
while(right>0&&right<size&&cp_a[right] > pivot)
right--;
if(left < right&&left>0&&left<size&&right>0&&right<size){
t = cp_a[left];
cp_a[left] = cp_a[right];
cp_a[right] = t;
}
}
pos[threadIdx.x] = right;
}
int main(){
int *a,*dev_a,*pos,*dev_pos,i,*fin,sz;
printf("\n Enter the size of the array : ");
scanf("%d",&sz);
a = (int*) malloc(sizeof(int)*sz);
pos = (int*) malloc(sizeof(int)*sz);
fin = (int*) malloc(sizeof(int)*sz);
for(i = 0;i < sz;i++){
printf("\n Enter the %dth element : ",i);
scanf("%d",a + i);
fin[i] = INT_MAX;
}
cudaMalloc((void**)&dev_a,sizeof(int)*sz);
cudaMalloc((void**)&dev_pos,sizeof(int)*sz);
cudaMemcpy(dev_a,a,sz*sizeof(int),cudaMemcpyHostToDevice);
part<<<1,sz>>>(dev_a,dev_pos,sz);
cudaMemcpy(pos,dev_pos,sz*sizeof(int),cudaMemcpyDeviceToHost);
for(i = 0; i < sz;i++)
fin[pos[i]] = a[i];
for(i = sz - 1;i >= 0;i--)
if(fin[i] == INT_MAX)
fin[i] = fin[i+1];
for(i = 0;i < sz;i++)
printf("\n %dth element : %d",i,*(fin+i));
printf("\n\n");
return 0;
}
}
********************************************************
#include<stdio.h>
#include<cuda.h>
__global__ void fill(int *c,int n)
{
int tid=blockDim.x*blockIdx.x+threadIdx.x;
if (tid<=n)
{
c[tid]=tid;
}
}
int main()
{
int n;
printf("enter n");
scanf("%d",&n);
int *res;
res=(int*)malloc(n*sizeof(int));
int *p;
cudaMalloc((void **) &p,n*sizeof(int));
fill<<<n,n>>>(p,n);
parallel_red<<<n,n>>>(p);
cudaMemcpy(res,p,n*sizeof(int),cudaMemcpyDeviceToHost);
printf("in main");
printf("%d\n",res[0]);
return 0;
}
#include<stdio.h>
#include<cuda.h>
#define n 10
__global__ void add(int *a,int *b,int *c)
{
int tid=threadIdx.x + blockIdx.x * blockDim.x;
if(tid<n)
{
c[tid]=a[tid]+b[tid];
}
}
int main()
{
int a[n],b[n],c[n];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void **)&dev_a,n*sizeof(int));
cudaMalloc((void **)&dev_b,n*sizeof(int));
cudaMalloc((void **)&dev_c,n*sizeof(int));
for(int i=0;i<n;i++)
{
a[i]=i;
b[i]=5;
}
cudaMemcpy(dev_a,a,n*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,n*sizeof(int),cudaMemcpyHostToDevice);
//cudaMemcpy(dev_c,c,size*sizeof(int),cudaMemcpyHostToDevice);
add<<<2,5>>>(dev_a,dev_b,dev_c);
cudaMemcpy(c,dev_c,n*sizeof(int),cudaMemcpyDeviceToHost);
printf("After the addition of two vectors \n");
for(int i=0;i<n;i++)
{
printf("%d \n",c[i]);
}
return 0;
}
#include<stdio.h>
#include<cuda.h>
__global__ void calc(int *a,int *b,int *res)
{
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx==0)
{
res[idx]=*a+*b;
}
else if(idx==1)
{
res[idx]=*a-*b;
}
else if(idx==2)
{
res[idx]=*a**b;
}
else if(idx==3)
{
res[idx]=*a/ *b;
}
}
int main()
{
int a=2;
int b=1;
int res[4];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void**)&dev_a,sizeof(int));
cudaMalloc((void**)&dev_b,sizeof(int));
cudaMalloc((void**)&dev_c,4*sizeof(int));
cudaMemcpy(dev_a,&a,sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,&b,sizeof(int),cudaMemcpyHostToDevice);
calc<<<4,4>>>(dev_a,dev_b,dev_c);
cudaMemcpy(res,dev_c,4*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<4;i++)
{
printf("%d",res[i]);
}
}