Skip to content

Instantly share code, notes, and snippets.

@D-Lite
Last active March 13, 2026 12:11
Show Gist options
  • Select an option

  • Save D-Lite/34834f3464a164a3290ac602f4ea2037 to your computer and use it in GitHub Desktop.

Select an option

Save D-Lite/34834f3464a164a3290ac602f4ea2037 to your computer and use it in GitHub Desktop.
CUDA vector add
%%cuda
#include <stdio.h>
#include <math.h>
// size of array
#define N 1048576
// Kernel
__global__ void add_vectors(double *a, double *b, double *c) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
if(id < N) c[id] = a[id] + b[id];
}
// main program
int main()
{
// number of bytes to allocate for N doubles
size_t bytes = N*sizeof(double);
double *A = (double*)malloc(bytes);
double *B = (double*)malloc(bytes);
double *C = (double*)malloc(bytes);
// Allocate memory for arrays d_A, d_B, and d_C on device
double *d_A, *d_B, *d_C;
cudaMalloc(&d_A, bytes);
cudaMalloc(&d_B, bytes);
cudaMalloc(&d_C, bytes);
// fill host arrays A and B
for(int i=0; i < N; i++) {
A[i] = 1.0;
B[i] = 2.0;
}
// copy data from host arrays A and B to device arrays d_A and d_B
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
// set execution configuration parameters
int thr_per_blk = 256; // number of CUDA threads per grid block
int blk_in_grid = (int)ceil(float(N) / thr_per_blk); // number of blocks in grid
// launch kernel
add_vectors<<<blk_in_grid, thr_per_blk>>>(d_A, d_B, d_C);
// copy data from device array d_C to host array C
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
// verify results
double tolerance = 1.0e-14;
for(int i = 0; i < N; i++) {
if(fabs(C[i] - 3.0) > tolerance) {
printf("\nError: value of C[%d] = %f instead of 3.0\n\n", i, C[i]);
exit(1);
}
}
// free CPU memory
free(A);
free(B);
free(C);
// free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
printf("\n---------------------------\n");
printf("__SUCCESS__\n");
printf("---------------------------");
printf("\nN = %d", N);
printf("\nThreads Per Block = %d", thr_per_blk);
printf("\nBlocks In Grid = %d", blk_in_grid);
printf("\n---------------------------\n\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment