Last active
March 13, 2026 12:11
-
-
Save D-Lite/34834f3464a164a3290ac602f4ea2037 to your computer and use it in GitHub Desktop.
CUDA vector add
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| %%cuda | |
| #include <stdio.h> | |
| #include <math.h> | |
| // size of array | |
| #define N 1048576 | |
| // Kernel | |
| __global__ void add_vectors(double *a, double *b, double *c) { | |
| int id = blockDim.x * blockIdx.x + threadIdx.x; | |
| if(id < N) c[id] = a[id] + b[id]; | |
| } | |
| // main program | |
| int main() | |
| { | |
| // number of bytes to allocate for N doubles | |
| size_t bytes = N*sizeof(double); | |
| double *A = (double*)malloc(bytes); | |
| double *B = (double*)malloc(bytes); | |
| double *C = (double*)malloc(bytes); | |
| // Allocate memory for arrays d_A, d_B, and d_C on device | |
| double *d_A, *d_B, *d_C; | |
| cudaMalloc(&d_A, bytes); | |
| cudaMalloc(&d_B, bytes); | |
| cudaMalloc(&d_C, bytes); | |
| // fill host arrays A and B | |
| for(int i=0; i < N; i++) { | |
| A[i] = 1.0; | |
| B[i] = 2.0; | |
| } | |
| // copy data from host arrays A and B to device arrays d_A and d_B | |
| cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice); | |
| cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice); | |
| // set execution configuration parameters | |
| int thr_per_blk = 256; // number of CUDA threads per grid block | |
| int blk_in_grid = (int)ceil(float(N) / thr_per_blk); // number of blocks in grid | |
| // launch kernel | |
| add_vectors<<<blk_in_grid, thr_per_blk>>>(d_A, d_B, d_C); | |
| // copy data from device array d_C to host array C | |
| cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost); | |
| // verify results | |
| double tolerance = 1.0e-14; | |
| for(int i = 0; i < N; i++) { | |
| if(fabs(C[i] - 3.0) > tolerance) { | |
| printf("\nError: value of C[%d] = %f instead of 3.0\n\n", i, C[i]); | |
| exit(1); | |
| } | |
| } | |
| // free CPU memory | |
| free(A); | |
| free(B); | |
| free(C); | |
| // free GPU memory | |
| cudaFree(d_A); | |
| cudaFree(d_B); | |
| cudaFree(d_C); | |
| printf("\n---------------------------\n"); | |
| printf("__SUCCESS__\n"); | |
| printf("---------------------------"); | |
| printf("\nN = %d", N); | |
| printf("\nThreads Per Block = %d", thr_per_blk); | |
| printf("\nBlocks In Grid = %d", blk_in_grid); | |
| printf("\n---------------------------\n\n"); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment