D-Lite/vector_add.cu

## vector_add.cu
%%cuda
#include <stdio.h>
#include <math.h>

// size of array
#define N 1048576

// Kernel
__global__ void add_vectors(double *a, double *b, double *c) {
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if(id < N) c[id] = a[id] + b[id];
}

// main program
int main()
{
    // number of bytes to allocate for N doubles
    size_t bytes = N*sizeof(double);

    double *A = (double*)malloc(bytes);
    double *B = (double*)malloc(bytes);
    double *C = (double*)malloc(bytes);

    // Allocate memory for arrays d_A, d_B, and d_C on device
    double *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    // fill host arrays A and B
    for(int i=0; i < N; i++) {
        A[i] = 1.0;
        B[i] = 2.0;
    }

    // copy data from host arrays A and B to device arrays d_A and d_B
    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

    // set execution configuration parameters
    int thr_per_blk = 256; // number of CUDA threads per grid block
    int blk_in_grid = (int)ceil(float(N) / thr_per_blk); // number of blocks in grid

    // launch kernel
    add_vectors<<<blk_in_grid, thr_per_blk>>>(d_A, d_B, d_C);

    // copy data from device array d_C to host array C
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    // verify results
    double tolerance = 1.0e-14;
    for(int i = 0; i < N; i++) {
        if(fabs(C[i] - 3.0) > tolerance) {
            printf("\nError: value of C[%d] = %f instead of 3.0\n\n", i, C[i]);
            exit(1);
        }
    }

    // free CPU memory
    free(A);
    free(B);
    free(C);

    // free GPU memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    printf("\n---------------------------\n");
    printf("__SUCCESS__\n");
    printf("---------------------------");
    printf("\nN                 = %d", N);
    printf("\nThreads Per Block = %d", thr_per_blk);
    printf("\nBlocks In Grid    = %d", blk_in_grid);
    printf("\n---------------------------\n\n");

    return 0;
}
	%%cuda
	#include <stdio.h>
	#include <math.h>

	// size of array
	#define N 1048576

	// Kernel
	__global__ void add_vectors(double a, double b, double *c) {
	int id = blockDim.x * blockIdx.x + threadIdx.x;
	if(id < N) c[id] = a[id] + b[id];
	}

	// main program
	int main()
	{
	// number of bytes to allocate for N doubles
	size_t bytes = N*sizeof(double);

	double A = (double)malloc(bytes);
	double B = (double)malloc(bytes);
	double C = (double)malloc(bytes);

	// Allocate memory for arrays d_A, d_B, and d_C on device
	double d_A, d_B, *d_C;
	cudaMalloc(&d_A, bytes);
	cudaMalloc(&d_B, bytes);
	cudaMalloc(&d_C, bytes);

	// fill host arrays A and B
	for(int i=0; i < N; i++) {
	A[i] = 1.0;
	B[i] = 2.0;
	}

	// copy data from host arrays A and B to device arrays d_A and d_B
	cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

	// set execution configuration parameters
	int thr_per_blk = 256; // number of CUDA threads per grid block
	int blk_in_grid = (int)ceil(float(N) / thr_per_blk); // number of blocks in grid

	// launch kernel
	add_vectors<<<blk_in_grid, thr_per_blk>>>(d_A, d_B, d_C);

	// copy data from device array d_C to host array C
	cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

	// verify results
	double tolerance = 1.0e-14;
	for(int i = 0; i < N; i++) {
	if(fabs(C[i] - 3.0) > tolerance) {
	printf("\nError: value of C[%d] = %f instead of 3.0\n\n", i, C[i]);
	exit(1);
	}
	}

	// free CPU memory
	free(A);
	free(B);
	free(C);

	// free GPU memory
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	printf("\n---------------------------\n");
	printf("__SUCCESS__\n");
	printf("---------------------------");
	printf("\nN = %d", N);
	printf("\nThreads Per Block = %d", thr_per_blk);
	printf("\nBlocks In Grid = %d", blk_in_grid);
	printf("\n---------------------------\n\n");

	return 0;
	}
No results found