raymondtay/Makefile

## gaussian_blur.hpp
/*
 * Separable Gaussian Blur - C++ with CUDA Implementation
 *
 * This is a C++ wrapper around CUDA kernels providing:
 * - Object-oriented interface
 * - RAII memory management
 * - Support for multiple data types (float, uchar)
 * - Easy-to-use API
 */

#ifndef GAUSSIAN_BLUR_HPP
#define GAUSSIAN_BLUR_HPP

#include <cuda_runtime.h>
#include <vector>
#include <memory>
#include <stdexcept>
#include <cmath>
#include <iostream>

// CUDA kernel declarations
extern "C" {
    void launchGaussianBlurHorizontal(
        const float* input,
        float* output,
        const float* kernel,
        int width,
        int height,
        int radius,
        dim3 gridSize,
        dim3 blockSize,
        int sharedMemSize
    );

    void launchGaussianBlurVertical(
        const float* input,
        float* output,
        const float* kernel,
        int width,
        int height,
        int radius,
        dim3 gridSize,
        dim3 blockSize,
        int sharedMemSize
    );
}

namespace gpu {

// RAII wrapper for CUDA device memory
template<typename T>
class DeviceBuffer {
private:
    T* d_ptr_;
    size_t size_;

public:
    DeviceBuffer(size_t size) : size_(size) {
        cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T));
        if (err != cudaSuccess) {
            throw std::runtime_error("CUDA malloc failed: " +
                std::string(cudaGetErrorString(err)));
        }
    }

    ~DeviceBuffer() {
        if (d_ptr_) {
            cudaFree(d_ptr_);
        }
    }

    // Delete copy constructor and assignment
    DeviceBuffer(const DeviceBuffer&) = delete;
    DeviceBuffer& operator=(const DeviceBuffer&) = delete;

    // Move constructor and assignment
    DeviceBuffer(DeviceBuffer&& other) noexcept
        : d_ptr_(other.d_ptr_), size_(other.size_) {
        other.d_ptr_ = nullptr;
    }

    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
        if (this != &other) {
            if (d_ptr_) cudaFree(d_ptr_);
            d_ptr_ = other.d_ptr_;
            size_ = other.size_;
            other.d_ptr_ = nullptr;
        }
        return *this;
    }

    T* get() { return d_ptr_; }
    const T* get() const { return d_ptr_; }
    size_t size() const { return size_; }

    void copyToDevice(const T* host_data) {
        cudaError_t err = cudaMemcpy(d_ptr_, host_data,
            size_ * sizeof(T), cudaMemcpyHostToDevice);
        if (err != cudaSuccess) {
            throw std::runtime_error("Copy to device failed");
        }
    }

    void copyToHost(T* host_data) const {
        cudaError_t err = cudaMemcpy(host_data, d_ptr_,
            size_ * sizeof(T), cudaMemcpyDeviceToHost);
        if (err != cudaSuccess) {
            throw std::runtime_error("Copy to host failed");
        }
    }
};

// Gaussian blur class
class GaussianBlur {
private:
    int width_;
    int height_;
    int radius_;
    float sigma_;
    std::vector<float> kernel_;

    std::unique_ptr<DeviceBuffer<float>> d_kernel_;
    std::unique_ptr<DeviceBuffer<float>> d_temp_;

    void generateKernel() {
        int kernelSize = 2 * radius_ + 1;
        kernel_.resize(kernelSize);

        float sum = 0.0f;
        for (int i = -radius_; i <= radius_; i++) {
            float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_));
            kernel_[i + radius_] = value;
            sum += value;
        }

        // Normalize
        for (auto& val : kernel_) {
            val /= sum;
        }
    }

public:
    GaussianBlur(int width, int height, int radius, float sigma)
        : width_(width), height_(height), radius_(radius), sigma_(sigma) {

        if (radius <= 0 || radius > 50) {
            throw std::invalid_argument("Radius must be between 1 and 50");
        }

        if (sigma <= 0.0f) {
            throw std::invalid_argument("Sigma must be positive");
        }

        // Generate Gaussian kernel
        generateKernel();

        // Allocate device memory
        d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size());
        d_kernel_->copyToDevice(kernel_.data());

        // Allocate temporary buffer for intermediate results
        d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height);
    }

    // Apply Gaussian blur
    void apply(const float* h_input, float* h_output) {
        // Allocate device buffers
        DeviceBuffer<float> d_input(width_ * height_);
        DeviceBuffer<float> d_output(width_ * height_);

        // Copy input to device
        d_input.copyToDevice(h_input);

        // Configure kernel launch parameters
        const int blockSize = 16;
        dim3 blockDim(blockSize, blockSize);
        dim3 gridDim(
            (width_ + blockSize - 1) / blockSize,
            (height_ + blockSize - 1) / blockSize
        );

        int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float);

        // Launch horizontal blur
        launchGaussianBlurHorizontal(
            d_input.get(),
            d_temp_->get(),
            d_kernel_->get(),
            width_,
            height_,
            radius_,
            gridDim,
            blockDim,
            sharedMemSize
        );

        // Check for kernel launch errors
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            throw std::runtime_error("Horizontal kernel launch failed: " +
                std::string(cudaGetErrorString(err)));
        }

        // Launch vertical blur
        launchGaussianBlurVertical(
            d_temp_->get(),
            d_output.get(),
            d_kernel_->get(),
            width_,
            height_,
            radius_,
            gridDim,
            blockDim,
            sharedMemSize
        );

        err = cudaGetLastError();
        if (err != cudaSuccess) {
            throw std::runtime_error("Vertical kernel launch failed: " +
                std::string(cudaGetErrorString(err)));
        }

        // Synchronize
        cudaDeviceSynchronize();

        // Copy result back to host
        d_output.copyToHost(h_output);
    }

    // Getters
    int getWidth() const { return width_; }
    int getHeight() const { return height_; }
    int getRadius() const { return radius_; }
    float getSigma() const { return sigma_; }
    const std::vector<float>& getKernel() const { return kernel_; }

    // Print kernel coefficients
    void printKernel() const {
        std::cout << "Gaussian kernel (radius=" << radius_
                  << ", sigma=" << sigma_ << "):\n";
        for (size_t i = 0; i < kernel_.size(); i++) {
            std::cout << kernel_[i] << " ";
        }
        std::cout << std::endl;
    }
};

} // namespace gpu

#endif // GAUSSIAN_BLUR_HPP

## gaussian_blur_cuda.cu
/*
 * Separable Gaussian Blur - Pure CUDA Implementation
 *
 * This implementation uses:
 * - Shared memory for efficient data reuse
 * - Separate kernels for horizontal and vertical passes
 * - Optimized memory access patterns
 */

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <math.h>

#define KERNEL_RADIUS 5
#define KERNEL_SIZE (2 * KERNEL_RADIUS + 1)
#define BLOCK_SIZE 16

// Error checking macro
#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

/*
 * Generate 1D Gaussian kernel
 * Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2))
 */
__host__ void generateGaussianKernel(float* kernel, int radius, float sigma) {
    float sum = 0.0f;

    for (int i = -radius; i <= radius; i++) {
        float value = expf(-(i * i) / (2.0f * sigma * sigma));
        kernel[i + radius] = value;
        sum += value;
    }

    // Normalize
    for (int i = 0; i < 2 * radius + 1; i++) {
        kernel[i] /= sum;
    }
}

/*
 * Horizontal Gaussian blur kernel
 * Each thread processes one pixel, reading from shared memory
 */
__global__ void gaussianBlurHorizontal(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius
) {
    // Shared memory for the row (includes halo region)
    extern __shared__ float sharedRow[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y >= height) return;

    // Load data into shared memory with halo
    int sharedIdx = threadIdx.x + radius;

    // Load main data
    if (x < width) {
        sharedRow[sharedIdx] = input[y * width + x];
    }

    // Load left halo
    if (threadIdx.x < radius) {
        int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
        if (leftX < 0) {
            sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp
        } else {
            sharedRow[threadIdx.x] = input[y * width + leftX];
        }
    }

    // Load right halo
    if (threadIdx.x >= blockDim.x - radius) {
        int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
        int sharedRightIdx = threadIdx.x + 2 * radius;
        if (rightX >= width) {
            sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp
        } else {
            sharedRow[sharedRightIdx] = input[y * width + rightX];
        }
    }

    __syncthreads();

    // Perform convolution
    if (x < width) {
        float sum = 0.0f;

        for (int k = -radius; k <= radius; k++) {
            sum += sharedRow[sharedIdx + k] * kernel[k + radius];
        }

        output[y * width + x] = sum;
    }
}

/*
 * Vertical Gaussian blur kernel
 * Each thread processes one pixel, reading columns
 */
__global__ void gaussianBlurVertical(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius
) {
    // Shared memory for the column (includes halo region)
    extern __shared__ float sharedCol[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width) return;

    // Load data into shared memory with halo
    int sharedIdx = threadIdx.y + radius;

    // Load main data
    if (y < height) {
        sharedCol[sharedIdx] = input[y * width + x];
    }

    // Load top halo
    if (threadIdx.y < radius) {
        int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
        if (topY < 0) {
            sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp
        } else {
            sharedCol[threadIdx.y] = input[topY * width + x];
        }
    }

    // Load bottom halo
    if (threadIdx.y >= blockDim.y - radius) {
        int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
        int sharedBottomIdx = threadIdx.y + 2 * radius;
        if (bottomY >= height) {
            sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp
        } else {
            sharedCol[sharedBottomIdx] = input[bottomY * width + x];
        }
    }

    __syncthreads();

    // Perform convolution
    if (y < height) {
        float sum = 0.0f;

        for (int k = -radius; k <= radius; k++) {
            sum += sharedCol[sharedIdx + k] * kernel[k + radius];
        }

        output[y * width + x] = sum;
    }
}

/*
 * Apply separable Gaussian blur
 */
void applySeparableGaussianBlur(
    const float* d_input,
    float* d_output,
    float* d_temp,
    const float* d_kernel,
    int width,
    int height,
    int radius
) {
    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridSize(
        (width + blockSize.x - 1) / blockSize.x,
        (height + blockSize.y - 1) / blockSize.y
    );

    // Shared memory size (block size + 2 * radius for halo)
    int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float);

    // Horizontal pass
    gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>(
        d_input, d_temp, d_kernel, width, height, radius
    );
    CUDA_CHECK(cudaGetLastError());

    // Vertical pass
    gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>(
        d_temp, d_output, d_kernel, width, height, radius
    );
    CUDA_CHECK(cudaGetLastError());

    CUDA_CHECK(cudaDeviceSynchronize());
}

/*
 * Main function demonstrating usage
 */
int main() {
    // Image dimensions
    const int width = 1920;
    const int height = 1080;
    const int imageSize = width * height * sizeof(float);

    // Gaussian parameters
    const int radius = KERNEL_RADIUS;
    const float sigma = 2.0f;
    const int kernelSize = 2 * radius + 1;

    printf("Separable Gaussian Blur (CUDA)\n");
    printf("Image size: %d x %d\n", width, height);
    printf("Kernel radius: %d (size: %d)\n", radius, kernelSize);
    printf("Sigma: %.2f\n\n", sigma);

    // Allocate host memory
    float* h_input = (float*)malloc(imageSize);
    float* h_output = (float*)malloc(imageSize);
    float* h_kernel = (float*)malloc(kernelSize * sizeof(float));

    // Initialize input with test pattern
    for (int i = 0; i < width * height; i++) {
        h_input[i] = (float)(rand() % 256) / 255.0f;
    }

    // Generate Gaussian kernel
    generateGaussianKernel(h_kernel, radius, sigma);

    printf("Gaussian kernel coefficients:\n");
    for (int i = 0; i < kernelSize; i++) {
        printf("%.6f ", h_kernel[i]);
    }
    printf("\n\n");

    // Allocate device memory
    float *d_input, *d_output, *d_temp, *d_kernel;
    CUDA_CHECK(cudaMalloc(&d_input, imageSize));
    CUDA_CHECK(cudaMalloc(&d_output, imageSize));
    CUDA_CHECK(cudaMalloc(&d_temp, imageSize));
    CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float)));

    // Copy data to device
    CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice));

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));

    // Warm-up run
    applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);

    // Timed run
    CUDA_CHECK(cudaEventRecord(start));
    applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));

    float milliseconds = 0;
    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));

    printf("Execution time: %.3f ms\n", milliseconds);
    printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f));

    // Copy result back to host
    CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost));

    // Verify result (check a few pixels)
    printf("\nSample output values:\n");
    for (int i = 0; i < 5; i++) {
        int idx = (height / 2) * width + (width / 2) + i;
        printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]);
    }

    // Cleanup
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(d_input));
    CUDA_CHECK(cudaFree(d_output));
    CUDA_CHECK(cudaFree(d_temp));
    CUDA_CHECK(cudaFree(d_kernel));
    free(h_input);
    free(h_output);
    free(h_kernel);

    printf("\nGaussian blur completed successfully!\n");

    return 0;
}

## gaussian_blur_kernels.cu
/*
 * CUDA Kernels Implementation
 * Separable Gaussian Blur
 */

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

/*
 * Horizontal Gaussian blur kernel with shared memory optimization
 */
__global__ void gaussianBlurHorizontalKernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    const float* __restrict__ kernel,
    int width,
    int height,
    int radius
) {
    extern __shared__ float sharedRow[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y >= height) return;

    int sharedIdx = threadIdx.x + radius;

    // Load center data
    if (x < width) {
        sharedRow[sharedIdx] = input[y * width + x];
    }

    // Load left halo
    if (threadIdx.x < radius) {
        int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
        sharedRow[threadIdx.x] = (leftX < 0) ?
            input[y * width] : input[y * width + leftX];
    }

    // Load right halo
    if (threadIdx.x >= blockDim.x - radius) {
        int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
        int sharedRightIdx = threadIdx.x + 2 * radius;
        sharedRow[sharedRightIdx] = (rightX >= width) ?
            input[y * width + width - 1] : input[y * width + rightX];
    }

    __syncthreads();

    // Perform convolution
    if (x < width) {
        float sum = 0.0f;

        #pragma unroll
        for (int k = -radius; k <= radius; k++) {
            sum += sharedRow[sharedIdx + k] * kernel[k + radius];
        }

        output[y * width + x] = sum;
    }
}

/*
 * Vertical Gaussian blur kernel with shared memory optimization
 */
__global__ void gaussianBlurVerticalKernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    const float* __restrict__ kernel,
    int width,
    int height,
    int radius
) {
    extern __shared__ float sharedCol[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width) return;

    int sharedIdx = threadIdx.y + radius;

    // Load center data
    if (y < height) {
        sharedCol[sharedIdx] = input[y * width + x];
    }

    // Load top halo
    if (threadIdx.y < radius) {
        int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
        sharedCol[threadIdx.y] = (topY < 0) ?
            input[x] : input[topY * width + x];
    }

    // Load bottom halo
    if (threadIdx.y >= blockDim.y - radius) {
        int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
        int sharedBottomIdx = threadIdx.y + 2 * radius;
        sharedCol[sharedBottomIdx] = (bottomY >= height) ?
            input[(height - 1) * width + x] : input[bottomY * width + x];
    }

    __syncthreads();

    // Perform convolution
    if (y < height) {
        float sum = 0.0f;

        #pragma unroll
        for (int k = -radius; k <= radius; k++) {
            sum += sharedCol[sharedIdx + k] * kernel[k + radius];
        }

        output[y * width + x] = sum;
    }
}

/*
 * C-style wrapper functions for C++ code
 */
extern "C" {

void launchGaussianBlurHorizontal(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius,
    dim3 gridSize,
    dim3 blockSize,
    int sharedMemSize
) {
    gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>(
        input, output, kernel, width, height, radius
    );
}

void launchGaussianBlurVertical(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius,
    dim3 gridSize,
    dim3 blockSize,
    int sharedMemSize
) {
    gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>(
        input, output, kernel, width, height, radius
    );
}

} // extern "C"

## main.cpp
/*
 * Main Application - C++ with CUDA Gaussian Blur
 *
 * Demonstrates usage of the GaussianBlur class
 */

#include "gaussian_blur.hpp"
#include <iostream>
#include <vector>
#include <chrono>
#include <random>
#include <iomanip>

// Simple image loader/saver simulation
class Image {
private:
    int width_;
    int height_;
    std::vector<float> data_;

public:
    Image(int width, int height)
        : width_(width), height_(height), data_(width * height) {}

    void fillRandom() {
        std::random_device rd;
        std::mt19937 gen(rd());
        std::uniform_real_distribution<float> dis(0.0f, 1.0f);

        for (auto& pixel : data_) {
            pixel = dis(gen);
        }
    }

    void fillTestPattern() {
        for (int y = 0; y < height_; y++) {
            for (int x = 0; x < width_; x++) {
                // Create a checkerboard pattern
                float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f;
                data_[y * width_ + x] = value;
            }
        }
    }

    float* data() { return data_.data(); }
    const float* data() const { return data_.data(); }
    int width() const { return width_; }
    int height() const { return height_; }
    size_t size() const { return data_.size(); }

    float getPixel(int x, int y) const {
        if (x < 0 || x >= width_ || y < 0 || y >= height_) {
            return 0.0f;
        }
        return data_[y * width_ + x];
    }

    void printRegion(int startX, int startY, int sizeX, int sizeY) const {
        std::cout << std::fixed << std::setprecision(3);
        for (int y = startY; y < startY + sizeY && y < height_; y++) {
            for (int x = startX; x < startX + sizeX && x < width_; x++) {
                std::cout << getPixel(x, y) << " ";
            }
            std::cout << std::endl;
        }
    }
};

// Benchmark helper
class Timer {
private:
    std::chrono::high_resolution_clock::time_point start_;

public:
    void start() {
        start_ = std::chrono::high_resolution_clock::now();
    }

    double elapsed() const {
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> duration = end - start_;
        return duration.count();
    }
};

// Run benchmark
void runBenchmark(gpu::GaussianBlur& blur, const Image& input,
                  Image& output, int iterations = 10) {
    Timer timer;
    std::vector<double> times;

    // Warm-up run
    blur.apply(input.data(), output.data());

    // Benchmark runs
    for (int i = 0; i < iterations; i++) {
        timer.start();
        blur.apply(input.data(), output.data());
        times.push_back(timer.elapsed());
    }

    // Calculate statistics
    double sum = 0.0;
    double minTime = times[0];
    double maxTime = times[0];

    for (double t : times) {
        sum += t;
        minTime = std::min(minTime, t);
        maxTime = std::max(maxTime, t);
    }

    double avgTime = sum / iterations;
    int pixels = input.width() * input.height();
    double throughput = (pixels / 1e6) / (avgTime / 1000.0);

    std::cout << "\n=== Benchmark Results ===" << std::endl;
    std::cout << "Iterations: " << iterations << std::endl;
    std::cout << "Average time: " << avgTime << " ms" << std::endl;
    std::cout << "Min time: " << minTime << " ms" << std::endl;
    std::cout << "Max time: " << maxTime << " ms" << std::endl;
    std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl;
}

int main(int argc, char** argv) {
    std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl;
    std::cout << std::endl;

    // Configuration
    const int width = 1920;
    const int height = 1080;
    const int radius = 5;
    const float sigma = 2.0f;

    std::cout << "Image dimensions: " << width << " x " << height << std::endl;
    std::cout << "Kernel radius: " << radius << std::endl;
    std::cout << "Sigma: " << sigma << std::endl;
    std::cout << std::endl;

    try {
        // Create input and output images
        Image input(width, height);
        Image output(width, height);

        // Fill with test data
        std::cout << "Generating test image..." << std::endl;
        input.fillRandom();

        // Create Gaussian blur processor
        std::cout << "Initializing GPU processor..." << std::endl;
        gpu::GaussianBlur blur(width, height, radius, sigma);

        // Print kernel
        blur.printKernel();
        std::cout << std::endl;

        // Apply blur
        std::cout << "Applying Gaussian blur..." << std::endl;
        Timer timer;
        timer.start();
        blur.apply(input.data(), output.data());
        double elapsed = timer.elapsed();

        std::cout << "First run completed in " << elapsed << " ms" << std::endl;

        // Show sample results
        std::cout << "\nSample input region (center 5x5):" << std::endl;
        input.printRegion(width/2 - 2, height/2 - 2, 5, 5);

        std::cout << "\nSample output region (center 5x5):" << std::endl;
        output.printRegion(width/2 - 2, height/2 - 2, 5, 5);

        // Run benchmark
        std::cout << "\nRunning performance benchmark..." << std::endl;
        runBenchmark(blur, input, output, 100);

        // Test with different kernel sizes
        std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl;
        std::vector<int> radii = {3, 5, 7, 10};

        for (int r : radii) {
            gpu::GaussianBlur blurTest(width, height, r, 2.0f);
            Timer t;
            t.start();
            blurTest.apply(input.data(), output.data());
            double time = t.elapsed();

            int kernelSize = 2 * r + 1;
            double throughput = (width * height / 1e6) / (time / 1000.0);

            std::cout << "Radius " << r << " (kernel " << kernelSize
                      << "x" << kernelSize << "): "
                      << time << " ms, "
                      << throughput << " Mpixels/sec" << std::endl;
        }

        std::cout << "\n=== Success! ===" << std::endl;

    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }

    return 0;
}

## Makefile
# Makefile for Gaussian Blur CUDA implementations
# Supports both standalone CUDA and C++ with CUDA versions

# Compiler settings
NVCC = nvcc
CXX = g++

# CUDA architecture (adjust based on your GPU)
# Common options:
# - sm_50: Maxwell (GTX 900 series)
# - sm_60: Pascal (GTX 10 series)
# - sm_70: Volta (Tesla V100)
# - sm_75: Turing (RTX 20 series)
# - sm_80: Ampere (RTX 30 series, A100)
# - sm_86: Ampere (RTX 30 series mobile)
# - sm_89: Ada Lovelace (RTX 40 series)
CUDA_ARCH = sm_75

# Compiler flags
NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall
CXX_FLAGS = -std=c++11 -Wall -O3

# CUDA include and library paths (adjust if needed)
CUDA_INC = /usr/local/cuda/include
CUDA_LIB = /usr/local/cuda/lib64

# Targets
TARGETS = gaussian_blur_cuda gaussian_blur_cpp

.PHONY: all clean

all: $(TARGETS)

# Standalone CUDA implementation
gaussian_blur_cuda: gaussian_blur_cuda.cu
	$(NVCC) $(NVCC_FLAGS) $< -o $@
	@echo "Built standalone CUDA implementation: $@"

# C++ with CUDA implementation (requires separate compilation)
gaussian_blur_kernels.o: gaussian_blur_kernels.cu
	$(NVCC) $(NVCC_FLAGS) -c $< -o $@

main.o: main.cpp gaussian_blur.hpp
	$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@

gaussian_blur_cpp: main.o gaussian_blur_kernels.o
	$(NVCC) $(NVCC_FLAGS) $^ -o $@
	@echo "Built C++ with CUDA implementation: $@"

# Run targets
run_cuda: gaussian_blur_cuda
	@echo "\n=== Running Standalone CUDA Implementation ==="
	./gaussian_blur_cuda

run_cpp: gaussian_blur_cpp
	@echo "\n=== Running C++ with CUDA Implementation ==="
	./gaussian_blur_cpp

run: run_cuda run_cpp

# Clean build artifacts
clean:
	rm -f $(TARGETS) *.o
	@echo "Cleaned build artifacts"

# Help target
help:
	@echo "Gaussian Blur CUDA Makefile"
	@echo ""
	@echo "Targets:"
	@echo "  all              - Build all implementations (default)"
	@echo "  gaussian_blur_cuda - Build standalone CUDA version"
	@echo "  gaussian_blur_cpp  - Build C++ with CUDA version"
	@echo "  run_cuda         - Build and run standalone CUDA"
	@echo "  run_cpp          - Build and run C++ with CUDA"
	@echo "  run              - Run both implementations"
	@echo "  clean            - Remove build artifacts"
	@echo "  help             - Show this help message"
	@echo ""
	@echo "Configuration:"
	@echo "  CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture"
	@echo ""
	@echo "Usage examples:"
	@echo "  make                    # Build all"
	@echo "  make run                # Build and run both"
	@echo "  make CUDA_ARCH=sm_80    # Build for Ampere (RTX 30 series)"
	@echo "  make clean              # Clean build files"
	/*
	* Separable Gaussian Blur - C++ with CUDA Implementation
	*
	* This is a C++ wrapper around CUDA kernels providing:
	* - Object-oriented interface
	* - RAII memory management
	* - Support for multiple data types (float, uchar)
	* - Easy-to-use API
	*/

	#ifndef GAUSSIAN_BLUR_HPP
	#define GAUSSIAN_BLUR_HPP

	#include <cuda_runtime.h>
	#include <vector>
	#include <memory>
	#include <stdexcept>
	#include <cmath>
	#include <iostream>

	// CUDA kernel declarations
	extern "C" {
	void launchGaussianBlurHorizontal(
	const float* input,
	float* output,
	const float* kernel,
	int width,
	int height,
	int radius,
	dim3 gridSize,
	dim3 blockSize,
	int sharedMemSize
	);

	void launchGaussianBlurVertical(
	const float* input,
	float* output,
	const float* kernel,
	int width,
	int height,
	int radius,
	dim3 gridSize,
	dim3 blockSize,
	int sharedMemSize
	);
	}

	namespace gpu {

	// RAII wrapper for CUDA device memory
	template<typename T>
	class DeviceBuffer {
	private:
	T* d_ptr_;
	size_t size_;

	public:
	DeviceBuffer(size_t size) : size_(size) {
	cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T));
	if (err != cudaSuccess) {
	throw std::runtime_error("CUDA malloc failed: " +
	std::string(cudaGetErrorString(err)));
	}
	}

	~DeviceBuffer() {
	if (d_ptr_) {
	cudaFree(d_ptr_);
	}
	}

	// Delete copy constructor and assignment
	DeviceBuffer(const DeviceBuffer&) = delete;
	DeviceBuffer& operator=(const DeviceBuffer&) = delete;

	// Move constructor and assignment
	DeviceBuffer(DeviceBuffer&& other) noexcept
	: d_ptr_(other.d_ptr_), size_(other.size_) {
	other.d_ptr_ = nullptr;
	}

	DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
	if (this != &other) {
	if (d_ptr_) cudaFree(d_ptr_);
	d_ptr_ = other.d_ptr_;
	size_ = other.size_;
	other.d_ptr_ = nullptr;
	}
	return *this;
	}

	T* get() { return d_ptr_; }
	const T* get() const { return d_ptr_; }
	size_t size() const { return size_; }

	void copyToDevice(const T* host_data) {
	cudaError_t err = cudaMemcpy(d_ptr_, host_data,
	size_ * sizeof(T), cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
	throw std::runtime_error("Copy to device failed");
	}
	}

	void copyToHost(T* host_data) const {
	cudaError_t err = cudaMemcpy(host_data, d_ptr_,
	size_ * sizeof(T), cudaMemcpyDeviceToHost);
	if (err != cudaSuccess) {
	throw std::runtime_error("Copy to host failed");
	}
	}
	};

	// Gaussian blur class
	class GaussianBlur {
	private:
	int width_;
	int height_;
	int radius_;
	float sigma_;
	std::vector<float> kernel_;

	std::unique_ptr<DeviceBuffer<float>> d_kernel_;
	std::unique_ptr<DeviceBuffer<float>> d_temp_;

	void generateKernel() {
	int kernelSize = 2 * radius_ + 1;
	kernel_.resize(kernelSize);

	float sum = 0.0f;
	for (int i = -radius_; i <= radius_; i++) {
	float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_));
	kernel_[i + radius_] = value;
	sum += value;
	}

	// Normalize
	for (auto& val : kernel_) {
	val /= sum;
	}
	}

	public:
	GaussianBlur(int width, int height, int radius, float sigma)
	: width_(width), height_(height), radius_(radius), sigma_(sigma) {

	if (radius <= 0 \|\| radius > 50) {
	throw std::invalid_argument("Radius must be between 1 and 50");
	}

	if (sigma <= 0.0f) {
	throw std::invalid_argument("Sigma must be positive");
	}

	// Generate Gaussian kernel
	generateKernel();

	// Allocate device memory
	d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size());
	d_kernel_->copyToDevice(kernel_.data());

	// Allocate temporary buffer for intermediate results
	d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height);
	}

	// Apply Gaussian blur
	void apply(const float* h_input, float* h_output) {
	// Allocate device buffers
	DeviceBuffer<float> d_input(width_ * height_);
	DeviceBuffer<float> d_output(width_ * height_);

	// Copy input to device
	d_input.copyToDevice(h_input);

	// Configure kernel launch parameters
	const int blockSize = 16;
	dim3 blockDim(blockSize, blockSize);
	dim3 gridDim(
	(width_ + blockSize - 1) / blockSize,
	(height_ + blockSize - 1) / blockSize
	);

	int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float);

	// Launch horizontal blur
	launchGaussianBlurHorizontal(
	d_input.get(),
	d_temp_->get(),
	d_kernel_->get(),
	width_,
	height_,
	radius_,
	gridDim,
	blockDim,
	sharedMemSize
	);

	// Check for kernel launch errors
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
	throw std::runtime_error("Horizontal kernel launch failed: " +
	std::string(cudaGetErrorString(err)));
	}

	// Launch vertical blur
	launchGaussianBlurVertical(
	d_temp_->get(),
	d_output.get(),
	d_kernel_->get(),
	width_,
	height_,
	radius_,
	gridDim,
	blockDim,
	sharedMemSize
	);

	err = cudaGetLastError();
	if (err != cudaSuccess) {
	throw std::runtime_error("Vertical kernel launch failed: " +
	std::string(cudaGetErrorString(err)));
	}

	// Synchronize
	cudaDeviceSynchronize();

	// Copy result back to host
	d_output.copyToHost(h_output);
	}

	// Getters
	int getWidth() const { return width_; }
	int getHeight() const { return height_; }
	int getRadius() const { return radius_; }
	float getSigma() const { return sigma_; }
	const std::vector<float>& getKernel() const { return kernel_; }

	// Print kernel coefficients
	void printKernel() const {
	std::cout << "Gaussian kernel (radius=" << radius_
	<< ", sigma=" << sigma_ << "):\n";
	for (size_t i = 0; i < kernel_.size(); i++) {
	std::cout << kernel_[i] << " ";
	}
	std::cout << std::endl;
	}
	};

	} // namespace gpu

	#endif // GAUSSIAN_BLUR_HPP
	/*
	* Separable Gaussian Blur - Pure CUDA Implementation
	*
	* This implementation uses:
	* - Shared memory for efficient data reuse
	* - Separate kernels for horizontal and vertical passes
	* - Optimized memory access patterns
	*/

	#include <cuda_runtime.h>
	#include <device_launch_parameters.h>
	#include <stdio.h>
	#include <math.h>

	#define KERNEL_RADIUS 5
	#define KERNEL_SIZE (2 * KERNEL_RADIUS + 1)
	#define BLOCK_SIZE 16

	// Error checking macro
	#define CUDA_CHECK(call) \
	do { \
	cudaError_t error = call; \
	if (error != cudaSuccess) { \
	fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
	cudaGetErrorString(error)); \
	exit(EXIT_FAILURE); \
	} \
	} while(0)

	/*
	* Generate 1D Gaussian kernel
	* Formula: G(x) = (1/sqrt(2pisigma^2)) * exp(-x^2 / (2*sigma^2))
	*/
	__host__ void generateGaussianKernel(float* kernel, int radius, float sigma) {
	float sum = 0.0f;

	for (int i = -radius; i <= radius; i++) {
	float value = expf(-(i * i) / (2.0f * sigma * sigma));
	kernel[i + radius] = value;
	sum += value;
	}

	// Normalize
	for (int i = 0; i < 2 * radius + 1; i++) {
	kernel[i] /= sum;
	}
	}

	/*
	* Horizontal Gaussian blur kernel
	* Each thread processes one pixel, reading from shared memory
	*/
	__global__ void gaussianBlurHorizontal(
	const float* input,
	float* output,
	const float* kernel,
	int width,
	int height,
	int radius
	) {
	// Shared memory for the row (includes halo region)
	extern __shared__ float sharedRow[];

	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (y >= height) return;

	// Load data into shared memory with halo
	int sharedIdx = threadIdx.x + radius;

	// Load main data
	if (x < width) {
	sharedRow[sharedIdx] = input[y * width + x];
	}

	// Load left halo
	if (threadIdx.x < radius) {
	int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
	if (leftX < 0) {
	sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp
	} else {
	sharedRow[threadIdx.x] = input[y * width + leftX];
	}
	}

	// Load right halo
	if (threadIdx.x >= blockDim.x - radius) {
	int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
	int sharedRightIdx = threadIdx.x + 2 * radius;
	if (rightX >= width) {
	sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp
	} else {
	sharedRow[sharedRightIdx] = input[y * width + rightX];
	}
	}

	__syncthreads();

	// Perform convolution
	if (x < width) {
	float sum = 0.0f;

	for (int k = -radius; k <= radius; k++) {
	sum += sharedRow[sharedIdx + k] * kernel[k + radius];
	}

	output[y * width + x] = sum;
	}
	}

	/*
	* Vertical Gaussian blur kernel
	* Each thread processes one pixel, reading columns
	*/
	__global__ void gaussianBlurVertical(
	const float* input,
	float* output,
	const float* kernel,
	int width,
	int height,
	int radius
	) {
	// Shared memory for the column (includes halo region)
	extern __shared__ float sharedCol[];

	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x >= width) return;

	// Load data into shared memory with halo
	int sharedIdx = threadIdx.y + radius;

	// Load main data
	if (y < height) {
	sharedCol[sharedIdx] = input[y * width + x];
	}

	// Load top halo
	if (threadIdx.y < radius) {
	int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
	if (topY < 0) {
	sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp
	} else {
	sharedCol[threadIdx.y] = input[topY * width + x];
	}
	}

	// Load bottom halo
	if (threadIdx.y >= blockDim.y - radius) {
	int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
	int sharedBottomIdx = threadIdx.y + 2 * radius;
	if (bottomY >= height) {
	sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp
	} else {
	sharedCol[sharedBottomIdx] = input[bottomY * width + x];
	}
	}

	__syncthreads();

	// Perform convolution
	if (y < height) {
	float sum = 0.0f;

	for (int k = -radius; k <= radius; k++) {
	sum += sharedCol[sharedIdx + k] * kernel[k + radius];
	}

	output[y * width + x] = sum;
	}
	}

	/*
	* Apply separable Gaussian blur
	*/
	void applySeparableGaussianBlur(
	const float* d_input,
	float* d_output,
	float* d_temp,
	const float* d_kernel,
	int width,
	int height,
	int radius
	) {
	dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
	dim3 gridSize(
	(width + blockSize.x - 1) / blockSize.x,
	(height + blockSize.y - 1) / blockSize.y
	);

	// Shared memory size (block size + 2 * radius for halo)
	int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float);

	// Horizontal pass
	gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>(
	d_input, d_temp, d_kernel, width, height, radius
	);
	CUDA_CHECK(cudaGetLastError());

	// Vertical pass
	gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>(
	d_temp, d_output, d_kernel, width, height, radius
	);
	CUDA_CHECK(cudaGetLastError());

	CUDA_CHECK(cudaDeviceSynchronize());
	}

	/*
	* Main function demonstrating usage
	*/
	int main() {
	// Image dimensions
	const int width = 1920;
	const int height = 1080;
	const int imageSize = width * height * sizeof(float);

	// Gaussian parameters
	const int radius = KERNEL_RADIUS;
	const float sigma = 2.0f;
	const int kernelSize = 2 * radius + 1;

	printf("Separable Gaussian Blur (CUDA)\n");
	printf("Image size: %d x %d\n", width, height);
	printf("Kernel radius: %d (size: %d)\n", radius, kernelSize);
	printf("Sigma: %.2f\n\n", sigma);

	// Allocate host memory
	float* h_input = (float*)malloc(imageSize);
	float* h_output = (float*)malloc(imageSize);
	float* h_kernel = (float)malloc(kernelSize sizeof(float));

	// Initialize input with test pattern
	for (int i = 0; i < width * height; i++) {
	h_input[i] = (float)(rand() % 256) / 255.0f;
	}

	// Generate Gaussian kernel
	generateGaussianKernel(h_kernel, radius, sigma);

	printf("Gaussian kernel coefficients:\n");
	for (int i = 0; i < kernelSize; i++) {
	printf("%.6f ", h_kernel[i]);
	}
	printf("\n\n");

	// Allocate device memory
	float d_input, d_output, d_temp, d_kernel;
	CUDA_CHECK(cudaMalloc(&d_input, imageSize));
	CUDA_CHECK(cudaMalloc(&d_output, imageSize));
	CUDA_CHECK(cudaMalloc(&d_temp, imageSize));
	CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float)));

	// Copy data to device
	CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice));

	// Create CUDA events for timing
	cudaEvent_t start, stop;
	CUDA_CHECK(cudaEventCreate(&start));
	CUDA_CHECK(cudaEventCreate(&stop));

	// Warm-up run
	applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);

	// Timed run
	CUDA_CHECK(cudaEventRecord(start));
	applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
	CUDA_CHECK(cudaEventRecord(stop));
	CUDA_CHECK(cudaEventSynchronize(stop));

	float milliseconds = 0;
	CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));

	printf("Execution time: %.3f ms\n", milliseconds);
	printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f));

	// Copy result back to host
	CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost));

	// Verify result (check a few pixels)
	printf("\nSample output values:\n");
	for (int i = 0; i < 5; i++) {
	int idx = (height / 2) * width + (width / 2) + i;
	printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]);
	}

	// Cleanup
	CUDA_CHECK(cudaEventDestroy(start));
	CUDA_CHECK(cudaEventDestroy(stop));
	CUDA_CHECK(cudaFree(d_input));
	CUDA_CHECK(cudaFree(d_output));
	CUDA_CHECK(cudaFree(d_temp));
	CUDA_CHECK(cudaFree(d_kernel));
	free(h_input);
	free(h_output);
	free(h_kernel);

	printf("\nGaussian blur completed successfully!\n");

	return 0;
	}
	/*
	* CUDA Kernels Implementation
	* Separable Gaussian Blur
	*/

	#include <cuda_runtime.h>
	#include <device_launch_parameters.h>

	/*
	* Horizontal Gaussian blur kernel with shared memory optimization
	*/
	__global__ void gaussianBlurHorizontalKernel(
	const float* __restrict__ input,
	float* __restrict__ output,
	const float* __restrict__ kernel,
	int width,
	int height,
	int radius
	) {
	extern __shared__ float sharedRow[];

	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (y >= height) return;

	int sharedIdx = threadIdx.x + radius;

	// Load center data
	if (x < width) {
	sharedRow[sharedIdx] = input[y * width + x];
	}

	// Load left halo
	if (threadIdx.x < radius) {
	int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
	sharedRow[threadIdx.x] = (leftX < 0) ?
	input[y * width] : input[y * width + leftX];
	}

	// Load right halo
	if (threadIdx.x >= blockDim.x - radius) {
	int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
	int sharedRightIdx = threadIdx.x + 2 * radius;
	sharedRow[sharedRightIdx] = (rightX >= width) ?
	input[y * width + width - 1] : input[y * width + rightX];
	}

	__syncthreads();

	// Perform convolution
	if (x < width) {
	float sum = 0.0f;

	#pragma unroll
	for (int k = -radius; k <= radius; k++) {
	sum += sharedRow[sharedIdx + k] * kernel[k + radius];
	}

	output[y * width + x] = sum;
	}
	}

	/*
	* Vertical Gaussian blur kernel with shared memory optimization
	*/
	__global__ void gaussianBlurVerticalKernel(
	const float* __restrict__ input,
	float* __restrict__ output,
	const float* __restrict__ kernel,
	int width,
	int height,
	int radius
	) {
	extern __shared__ float sharedCol[];

	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x >= width) return;

	int sharedIdx = threadIdx.y + radius;

	// Load center data
	if (y < height) {
	sharedCol[sharedIdx] = input[y * width + x];
	}

	// Load top halo
	if (threadIdx.y < radius) {
	int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
	sharedCol[threadIdx.y] = (topY < 0) ?
	input[x] : input[topY * width + x];
	}

	// Load bottom halo
	if (threadIdx.y >= blockDim.y - radius) {
	int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
	int sharedBottomIdx = threadIdx.y + 2 * radius;
	sharedCol[sharedBottomIdx] = (bottomY >= height) ?
	input[(height - 1) * width + x] : input[bottomY * width + x];
	}

	__syncthreads();

	// Perform convolution
	if (y < height) {
	float sum = 0.0f;

	#pragma unroll
	for (int k = -radius; k <= radius; k++) {
	sum += sharedCol[sharedIdx + k] * kernel[k + radius];
	}

	output[y * width + x] = sum;
	}
	}

	/*
	* C-style wrapper functions for C++ code
	*/
	extern "C" {

	void launchGaussianBlurHorizontal(
	const float* input,
	float* output,
	const float* kernel,
	int width,
	int height,
	int radius,
	dim3 gridSize,
	dim3 blockSize,
	int sharedMemSize
	) {
	gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>(
	input, output, kernel, width, height, radius
	);
	}

	void launchGaussianBlurVertical(
	const float* input,
	float* output,
	const float* kernel,
	int width,
	int height,
	int radius,
	dim3 gridSize,
	dim3 blockSize,
	int sharedMemSize
	) {
	gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>(
	input, output, kernel, width, height, radius
	);
	}

	} // extern "C"
	/*
	* Main Application - C++ with CUDA Gaussian Blur
	*
	* Demonstrates usage of the GaussianBlur class
	*/

	#include "gaussian_blur.hpp"
	#include <iostream>
	#include <vector>
	#include <chrono>
	#include <random>
	#include <iomanip>

	// Simple image loader/saver simulation
	class Image {
	private:
	int width_;
	int height_;
	std::vector<float> data_;

	public:
	Image(int width, int height)
	: width_(width), height_(height), data_(width * height) {}

	void fillRandom() {
	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_real_distribution<float> dis(0.0f, 1.0f);

	for (auto& pixel : data_) {
	pixel = dis(gen);
	}
	}

	void fillTestPattern() {
	for (int y = 0; y < height_; y++) {
	for (int x = 0; x < width_; x++) {
	// Create a checkerboard pattern
	float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f;
	data_[y * width_ + x] = value;
	}
	}
	}

	float* data() { return data_.data(); }
	const float* data() const { return data_.data(); }
	int width() const { return width_; }
	int height() const { return height_; }
	size_t size() const { return data_.size(); }

	float getPixel(int x, int y) const {
	if (x < 0 \|\| x >= width_ \|\| y < 0 \|\| y >= height_) {
	return 0.0f;
	}
	return data_[y * width_ + x];
	}

	void printRegion(int startX, int startY, int sizeX, int sizeY) const {
	std::cout << std::fixed << std::setprecision(3);
	for (int y = startY; y < startY + sizeY && y < height_; y++) {
	for (int x = startX; x < startX + sizeX && x < width_; x++) {
	std::cout << getPixel(x, y) << " ";
	}
	std::cout << std::endl;
	}
	}
	};

	// Benchmark helper
	class Timer {
	private:
	std::chrono::high_resolution_clock::time_point start_;

	public:
	void start() {
	start_ = std::chrono::high_resolution_clock::now();
	}

	double elapsed() const {
	auto end = std::chrono::high_resolution_clock::now();
	std::chrono::duration<double, std::milli> duration = end - start_;
	return duration.count();
	}
	};

	// Run benchmark
	void runBenchmark(gpu::GaussianBlur& blur, const Image& input,
	Image& output, int iterations = 10) {
	Timer timer;
	std::vector<double> times;

	// Warm-up run
	blur.apply(input.data(), output.data());

	// Benchmark runs
	for (int i = 0; i < iterations; i++) {
	timer.start();
	blur.apply(input.data(), output.data());
	times.push_back(timer.elapsed());
	}

	// Calculate statistics
	double sum = 0.0;
	double minTime = times[0];
	double maxTime = times[0];

	for (double t : times) {
	sum += t;
	minTime = std::min(minTime, t);
	maxTime = std::max(maxTime, t);
	}

	double avgTime = sum / iterations;
	int pixels = input.width() * input.height();
	double throughput = (pixels / 1e6) / (avgTime / 1000.0);

	std::cout << "\n=== Benchmark Results ===" << std::endl;
	std::cout << "Iterations: " << iterations << std::endl;
	std::cout << "Average time: " << avgTime << " ms" << std::endl;
	std::cout << "Min time: " << minTime << " ms" << std::endl;
	std::cout << "Max time: " << maxTime << " ms" << std::endl;
	std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl;
	}

	int main(int argc, char** argv) {
	std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl;
	std::cout << std::endl;

	// Configuration
	const int width = 1920;
	const int height = 1080;
	const int radius = 5;
	const float sigma = 2.0f;

	std::cout << "Image dimensions: " << width << " x " << height << std::endl;
	std::cout << "Kernel radius: " << radius << std::endl;
	std::cout << "Sigma: " << sigma << std::endl;
	std::cout << std::endl;

	try {
	// Create input and output images
	Image input(width, height);
	Image output(width, height);

	// Fill with test data
	std::cout << "Generating test image..." << std::endl;
	input.fillRandom();

	// Create Gaussian blur processor
	std::cout << "Initializing GPU processor..." << std::endl;
	gpu::GaussianBlur blur(width, height, radius, sigma);

	// Print kernel
	blur.printKernel();
	std::cout << std::endl;

	// Apply blur
	std::cout << "Applying Gaussian blur..." << std::endl;
	Timer timer;
	timer.start();
	blur.apply(input.data(), output.data());
	double elapsed = timer.elapsed();

	std::cout << "First run completed in " << elapsed << " ms" << std::endl;

	// Show sample results
	std::cout << "\nSample input region (center 5x5):" << std::endl;
	input.printRegion(width/2 - 2, height/2 - 2, 5, 5);

	std::cout << "\nSample output region (center 5x5):" << std::endl;
	output.printRegion(width/2 - 2, height/2 - 2, 5, 5);

	// Run benchmark
	std::cout << "\nRunning performance benchmark..." << std::endl;
	runBenchmark(blur, input, output, 100);

	// Test with different kernel sizes
	std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl;
	std::vector<int> radii = {3, 5, 7, 10};

	for (int r : radii) {
	gpu::GaussianBlur blurTest(width, height, r, 2.0f);
	Timer t;
	t.start();
	blurTest.apply(input.data(), output.data());
	double time = t.elapsed();

	int kernelSize = 2 * r + 1;
	double throughput = (width * height / 1e6) / (time / 1000.0);

	std::cout << "Radius " << r << " (kernel " << kernelSize
	<< "x" << kernelSize << "): "
	<< time << " ms, "
	<< throughput << " Mpixels/sec" << std::endl;
	}

	std::cout << "\n=== Success! ===" << std::endl;

	} catch (const std::exception& e) {
	std::cerr << "Error: " << e.what() << std::endl;
	return 1;
	}

	return 0;
	}
	# Makefile for Gaussian Blur CUDA implementations
	# Supports both standalone CUDA and C++ with CUDA versions

	# Compiler settings
	NVCC = nvcc
	CXX = g++

	# CUDA architecture (adjust based on your GPU)
	# Common options:
	# - sm_50: Maxwell (GTX 900 series)
	# - sm_60: Pascal (GTX 10 series)
	# - sm_70: Volta (Tesla V100)
	# - sm_75: Turing (RTX 20 series)
	# - sm_80: Ampere (RTX 30 series, A100)
	# - sm_86: Ampere (RTX 30 series mobile)
	# - sm_89: Ada Lovelace (RTX 40 series)
	CUDA_ARCH = sm_75

	# Compiler flags
	NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall
	CXX_FLAGS = -std=c++11 -Wall -O3

	# CUDA include and library paths (adjust if needed)
	CUDA_INC = /usr/local/cuda/include
	CUDA_LIB = /usr/local/cuda/lib64

	# Targets
	TARGETS = gaussian_blur_cuda gaussian_blur_cpp

	.PHONY: all clean

	all: $(TARGETS)

	# Standalone CUDA implementation
	gaussian_blur_cuda: gaussian_blur_cuda.cu
	$(NVCC) $(NVCC_FLAGS) $< -o $@
	@echo "Built standalone CUDA implementation: $@"

	# C++ with CUDA implementation (requires separate compilation)
	gaussian_blur_kernels.o: gaussian_blur_kernels.cu
	$(NVCC) $(NVCC_FLAGS) -c $< -o $@

	main.o: main.cpp gaussian_blur.hpp
	$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@

	gaussian_blur_cpp: main.o gaussian_blur_kernels.o
	$(NVCC) $(NVCC_FLAGS) $^ -o $@
	@echo "Built C++ with CUDA implementation: $@"

	# Run targets
	run_cuda: gaussian_blur_cuda
	@echo "\n=== Running Standalone CUDA Implementation ==="
	./gaussian_blur_cuda

	run_cpp: gaussian_blur_cpp
	@echo "\n=== Running C++ with CUDA Implementation ==="
	./gaussian_blur_cpp

	run: run_cuda run_cpp

	# Clean build artifacts
	clean:
	rm -f $(TARGETS) *.o
	@echo "Cleaned build artifacts"

	# Help target
	help:
	@echo "Gaussian Blur CUDA Makefile"
	@echo ""
	@echo "Targets:"
	@echo " all - Build all implementations (default)"
	@echo " gaussian_blur_cuda - Build standalone CUDA version"
	@echo " gaussian_blur_cpp - Build C++ with CUDA version"
	@echo " run_cuda - Build and run standalone CUDA"
	@echo " run_cpp - Build and run C++ with CUDA"
	@echo " run - Run both implementations"
	@echo " clean - Remove build artifacts"
	@echo " help - Show this help message"
	@echo ""
	@echo "Configuration:"
	@echo " CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture"
	@echo ""
	@echo "Usage examples:"
	@echo " make # Build all"
	@echo " make run # Build and run both"
	@echo " make CUDA_ARCH=sm_80 # Build for Ampere (RTX 30 series)"
	@echo " make clean # Clean build files"