Last active
February 22, 2026 03:25
-
-
Save raymondtay/9915510013d22a5ac890652eedf9af03 to your computer and use it in GitHub Desktop.
Gaussian Blur blogpost
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * Separable Gaussian Blur - C++ with CUDA Implementation | |
| * | |
| * This is a C++ wrapper around CUDA kernels providing: | |
| * - Object-oriented interface | |
| * - RAII memory management | |
| * - Support for multiple data types (float, uchar) | |
| * - Easy-to-use API | |
| */ | |
| #ifndef GAUSSIAN_BLUR_HPP | |
| #define GAUSSIAN_BLUR_HPP | |
| #include <cuda_runtime.h> | |
| #include <vector> | |
| #include <memory> | |
| #include <stdexcept> | |
| #include <cmath> | |
| #include <iostream> | |
| // CUDA kernel declarations | |
| extern "C" { | |
| void launchGaussianBlurHorizontal( | |
| const float* input, | |
| float* output, | |
| const float* kernel, | |
| int width, | |
| int height, | |
| int radius, | |
| dim3 gridSize, | |
| dim3 blockSize, | |
| int sharedMemSize | |
| ); | |
| void launchGaussianBlurVertical( | |
| const float* input, | |
| float* output, | |
| const float* kernel, | |
| int width, | |
| int height, | |
| int radius, | |
| dim3 gridSize, | |
| dim3 blockSize, | |
| int sharedMemSize | |
| ); | |
| } | |
| namespace gpu { | |
| // RAII wrapper for CUDA device memory | |
| template<typename T> | |
| class DeviceBuffer { | |
| private: | |
| T* d_ptr_; | |
| size_t size_; | |
| public: | |
| DeviceBuffer(size_t size) : size_(size) { | |
| cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T)); | |
| if (err != cudaSuccess) { | |
| throw std::runtime_error("CUDA malloc failed: " + | |
| std::string(cudaGetErrorString(err))); | |
| } | |
| } | |
| ~DeviceBuffer() { | |
| if (d_ptr_) { | |
| cudaFree(d_ptr_); | |
| } | |
| } | |
| // Delete copy constructor and assignment | |
| DeviceBuffer(const DeviceBuffer&) = delete; | |
| DeviceBuffer& operator=(const DeviceBuffer&) = delete; | |
| // Move constructor and assignment | |
| DeviceBuffer(DeviceBuffer&& other) noexcept | |
| : d_ptr_(other.d_ptr_), size_(other.size_) { | |
| other.d_ptr_ = nullptr; | |
| } | |
| DeviceBuffer& operator=(DeviceBuffer&& other) noexcept { | |
| if (this != &other) { | |
| if (d_ptr_) cudaFree(d_ptr_); | |
| d_ptr_ = other.d_ptr_; | |
| size_ = other.size_; | |
| other.d_ptr_ = nullptr; | |
| } | |
| return *this; | |
| } | |
| T* get() { return d_ptr_; } | |
| const T* get() const { return d_ptr_; } | |
| size_t size() const { return size_; } | |
| void copyToDevice(const T* host_data) { | |
| cudaError_t err = cudaMemcpy(d_ptr_, host_data, | |
| size_ * sizeof(T), cudaMemcpyHostToDevice); | |
| if (err != cudaSuccess) { | |
| throw std::runtime_error("Copy to device failed"); | |
| } | |
| } | |
| void copyToHost(T* host_data) const { | |
| cudaError_t err = cudaMemcpy(host_data, d_ptr_, | |
| size_ * sizeof(T), cudaMemcpyDeviceToHost); | |
| if (err != cudaSuccess) { | |
| throw std::runtime_error("Copy to host failed"); | |
| } | |
| } | |
| }; | |
| // Gaussian blur class | |
| class GaussianBlur { | |
| private: | |
| int width_; | |
| int height_; | |
| int radius_; | |
| float sigma_; | |
| std::vector<float> kernel_; | |
| std::unique_ptr<DeviceBuffer<float>> d_kernel_; | |
| std::unique_ptr<DeviceBuffer<float>> d_temp_; | |
| void generateKernel() { | |
| int kernelSize = 2 * radius_ + 1; | |
| kernel_.resize(kernelSize); | |
| float sum = 0.0f; | |
| for (int i = -radius_; i <= radius_; i++) { | |
| float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_)); | |
| kernel_[i + radius_] = value; | |
| sum += value; | |
| } | |
| // Normalize | |
| for (auto& val : kernel_) { | |
| val /= sum; | |
| } | |
| } | |
| public: | |
| GaussianBlur(int width, int height, int radius, float sigma) | |
| : width_(width), height_(height), radius_(radius), sigma_(sigma) { | |
| if (radius <= 0 || radius > 50) { | |
| throw std::invalid_argument("Radius must be between 1 and 50"); | |
| } | |
| if (sigma <= 0.0f) { | |
| throw std::invalid_argument("Sigma must be positive"); | |
| } | |
| // Generate Gaussian kernel | |
| generateKernel(); | |
| // Allocate device memory | |
| d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size()); | |
| d_kernel_->copyToDevice(kernel_.data()); | |
| // Allocate temporary buffer for intermediate results | |
| d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height); | |
| } | |
| // Apply Gaussian blur | |
| void apply(const float* h_input, float* h_output) { | |
| // Allocate device buffers | |
| DeviceBuffer<float> d_input(width_ * height_); | |
| DeviceBuffer<float> d_output(width_ * height_); | |
| // Copy input to device | |
| d_input.copyToDevice(h_input); | |
| // Configure kernel launch parameters | |
| const int blockSize = 16; | |
| dim3 blockDim(blockSize, blockSize); | |
| dim3 gridDim( | |
| (width_ + blockSize - 1) / blockSize, | |
| (height_ + blockSize - 1) / blockSize | |
| ); | |
| int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float); | |
| // Launch horizontal blur | |
| launchGaussianBlurHorizontal( | |
| d_input.get(), | |
| d_temp_->get(), | |
| d_kernel_->get(), | |
| width_, | |
| height_, | |
| radius_, | |
| gridDim, | |
| blockDim, | |
| sharedMemSize | |
| ); | |
| // Check for kernel launch errors | |
| cudaError_t err = cudaGetLastError(); | |
| if (err != cudaSuccess) { | |
| throw std::runtime_error("Horizontal kernel launch failed: " + | |
| std::string(cudaGetErrorString(err))); | |
| } | |
| // Launch vertical blur | |
| launchGaussianBlurVertical( | |
| d_temp_->get(), | |
| d_output.get(), | |
| d_kernel_->get(), | |
| width_, | |
| height_, | |
| radius_, | |
| gridDim, | |
| blockDim, | |
| sharedMemSize | |
| ); | |
| err = cudaGetLastError(); | |
| if (err != cudaSuccess) { | |
| throw std::runtime_error("Vertical kernel launch failed: " + | |
| std::string(cudaGetErrorString(err))); | |
| } | |
| // Synchronize | |
| cudaDeviceSynchronize(); | |
| // Copy result back to host | |
| d_output.copyToHost(h_output); | |
| } | |
| // Getters | |
| int getWidth() const { return width_; } | |
| int getHeight() const { return height_; } | |
| int getRadius() const { return radius_; } | |
| float getSigma() const { return sigma_; } | |
| const std::vector<float>& getKernel() const { return kernel_; } | |
| // Print kernel coefficients | |
| void printKernel() const { | |
| std::cout << "Gaussian kernel (radius=" << radius_ | |
| << ", sigma=" << sigma_ << "):\n"; | |
| for (size_t i = 0; i < kernel_.size(); i++) { | |
| std::cout << kernel_[i] << " "; | |
| } | |
| std::cout << std::endl; | |
| } | |
| }; | |
| } // namespace gpu | |
| #endif // GAUSSIAN_BLUR_HPP |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * Separable Gaussian Blur - Pure CUDA Implementation | |
| * | |
| * This implementation uses: | |
| * - Shared memory for efficient data reuse | |
| * - Separate kernels for horizontal and vertical passes | |
| * - Optimized memory access patterns | |
| */ | |
| #include <cuda_runtime.h> | |
| #include <device_launch_parameters.h> | |
| #include <stdio.h> | |
| #include <math.h> | |
| #define KERNEL_RADIUS 5 | |
| #define KERNEL_SIZE (2 * KERNEL_RADIUS + 1) | |
| #define BLOCK_SIZE 16 | |
| // Error checking macro | |
| #define CUDA_CHECK(call) \ | |
| do { \ | |
| cudaError_t error = call; \ | |
| if (error != cudaSuccess) { \ | |
| fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ | |
| cudaGetErrorString(error)); \ | |
| exit(EXIT_FAILURE); \ | |
| } \ | |
| } while(0) | |
| /* | |
| * Generate 1D Gaussian kernel | |
| * Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2)) | |
| */ | |
| __host__ void generateGaussianKernel(float* kernel, int radius, float sigma) { | |
| float sum = 0.0f; | |
| for (int i = -radius; i <= radius; i++) { | |
| float value = expf(-(i * i) / (2.0f * sigma * sigma)); | |
| kernel[i + radius] = value; | |
| sum += value; | |
| } | |
| // Normalize | |
| for (int i = 0; i < 2 * radius + 1; i++) { | |
| kernel[i] /= sum; | |
| } | |
| } | |
| /* | |
| * Horizontal Gaussian blur kernel | |
| * Each thread processes one pixel, reading from shared memory | |
| */ | |
| __global__ void gaussianBlurHorizontal( | |
| const float* input, | |
| float* output, | |
| const float* kernel, | |
| int width, | |
| int height, | |
| int radius | |
| ) { | |
| // Shared memory for the row (includes halo region) | |
| extern __shared__ float sharedRow[]; | |
| int x = blockIdx.x * blockDim.x + threadIdx.x; | |
| int y = blockIdx.y * blockDim.y + threadIdx.y; | |
| if (y >= height) return; | |
| // Load data into shared memory with halo | |
| int sharedIdx = threadIdx.x + radius; | |
| // Load main data | |
| if (x < width) { | |
| sharedRow[sharedIdx] = input[y * width + x]; | |
| } | |
| // Load left halo | |
| if (threadIdx.x < radius) { | |
| int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x; | |
| if (leftX < 0) { | |
| sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp | |
| } else { | |
| sharedRow[threadIdx.x] = input[y * width + leftX]; | |
| } | |
| } | |
| // Load right halo | |
| if (threadIdx.x >= blockDim.x - radius) { | |
| int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius; | |
| int sharedRightIdx = threadIdx.x + 2 * radius; | |
| if (rightX >= width) { | |
| sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp | |
| } else { | |
| sharedRow[sharedRightIdx] = input[y * width + rightX]; | |
| } | |
| } | |
| __syncthreads(); | |
| // Perform convolution | |
| if (x < width) { | |
| float sum = 0.0f; | |
| for (int k = -radius; k <= radius; k++) { | |
| sum += sharedRow[sharedIdx + k] * kernel[k + radius]; | |
| } | |
| output[y * width + x] = sum; | |
| } | |
| } | |
| /* | |
| * Vertical Gaussian blur kernel | |
| * Each thread processes one pixel, reading columns | |
| */ | |
| __global__ void gaussianBlurVertical( | |
| const float* input, | |
| float* output, | |
| const float* kernel, | |
| int width, | |
| int height, | |
| int radius | |
| ) { | |
| // Shared memory for the column (includes halo region) | |
| extern __shared__ float sharedCol[]; | |
| int x = blockIdx.x * blockDim.x + threadIdx.x; | |
| int y = blockIdx.y * blockDim.y + threadIdx.y; | |
| if (x >= width) return; | |
| // Load data into shared memory with halo | |
| int sharedIdx = threadIdx.y + radius; | |
| // Load main data | |
| if (y < height) { | |
| sharedCol[sharedIdx] = input[y * width + x]; | |
| } | |
| // Load top halo | |
| if (threadIdx.y < radius) { | |
| int topY = blockIdx.y * blockDim.y - radius + threadIdx.y; | |
| if (topY < 0) { | |
| sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp | |
| } else { | |
| sharedCol[threadIdx.y] = input[topY * width + x]; | |
| } | |
| } | |
| // Load bottom halo | |
| if (threadIdx.y >= blockDim.y - radius) { | |
| int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius; | |
| int sharedBottomIdx = threadIdx.y + 2 * radius; | |
| if (bottomY >= height) { | |
| sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp | |
| } else { | |
| sharedCol[sharedBottomIdx] = input[bottomY * width + x]; | |
| } | |
| } | |
| __syncthreads(); | |
| // Perform convolution | |
| if (y < height) { | |
| float sum = 0.0f; | |
| for (int k = -radius; k <= radius; k++) { | |
| sum += sharedCol[sharedIdx + k] * kernel[k + radius]; | |
| } | |
| output[y * width + x] = sum; | |
| } | |
| } | |
| /* | |
| * Apply separable Gaussian blur | |
| */ | |
| void applySeparableGaussianBlur( | |
| const float* d_input, | |
| float* d_output, | |
| float* d_temp, | |
| const float* d_kernel, | |
| int width, | |
| int height, | |
| int radius | |
| ) { | |
| dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE); | |
| dim3 gridSize( | |
| (width + blockSize.x - 1) / blockSize.x, | |
| (height + blockSize.y - 1) / blockSize.y | |
| ); | |
| // Shared memory size (block size + 2 * radius for halo) | |
| int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float); | |
| // Horizontal pass | |
| gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>( | |
| d_input, d_temp, d_kernel, width, height, radius | |
| ); | |
| CUDA_CHECK(cudaGetLastError()); | |
| // Vertical pass | |
| gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>( | |
| d_temp, d_output, d_kernel, width, height, radius | |
| ); | |
| CUDA_CHECK(cudaGetLastError()); | |
| CUDA_CHECK(cudaDeviceSynchronize()); | |
| } | |
| /* | |
| * Main function demonstrating usage | |
| */ | |
| int main() { | |
| // Image dimensions | |
| const int width = 1920; | |
| const int height = 1080; | |
| const int imageSize = width * height * sizeof(float); | |
| // Gaussian parameters | |
| const int radius = KERNEL_RADIUS; | |
| const float sigma = 2.0f; | |
| const int kernelSize = 2 * radius + 1; | |
| printf("Separable Gaussian Blur (CUDA)\n"); | |
| printf("Image size: %d x %d\n", width, height); | |
| printf("Kernel radius: %d (size: %d)\n", radius, kernelSize); | |
| printf("Sigma: %.2f\n\n", sigma); | |
| // Allocate host memory | |
| float* h_input = (float*)malloc(imageSize); | |
| float* h_output = (float*)malloc(imageSize); | |
| float* h_kernel = (float*)malloc(kernelSize * sizeof(float)); | |
| // Initialize input with test pattern | |
| for (int i = 0; i < width * height; i++) { | |
| h_input[i] = (float)(rand() % 256) / 255.0f; | |
| } | |
| // Generate Gaussian kernel | |
| generateGaussianKernel(h_kernel, radius, sigma); | |
| printf("Gaussian kernel coefficients:\n"); | |
| for (int i = 0; i < kernelSize; i++) { | |
| printf("%.6f ", h_kernel[i]); | |
| } | |
| printf("\n\n"); | |
| // Allocate device memory | |
| float *d_input, *d_output, *d_temp, *d_kernel; | |
| CUDA_CHECK(cudaMalloc(&d_input, imageSize)); | |
| CUDA_CHECK(cudaMalloc(&d_output, imageSize)); | |
| CUDA_CHECK(cudaMalloc(&d_temp, imageSize)); | |
| CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float))); | |
| // Copy data to device | |
| CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice)); | |
| CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice)); | |
| // Create CUDA events for timing | |
| cudaEvent_t start, stop; | |
| CUDA_CHECK(cudaEventCreate(&start)); | |
| CUDA_CHECK(cudaEventCreate(&stop)); | |
| // Warm-up run | |
| applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius); | |
| // Timed run | |
| CUDA_CHECK(cudaEventRecord(start)); | |
| applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius); | |
| CUDA_CHECK(cudaEventRecord(stop)); | |
| CUDA_CHECK(cudaEventSynchronize(stop)); | |
| float milliseconds = 0; | |
| CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop)); | |
| printf("Execution time: %.3f ms\n", milliseconds); | |
| printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f)); | |
| // Copy result back to host | |
| CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost)); | |
| // Verify result (check a few pixels) | |
| printf("\nSample output values:\n"); | |
| for (int i = 0; i < 5; i++) { | |
| int idx = (height / 2) * width + (width / 2) + i; | |
| printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]); | |
| } | |
| // Cleanup | |
| CUDA_CHECK(cudaEventDestroy(start)); | |
| CUDA_CHECK(cudaEventDestroy(stop)); | |
| CUDA_CHECK(cudaFree(d_input)); | |
| CUDA_CHECK(cudaFree(d_output)); | |
| CUDA_CHECK(cudaFree(d_temp)); | |
| CUDA_CHECK(cudaFree(d_kernel)); | |
| free(h_input); | |
| free(h_output); | |
| free(h_kernel); | |
| printf("\nGaussian blur completed successfully!\n"); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * CUDA Kernels Implementation | |
| * Separable Gaussian Blur | |
| */ | |
| #include <cuda_runtime.h> | |
| #include <device_launch_parameters.h> | |
| /* | |
| * Horizontal Gaussian blur kernel with shared memory optimization | |
| */ | |
| __global__ void gaussianBlurHorizontalKernel( | |
| const float* __restrict__ input, | |
| float* __restrict__ output, | |
| const float* __restrict__ kernel, | |
| int width, | |
| int height, | |
| int radius | |
| ) { | |
| extern __shared__ float sharedRow[]; | |
| int x = blockIdx.x * blockDim.x + threadIdx.x; | |
| int y = blockIdx.y * blockDim.y + threadIdx.y; | |
| if (y >= height) return; | |
| int sharedIdx = threadIdx.x + radius; | |
| // Load center data | |
| if (x < width) { | |
| sharedRow[sharedIdx] = input[y * width + x]; | |
| } | |
| // Load left halo | |
| if (threadIdx.x < radius) { | |
| int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x; | |
| sharedRow[threadIdx.x] = (leftX < 0) ? | |
| input[y * width] : input[y * width + leftX]; | |
| } | |
| // Load right halo | |
| if (threadIdx.x >= blockDim.x - radius) { | |
| int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius; | |
| int sharedRightIdx = threadIdx.x + 2 * radius; | |
| sharedRow[sharedRightIdx] = (rightX >= width) ? | |
| input[y * width + width - 1] : input[y * width + rightX]; | |
| } | |
| __syncthreads(); | |
| // Perform convolution | |
| if (x < width) { | |
| float sum = 0.0f; | |
| #pragma unroll | |
| for (int k = -radius; k <= radius; k++) { | |
| sum += sharedRow[sharedIdx + k] * kernel[k + radius]; | |
| } | |
| output[y * width + x] = sum; | |
| } | |
| } | |
| /* | |
| * Vertical Gaussian blur kernel with shared memory optimization | |
| */ | |
| __global__ void gaussianBlurVerticalKernel( | |
| const float* __restrict__ input, | |
| float* __restrict__ output, | |
| const float* __restrict__ kernel, | |
| int width, | |
| int height, | |
| int radius | |
| ) { | |
| extern __shared__ float sharedCol[]; | |
| int x = blockIdx.x * blockDim.x + threadIdx.x; | |
| int y = blockIdx.y * blockDim.y + threadIdx.y; | |
| if (x >= width) return; | |
| int sharedIdx = threadIdx.y + radius; | |
| // Load center data | |
| if (y < height) { | |
| sharedCol[sharedIdx] = input[y * width + x]; | |
| } | |
| // Load top halo | |
| if (threadIdx.y < radius) { | |
| int topY = blockIdx.y * blockDim.y - radius + threadIdx.y; | |
| sharedCol[threadIdx.y] = (topY < 0) ? | |
| input[x] : input[topY * width + x]; | |
| } | |
| // Load bottom halo | |
| if (threadIdx.y >= blockDim.y - radius) { | |
| int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius; | |
| int sharedBottomIdx = threadIdx.y + 2 * radius; | |
| sharedCol[sharedBottomIdx] = (bottomY >= height) ? | |
| input[(height - 1) * width + x] : input[bottomY * width + x]; | |
| } | |
| __syncthreads(); | |
| // Perform convolution | |
| if (y < height) { | |
| float sum = 0.0f; | |
| #pragma unroll | |
| for (int k = -radius; k <= radius; k++) { | |
| sum += sharedCol[sharedIdx + k] * kernel[k + radius]; | |
| } | |
| output[y * width + x] = sum; | |
| } | |
| } | |
| /* | |
| * C-style wrapper functions for C++ code | |
| */ | |
| extern "C" { | |
| void launchGaussianBlurHorizontal( | |
| const float* input, | |
| float* output, | |
| const float* kernel, | |
| int width, | |
| int height, | |
| int radius, | |
| dim3 gridSize, | |
| dim3 blockSize, | |
| int sharedMemSize | |
| ) { | |
| gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>( | |
| input, output, kernel, width, height, radius | |
| ); | |
| } | |
| void launchGaussianBlurVertical( | |
| const float* input, | |
| float* output, | |
| const float* kernel, | |
| int width, | |
| int height, | |
| int radius, | |
| dim3 gridSize, | |
| dim3 blockSize, | |
| int sharedMemSize | |
| ) { | |
| gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>( | |
| input, output, kernel, width, height, radius | |
| ); | |
| } | |
| } // extern "C" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * Main Application - C++ with CUDA Gaussian Blur | |
| * | |
| * Demonstrates usage of the GaussianBlur class | |
| */ | |
| #include "gaussian_blur.hpp" | |
| #include <iostream> | |
| #include <vector> | |
| #include <chrono> | |
| #include <random> | |
| #include <iomanip> | |
| // Simple image loader/saver simulation | |
| class Image { | |
| private: | |
| int width_; | |
| int height_; | |
| std::vector<float> data_; | |
| public: | |
| Image(int width, int height) | |
| : width_(width), height_(height), data_(width * height) {} | |
| void fillRandom() { | |
| std::random_device rd; | |
| std::mt19937 gen(rd()); | |
| std::uniform_real_distribution<float> dis(0.0f, 1.0f); | |
| for (auto& pixel : data_) { | |
| pixel = dis(gen); | |
| } | |
| } | |
| void fillTestPattern() { | |
| for (int y = 0; y < height_; y++) { | |
| for (int x = 0; x < width_; x++) { | |
| // Create a checkerboard pattern | |
| float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f; | |
| data_[y * width_ + x] = value; | |
| } | |
| } | |
| } | |
| float* data() { return data_.data(); } | |
| const float* data() const { return data_.data(); } | |
| int width() const { return width_; } | |
| int height() const { return height_; } | |
| size_t size() const { return data_.size(); } | |
| float getPixel(int x, int y) const { | |
| if (x < 0 || x >= width_ || y < 0 || y >= height_) { | |
| return 0.0f; | |
| } | |
| return data_[y * width_ + x]; | |
| } | |
| void printRegion(int startX, int startY, int sizeX, int sizeY) const { | |
| std::cout << std::fixed << std::setprecision(3); | |
| for (int y = startY; y < startY + sizeY && y < height_; y++) { | |
| for (int x = startX; x < startX + sizeX && x < width_; x++) { | |
| std::cout << getPixel(x, y) << " "; | |
| } | |
| std::cout << std::endl; | |
| } | |
| } | |
| }; | |
| // Benchmark helper | |
| class Timer { | |
| private: | |
| std::chrono::high_resolution_clock::time_point start_; | |
| public: | |
| void start() { | |
| start_ = std::chrono::high_resolution_clock::now(); | |
| } | |
| double elapsed() const { | |
| auto end = std::chrono::high_resolution_clock::now(); | |
| std::chrono::duration<double, std::milli> duration = end - start_; | |
| return duration.count(); | |
| } | |
| }; | |
| // Run benchmark | |
| void runBenchmark(gpu::GaussianBlur& blur, const Image& input, | |
| Image& output, int iterations = 10) { | |
| Timer timer; | |
| std::vector<double> times; | |
| // Warm-up run | |
| blur.apply(input.data(), output.data()); | |
| // Benchmark runs | |
| for (int i = 0; i < iterations; i++) { | |
| timer.start(); | |
| blur.apply(input.data(), output.data()); | |
| times.push_back(timer.elapsed()); | |
| } | |
| // Calculate statistics | |
| double sum = 0.0; | |
| double minTime = times[0]; | |
| double maxTime = times[0]; | |
| for (double t : times) { | |
| sum += t; | |
| minTime = std::min(minTime, t); | |
| maxTime = std::max(maxTime, t); | |
| } | |
| double avgTime = sum / iterations; | |
| int pixels = input.width() * input.height(); | |
| double throughput = (pixels / 1e6) / (avgTime / 1000.0); | |
| std::cout << "\n=== Benchmark Results ===" << std::endl; | |
| std::cout << "Iterations: " << iterations << std::endl; | |
| std::cout << "Average time: " << avgTime << " ms" << std::endl; | |
| std::cout << "Min time: " << minTime << " ms" << std::endl; | |
| std::cout << "Max time: " << maxTime << " ms" << std::endl; | |
| std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl; | |
| } | |
| int main(int argc, char** argv) { | |
| std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl; | |
| std::cout << std::endl; | |
| // Configuration | |
| const int width = 1920; | |
| const int height = 1080; | |
| const int radius = 5; | |
| const float sigma = 2.0f; | |
| std::cout << "Image dimensions: " << width << " x " << height << std::endl; | |
| std::cout << "Kernel radius: " << radius << std::endl; | |
| std::cout << "Sigma: " << sigma << std::endl; | |
| std::cout << std::endl; | |
| try { | |
| // Create input and output images | |
| Image input(width, height); | |
| Image output(width, height); | |
| // Fill with test data | |
| std::cout << "Generating test image..." << std::endl; | |
| input.fillRandom(); | |
| // Create Gaussian blur processor | |
| std::cout << "Initializing GPU processor..." << std::endl; | |
| gpu::GaussianBlur blur(width, height, radius, sigma); | |
| // Print kernel | |
| blur.printKernel(); | |
| std::cout << std::endl; | |
| // Apply blur | |
| std::cout << "Applying Gaussian blur..." << std::endl; | |
| Timer timer; | |
| timer.start(); | |
| blur.apply(input.data(), output.data()); | |
| double elapsed = timer.elapsed(); | |
| std::cout << "First run completed in " << elapsed << " ms" << std::endl; | |
| // Show sample results | |
| std::cout << "\nSample input region (center 5x5):" << std::endl; | |
| input.printRegion(width/2 - 2, height/2 - 2, 5, 5); | |
| std::cout << "\nSample output region (center 5x5):" << std::endl; | |
| output.printRegion(width/2 - 2, height/2 - 2, 5, 5); | |
| // Run benchmark | |
| std::cout << "\nRunning performance benchmark..." << std::endl; | |
| runBenchmark(blur, input, output, 100); | |
| // Test with different kernel sizes | |
| std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl; | |
| std::vector<int> radii = {3, 5, 7, 10}; | |
| for (int r : radii) { | |
| gpu::GaussianBlur blurTest(width, height, r, 2.0f); | |
| Timer t; | |
| t.start(); | |
| blurTest.apply(input.data(), output.data()); | |
| double time = t.elapsed(); | |
| int kernelSize = 2 * r + 1; | |
| double throughput = (width * height / 1e6) / (time / 1000.0); | |
| std::cout << "Radius " << r << " (kernel " << kernelSize | |
| << "x" << kernelSize << "): " | |
| << time << " ms, " | |
| << throughput << " Mpixels/sec" << std::endl; | |
| } | |
| std::cout << "\n=== Success! ===" << std::endl; | |
| } catch (const std::exception& e) { | |
| std::cerr << "Error: " << e.what() << std::endl; | |
| return 1; | |
| } | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Makefile for Gaussian Blur CUDA implementations | |
| # Supports both standalone CUDA and C++ with CUDA versions | |
| # Compiler settings | |
| NVCC = nvcc | |
| CXX = g++ | |
| # CUDA architecture (adjust based on your GPU) | |
| # Common options: | |
| # - sm_50: Maxwell (GTX 900 series) | |
| # - sm_60: Pascal (GTX 10 series) | |
| # - sm_70: Volta (Tesla V100) | |
| # - sm_75: Turing (RTX 20 series) | |
| # - sm_80: Ampere (RTX 30 series, A100) | |
| # - sm_86: Ampere (RTX 30 series mobile) | |
| # - sm_89: Ada Lovelace (RTX 40 series) | |
| CUDA_ARCH = sm_75 | |
| # Compiler flags | |
| NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall | |
| CXX_FLAGS = -std=c++11 -Wall -O3 | |
| # CUDA include and library paths (adjust if needed) | |
| CUDA_INC = /usr/local/cuda/include | |
| CUDA_LIB = /usr/local/cuda/lib64 | |
| # Targets | |
| TARGETS = gaussian_blur_cuda gaussian_blur_cpp | |
| .PHONY: all clean | |
| all: $(TARGETS) | |
| # Standalone CUDA implementation | |
| gaussian_blur_cuda: gaussian_blur_cuda.cu | |
| $(NVCC) $(NVCC_FLAGS) $< -o $@ | |
| @echo "Built standalone CUDA implementation: $@" | |
| # C++ with CUDA implementation (requires separate compilation) | |
| gaussian_blur_kernels.o: gaussian_blur_kernels.cu | |
| $(NVCC) $(NVCC_FLAGS) -c $< -o $@ | |
| main.o: main.cpp gaussian_blur.hpp | |
| $(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ | |
| gaussian_blur_cpp: main.o gaussian_blur_kernels.o | |
| $(NVCC) $(NVCC_FLAGS) $^ -o $@ | |
| @echo "Built C++ with CUDA implementation: $@" | |
| # Run targets | |
| run_cuda: gaussian_blur_cuda | |
| @echo "\n=== Running Standalone CUDA Implementation ===" | |
| ./gaussian_blur_cuda | |
| run_cpp: gaussian_blur_cpp | |
| @echo "\n=== Running C++ with CUDA Implementation ===" | |
| ./gaussian_blur_cpp | |
| run: run_cuda run_cpp | |
| # Clean build artifacts | |
| clean: | |
| rm -f $(TARGETS) *.o | |
| @echo "Cleaned build artifacts" | |
| # Help target | |
| help: | |
| @echo "Gaussian Blur CUDA Makefile" | |
| @echo "" | |
| @echo "Targets:" | |
| @echo " all - Build all implementations (default)" | |
| @echo " gaussian_blur_cuda - Build standalone CUDA version" | |
| @echo " gaussian_blur_cpp - Build C++ with CUDA version" | |
| @echo " run_cuda - Build and run standalone CUDA" | |
| @echo " run_cpp - Build and run C++ with CUDA" | |
| @echo " run - Run both implementations" | |
| @echo " clean - Remove build artifacts" | |
| @echo " help - Show this help message" | |
| @echo "" | |
| @echo "Configuration:" | |
| @echo " CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture" | |
| @echo "" | |
| @echo "Usage examples:" | |
| @echo " make # Build all" | |
| @echo " make run # Build and run both" | |
| @echo " make CUDA_ARCH=sm_80 # Build for Ampere (RTX 30 series)" | |
| @echo " make clean # Clean build files" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment