Skip to content

Instantly share code, notes, and snippets.

@raymondtay
Last active February 22, 2026 03:25
Show Gist options
  • Select an option

  • Save raymondtay/9915510013d22a5ac890652eedf9af03 to your computer and use it in GitHub Desktop.

Select an option

Save raymondtay/9915510013d22a5ac890652eedf9af03 to your computer and use it in GitHub Desktop.
Gaussian Blur blogpost
/*
* Separable Gaussian Blur - C++ with CUDA Implementation
*
* This is a C++ wrapper around CUDA kernels providing:
* - Object-oriented interface
* - RAII memory management
* - Support for multiple data types (float, uchar)
* - Easy-to-use API
*/
#ifndef GAUSSIAN_BLUR_HPP
#define GAUSSIAN_BLUR_HPP
#include <cuda_runtime.h>
#include <vector>
#include <memory>
#include <stdexcept>
#include <cmath>
#include <iostream>
// CUDA kernel declarations
extern "C" {
void launchGaussianBlurHorizontal(
const float* input,
float* output,
const float* kernel,
int width,
int height,
int radius,
dim3 gridSize,
dim3 blockSize,
int sharedMemSize
);
void launchGaussianBlurVertical(
const float* input,
float* output,
const float* kernel,
int width,
int height,
int radius,
dim3 gridSize,
dim3 blockSize,
int sharedMemSize
);
}
namespace gpu {
// RAII wrapper for CUDA device memory
template<typename T>
class DeviceBuffer {
private:
T* d_ptr_;
size_t size_;
public:
DeviceBuffer(size_t size) : size_(size) {
cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T));
if (err != cudaSuccess) {
throw std::runtime_error("CUDA malloc failed: " +
std::string(cudaGetErrorString(err)));
}
}
~DeviceBuffer() {
if (d_ptr_) {
cudaFree(d_ptr_);
}
}
// Delete copy constructor and assignment
DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
// Move constructor and assignment
DeviceBuffer(DeviceBuffer&& other) noexcept
: d_ptr_(other.d_ptr_), size_(other.size_) {
other.d_ptr_ = nullptr;
}
DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
if (this != &other) {
if (d_ptr_) cudaFree(d_ptr_);
d_ptr_ = other.d_ptr_;
size_ = other.size_;
other.d_ptr_ = nullptr;
}
return *this;
}
T* get() { return d_ptr_; }
const T* get() const { return d_ptr_; }
size_t size() const { return size_; }
void copyToDevice(const T* host_data) {
cudaError_t err = cudaMemcpy(d_ptr_, host_data,
size_ * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
throw std::runtime_error("Copy to device failed");
}
}
void copyToHost(T* host_data) const {
cudaError_t err = cudaMemcpy(host_data, d_ptr_,
size_ * sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
throw std::runtime_error("Copy to host failed");
}
}
};
// Gaussian blur class
class GaussianBlur {
private:
int width_;
int height_;
int radius_;
float sigma_;
std::vector<float> kernel_;
std::unique_ptr<DeviceBuffer<float>> d_kernel_;
std::unique_ptr<DeviceBuffer<float>> d_temp_;
void generateKernel() {
int kernelSize = 2 * radius_ + 1;
kernel_.resize(kernelSize);
float sum = 0.0f;
for (int i = -radius_; i <= radius_; i++) {
float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_));
kernel_[i + radius_] = value;
sum += value;
}
// Normalize
for (auto& val : kernel_) {
val /= sum;
}
}
public:
GaussianBlur(int width, int height, int radius, float sigma)
: width_(width), height_(height), radius_(radius), sigma_(sigma) {
if (radius <= 0 || radius > 50) {
throw std::invalid_argument("Radius must be between 1 and 50");
}
if (sigma <= 0.0f) {
throw std::invalid_argument("Sigma must be positive");
}
// Generate Gaussian kernel
generateKernel();
// Allocate device memory
d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size());
d_kernel_->copyToDevice(kernel_.data());
// Allocate temporary buffer for intermediate results
d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height);
}
// Apply Gaussian blur
void apply(const float* h_input, float* h_output) {
// Allocate device buffers
DeviceBuffer<float> d_input(width_ * height_);
DeviceBuffer<float> d_output(width_ * height_);
// Copy input to device
d_input.copyToDevice(h_input);
// Configure kernel launch parameters
const int blockSize = 16;
dim3 blockDim(blockSize, blockSize);
dim3 gridDim(
(width_ + blockSize - 1) / blockSize,
(height_ + blockSize - 1) / blockSize
);
int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float);
// Launch horizontal blur
launchGaussianBlurHorizontal(
d_input.get(),
d_temp_->get(),
d_kernel_->get(),
width_,
height_,
radius_,
gridDim,
blockDim,
sharedMemSize
);
// Check for kernel launch errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
throw std::runtime_error("Horizontal kernel launch failed: " +
std::string(cudaGetErrorString(err)));
}
// Launch vertical blur
launchGaussianBlurVertical(
d_temp_->get(),
d_output.get(),
d_kernel_->get(),
width_,
height_,
radius_,
gridDim,
blockDim,
sharedMemSize
);
err = cudaGetLastError();
if (err != cudaSuccess) {
throw std::runtime_error("Vertical kernel launch failed: " +
std::string(cudaGetErrorString(err)));
}
// Synchronize
cudaDeviceSynchronize();
// Copy result back to host
d_output.copyToHost(h_output);
}
// Getters
int getWidth() const { return width_; }
int getHeight() const { return height_; }
int getRadius() const { return radius_; }
float getSigma() const { return sigma_; }
const std::vector<float>& getKernel() const { return kernel_; }
// Print kernel coefficients
void printKernel() const {
std::cout << "Gaussian kernel (radius=" << radius_
<< ", sigma=" << sigma_ << "):\n";
for (size_t i = 0; i < kernel_.size(); i++) {
std::cout << kernel_[i] << " ";
}
std::cout << std::endl;
}
};
} // namespace gpu
#endif // GAUSSIAN_BLUR_HPP
/*
* Separable Gaussian Blur - Pure CUDA Implementation
*
* This implementation uses:
* - Shared memory for efficient data reuse
* - Separate kernels for horizontal and vertical passes
* - Optimized memory access patterns
*/
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <math.h>
#define KERNEL_RADIUS 5
#define KERNEL_SIZE (2 * KERNEL_RADIUS + 1)
#define BLOCK_SIZE 16
// Error checking macro
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while(0)
/*
* Generate 1D Gaussian kernel
* Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2))
*/
__host__ void generateGaussianKernel(float* kernel, int radius, float sigma) {
float sum = 0.0f;
for (int i = -radius; i <= radius; i++) {
float value = expf(-(i * i) / (2.0f * sigma * sigma));
kernel[i + radius] = value;
sum += value;
}
// Normalize
for (int i = 0; i < 2 * radius + 1; i++) {
kernel[i] /= sum;
}
}
/*
* Horizontal Gaussian blur kernel
* Each thread processes one pixel, reading from shared memory
*/
__global__ void gaussianBlurHorizontal(
const float* input,
float* output,
const float* kernel,
int width,
int height,
int radius
) {
// Shared memory for the row (includes halo region)
extern __shared__ float sharedRow[];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= height) return;
// Load data into shared memory with halo
int sharedIdx = threadIdx.x + radius;
// Load main data
if (x < width) {
sharedRow[sharedIdx] = input[y * width + x];
}
// Load left halo
if (threadIdx.x < radius) {
int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
if (leftX < 0) {
sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp
} else {
sharedRow[threadIdx.x] = input[y * width + leftX];
}
}
// Load right halo
if (threadIdx.x >= blockDim.x - radius) {
int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
int sharedRightIdx = threadIdx.x + 2 * radius;
if (rightX >= width) {
sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp
} else {
sharedRow[sharedRightIdx] = input[y * width + rightX];
}
}
__syncthreads();
// Perform convolution
if (x < width) {
float sum = 0.0f;
for (int k = -radius; k <= radius; k++) {
sum += sharedRow[sharedIdx + k] * kernel[k + radius];
}
output[y * width + x] = sum;
}
}
/*
* Vertical Gaussian blur kernel
* Each thread processes one pixel, reading columns
*/
__global__ void gaussianBlurVertical(
const float* input,
float* output,
const float* kernel,
int width,
int height,
int radius
) {
// Shared memory for the column (includes halo region)
extern __shared__ float sharedCol[];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width) return;
// Load data into shared memory with halo
int sharedIdx = threadIdx.y + radius;
// Load main data
if (y < height) {
sharedCol[sharedIdx] = input[y * width + x];
}
// Load top halo
if (threadIdx.y < radius) {
int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
if (topY < 0) {
sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp
} else {
sharedCol[threadIdx.y] = input[topY * width + x];
}
}
// Load bottom halo
if (threadIdx.y >= blockDim.y - radius) {
int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
int sharedBottomIdx = threadIdx.y + 2 * radius;
if (bottomY >= height) {
sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp
} else {
sharedCol[sharedBottomIdx] = input[bottomY * width + x];
}
}
__syncthreads();
// Perform convolution
if (y < height) {
float sum = 0.0f;
for (int k = -radius; k <= radius; k++) {
sum += sharedCol[sharedIdx + k] * kernel[k + radius];
}
output[y * width + x] = sum;
}
}
/*
* Apply separable Gaussian blur
*/
void applySeparableGaussianBlur(
const float* d_input,
float* d_output,
float* d_temp,
const float* d_kernel,
int width,
int height,
int radius
) {
dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
dim3 gridSize(
(width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y
);
// Shared memory size (block size + 2 * radius for halo)
int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float);
// Horizontal pass
gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>(
d_input, d_temp, d_kernel, width, height, radius
);
CUDA_CHECK(cudaGetLastError());
// Vertical pass
gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>(
d_temp, d_output, d_kernel, width, height, radius
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
/*
* Main function demonstrating usage
*/
int main() {
// Image dimensions
const int width = 1920;
const int height = 1080;
const int imageSize = width * height * sizeof(float);
// Gaussian parameters
const int radius = KERNEL_RADIUS;
const float sigma = 2.0f;
const int kernelSize = 2 * radius + 1;
printf("Separable Gaussian Blur (CUDA)\n");
printf("Image size: %d x %d\n", width, height);
printf("Kernel radius: %d (size: %d)\n", radius, kernelSize);
printf("Sigma: %.2f\n\n", sigma);
// Allocate host memory
float* h_input = (float*)malloc(imageSize);
float* h_output = (float*)malloc(imageSize);
float* h_kernel = (float*)malloc(kernelSize * sizeof(float));
// Initialize input with test pattern
for (int i = 0; i < width * height; i++) {
h_input[i] = (float)(rand() % 256) / 255.0f;
}
// Generate Gaussian kernel
generateGaussianKernel(h_kernel, radius, sigma);
printf("Gaussian kernel coefficients:\n");
for (int i = 0; i < kernelSize; i++) {
printf("%.6f ", h_kernel[i]);
}
printf("\n\n");
// Allocate device memory
float *d_input, *d_output, *d_temp, *d_kernel;
CUDA_CHECK(cudaMalloc(&d_input, imageSize));
CUDA_CHECK(cudaMalloc(&d_output, imageSize));
CUDA_CHECK(cudaMalloc(&d_temp, imageSize));
CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float)));
// Copy data to device
CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice));
// Create CUDA events for timing
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
// Warm-up run
applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
// Timed run
CUDA_CHECK(cudaEventRecord(start));
applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
float milliseconds = 0;
CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Execution time: %.3f ms\n", milliseconds);
printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f));
// Copy result back to host
CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost));
// Verify result (check a few pixels)
printf("\nSample output values:\n");
for (int i = 0; i < 5; i++) {
int idx = (height / 2) * width + (width / 2) + i;
printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]);
}
// Cleanup
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));
CUDA_CHECK(cudaFree(d_temp));
CUDA_CHECK(cudaFree(d_kernel));
free(h_input);
free(h_output);
free(h_kernel);
printf("\nGaussian blur completed successfully!\n");
return 0;
}
/*
* CUDA Kernels Implementation
* Separable Gaussian Blur
*/
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
/*
* Horizontal Gaussian blur kernel with shared memory optimization
*/
__global__ void gaussianBlurHorizontalKernel(
const float* __restrict__ input,
float* __restrict__ output,
const float* __restrict__ kernel,
int width,
int height,
int radius
) {
extern __shared__ float sharedRow[];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= height) return;
int sharedIdx = threadIdx.x + radius;
// Load center data
if (x < width) {
sharedRow[sharedIdx] = input[y * width + x];
}
// Load left halo
if (threadIdx.x < radius) {
int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
sharedRow[threadIdx.x] = (leftX < 0) ?
input[y * width] : input[y * width + leftX];
}
// Load right halo
if (threadIdx.x >= blockDim.x - radius) {
int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
int sharedRightIdx = threadIdx.x + 2 * radius;
sharedRow[sharedRightIdx] = (rightX >= width) ?
input[y * width + width - 1] : input[y * width + rightX];
}
__syncthreads();
// Perform convolution
if (x < width) {
float sum = 0.0f;
#pragma unroll
for (int k = -radius; k <= radius; k++) {
sum += sharedRow[sharedIdx + k] * kernel[k + radius];
}
output[y * width + x] = sum;
}
}
/*
* Vertical Gaussian blur kernel with shared memory optimization
*/
__global__ void gaussianBlurVerticalKernel(
const float* __restrict__ input,
float* __restrict__ output,
const float* __restrict__ kernel,
int width,
int height,
int radius
) {
extern __shared__ float sharedCol[];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width) return;
int sharedIdx = threadIdx.y + radius;
// Load center data
if (y < height) {
sharedCol[sharedIdx] = input[y * width + x];
}
// Load top halo
if (threadIdx.y < radius) {
int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
sharedCol[threadIdx.y] = (topY < 0) ?
input[x] : input[topY * width + x];
}
// Load bottom halo
if (threadIdx.y >= blockDim.y - radius) {
int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
int sharedBottomIdx = threadIdx.y + 2 * radius;
sharedCol[sharedBottomIdx] = (bottomY >= height) ?
input[(height - 1) * width + x] : input[bottomY * width + x];
}
__syncthreads();
// Perform convolution
if (y < height) {
float sum = 0.0f;
#pragma unroll
for (int k = -radius; k <= radius; k++) {
sum += sharedCol[sharedIdx + k] * kernel[k + radius];
}
output[y * width + x] = sum;
}
}
/*
* C-style wrapper functions for C++ code
*/
extern "C" {
void launchGaussianBlurHorizontal(
const float* input,
float* output,
const float* kernel,
int width,
int height,
int radius,
dim3 gridSize,
dim3 blockSize,
int sharedMemSize
) {
gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>(
input, output, kernel, width, height, radius
);
}
void launchGaussianBlurVertical(
const float* input,
float* output,
const float* kernel,
int width,
int height,
int radius,
dim3 gridSize,
dim3 blockSize,
int sharedMemSize
) {
gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>(
input, output, kernel, width, height, radius
);
}
} // extern "C"
/*
* Main Application - C++ with CUDA Gaussian Blur
*
* Demonstrates usage of the GaussianBlur class
*/
#include "gaussian_blur.hpp"
#include <iostream>
#include <vector>
#include <chrono>
#include <random>
#include <iomanip>
// Simple image loader/saver simulation
class Image {
private:
int width_;
int height_;
std::vector<float> data_;
public:
Image(int width, int height)
: width_(width), height_(height), data_(width * height) {}
void fillRandom() {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
for (auto& pixel : data_) {
pixel = dis(gen);
}
}
void fillTestPattern() {
for (int y = 0; y < height_; y++) {
for (int x = 0; x < width_; x++) {
// Create a checkerboard pattern
float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f;
data_[y * width_ + x] = value;
}
}
}
float* data() { return data_.data(); }
const float* data() const { return data_.data(); }
int width() const { return width_; }
int height() const { return height_; }
size_t size() const { return data_.size(); }
float getPixel(int x, int y) const {
if (x < 0 || x >= width_ || y < 0 || y >= height_) {
return 0.0f;
}
return data_[y * width_ + x];
}
void printRegion(int startX, int startY, int sizeX, int sizeY) const {
std::cout << std::fixed << std::setprecision(3);
for (int y = startY; y < startY + sizeY && y < height_; y++) {
for (int x = startX; x < startX + sizeX && x < width_; x++) {
std::cout << getPixel(x, y) << " ";
}
std::cout << std::endl;
}
}
};
// Benchmark helper
class Timer {
private:
std::chrono::high_resolution_clock::time_point start_;
public:
void start() {
start_ = std::chrono::high_resolution_clock::now();
}
double elapsed() const {
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start_;
return duration.count();
}
};
// Run benchmark
void runBenchmark(gpu::GaussianBlur& blur, const Image& input,
Image& output, int iterations = 10) {
Timer timer;
std::vector<double> times;
// Warm-up run
blur.apply(input.data(), output.data());
// Benchmark runs
for (int i = 0; i < iterations; i++) {
timer.start();
blur.apply(input.data(), output.data());
times.push_back(timer.elapsed());
}
// Calculate statistics
double sum = 0.0;
double minTime = times[0];
double maxTime = times[0];
for (double t : times) {
sum += t;
minTime = std::min(minTime, t);
maxTime = std::max(maxTime, t);
}
double avgTime = sum / iterations;
int pixels = input.width() * input.height();
double throughput = (pixels / 1e6) / (avgTime / 1000.0);
std::cout << "\n=== Benchmark Results ===" << std::endl;
std::cout << "Iterations: " << iterations << std::endl;
std::cout << "Average time: " << avgTime << " ms" << std::endl;
std::cout << "Min time: " << minTime << " ms" << std::endl;
std::cout << "Max time: " << maxTime << " ms" << std::endl;
std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl;
}
int main(int argc, char** argv) {
std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl;
std::cout << std::endl;
// Configuration
const int width = 1920;
const int height = 1080;
const int radius = 5;
const float sigma = 2.0f;
std::cout << "Image dimensions: " << width << " x " << height << std::endl;
std::cout << "Kernel radius: " << radius << std::endl;
std::cout << "Sigma: " << sigma << std::endl;
std::cout << std::endl;
try {
// Create input and output images
Image input(width, height);
Image output(width, height);
// Fill with test data
std::cout << "Generating test image..." << std::endl;
input.fillRandom();
// Create Gaussian blur processor
std::cout << "Initializing GPU processor..." << std::endl;
gpu::GaussianBlur blur(width, height, radius, sigma);
// Print kernel
blur.printKernel();
std::cout << std::endl;
// Apply blur
std::cout << "Applying Gaussian blur..." << std::endl;
Timer timer;
timer.start();
blur.apply(input.data(), output.data());
double elapsed = timer.elapsed();
std::cout << "First run completed in " << elapsed << " ms" << std::endl;
// Show sample results
std::cout << "\nSample input region (center 5x5):" << std::endl;
input.printRegion(width/2 - 2, height/2 - 2, 5, 5);
std::cout << "\nSample output region (center 5x5):" << std::endl;
output.printRegion(width/2 - 2, height/2 - 2, 5, 5);
// Run benchmark
std::cout << "\nRunning performance benchmark..." << std::endl;
runBenchmark(blur, input, output, 100);
// Test with different kernel sizes
std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl;
std::vector<int> radii = {3, 5, 7, 10};
for (int r : radii) {
gpu::GaussianBlur blurTest(width, height, r, 2.0f);
Timer t;
t.start();
blurTest.apply(input.data(), output.data());
double time = t.elapsed();
int kernelSize = 2 * r + 1;
double throughput = (width * height / 1e6) / (time / 1000.0);
std::cout << "Radius " << r << " (kernel " << kernelSize
<< "x" << kernelSize << "): "
<< time << " ms, "
<< throughput << " Mpixels/sec" << std::endl;
}
std::cout << "\n=== Success! ===" << std::endl;
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
# Makefile for Gaussian Blur CUDA implementations
# Supports both standalone CUDA and C++ with CUDA versions
# Compiler settings
NVCC = nvcc
CXX = g++
# CUDA architecture (adjust based on your GPU)
# Common options:
# - sm_50: Maxwell (GTX 900 series)
# - sm_60: Pascal (GTX 10 series)
# - sm_70: Volta (Tesla V100)
# - sm_75: Turing (RTX 20 series)
# - sm_80: Ampere (RTX 30 series, A100)
# - sm_86: Ampere (RTX 30 series mobile)
# - sm_89: Ada Lovelace (RTX 40 series)
CUDA_ARCH = sm_75
# Compiler flags
NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall
CXX_FLAGS = -std=c++11 -Wall -O3
# CUDA include and library paths (adjust if needed)
CUDA_INC = /usr/local/cuda/include
CUDA_LIB = /usr/local/cuda/lib64
# Targets
TARGETS = gaussian_blur_cuda gaussian_blur_cpp
.PHONY: all clean
all: $(TARGETS)
# Standalone CUDA implementation
gaussian_blur_cuda: gaussian_blur_cuda.cu
$(NVCC) $(NVCC_FLAGS) $< -o $@
@echo "Built standalone CUDA implementation: $@"
# C++ with CUDA implementation (requires separate compilation)
gaussian_blur_kernels.o: gaussian_blur_kernels.cu
$(NVCC) $(NVCC_FLAGS) -c $< -o $@
main.o: main.cpp gaussian_blur.hpp
$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
gaussian_blur_cpp: main.o gaussian_blur_kernels.o
$(NVCC) $(NVCC_FLAGS) $^ -o $@
@echo "Built C++ with CUDA implementation: $@"
# Run targets
run_cuda: gaussian_blur_cuda
@echo "\n=== Running Standalone CUDA Implementation ==="
./gaussian_blur_cuda
run_cpp: gaussian_blur_cpp
@echo "\n=== Running C++ with CUDA Implementation ==="
./gaussian_blur_cpp
run: run_cuda run_cpp
# Clean build artifacts
clean:
rm -f $(TARGETS) *.o
@echo "Cleaned build artifacts"
# Help target
help:
@echo "Gaussian Blur CUDA Makefile"
@echo ""
@echo "Targets:"
@echo " all - Build all implementations (default)"
@echo " gaussian_blur_cuda - Build standalone CUDA version"
@echo " gaussian_blur_cpp - Build C++ with CUDA version"
@echo " run_cuda - Build and run standalone CUDA"
@echo " run_cpp - Build and run C++ with CUDA"
@echo " run - Run both implementations"
@echo " clean - Remove build artifacts"
@echo " help - Show this help message"
@echo ""
@echo "Configuration:"
@echo " CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture"
@echo ""
@echo "Usage examples:"
@echo " make # Build all"
@echo " make run # Build and run both"
@echo " make CUDA_ARCH=sm_80 # Build for Ampere (RTX 30 series)"
@echo " make clean # Clean build files"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment