Skip to content

Instantly share code, notes, and snippets.

@JoshuaManton
Last active September 5, 2025 14:55
Show Gist options
  • Select an option

  • Save JoshuaManton/9b1916b51e1a22dfcf44842c1bfbcebb to your computer and use it in GitHub Desktop.

Select an option

Save JoshuaManton/9b1916b51e1a22dfcf44842c1bfbcebb to your computer and use it in GitHub Desktop.
SPMD software rendering example
//
// Build:
// cl /O2 main.cpp
//
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <synchapi.h>
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <thread>
#include <assert.h>
#include <intrin.h>
#include <emmintrin.h>
#pragma comment(lib, "user32.lib")
#pragma comment(lib, "gdi32.lib")
#pragma comment(lib, "kernel32.lib")
#pragma comment(lib, "Synchronization.lib")
//
// Types, intrinsics
//
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
typedef int64_t sint;
bool is_power_of_two(s64 v) { return v && ((v & (v - 1)) == 0); }
s64 align_forward(s64 v, s64 alignment) { assert(is_power_of_two(alignment)); return (v + alignment - 1) & ~(alignment - 1); }
s64 atomic_load(volatile s64 *ptr) { return _InterlockedCompareExchange64(ptr, 0, 0); }
void atomic_store(volatile s64 *ptr, s64 value) { _InterlockedExchange64(ptr, value); }
s64 atomic_increment(volatile s64 *ptr) { return _InterlockedIncrement64(ptr); }
//
// Futex
//
struct Futex {
volatile s64 value;
void set_and_signal(s64 v) {
atomic_store(&value, v);
WakeByAddressAll((void *)&value);
}
void wait_until_greater_than_or_equal(s64 expected) {
int spin_iterations = 50000;
while (true) {
s64 current = atomic_load(&value);
if (current >= expected) {
return;
}
if (spin_iterations --> 0) {
_mm_pause();
continue;
}
WaitOnAddress((void *)&value, &current, sizeof(current), INFINITE);
}
}
s64 increment_and_signal() {
s64 result = atomic_increment(&value);
WakeByAddressAll((void *)&value);
return result;
}
};
//
// SPMD
//
enum {
SPMD_THREAD_COUNT = 8, // note(josh): includes the main thread
};
Futex g_lane_barrier;
thread_local u64 g_lane_id;
void INIT_THREAD(int lane_id) {
g_lane_id = lane_id;
}
bool is_warp_leader() {
return g_lane_id == 0;
}
void LANE_BARRIER() {
u64 arrived = g_lane_barrier.increment_and_signal();
u64 gen = arrived >> 32;
u64 count = arrived & 0xffffffff;
u64 wait_for = (gen + 1) << 32;
if (count == SPMD_THREAD_COUNT) {
g_lane_barrier.set_and_signal(wait_for);
}
else {
g_lane_barrier.wait_until_greater_than_or_equal(wait_for);
}
}
struct Work_Range {
sint start;
sint one_past_last;
};
Work_Range distribute_work(sint total_work) {
sint base = total_work / SPMD_THREAD_COUNT;
sint rem = total_work % SPMD_THREAD_COUNT;
Work_Range range;
if (g_lane_id < rem) {
sint count = base + 1;
range.start = g_lane_id * count;
range.one_past_last = range.start + count;
}
else {
sint count = base;
range.start = g_lane_id * base + rem;
range.one_past_last = range.start + count;
}
return range;
}
//
// Renderer
//
struct Framebuffer {
sint visible_width;
sint visible_height;
sint aligned_width;
sint aligned_height;
u8 *bgra_pixels;
};
Framebuffer g_framebuffer;
bool g_quit;
enum {
SCREEN_TILE_DIM = 64,
};
LRESULT CALLBACK main_window_callback(HWND window, UINT message, WPARAM w_param, LPARAM l_param) {
LRESULT result = 0;
switch (message) {
case WM_SIZE: {
RECT client_rect;
GetClientRect(window, &client_rect);
sint width = client_rect.right - client_rect.left;
sint height = client_rect.bottom - client_rect.top;
{
sint new_aligned_width = align_forward(width, SCREEN_TILE_DIM);
sint new_aligned_height = align_forward(height, SCREEN_TILE_DIM);
if (g_framebuffer.aligned_width != new_aligned_width || g_framebuffer.aligned_height != new_aligned_height) {
if (g_framebuffer.bgra_pixels) {
VirtualFree(g_framebuffer.bgra_pixels, 0, MEM_RELEASE);
}
g_framebuffer.visible_width = width;
g_framebuffer.visible_height = height;
g_framebuffer.aligned_width = new_aligned_width;
g_framebuffer.aligned_height = new_aligned_height;
sint pixel_count = g_framebuffer.aligned_width * g_framebuffer.aligned_height;
g_framebuffer.bgra_pixels = (u8 *)VirtualAlloc(0, pixel_count * 4 * sizeof(u8), MEM_COMMIT, PAGE_READWRITE);
}
}
break;
}
case WM_KEYDOWN: {
if (w_param == VK_ESCAPE) {
g_quit = true;
PostQuitMessage(0);
}
break;
}
case WM_DESTROY: {
g_quit = true;
PostQuitMessage(0);
break;
}
case WM_CLOSE: {
g_quit = true;
PostQuitMessage(0);
break;
}
case WM_PAINT: {
PAINTSTRUCT paint;
HDC device_context = BeginPaint(window, &paint);
EndPaint(window, &paint);
break;
}
default: {
result = DefWindowProc(window, message, w_param, l_param);
break;
}
}
return result;
}
void run(u64 lane_id) {
INIT_THREAD(lane_id);
static HWND window_handle;
// create the window
if (is_warp_leader()) {
HINSTANCE instance = GetModuleHandle(0);
WNDCLASSA window_class = {0};
window_class.style = CS_HREDRAW|CS_VREDRAW;
window_class.lpfnWndProc = main_window_callback;
window_class.hInstance = instance;
window_class.lpszClassName = "windowclass";
if (RegisterClassA(&window_class)) {
window_handle = CreateWindowExA(
0,
window_class.lpszClassName,
"Software Renderer",
WS_OVERLAPPEDWINDOW|WS_VISIBLE,
CW_USEDEFAULT,
CW_USEDEFAULT,
CW_USEDEFAULT,
CW_USEDEFAULT,
0,
0,
instance,
0);
}
}
// wait for the window to be created
LANE_BARRIER();
assert(window_handle);
// main loop
static sint frame_count = 0;
while (true) {
if (is_warp_leader()) {
frame_count++;
MSG message;
while (PeekMessage(&message, 0, 0, 0, PM_REMOVE)) {
TranslateMessage(&message);
DispatchMessage(&message);
}
}
// wait for windows message loop
LANE_BARRIER();
if (g_quit) {
break;
}
struct Screen_Tile {
sint x_lo;
sint y_lo;
sint x_hi;
sint y_hi;
};
Screen_Tile my_tiles[4096];
sint my_tile_count = 0;
// set up this thread's tiles
{
sint tiles_needed_x = (g_framebuffer.aligned_width + SCREEN_TILE_DIM - 1) / SCREEN_TILE_DIM;
sint tiles_needed_y = (g_framebuffer.aligned_height + SCREEN_TILE_DIM - 1) / SCREEN_TILE_DIM;
sint total_tile_count = tiles_needed_x * tiles_needed_y;
Work_Range tile_work_range = distribute_work(total_tile_count);
for (sint tile_index = tile_work_range.start; tile_index < tile_work_range.one_past_last; tile_index++) {
assert(my_tile_count < ARRAYSIZE(my_tiles));
Screen_Tile *tile = &my_tiles[my_tile_count++];
tile->x_lo = (tile_index % tiles_needed_x) * SCREEN_TILE_DIM;
tile->y_lo = (tile_index / tiles_needed_x) * SCREEN_TILE_DIM;
tile->x_hi = tile->x_lo + SCREEN_TILE_DIM - 1;
tile->y_hi = tile->y_lo + SCREEN_TILE_DIM - 1;
}
}
// draw gradient, one tile at a time
for (sint tile_index = 0; tile_index < my_tile_count; tile_index++) {
Screen_Tile *tile = &my_tiles[tile_index];
for (sint y = tile->y_lo; y <= tile->y_hi; y++) {
u8 *row = g_framebuffer.bgra_pixels + y * g_framebuffer.aligned_width * 4 + tile->x_lo * 4;
for (sint x = tile->x_lo; x <= tile->x_hi; x++) {
row[0] = (frame_count*4 + x) % 255;
row[1] = (frame_count*4 + y) % 255;
row[2] = 0;
row[3] = 255;
row += 4;
}
}
}
// wait for all threads to finish drawing their tiles
LANE_BARRIER();
// draw to the screen
if (is_warp_leader()) {
HDC hdc = GetDC(window_handle);
RECT client_rect;
GetClientRect(window_handle, &client_rect);
sint window_width = client_rect.right - client_rect.left;
sint window_height = client_rect.bottom - client_rect.top;
BITMAPINFO bitmap_info = {};
bitmap_info.bmiHeader.biSize = sizeof(bitmap_info.bmiHeader);
bitmap_info.bmiHeader.biWidth = g_framebuffer.aligned_width;
bitmap_info.bmiHeader.biHeight = -g_framebuffer.aligned_height;
bitmap_info.bmiHeader.biPlanes = 1;
bitmap_info.bmiHeader.biBitCount = 32;
bitmap_info.bmiHeader.biCompression = BI_RGB;
StretchDIBits(hdc,
0, 0, window_width, window_height,
0, 0, g_framebuffer.visible_width, g_framebuffer.visible_height,
g_framebuffer.bgra_pixels,
&bitmap_info,
DIB_RGB_COLORS, SRCCOPY);
ReleaseDC(window_handle, hdc);
}
Sleep(16);
}
}
std::thread threads[SPMD_THREAD_COUNT];
int main(int argc, char **argv) {
SetProcessDPIAware();
for (int i = 0; i < SPMD_THREAD_COUNT-1; i++) {
threads[i] = std::thread(run, i+1);
}
run(0);
for (int i = 0; i < SPMD_THREAD_COUNT-1; i++) {
threads[i].join();
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment