Last active
September 5, 2025 14:55
-
-
Save JoshuaManton/9b1916b51e1a22dfcf44842c1bfbcebb to your computer and use it in GitHub Desktop.
SPMD software rendering example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // Build: | |
| // cl /O2 main.cpp | |
| // | |
| #define WIN32_LEAN_AND_MEAN | |
| #define NOMINMAX | |
| #include <windows.h> | |
| #include <synchapi.h> | |
| #include <stdint.h> | |
| #include <stddef.h> | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <thread> | |
| #include <assert.h> | |
| #include <intrin.h> | |
| #include <emmintrin.h> | |
| #pragma comment(lib, "user32.lib") | |
| #pragma comment(lib, "gdi32.lib") | |
| #pragma comment(lib, "kernel32.lib") | |
| #pragma comment(lib, "Synchronization.lib") | |
| // | |
| // Types, intrinsics | |
| // | |
| typedef uint8_t u8; | |
| typedef uint16_t u16; | |
| typedef uint32_t u32; | |
| typedef uint64_t u64; | |
| typedef int8_t s8; | |
| typedef int16_t s16; | |
| typedef int32_t s32; | |
| typedef int64_t s64; | |
| typedef int64_t sint; | |
| bool is_power_of_two(s64 v) { return v && ((v & (v - 1)) == 0); } | |
| s64 align_forward(s64 v, s64 alignment) { assert(is_power_of_two(alignment)); return (v + alignment - 1) & ~(alignment - 1); } | |
| s64 atomic_load(volatile s64 *ptr) { return _InterlockedCompareExchange64(ptr, 0, 0); } | |
| void atomic_store(volatile s64 *ptr, s64 value) { _InterlockedExchange64(ptr, value); } | |
| s64 atomic_increment(volatile s64 *ptr) { return _InterlockedIncrement64(ptr); } | |
| // | |
| // Futex | |
| // | |
| struct Futex { | |
| volatile s64 value; | |
| void set_and_signal(s64 v) { | |
| atomic_store(&value, v); | |
| WakeByAddressAll((void *)&value); | |
| } | |
| void wait_until_greater_than_or_equal(s64 expected) { | |
| int spin_iterations = 50000; | |
| while (true) { | |
| s64 current = atomic_load(&value); | |
| if (current >= expected) { | |
| return; | |
| } | |
| if (spin_iterations --> 0) { | |
| _mm_pause(); | |
| continue; | |
| } | |
| WaitOnAddress((void *)&value, ¤t, sizeof(current), INFINITE); | |
| } | |
| } | |
| s64 increment_and_signal() { | |
| s64 result = atomic_increment(&value); | |
| WakeByAddressAll((void *)&value); | |
| return result; | |
| } | |
| }; | |
| // | |
| // SPMD | |
| // | |
| enum { | |
| SPMD_THREAD_COUNT = 8, // note(josh): includes the main thread | |
| }; | |
| Futex g_lane_barrier; | |
| thread_local u64 g_lane_id; | |
| void INIT_THREAD(int lane_id) { | |
| g_lane_id = lane_id; | |
| } | |
| bool is_warp_leader() { | |
| return g_lane_id == 0; | |
| } | |
| void LANE_BARRIER() { | |
| u64 arrived = g_lane_barrier.increment_and_signal(); | |
| u64 gen = arrived >> 32; | |
| u64 count = arrived & 0xffffffff; | |
| u64 wait_for = (gen + 1) << 32; | |
| if (count == SPMD_THREAD_COUNT) { | |
| g_lane_barrier.set_and_signal(wait_for); | |
| } | |
| else { | |
| g_lane_barrier.wait_until_greater_than_or_equal(wait_for); | |
| } | |
| } | |
| struct Work_Range { | |
| sint start; | |
| sint one_past_last; | |
| }; | |
| Work_Range distribute_work(sint total_work) { | |
| sint base = total_work / SPMD_THREAD_COUNT; | |
| sint rem = total_work % SPMD_THREAD_COUNT; | |
| Work_Range range; | |
| if (g_lane_id < rem) { | |
| sint count = base + 1; | |
| range.start = g_lane_id * count; | |
| range.one_past_last = range.start + count; | |
| } | |
| else { | |
| sint count = base; | |
| range.start = g_lane_id * base + rem; | |
| range.one_past_last = range.start + count; | |
| } | |
| return range; | |
| } | |
| // | |
| // Renderer | |
| // | |
| struct Framebuffer { | |
| sint visible_width; | |
| sint visible_height; | |
| sint aligned_width; | |
| sint aligned_height; | |
| u8 *bgra_pixels; | |
| }; | |
| Framebuffer g_framebuffer; | |
| bool g_quit; | |
| enum { | |
| SCREEN_TILE_DIM = 64, | |
| }; | |
| LRESULT CALLBACK main_window_callback(HWND window, UINT message, WPARAM w_param, LPARAM l_param) { | |
| LRESULT result = 0; | |
| switch (message) { | |
| case WM_SIZE: { | |
| RECT client_rect; | |
| GetClientRect(window, &client_rect); | |
| sint width = client_rect.right - client_rect.left; | |
| sint height = client_rect.bottom - client_rect.top; | |
| { | |
| sint new_aligned_width = align_forward(width, SCREEN_TILE_DIM); | |
| sint new_aligned_height = align_forward(height, SCREEN_TILE_DIM); | |
| if (g_framebuffer.aligned_width != new_aligned_width || g_framebuffer.aligned_height != new_aligned_height) { | |
| if (g_framebuffer.bgra_pixels) { | |
| VirtualFree(g_framebuffer.bgra_pixels, 0, MEM_RELEASE); | |
| } | |
| g_framebuffer.visible_width = width; | |
| g_framebuffer.visible_height = height; | |
| g_framebuffer.aligned_width = new_aligned_width; | |
| g_framebuffer.aligned_height = new_aligned_height; | |
| sint pixel_count = g_framebuffer.aligned_width * g_framebuffer.aligned_height; | |
| g_framebuffer.bgra_pixels = (u8 *)VirtualAlloc(0, pixel_count * 4 * sizeof(u8), MEM_COMMIT, PAGE_READWRITE); | |
| } | |
| } | |
| break; | |
| } | |
| case WM_KEYDOWN: { | |
| if (w_param == VK_ESCAPE) { | |
| g_quit = true; | |
| PostQuitMessage(0); | |
| } | |
| break; | |
| } | |
| case WM_DESTROY: { | |
| g_quit = true; | |
| PostQuitMessage(0); | |
| break; | |
| } | |
| case WM_CLOSE: { | |
| g_quit = true; | |
| PostQuitMessage(0); | |
| break; | |
| } | |
| case WM_PAINT: { | |
| PAINTSTRUCT paint; | |
| HDC device_context = BeginPaint(window, &paint); | |
| EndPaint(window, &paint); | |
| break; | |
| } | |
| default: { | |
| result = DefWindowProc(window, message, w_param, l_param); | |
| break; | |
| } | |
| } | |
| return result; | |
| } | |
| void run(u64 lane_id) { | |
| INIT_THREAD(lane_id); | |
| static HWND window_handle; | |
| // create the window | |
| if (is_warp_leader()) { | |
| HINSTANCE instance = GetModuleHandle(0); | |
| WNDCLASSA window_class = {0}; | |
| window_class.style = CS_HREDRAW|CS_VREDRAW; | |
| window_class.lpfnWndProc = main_window_callback; | |
| window_class.hInstance = instance; | |
| window_class.lpszClassName = "windowclass"; | |
| if (RegisterClassA(&window_class)) { | |
| window_handle = CreateWindowExA( | |
| 0, | |
| window_class.lpszClassName, | |
| "Software Renderer", | |
| WS_OVERLAPPEDWINDOW|WS_VISIBLE, | |
| CW_USEDEFAULT, | |
| CW_USEDEFAULT, | |
| CW_USEDEFAULT, | |
| CW_USEDEFAULT, | |
| 0, | |
| 0, | |
| instance, | |
| 0); | |
| } | |
| } | |
| // wait for the window to be created | |
| LANE_BARRIER(); | |
| assert(window_handle); | |
| // main loop | |
| static sint frame_count = 0; | |
| while (true) { | |
| if (is_warp_leader()) { | |
| frame_count++; | |
| MSG message; | |
| while (PeekMessage(&message, 0, 0, 0, PM_REMOVE)) { | |
| TranslateMessage(&message); | |
| DispatchMessage(&message); | |
| } | |
| } | |
| // wait for windows message loop | |
| LANE_BARRIER(); | |
| if (g_quit) { | |
| break; | |
| } | |
| struct Screen_Tile { | |
| sint x_lo; | |
| sint y_lo; | |
| sint x_hi; | |
| sint y_hi; | |
| }; | |
| Screen_Tile my_tiles[4096]; | |
| sint my_tile_count = 0; | |
| // set up this thread's tiles | |
| { | |
| sint tiles_needed_x = (g_framebuffer.aligned_width + SCREEN_TILE_DIM - 1) / SCREEN_TILE_DIM; | |
| sint tiles_needed_y = (g_framebuffer.aligned_height + SCREEN_TILE_DIM - 1) / SCREEN_TILE_DIM; | |
| sint total_tile_count = tiles_needed_x * tiles_needed_y; | |
| Work_Range tile_work_range = distribute_work(total_tile_count); | |
| for (sint tile_index = tile_work_range.start; tile_index < tile_work_range.one_past_last; tile_index++) { | |
| assert(my_tile_count < ARRAYSIZE(my_tiles)); | |
| Screen_Tile *tile = &my_tiles[my_tile_count++]; | |
| tile->x_lo = (tile_index % tiles_needed_x) * SCREEN_TILE_DIM; | |
| tile->y_lo = (tile_index / tiles_needed_x) * SCREEN_TILE_DIM; | |
| tile->x_hi = tile->x_lo + SCREEN_TILE_DIM - 1; | |
| tile->y_hi = tile->y_lo + SCREEN_TILE_DIM - 1; | |
| } | |
| } | |
| // draw gradient, one tile at a time | |
| for (sint tile_index = 0; tile_index < my_tile_count; tile_index++) { | |
| Screen_Tile *tile = &my_tiles[tile_index]; | |
| for (sint y = tile->y_lo; y <= tile->y_hi; y++) { | |
| u8 *row = g_framebuffer.bgra_pixels + y * g_framebuffer.aligned_width * 4 + tile->x_lo * 4; | |
| for (sint x = tile->x_lo; x <= tile->x_hi; x++) { | |
| row[0] = (frame_count*4 + x) % 255; | |
| row[1] = (frame_count*4 + y) % 255; | |
| row[2] = 0; | |
| row[3] = 255; | |
| row += 4; | |
| } | |
| } | |
| } | |
| // wait for all threads to finish drawing their tiles | |
| LANE_BARRIER(); | |
| // draw to the screen | |
| if (is_warp_leader()) { | |
| HDC hdc = GetDC(window_handle); | |
| RECT client_rect; | |
| GetClientRect(window_handle, &client_rect); | |
| sint window_width = client_rect.right - client_rect.left; | |
| sint window_height = client_rect.bottom - client_rect.top; | |
| BITMAPINFO bitmap_info = {}; | |
| bitmap_info.bmiHeader.biSize = sizeof(bitmap_info.bmiHeader); | |
| bitmap_info.bmiHeader.biWidth = g_framebuffer.aligned_width; | |
| bitmap_info.bmiHeader.biHeight = -g_framebuffer.aligned_height; | |
| bitmap_info.bmiHeader.biPlanes = 1; | |
| bitmap_info.bmiHeader.biBitCount = 32; | |
| bitmap_info.bmiHeader.biCompression = BI_RGB; | |
| StretchDIBits(hdc, | |
| 0, 0, window_width, window_height, | |
| 0, 0, g_framebuffer.visible_width, g_framebuffer.visible_height, | |
| g_framebuffer.bgra_pixels, | |
| &bitmap_info, | |
| DIB_RGB_COLORS, SRCCOPY); | |
| ReleaseDC(window_handle, hdc); | |
| } | |
| Sleep(16); | |
| } | |
| } | |
| std::thread threads[SPMD_THREAD_COUNT]; | |
| int main(int argc, char **argv) { | |
| SetProcessDPIAware(); | |
| for (int i = 0; i < SPMD_THREAD_COUNT-1; i++) { | |
| threads[i] = std::thread(run, i+1); | |
| } | |
| run(0); | |
| for (int i = 0; i < SPMD_THREAD_COUNT-1; i++) { | |
| threads[i].join(); | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment