Skip to content

Instantly share code, notes, and snippets.

@BlobTheKat
Last active March 11, 2026 19:44
Show Gist options
  • Select an option

  • Save BlobTheKat/f8a6c4f0dcfb68d99ed031bcd5339a96 to your computer and use it in GitHub Desktop.

Select an option

Save BlobTheKat/f8a6c4f0dcfb68d99ed031bcd5339a96 to your computer and use it in GitHub Desktop.
a.h - C11 Atomics & threads complement library
/**
* Atomics & threads complement library
* Matthew Reiner, 2026
* Available under the GPL 3.0 license
* Targeting: Windows, Linux, *BSD, MacOS
* Implements the following:
* - Wait/Notify functionality for simple types up to 64 bit
* - Arbitrary condition wait
* - Thread management
* - Relax, yield & sleep
* - Getting the current time
*/
#pragma once
#include <stdint.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdalign.h>
static _Atomic uint32_t _atomic_waiter_pool[32];
typedef _Atomic uint32_t lock_t;
#if !defined(alignas) && !defined(__cplusplus)
#define alignas _Alignas
#endif
#define atomic _Atomic
#define thread_local _Thread_local
typedef _Atomic(uint8_t) atomic_uint8_t;
typedef _Atomic(int8_t) atomic_int8_t;
typedef _Atomic(uint16_t) atomic_uint16_t;
typedef _Atomic(int16_t) atomic_int16_t;
typedef _Atomic(uint32_t) atomic_uint32_t;
typedef _Atomic(int32_t) atomic_int32_t;
typedef _Atomic(uint64_t) atomic_uint64_t;
typedef _Atomic(int64_t) atomic_int64_t;
#define SECOND_US 1000000ull
#define MILLISECOND_US 1000ull
typedef enum{
THREAD_PRIO_BACKGROUND,
THREAD_PRIO_NORMAL,
THREAD_PRIO_REALTIME
} thread_priority_t;
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
static inline void thread_relax(){
#if defined(_MSC_VER)
_mm_pause();
#else
__builtin_ia32_pause();
#endif
}
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM64) || defined(_M_ARM)
#if defined(__has_include) && __has_include(<arm_acle.h>)
#include <arm_acle.h>
#endif
static inline void thread_relax(){
#if defined(__ARM_ARCH)
__yield();
#else
__asm__ __volatile__("yield");
#endif
}
#elif defined(__riscv) && defined(__riscv_zihintpause)
static inline void thread_relax(){
__asm__ __volatile__("pause");
}
#else
inline void thread_relax(){ __asm__ __volatile__("nop"); }
#endif
#ifndef _ATOMIC_DEFAULT_SPIN
// thread_relax() this many times while waiting on an atomic
#define _ATOMIC_DEFAULT_SPIN 48
#endif
#ifndef _ATOMIC_DEFAULT_YIELD
// thread_yield() this many times while waiting on an atomic
#define _ATOMIC_DEFAULT_YIELD 6
#endif
// After this many spin/yield, we use a thread parking loop: SYS_futex on Linux, _umtx_op on *BSD, SYS_ulock_* on MacOS, and WaitOnAddress/WakeByAddress* on Windows
// SYS_ulock_* is technically unstable on Mac, although unlikely to go away any time soon
static inline void thread_yield();
#define _atomic_futex_loop(addr, val, wait, s, y) int count = s; \
while(count--) if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; else thread_relax(); \
count = y; while(count--) if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; else thread_yield(); \
wait; goto check;
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#undef WIN32_LEAN_AND_MEAN
#undef NOMINMAX
// Top tier windows trolling
#undef near
#undef far
#undef pascal
#undef cdecl
typedef SSIZE_T ssize_t;
typedef SIZE_T size_t;
static inline void _atomic_wait8(void* addr, uint8_t val){ WaitOnAddress(addr, &val, 1, INFINITE); }
static inline void _atomic_wait16(_Atomic uint16_t* addr, uint16_t val){ WaitOnAddress(addr, &val, 2, INFINITE); }
static inline void _atomic_wait32(_Atomic uint32_t* addr, uint32_t val){ WaitOnAddress(addr, &val, 4, INFINITE); }
static inline void _atomic_wait64(_Atomic uint64_t* addr, uint64_t val){ WaitOnAddress(addr, &val, 8, INFINITE); }
static inline void _atomic_waitloop8(void* addr, uint8_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 1, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_waitloop16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 2, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_waitloop32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 4, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_waitloop64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 8, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_wait_arch(void* a_, void* b_){ WaitOnAddress(a_, &b_, sizeof(void*), INFINITE); }
static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uintptr_t* addr = (_Atomic uintptr_t*)a_; uintptr_t val = (uintptr_t)b_; _atomic_futex_loop(addr, val, check: WaitOnAddress(addr, &val, sizeof(uintptr_t), INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
#define atomic_wake(ptr, n) do{ if((int)(n)==1) WakeByAddressSingle(ptr); else WakeByAddressAll(ptr); }while(0)
#define _atomic_wake32_all(ptr) WakeByAddressAll(ptr)
static inline void _atomic_wake_condition(void* addr, int n){
_Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr)^((uintptr_t)addr>>5))&31];
atomic_fetch_add_explicit(fut, 1, memory_order_release);
atomic_wake(addr, n);
}
struct _thread_t{
void* (*fn)(void*);
void* arg;
_Atomic HANDLE _handle;
};
typedef struct _thread_t* thread_t;
static inline size_t available_concurrency(){ return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); }
_Thread_local thread_t _thread_self;
static inline DWORD WINAPI _thread_wrapper(void* a_){
struct _thread_t* a = _thread_self = (struct _thread_t*)a_;
a->arg = a->fn(a->arg);
HANDLE h = atomic_exchange_explicit(&a->_handle, 0, memory_order_release);
if(h) CloseHandle(h);
else free(a);
return 0;
}
static inline thread_t thread_create(void* (*fn)(void*), void* arg, size_t stack){
HANDLE h;
struct _thread_t* t = malloc(sizeof(struct _thread_t));
t->fn = fn;
t->arg = arg;
atomic_thread_fence(memory_order_release);
t->_handle = CreateThread(NULL, stack, _thread_wrapper, t, 0, NULL);
if(t->_handle == INVALID_HANDLE_VALUE){ free(t); return 0; }
return t;
}
static inline void* thread_wait(thread_t t){
HANDLE h = atomic_load_explicit(&t->_handle, memory_order_acquire);
if(h){
WaitForSingleObject(h, INFINITE);
CloseHandle(h);
}
void* ret = t->arg;
free(t);
return ret;
}
static inline void thread_detach(thread_t t){
HANDLE h = atomic_exchange_explicit(&t->_handle, 0, memory_order_relaxed);
if(h){ CloseHandle(h); }
else free(t);
}
static inline thread_t thread_self(){ return _thread_self; }
static inline void thread_yield(){ SwitchToThread(); }
#define _SLEEP_MAX (uint64_t)(INFINITE-1)
static inline void thread_sleep(uint64_t nanoseconds){
nanoseconds = (nanoseconds+999999) / 1000000;
// nanoseconds becomes milliseconds
while(nanoseconds > _SLEEP_MAX){
nanoseconds -= _SLEEP_MAX;
Sleep(_SLEEP_MAX);
}
Sleep((DWORD) nanoseconds);
}
static inline bool thread_set_priority(thread_priority_t p){
return SetThreadPriority(GetCurrentThread(), p == THREAD_PRIO_BACKGROUND ? THREAD_PRIORITY_LOWEST : p == THREAD_PRIO_REALTIME ? THREAD_PRIORITY_TIME_CRITICAL : THREAD_PRIORITY_NORMAL);
}
static inline uint64_t local_now(){
static double pfreq = -1;
LARGE_INTEGER counter;
QueryPerformanceCounter(&counter);
if(pfreq < 0){
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
pfreq = (double)SECOND_NS / (double)f.QuadPart;
}
return (uint64_t)(counter.QuadPart * pfreq);
}
static inline uint64_t epoch_now(){
FILETIME ft;
ULARGE_INTEGER uli;
GetSystemTimePreciseAsFileTime(&ft);
uli.LowPart = ft.dwLowDateTime;
uli.HighPart = ft.dwHighDateTime;
return uli.QuadPart/10 - 11644473600000000LL;
}
static inline uint64_t thread_now(){
FILETIME kernel, user;
if(!GetThreadTimes(GetCurrentThread(), 0, 0, &kernel, &user)) return 0;
uint64_t k =
((uint64_t)kernel.dwHighDateTime << 32) |
kernel.dwLowDateTime;
uint64_t u =
((uint64_t)user.dwHighDateTime << 32) |
user.dwLowDateTime;
return (k+u)/10;
}
#else
#include <sched.h>
#include <unistd.h>
#ifdef __linux__
#include <linux/futex.h>
#elif !defined(__APPLE__)
#include <sys/umtx.h>
#endif
#include <sys/syscall.h>
#include <limits.h>
#include <pthread.h>
#include <errno.h>
#include <time.h>
#ifdef __linux__
// Note on _atomic_futex_small:
// Every 8 bit value is inside some aligned 32 bit value that doesn't cross a page boundary, same for 16 bit
// C standard might say loading this 32 bit value is UB and to that I say BITE ME
// We use volatile here to break the compiler's assumptions about memory
// This does not "fix" the UB, but does make the UB mostly non-actionable to the compiler in practice
// If you sacrifice performance for "idiomatic correctness" you are a sucker and PRs as such will not be accepted
// We use bitset to guarantee that wake ops will wakes the correct thread, even if multiple threads are waiting
// on the same 32 bit word but different 8/16 bit portions of that word
// _atomic_futex32 is just normal futexes
// _atomic_futex64 has to be emulated via a waiter pool: each "waiter" tracks the wake "generation" to avoid lost wakes between the check and the actual syscall
// (this only breaks if there have been exactly 2^32 wakes between the check and the syscall which is virtually impossible in practice)
// _atomic_futex64 also makes use of bitset to reduce unnecessary wakeups (from 1 in 32 to 1 in 1024)
// The exact same algorithm used by _atomic_futex64 can be used to implement wait for arbitrary sizes or conditions
#define _atomic_futex_wake_small(addr, n) int off = ((uintptr_t)addr)&3; \
syscall(SYS_futex, (char*)addr-off, FUTEX_WAKE_BITSET_PRIVATE, n, 0, 0, 1<<(off<<3))
#define _atomic_futex_wake32(addr, n) syscall(SYS_futex, addr, FUTEX_WAKE_PRIVATE, n, 0, 0)
#define _atomic_futex_wake64(addr, n) _Atomic uint32_t *fut = &_atomic_waiter_pool[((uintptr_t)addr>>3)&31]; \
atomic_fetch_add_explicit(fut, 1, memory_order_release); \
syscall(SYS_futex, fut, FUTEX_WAKE_BITSET_PRIVATE, n, 0, 0, 1u<<(((uintptr_t)addr>>8)&31));
#define _atomic_futex_small(addr, val, m) int off = ((uintptr_t)addr)&3; \
addr -= off; off <<= 3; \
check: {\
uint32_t v = atomic_load_explicit((volatile _Atomic uint32_t*) addr, memory_order_relaxed); \
uint32_t v2 = v&m | val<<off; \
if(v != v2) return; \
syscall(SYS_futex, addr, FUTEX_WAIT_BITSET_PRIVATE, v, 0, 0, 1<<off); }
#define _atomic_futex32(addr, val) syscall(SYS_futex, addr, FUTEX_WAIT_PRIVATE, val, 0, 0)
#define _atomic_futex64(addr, val) _Atomic uint32_t *fut = &_atomic_waiter_pool[((uintptr_t)addr>>3)&31]; \
uint32_t tok = atomic_load_explicit(fut, memory_order_acquire); \
if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
uint32_t m = 1u<<(((uintptr_t)addr>>8)&31); \
check: {\
syscall(SYS_futex, fut, FUTEX_WAIT_BITSET_PRIVATE, tok, 0, 0, m); \
if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
uint32_t tok2 = tok; if(tok2 == (tok=atomic_load_explicit(fut, memory_order_relaxed))) return; \
syscall(SYS_futex, fut, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, 0, 0, m); }
#else
// MacOS __ulock and BSD _umtx are pretty similar
// Generic 32 bit futex, because the rest is so similar
#if defined(UMTX_OP_WAIT)
#define _atomic_futex32(addr, val) _umtx_op(addr, UMTX_OP_WAIT_UINT_PRIVATE, val, 0, 0)
#define _atomic_futex_wake32(addr, n) _umtx_op(addr, UMTX_OP_WAKE_PRIVATE, n, 0, 0)
#elif defined(__APPLE__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#define _atomic_futex32(addr, val) syscall(SYS_ulock_wait, 0x1000001, addr, val)
#define _atomic_futex_wake32(addr, n) syscall(SYS_ulock_wake, 0x1000001, addr, n)
#endif
// Read the notes on the linux implementation
// Main difference: we don't have bitset, so _atomic_futex_small needs the same "wake the others" condition as _atomic_futex64
// Additionally, _atomic_futex64 can't use the mask for additional waiter pool separation, so we compensate by using a better hash instead
#define _atomic_futex_wake_small(addr, n) int off = ((uintptr_t)addr)&3; \
_atomic_futex_wake32((char*)addr-off, n)
#define _atomic_futex_small(addr, val, m) int off = ((uintptr_t)addr)&3; \
volatile _Atomic uint32_t* addr2 = (volatile _Atomic uint32_t*)((char*)addr-off); off <<= 3; \
check: {\
uint32_t v = atomic_load_explicit(addr2, memory_order_relaxed); \
uint32_t v2 = v&m | val<<off; \
if(v != v2) return; \
_atomic_futex32(addr2, v); \
uint32_t v3 = atomic_load_explicit(addr2, memory_order_relaxed)^v; \
if(v3&m) return; \
if(v3^~m) _atomic_futex_wake32(addr2, INT_MAX);}
#if ULONG_MAX == UINT64_MAX && defined(UMTX_OP_WAIT)
#define _atomic_futex_wake64(addr, n) _umtx_op(addr, UMTX_OP_WAKE_PRIVATE, n, 0, 0)
#define _atomic_futex64(addr, val) _umtx_op(addr, UMTX_OP_WAIT_PRIVATE, val, 0, 0)
#else
#define _atomic_futex_wake64(addr, n) _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr>>3)^((uintptr_t)addr>>8))&31]; \
atomic_fetch_add_explicit(fut, 1, memory_order_release); \
_atomic_futex_wake32(fut, n)
#define _atomic_futex64(addr, val) _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr>>3)^((uintptr_t)addr>>8))&31]; \
uint32_t tok = atomic_load_explicit(fut, memory_order_acquire); \
if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
check: {\
_atomic_futex32(fut, tok); \
if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
uint32_t tok2 = tok; if(tok2 == (tok=atomic_load_explicit(fut, memory_order_relaxed))) return; \
_atomic_futex_wake32(fut, INT_MAX); }
#endif
#endif
static inline void _atomic_wait8(void* addr, uint8_t val){ _atomic_futex_small(addr, val, ~(255<<off)) }
static inline void _atomic_waitloop8(void* addr, uint8_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, _atomic_futex_small(addr, val, ~(255<<off)), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wait16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_small(addr, val, (0xFFFF0000>>off)) }
static inline void _atomic_waitloop16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_loop(addr, val, _atomic_futex_small(addr, val, (0xFFFF0000>>off)), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wait32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex32(addr, val); }
static inline void _atomic_waitloop32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex_loop(addr, val, check: _atomic_futex32(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wait64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex64(addr, val) }
static inline void _atomic_waitloop64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex_loop(addr, val, _atomic_futex64(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wake8_16(void* addr, int n){ _atomic_futex_wake_small(addr, n); }
static inline void _atomic_wake32(void* addr, int n){ _atomic_futex_wake32(addr, n); }
static inline void _atomic_wake64(void* addr, int n){ _atomic_futex_wake64(addr, n); }
#if UINTPTR_MAX == UINT64_MAX
static inline void _atomic_wait_arch(void* a_, void* b_){ _Atomic uint64_t* addr = (_Atomic uint64_t*)a_; uint64_t val = (uint64_t)b_; _atomic_futex64(addr, val) }
static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uint64_t* addr = (_Atomic uint64_t*)a_; uint64_t val = (uint64_t)b_; _atomic_futex_loop(addr, val, _atomic_futex64(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
#define _atomic_wake_arch _atomic_wake64
#else
static inline void _atomic_wait_arch(void* a_, void* b_){ _atomic_futex32(((_Atomic uint32_t*)a_), ((uint32_t)b_)) }
static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uint32_t* addr = a_; uint32_t val = (uint32_t)b_; _atomic_futex_loop(addr, val, _atomic_futex32(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
#define _atomic_wake_arch _atomic_wake32
#endif
#define atomic_wake(ptr, n) _Generic((ptr), \
_Atomic(_Bool)*: _atomic_wake8_16, _Atomic(uint8_t)*: _atomic_wake8_16, _Atomic(int8_t)*: _atomic_wake8_16, \
_Atomic(uint16_t)*: _atomic_wake8_16, _Atomic(int16_t)*: _atomic_wake8_16, \
_Atomic(uint32_t)*: _atomic_wake32, _Atomic(int32_t)*: _atomic_wake32, \
_Atomic(uint64_t)*: _atomic_wake64, _Atomic(int64_t)*: _atomic_wake64, \
default: _atomic_wake_arch \
)(ptr, n)
#define _atomic_wake32_all(ptr) _atomic_wake32(ptr, INT_MAX)
static inline void _atomic_wake_condition(void* addr, int n){
_Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr)^((uintptr_t)addr>>5))&31];
atomic_fetch_add_explicit(fut, 1, memory_order_release);
_atomic_futex_wake32(fut, n);
}
typedef pthread_t thread_t;
static inline size_t available_concurrency(){ return sysconf(_SC_NPROCESSORS_ONLN); }
static inline thread_t thread_create(void* (*fn)(void*), void* arg, size_t stack){
// This implementation assumes that (pthread_t)0 is never used. This is usually true in practice
thread_t t = {0};
if(!stack){ pthread_create(&t, 0, fn, arg); return t; }
pthread_attr_t a;
pthread_attr_init(&a);
pthread_attr_setstacksize(&a, stack);
pthread_create(&t, &a, fn, arg);
pthread_attr_destroy(&a);
return t;
}
static inline void thread_detach(thread_t t){ pthread_detach(t); }
static inline void* thread_wait(thread_t t){ void* res; pthread_join(t, &res); return res; }
static inline thread_t thread_self(){ return pthread_self(); }
static inline void thread_yield(){ sched_yield(); }
#ifdef __linux__
#define _A_CLOCK CLOCK_MONOTONIC_RAW
#else
#define _A_CLOCK CLOCK_MONOTONIC
#endif
static inline void thread_sleep(uint64_t nanoseconds){
struct timespec ts;
ts.tv_sec = nanoseconds / 1000000000;
ts.tv_nsec = nanoseconds - ts.tv_sec*1000000000;
#ifndef __APPLE__
while(clock_nanosleep(_A_CLOCK, 0, &ts, &ts) && errno == EINTR);
#else
while(nanosleep(&ts, &ts) && errno == EINTR);
#endif
}
static inline uint64_t local_now(){
struct timespec ts;
clock_gettime(_A_CLOCK, &ts);
return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec;
}
static inline uint64_t epoch_now(){
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec;
}
static inline bool thread_set_priority(thread_priority_t p){
struct sched_param param = {0};
#ifdef SCHED_IDLE
return !pthread_setschedparam(pthread_self(), p == THREAD_PRIO_REALTIME ? SCHED_RR : p == THREAD_PRIO_NORMAL ? SCHED_OTHER : SCHED_IDLE, &param);
#else
return !pthread_setschedparam(pthread_self(), p == THREAD_PRIO_REALTIME ? SCHED_RR : SCHED_OTHER, &param);
#endif
}
static inline uint64_t thread_now(){
struct timespec ts;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec;
}
#endif
#define atomic_wait(ptr, val) _Generic((ptr), \
_Atomic(_Bool)*: _atomic_wait8, _Atomic(uint8_t)*: _atomic_wait8, _Atomic(int8_t)*: _atomic_wait8, \
_Atomic(uint16_t)*: _atomic_wait16, _Atomic(int16_t)*: _atomic_wait16, \
_Atomic(uint32_t)*: _atomic_wait32, _Atomic(int32_t)*: _atomic_wait32, \
_Atomic(uint64_t)*: _atomic_wait64, _Atomic(int64_t)*: _atomic_wait64, \
default: _atomic_wait_arch \
)(ptr, val)
#define atomic_wait_loop(ptr, val) _Generic((ptr), \
_Atomic(_Bool)*: _atomic_waitloop8, _Atomic(uint8_t)*: _atomic_waitloop8, _Atomic(int8_t)*: _atomic_waitloop8, \
_Atomic(uint16_t)*: _atomic_waitloop16, _Atomic(int16_t)*: _atomic_waitloop16, \
_Atomic(uint32_t)*: _atomic_waitloop32, _Atomic(int32_t)*: _atomic_waitloop32, \
_Atomic(uint64_t)*: _atomic_waitloop64, _Atomic(int64_t)*: _atomic_waitloop64, \
default: _atomic_wait_arch \
)(ptr, val)
#define _atomic_wait_until(key, cond, s, y) do{ \
int _atomic_count = s; \
while(_atomic_count--) if(cond) break; else thread_relax(); \
_atomic_count = y; while(_atomic_count--) if(cond) break; else thread_yield(); \
void* _atomic_addr = (void*)(key); \
_Atomic uint32_t *_atomic_fut = &_atomic_waiter_pool[(((uintptr_t)_atomic_addr)^((uintptr_t)_atomic_addr>>5))&31]; \
uint32_t _atomic_tok = atomic_load_explicit(_atomic_fut, memory_order_acquire); \
if(cond) break; \
check: { \
_atomic_wait32(_atomic_fut, _atomic_tok); \
if(cond) break; \
uint32_t _atomic_tok2 = _atomic_tok; if(_atomic_tok2 != (_atomic_tok=atomic_load_explicit(_atomic_fut, memory_order_relaxed))) _atomic_wake32(_atomic_fut, INT_MAX); } \
goto check; }while(0)
#define atomic_wait_until(key, cond) _atomic_wait_until(key, cond, _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD)
#define atomic_wake_condition(key, n) _atomic_wake_condition((void*)(key), n)
static inline void lock_acquire(lock_t* lock, int n){
loop0: {}
uint32_t v = atomic_load_explicit(lock, memory_order_relaxed);
loop: {}
int32_t v2 = (v&0x7FFFFFFF)-n;
if(v2<0){
if(atomic_compare_exchange_weak_explicit(lock, &v, 0x80000000, memory_order_relaxed, memory_order_relaxed)){
n = -v2;
// block
int count = _ATOMIC_DEFAULT_SPIN;
while(count--) if((v = atomic_load_explicit(lock, memory_order_relaxed)) & 0x7FFFFFFF) goto loop; else thread_relax();
count = _ATOMIC_DEFAULT_YIELD;
while(count--) if((v = atomic_load_explicit(lock, memory_order_relaxed)) & 0x7FFFFFFF) goto loop; else thread_yield();
if(!(v&0x80000000)) atomic_fetch_or_explicit(lock, 0x80000000, memory_order_relaxed), v |= 0x80000000;
_atomic_wait32(lock, v);
goto loop0;
}
}else if(atomic_compare_exchange_weak_explicit(lock, &v, v2|0x80000000, memory_order_acquire, memory_order_relaxed)) return; // acquired
goto loop;
}
static inline void lock_release(lock_t* lock, int n){
uint32_t wk = atomic_fetch_add_explicit(lock, n, memory_order_release);
if(wk&0x80000000){
atomic_fetch_and_explicit(lock, 0x7FFFFFFF, memory_order_relaxed);
// We need to wake at least this many waiters. By waking one more waiter we guarantee that the waiting flag is set again if there are indeed more waiters
wk &= 0x7FFFFFFF;
_atomic_wake32(lock, wk+(wk<0x7FFFFFFF));
}
}
#define LOCK_MAX 2147483647
#ifdef __APPLE__
#pragma clang diagnostic pop
#endif
typedef _Atomic(ssize_t) atomic_ssize_t;
// L1 cache line is 64 bytes almost everywhere
#ifndef CACHE_LINE
#define CACHE_LINE 64
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment