BlobTheKat/a.h

## a.h
/**
 * Atomics & threads complement library
 * Matthew Reiner, 2026
 * Available under the GPL 3.0 license
 * Targeting: Windows, Linux, *BSD, MacOS
 * Implements the following:
 *  - Wait/Notify functionality for simple types up to 64 bit
 *  - Arbitrary condition wait
 *  - Thread management
 *  - Relax, yield & sleep
 *  - Getting the current time
 */

#pragma once
#include <stdint.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdalign.h>

static _Atomic uint32_t _atomic_waiter_pool[32];
typedef _Atomic uint32_t lock_t;

#if !defined(alignas) && !defined(__cplusplus)
#define alignas _Alignas
#endif

#define atomic _Atomic
#define thread_local _Thread_local
typedef _Atomic(uint8_t) atomic_uint8_t;
typedef _Atomic(int8_t) atomic_int8_t;
typedef _Atomic(uint16_t) atomic_uint16_t;
typedef _Atomic(int16_t) atomic_int16_t;
typedef _Atomic(uint32_t) atomic_uint32_t;
typedef _Atomic(int32_t) atomic_int32_t;
typedef _Atomic(uint64_t) atomic_uint64_t;
typedef _Atomic(int64_t) atomic_int64_t;

#define SECOND_US 1000000ull
#define MILLISECOND_US 1000ull

typedef enum{
	THREAD_PRIO_BACKGROUND,
	THREAD_PRIO_NORMAL,
	THREAD_PRIO_REALTIME
} thread_priority_t;

#if defined(_MSC_VER)
#include <intrin.h>
#endif

#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
static inline void thread_relax(){
#if defined(_MSC_VER)
	_mm_pause();
#else
	__builtin_ia32_pause();
#endif
}
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM64) || defined(_M_ARM)
#if defined(__has_include) && __has_include(<arm_acle.h>)
#include <arm_acle.h>
#endif
static inline void thread_relax(){
#if defined(__ARM_ARCH)
	__yield();
#else
	__asm__ __volatile__("yield");
#endif
}
#elif defined(__riscv) && defined(__riscv_zihintpause)
static inline void thread_relax(){
	__asm__ __volatile__("pause");
}
#else
inline void thread_relax(){ __asm__ __volatile__("nop"); }
#endif

#ifndef _ATOMIC_DEFAULT_SPIN
// thread_relax() this many times while waiting on an atomic
#define _ATOMIC_DEFAULT_SPIN 48
#endif
#ifndef _ATOMIC_DEFAULT_YIELD
// thread_yield() this many times while waiting on an atomic
#define _ATOMIC_DEFAULT_YIELD 6
#endif
// After this many spin/yield, we use a thread parking loop: SYS_futex on Linux, _umtx_op on *BSD, SYS_ulock_* on MacOS, and WaitOnAddress/WakeByAddress* on Windows
// SYS_ulock_* is technically unstable on Mac, although unlikely to go away any time soon

static inline void thread_yield();

#define _atomic_futex_loop(addr, val, wait, s, y) int count = s; \
	while(count--) if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; else thread_relax(); \
	count = y; while(count--) if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; else thread_yield(); \
	wait; goto check;

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#undef WIN32_LEAN_AND_MEAN
#undef NOMINMAX
// Top tier windows trolling
#undef near
#undef far
#undef pascal
#undef cdecl


typedef SSIZE_T ssize_t;
typedef SIZE_T size_t;

static inline void _atomic_wait8(void* addr, uint8_t val){ WaitOnAddress(addr, &val, 1, INFINITE); }
static inline void _atomic_wait16(_Atomic uint16_t* addr, uint16_t val){ WaitOnAddress(addr, &val, 2, INFINITE); }
static inline void _atomic_wait32(_Atomic uint32_t* addr, uint32_t val){ WaitOnAddress(addr, &val, 4, INFINITE); }
static inline void _atomic_wait64(_Atomic uint64_t* addr, uint64_t val){ WaitOnAddress(addr, &val, 8, INFINITE); }
static inline void _atomic_waitloop8(void* addr, uint8_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 1, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_waitloop16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 2, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_waitloop32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 4, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }
static inline void _atomic_waitloop64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 8, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }

static inline void _atomic_wait_arch(void* a_, void* b_){ WaitOnAddress(a_, &b_, sizeof(void*), INFINITE); }
static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uintptr_t* addr = (_Atomic uintptr_t*)a_; uintptr_t val = (uintptr_t)b_; _atomic_futex_loop(addr, val, check: WaitOnAddress(addr, &val, sizeof(uintptr_t), INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); }

#define atomic_wake(ptr, n) do{ if((int)(n)==1) WakeByAddressSingle(ptr); else WakeByAddressAll(ptr); }while(0)

#define _atomic_wake32_all(ptr) WakeByAddressAll(ptr)

static inline void _atomic_wake_condition(void* addr, int n){
	_Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr)^((uintptr_t)addr>>5))&31];
	atomic_fetch_add_explicit(fut, 1, memory_order_release);
	atomic_wake(addr, n);
}

struct _thread_t{
	void* (*fn)(void*);
	void* arg;
	_Atomic HANDLE _handle;
};

typedef struct _thread_t* thread_t;

static inline size_t available_concurrency(){ return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); }

_Thread_local thread_t _thread_self;
static inline DWORD WINAPI _thread_wrapper(void* a_){
	struct _thread_t* a = _thread_self = (struct _thread_t*)a_;
	a->arg = a->fn(a->arg);
	HANDLE h = atomic_exchange_explicit(&a->_handle, 0, memory_order_release);
	if(h) CloseHandle(h);
	else free(a);
	return 0;
}

static inline thread_t thread_create(void* (*fn)(void*), void* arg, size_t stack){
	HANDLE h;
	struct _thread_t* t = malloc(sizeof(struct _thread_t));
	t->fn = fn;
	t->arg = arg;
	atomic_thread_fence(memory_order_release);
	t->_handle = CreateThread(NULL, stack, _thread_wrapper, t, 0, NULL);
	if(t->_handle == INVALID_HANDLE_VALUE){ free(t); return 0; }
	return t;
}

static inline void* thread_wait(thread_t t){
	HANDLE h = atomic_load_explicit(&t->_handle, memory_order_acquire);
	if(h){
		WaitForSingleObject(h, INFINITE);
		CloseHandle(h);
	}
	void* ret = t->arg;
	free(t);
	return ret;
}
static inline void thread_detach(thread_t t){
	HANDLE h = atomic_exchange_explicit(&t->_handle, 0, memory_order_relaxed);
	if(h){ CloseHandle(h); }
	else free(t);
}

static inline thread_t thread_self(){ return _thread_self; }
static inline void thread_yield(){ SwitchToThread(); }
#define _SLEEP_MAX (uint64_t)(INFINITE-1)
static inline void thread_sleep(uint64_t nanoseconds){
	nanoseconds = (nanoseconds+999999) / 1000000;
	// nanoseconds becomes milliseconds
	while(nanoseconds > _SLEEP_MAX){
		nanoseconds -= _SLEEP_MAX;
		Sleep(_SLEEP_MAX);
	}
	Sleep((DWORD) nanoseconds);
}

static inline bool thread_set_priority(thread_priority_t p){
	return SetThreadPriority(GetCurrentThread(), p == THREAD_PRIO_BACKGROUND ? THREAD_PRIORITY_LOWEST : p == THREAD_PRIO_REALTIME ? THREAD_PRIORITY_TIME_CRITICAL : THREAD_PRIORITY_NORMAL);
}

static inline uint64_t local_now(){
	static double pfreq = -1;
	LARGE_INTEGER counter;
	QueryPerformanceCounter(&counter);
	if(pfreq < 0){
		LARGE_INTEGER f;
		QueryPerformanceFrequency(&f);
		pfreq = (double)SECOND_NS / (double)f.QuadPart;
	}
	return (uint64_t)(counter.QuadPart * pfreq);
}

static inline uint64_t epoch_now(){
	FILETIME ft;
	ULARGE_INTEGER uli;
	GetSystemTimePreciseAsFileTime(&ft);
	uli.LowPart = ft.dwLowDateTime;
	uli.HighPart = ft.dwHighDateTime;
	return uli.QuadPart/10 - 11644473600000000LL;
}

static inline uint64_t thread_now(){
	FILETIME kernel, user;
	if(!GetThreadTimes(GetCurrentThread(), 0, 0, &kernel, &user)) return 0;
	uint64_t k =
		((uint64_t)kernel.dwHighDateTime << 32) |
		kernel.dwLowDateTime;
	uint64_t u =
		((uint64_t)user.dwHighDateTime << 32) |
		user.dwLowDateTime;
	return (k+u)/10;
}


#else
#include <sched.h>
#include <unistd.h>
#ifdef __linux__
#include <linux/futex.h>
#elif !defined(__APPLE__)
#include <sys/umtx.h>
#endif
#include <sys/syscall.h>
#include <limits.h>
#include <pthread.h>
#include <errno.h>
#include <time.h>

#ifdef __linux__

// Note on _atomic_futex_small:
// Every 8 bit value is inside some aligned 32 bit value that doesn't cross a page boundary, same for 16 bit
// C standard might say loading this 32 bit value is UB and to that I say BITE ME
// We use volatile here to break the compiler's assumptions about memory
// This does not "fix" the UB, but does make the UB mostly non-actionable to the compiler in practice
// If you sacrifice performance for "idiomatic correctness" you are a sucker and PRs as such will not be accepted
// We use bitset to guarantee that wake ops will wakes the correct thread, even if multiple threads are waiting
//   on the same 32 bit word but different 8/16 bit portions of that word

// _atomic_futex32 is just normal futexes
// _atomic_futex64 has to be emulated via a waiter pool: each "waiter" tracks the wake "generation" to avoid lost wakes between the check and the actual syscall
//   (this only breaks if there have been exactly 2^32 wakes between the check and the syscall which is virtually impossible in practice)
// _atomic_futex64 also makes use of bitset to reduce unnecessary wakeups (from 1 in 32 to 1 in 1024)
// The exact same algorithm used by _atomic_futex64 can be used to implement wait for arbitrary sizes or conditions

#define _atomic_futex_wake_small(addr, n) int off = ((uintptr_t)addr)&3; \
	syscall(SYS_futex, (char*)addr-off, FUTEX_WAKE_BITSET_PRIVATE, n, 0, 0, 1<<(off<<3))
#define _atomic_futex_wake32(addr, n) syscall(SYS_futex, addr, FUTEX_WAKE_PRIVATE, n, 0, 0)
#define _atomic_futex_wake64(addr, n) _Atomic uint32_t *fut = &_atomic_waiter_pool[((uintptr_t)addr>>3)&31]; \
	atomic_fetch_add_explicit(fut, 1, memory_order_release); \
	syscall(SYS_futex, fut, FUTEX_WAKE_BITSET_PRIVATE, n, 0, 0, 1u<<(((uintptr_t)addr>>8)&31));
#define _atomic_futex_small(addr, val, m) int off = ((uintptr_t)addr)&3; \
	addr -= off; off <<= 3; \
	check: {\
	uint32_t v = atomic_load_explicit((volatile _Atomic uint32_t*) addr, memory_order_relaxed); \
	uint32_t v2 = v&m | val<<off; \
	if(v != v2) return; \
	syscall(SYS_futex, addr, FUTEX_WAIT_BITSET_PRIVATE, v, 0, 0, 1<<off); }
#define _atomic_futex32(addr, val) syscall(SYS_futex, addr, FUTEX_WAIT_PRIVATE, val, 0, 0)
#define _atomic_futex64(addr, val) _Atomic uint32_t *fut = &_atomic_waiter_pool[((uintptr_t)addr>>3)&31]; \
	uint32_t tok = atomic_load_explicit(fut, memory_order_acquire); \
	if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
	uint32_t m = 1u<<(((uintptr_t)addr>>8)&31); \
	check: {\
	syscall(SYS_futex, fut, FUTEX_WAIT_BITSET_PRIVATE, tok, 0, 0, m); \
	if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
	uint32_t tok2 = tok; if(tok2 == (tok=atomic_load_explicit(fut, memory_order_relaxed))) return; \
	syscall(SYS_futex, fut, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, 0, 0, m); }

#else
// MacOS __ulock and BSD _umtx are pretty similar
// Generic 32 bit futex, because the rest is so similar
#if defined(UMTX_OP_WAIT)
#define _atomic_futex32(addr, val) _umtx_op(addr, UMTX_OP_WAIT_UINT_PRIVATE, val, 0, 0)
#define _atomic_futex_wake32(addr, n) _umtx_op(addr, UMTX_OP_WAKE_PRIVATE, n, 0, 0)
#elif defined(__APPLE__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#define _atomic_futex32(addr, val) syscall(SYS_ulock_wait, 0x1000001, addr, val)
#define _atomic_futex_wake32(addr, n) syscall(SYS_ulock_wake, 0x1000001, addr, n)
#endif

// Read the notes on the linux implementation
// Main difference: we don't have bitset, so _atomic_futex_small needs the same "wake the others" condition as _atomic_futex64
// Additionally, _atomic_futex64 can't use the mask for additional waiter pool separation, so we compensate by using a better hash instead

#define _atomic_futex_wake_small(addr, n) int off = ((uintptr_t)addr)&3; \
	_atomic_futex_wake32((char*)addr-off, n)
#define _atomic_futex_small(addr, val, m) int off = ((uintptr_t)addr)&3; \
	volatile _Atomic uint32_t* addr2 = (volatile _Atomic uint32_t*)((char*)addr-off); off <<= 3; \
	check: {\
	uint32_t v = atomic_load_explicit(addr2, memory_order_relaxed); \
	uint32_t v2 = v&m | val<<off; \
	if(v != v2) return; \
	_atomic_futex32(addr2, v); \
	uint32_t v3 = atomic_load_explicit(addr2, memory_order_relaxed)^v; \
	if(v3&m) return; \
	if(v3^~m) _atomic_futex_wake32(addr2, INT_MAX);}

#if ULONG_MAX == UINT64_MAX && defined(UMTX_OP_WAIT)
#define _atomic_futex_wake64(addr, n) _umtx_op(addr, UMTX_OP_WAKE_PRIVATE, n, 0, 0)
#define _atomic_futex64(addr, val) _umtx_op(addr, UMTX_OP_WAIT_PRIVATE, val, 0, 0)
#else
#define _atomic_futex_wake64(addr, n) _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr>>3)^((uintptr_t)addr>>8))&31]; \
	atomic_fetch_add_explicit(fut, 1, memory_order_release); \
	_atomic_futex_wake32(fut, n)
#define _atomic_futex64(addr, val) _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr>>3)^((uintptr_t)addr>>8))&31]; \
	uint32_t tok = atomic_load_explicit(fut, memory_order_acquire); \
	if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
	check: {\
	_atomic_futex32(fut, tok); \
	if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \
	uint32_t tok2 = tok; if(tok2 == (tok=atomic_load_explicit(fut, memory_order_relaxed))) return; \
	_atomic_futex_wake32(fut, INT_MAX); }
#endif
#endif
static inline void _atomic_wait8(void* addr, uint8_t val){ _atomic_futex_small(addr, val, ~(255<<off)) }
static inline void _atomic_waitloop8(void* addr, uint8_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, _atomic_futex_small(addr, val, ~(255<<off)), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wait16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_small(addr, val, (0xFFFF0000>>off)) }
static inline void _atomic_waitloop16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_loop(addr, val, _atomic_futex_small(addr, val, (0xFFFF0000>>off)), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wait32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex32(addr, val); }
static inline void _atomic_waitloop32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex_loop(addr, val, check: _atomic_futex32(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
static inline void _atomic_wait64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex64(addr, val) }
static inline void _atomic_waitloop64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex_loop(addr, val, _atomic_futex64(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }

static inline void _atomic_wake8_16(void* addr, int n){ _atomic_futex_wake_small(addr, n); }
static inline void _atomic_wake32(void* addr, int n){ _atomic_futex_wake32(addr, n); }
static inline void _atomic_wake64(void* addr, int n){ _atomic_futex_wake64(addr, n); }

#if UINTPTR_MAX == UINT64_MAX
static inline void _atomic_wait_arch(void* a_, void* b_){ _Atomic uint64_t* addr = (_Atomic uint64_t*)a_; uint64_t val = (uint64_t)b_; _atomic_futex64(addr, val) }
static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uint64_t* addr = (_Atomic uint64_t*)a_; uint64_t val = (uint64_t)b_; _atomic_futex_loop(addr, val, _atomic_futex64(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
#define _atomic_wake_arch _atomic_wake64
#else
static inline void _atomic_wait_arch(void* a_, void* b_){ _atomic_futex32(((_Atomic uint32_t*)a_), ((uint32_t)b_)) }
static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uint32_t* addr = a_; uint32_t val = (uint32_t)b_; _atomic_futex_loop(addr, val, _atomic_futex32(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) }
#define _atomic_wake_arch _atomic_wake32
#endif

#define atomic_wake(ptr, n) _Generic((ptr), \
	_Atomic(_Bool)*: _atomic_wake8_16, _Atomic(uint8_t)*: _atomic_wake8_16, _Atomic(int8_t)*: _atomic_wake8_16, \
	_Atomic(uint16_t)*: _atomic_wake8_16, _Atomic(int16_t)*: _atomic_wake8_16, \
	_Atomic(uint32_t)*: _atomic_wake32, _Atomic(int32_t)*: _atomic_wake32, \
	_Atomic(uint64_t)*: _atomic_wake64, _Atomic(int64_t)*: _atomic_wake64, \
	default: _atomic_wake_arch \
)(ptr, n)

#define _atomic_wake32_all(ptr) _atomic_wake32(ptr, INT_MAX)

static inline void _atomic_wake_condition(void* addr, int n){
	_Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr)^((uintptr_t)addr>>5))&31];
	atomic_fetch_add_explicit(fut, 1, memory_order_release);
	_atomic_futex_wake32(fut, n);
}

typedef pthread_t thread_t;

static inline size_t available_concurrency(){ return sysconf(_SC_NPROCESSORS_ONLN); }

static inline thread_t thread_create(void* (*fn)(void*), void* arg, size_t stack){
	// This implementation assumes that (pthread_t)0 is never used. This is usually true in practice
	thread_t t = {0};
	if(!stack){ pthread_create(&t, 0, fn, arg); return t; }
	pthread_attr_t a;
	pthread_attr_init(&a);
	pthread_attr_setstacksize(&a, stack);
	pthread_create(&t, &a, fn, arg);
	pthread_attr_destroy(&a);
	return t;
}

static inline void thread_detach(thread_t t){ pthread_detach(t); }
static inline void* thread_wait(thread_t t){ void* res; pthread_join(t, &res); return res; }

static inline thread_t thread_self(){ return pthread_self(); }
static inline void thread_yield(){ sched_yield(); }

#ifdef __linux__
#define _A_CLOCK CLOCK_MONOTONIC_RAW
#else
#define _A_CLOCK CLOCK_MONOTONIC
#endif

static inline void thread_sleep(uint64_t nanoseconds){
	struct timespec ts;
	ts.tv_sec = nanoseconds / 1000000000;
	ts.tv_nsec = nanoseconds - ts.tv_sec*1000000000;
#ifndef __APPLE__
	while(clock_nanosleep(_A_CLOCK, 0, &ts, &ts) && errno == EINTR);
#else
	while(nanosleep(&ts, &ts) && errno == EINTR);
#endif
}

static inline uint64_t local_now(){
	struct timespec ts;
	clock_gettime(_A_CLOCK, &ts);
	return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec;
}

static inline uint64_t epoch_now(){
	struct timespec ts;
	clock_gettime(CLOCK_REALTIME, &ts);
	return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec;
}

static inline bool thread_set_priority(thread_priority_t p){
	struct sched_param param = {0};
#ifdef SCHED_IDLE
	return !pthread_setschedparam(pthread_self(), p == THREAD_PRIO_REALTIME ? SCHED_RR : p == THREAD_PRIO_NORMAL ? SCHED_OTHER : SCHED_IDLE, &param);
#else
	return !pthread_setschedparam(pthread_self(), p == THREAD_PRIO_REALTIME ? SCHED_RR : SCHED_OTHER, &param);
#endif
}

static inline uint64_t thread_now(){
	struct timespec ts;
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
	return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec;
}

#endif

#define atomic_wait(ptr, val) _Generic((ptr), \
	_Atomic(_Bool)*: _atomic_wait8, _Atomic(uint8_t)*: _atomic_wait8, _Atomic(int8_t)*: _atomic_wait8, \
	_Atomic(uint16_t)*: _atomic_wait16, _Atomic(int16_t)*: _atomic_wait16, \
	_Atomic(uint32_t)*: _atomic_wait32, _Atomic(int32_t)*: _atomic_wait32, \
	_Atomic(uint64_t)*: _atomic_wait64, _Atomic(int64_t)*: _atomic_wait64, \
	default: _atomic_wait_arch \
)(ptr, val)
#define atomic_wait_loop(ptr, val) _Generic((ptr), \
	_Atomic(_Bool)*: _atomic_waitloop8, _Atomic(uint8_t)*: _atomic_waitloop8, _Atomic(int8_t)*: _atomic_waitloop8, \
	_Atomic(uint16_t)*: _atomic_waitloop16, _Atomic(int16_t)*: _atomic_waitloop16, \
	_Atomic(uint32_t)*: _atomic_waitloop32, _Atomic(int32_t)*: _atomic_waitloop32, \
	_Atomic(uint64_t)*: _atomic_waitloop64, _Atomic(int64_t)*: _atomic_waitloop64, \
	default: _atomic_wait_arch \
)(ptr, val)

#define _atomic_wait_until(key, cond, s, y) do{ \
	int _atomic_count = s; \
	while(_atomic_count--) if(cond) break; else thread_relax(); \
	_atomic_count = y; while(_atomic_count--) if(cond) break; else thread_yield(); \
	void* _atomic_addr = (void*)(key); \
	_Atomic uint32_t *_atomic_fut = &_atomic_waiter_pool[(((uintptr_t)_atomic_addr)^((uintptr_t)_atomic_addr>>5))&31]; \
	uint32_t _atomic_tok = atomic_load_explicit(_atomic_fut, memory_order_acquire); \
	if(cond) break; \
	check: { \
	_atomic_wait32(_atomic_fut, _atomic_tok); \
	if(cond) break; \
	uint32_t _atomic_tok2 = _atomic_tok; if(_atomic_tok2 != (_atomic_tok=atomic_load_explicit(_atomic_fut, memory_order_relaxed))) _atomic_wake32(_atomic_fut, INT_MAX); } \
	goto check; }while(0)

#define atomic_wait_until(key, cond) _atomic_wait_until(key, cond, _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD)

#define atomic_wake_condition(key, n) _atomic_wake_condition((void*)(key), n)

static inline void lock_acquire(lock_t* lock, int n){
	loop0: {}
	uint32_t v = atomic_load_explicit(lock, memory_order_relaxed);
	loop: {}
	int32_t v2 = (v&0x7FFFFFFF)-n;
	if(v2<0){
		if(atomic_compare_exchange_weak_explicit(lock, &v, 0x80000000, memory_order_relaxed, memory_order_relaxed)){
			n = -v2;
			// block
			int count = _ATOMIC_DEFAULT_SPIN;
			while(count--) if((v = atomic_load_explicit(lock, memory_order_relaxed)) & 0x7FFFFFFF) goto loop; else thread_relax();
			count = _ATOMIC_DEFAULT_YIELD;
			while(count--) if((v = atomic_load_explicit(lock, memory_order_relaxed)) & 0x7FFFFFFF) goto loop; else thread_yield();
			if(!(v&0x80000000)) atomic_fetch_or_explicit(lock, 0x80000000, memory_order_relaxed), v |= 0x80000000;
			_atomic_wait32(lock, v);
			goto loop0;
		}
	}else if(atomic_compare_exchange_weak_explicit(lock, &v, v2|0x80000000, memory_order_acquire, memory_order_relaxed)) return; // acquired
	goto loop;
}

static inline void lock_release(lock_t* lock, int n){
	uint32_t wk = atomic_fetch_add_explicit(lock, n, memory_order_release);
	if(wk&0x80000000){
		atomic_fetch_and_explicit(lock, 0x7FFFFFFF, memory_order_relaxed);
		// We need to wake at least this many waiters. By waking one more waiter we guarantee that the waiting flag is set again if there are indeed more waiters
		wk &= 0x7FFFFFFF;
		_atomic_wake32(lock, wk+(wk<0x7FFFFFFF));
	}
}

#define LOCK_MAX 2147483647

#ifdef __APPLE__
#pragma clang diagnostic pop
#endif

typedef _Atomic(ssize_t) atomic_ssize_t;

// L1 cache line is 64 bytes almost everywhere
#ifndef CACHE_LINE
#define CACHE_LINE 64
#endif
No results found