-
-
Save BlobTheKat/f8a6c4f0dcfb68d99ed031bcd5339a96 to your computer and use it in GitHub Desktop.
a.h - C11 Atomics & threads complement library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Atomics & threads complement library | |
| * Matthew Reiner, 2026 | |
| * Available under the GPL 3.0 license | |
| * Targeting: Windows, Linux, *BSD, MacOS | |
| * Implements the following: | |
| * - Wait/Notify functionality for simple types up to 64 bit | |
| * - Arbitrary condition wait | |
| * - Thread management | |
| * - Relax, yield & sleep | |
| * - Getting the current time | |
| */ | |
| #pragma once | |
| #include <stdint.h> | |
| #include <stdatomic.h> | |
| #include <stdbool.h> | |
| #include <stdalign.h> | |
| static _Atomic uint32_t _atomic_waiter_pool[32]; | |
| typedef _Atomic uint32_t lock_t; | |
| #if !defined(alignas) && !defined(__cplusplus) | |
| #define alignas _Alignas | |
| #endif | |
| #define atomic _Atomic | |
| #define thread_local _Thread_local | |
| typedef _Atomic(uint8_t) atomic_uint8_t; | |
| typedef _Atomic(int8_t) atomic_int8_t; | |
| typedef _Atomic(uint16_t) atomic_uint16_t; | |
| typedef _Atomic(int16_t) atomic_int16_t; | |
| typedef _Atomic(uint32_t) atomic_uint32_t; | |
| typedef _Atomic(int32_t) atomic_int32_t; | |
| typedef _Atomic(uint64_t) atomic_uint64_t; | |
| typedef _Atomic(int64_t) atomic_int64_t; | |
| #define SECOND_US 1000000ull | |
| #define MILLISECOND_US 1000ull | |
| typedef enum{ | |
| THREAD_PRIO_BACKGROUND, | |
| THREAD_PRIO_NORMAL, | |
| THREAD_PRIO_REALTIME | |
| } thread_priority_t; | |
| #if defined(_MSC_VER) | |
| #include <intrin.h> | |
| #endif | |
| #if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86) | |
| static inline void thread_relax(){ | |
| #if defined(_MSC_VER) | |
| _mm_pause(); | |
| #else | |
| __builtin_ia32_pause(); | |
| #endif | |
| } | |
| #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM64) || defined(_M_ARM) | |
| #if defined(__has_include) && __has_include(<arm_acle.h>) | |
| #include <arm_acle.h> | |
| #endif | |
| static inline void thread_relax(){ | |
| #if defined(__ARM_ARCH) | |
| __yield(); | |
| #else | |
| __asm__ __volatile__("yield"); | |
| #endif | |
| } | |
| #elif defined(__riscv) && defined(__riscv_zihintpause) | |
| static inline void thread_relax(){ | |
| __asm__ __volatile__("pause"); | |
| } | |
| #else | |
| inline void thread_relax(){ __asm__ __volatile__("nop"); } | |
| #endif | |
| #ifndef _ATOMIC_DEFAULT_SPIN | |
| // thread_relax() this many times while waiting on an atomic | |
| #define _ATOMIC_DEFAULT_SPIN 48 | |
| #endif | |
| #ifndef _ATOMIC_DEFAULT_YIELD | |
| // thread_yield() this many times while waiting on an atomic | |
| #define _ATOMIC_DEFAULT_YIELD 6 | |
| #endif | |
| // After this many spin/yield, we use a thread parking loop: SYS_futex on Linux, _umtx_op on *BSD, SYS_ulock_* on MacOS, and WaitOnAddress/WakeByAddress* on Windows | |
| // SYS_ulock_* is technically unstable on Mac, although unlikely to go away any time soon | |
| static inline void thread_yield(); | |
| #define _atomic_futex_loop(addr, val, wait, s, y) int count = s; \ | |
| while(count--) if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; else thread_relax(); \ | |
| count = y; while(count--) if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; else thread_yield(); \ | |
| wait; goto check; | |
| #ifdef _WIN32 | |
| #define WIN32_LEAN_AND_MEAN | |
| #define NOMINMAX | |
| #include <windows.h> | |
| #undef WIN32_LEAN_AND_MEAN | |
| #undef NOMINMAX | |
| // Top tier windows trolling | |
| #undef near | |
| #undef far | |
| #undef pascal | |
| #undef cdecl | |
| typedef SSIZE_T ssize_t; | |
| typedef SIZE_T size_t; | |
| static inline void _atomic_wait8(void* addr, uint8_t val){ WaitOnAddress(addr, &val, 1, INFINITE); } | |
| static inline void _atomic_wait16(_Atomic uint16_t* addr, uint16_t val){ WaitOnAddress(addr, &val, 2, INFINITE); } | |
| static inline void _atomic_wait32(_Atomic uint32_t* addr, uint32_t val){ WaitOnAddress(addr, &val, 4, INFINITE); } | |
| static inline void _atomic_wait64(_Atomic uint64_t* addr, uint64_t val){ WaitOnAddress(addr, &val, 8, INFINITE); } | |
| static inline void _atomic_waitloop8(void* addr, uint8_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 1, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); } | |
| static inline void _atomic_waitloop16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 2, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); } | |
| static inline void _atomic_waitloop32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 4, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); } | |
| static inline void _atomic_waitloop64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, check: WaitOnAddress(addr, &val, 8, INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); } | |
| static inline void _atomic_wait_arch(void* a_, void* b_){ WaitOnAddress(a_, &b_, sizeof(void*), INFINITE); } | |
| static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uintptr_t* addr = (_Atomic uintptr_t*)a_; uintptr_t val = (uintptr_t)b_; _atomic_futex_loop(addr, val, check: WaitOnAddress(addr, &val, sizeof(uintptr_t), INFINITE), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD); } | |
| #define atomic_wake(ptr, n) do{ if((int)(n)==1) WakeByAddressSingle(ptr); else WakeByAddressAll(ptr); }while(0) | |
| #define _atomic_wake32_all(ptr) WakeByAddressAll(ptr) | |
| static inline void _atomic_wake_condition(void* addr, int n){ | |
| _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr)^((uintptr_t)addr>>5))&31]; | |
| atomic_fetch_add_explicit(fut, 1, memory_order_release); | |
| atomic_wake(addr, n); | |
| } | |
| struct _thread_t{ | |
| void* (*fn)(void*); | |
| void* arg; | |
| _Atomic HANDLE _handle; | |
| }; | |
| typedef struct _thread_t* thread_t; | |
| static inline size_t available_concurrency(){ return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); } | |
| _Thread_local thread_t _thread_self; | |
| static inline DWORD WINAPI _thread_wrapper(void* a_){ | |
| struct _thread_t* a = _thread_self = (struct _thread_t*)a_; | |
| a->arg = a->fn(a->arg); | |
| HANDLE h = atomic_exchange_explicit(&a->_handle, 0, memory_order_release); | |
| if(h) CloseHandle(h); | |
| else free(a); | |
| return 0; | |
| } | |
| static inline thread_t thread_create(void* (*fn)(void*), void* arg, size_t stack){ | |
| HANDLE h; | |
| struct _thread_t* t = malloc(sizeof(struct _thread_t)); | |
| t->fn = fn; | |
| t->arg = arg; | |
| atomic_thread_fence(memory_order_release); | |
| t->_handle = CreateThread(NULL, stack, _thread_wrapper, t, 0, NULL); | |
| if(t->_handle == INVALID_HANDLE_VALUE){ free(t); return 0; } | |
| return t; | |
| } | |
| static inline void* thread_wait(thread_t t){ | |
| HANDLE h = atomic_load_explicit(&t->_handle, memory_order_acquire); | |
| if(h){ | |
| WaitForSingleObject(h, INFINITE); | |
| CloseHandle(h); | |
| } | |
| void* ret = t->arg; | |
| free(t); | |
| return ret; | |
| } | |
| static inline void thread_detach(thread_t t){ | |
| HANDLE h = atomic_exchange_explicit(&t->_handle, 0, memory_order_relaxed); | |
| if(h){ CloseHandle(h); } | |
| else free(t); | |
| } | |
| static inline thread_t thread_self(){ return _thread_self; } | |
| static inline void thread_yield(){ SwitchToThread(); } | |
| #define _SLEEP_MAX (uint64_t)(INFINITE-1) | |
| static inline void thread_sleep(uint64_t nanoseconds){ | |
| nanoseconds = (nanoseconds+999999) / 1000000; | |
| // nanoseconds becomes milliseconds | |
| while(nanoseconds > _SLEEP_MAX){ | |
| nanoseconds -= _SLEEP_MAX; | |
| Sleep(_SLEEP_MAX); | |
| } | |
| Sleep((DWORD) nanoseconds); | |
| } | |
| static inline bool thread_set_priority(thread_priority_t p){ | |
| return SetThreadPriority(GetCurrentThread(), p == THREAD_PRIO_BACKGROUND ? THREAD_PRIORITY_LOWEST : p == THREAD_PRIO_REALTIME ? THREAD_PRIORITY_TIME_CRITICAL : THREAD_PRIORITY_NORMAL); | |
| } | |
| static inline uint64_t local_now(){ | |
| static double pfreq = -1; | |
| LARGE_INTEGER counter; | |
| QueryPerformanceCounter(&counter); | |
| if(pfreq < 0){ | |
| LARGE_INTEGER f; | |
| QueryPerformanceFrequency(&f); | |
| pfreq = (double)SECOND_NS / (double)f.QuadPart; | |
| } | |
| return (uint64_t)(counter.QuadPart * pfreq); | |
| } | |
| static inline uint64_t epoch_now(){ | |
| FILETIME ft; | |
| ULARGE_INTEGER uli; | |
| GetSystemTimePreciseAsFileTime(&ft); | |
| uli.LowPart = ft.dwLowDateTime; | |
| uli.HighPart = ft.dwHighDateTime; | |
| return uli.QuadPart/10 - 11644473600000000LL; | |
| } | |
| static inline uint64_t thread_now(){ | |
| FILETIME kernel, user; | |
| if(!GetThreadTimes(GetCurrentThread(), 0, 0, &kernel, &user)) return 0; | |
| uint64_t k = | |
| ((uint64_t)kernel.dwHighDateTime << 32) | | |
| kernel.dwLowDateTime; | |
| uint64_t u = | |
| ((uint64_t)user.dwHighDateTime << 32) | | |
| user.dwLowDateTime; | |
| return (k+u)/10; | |
| } | |
| #else | |
| #include <sched.h> | |
| #include <unistd.h> | |
| #ifdef __linux__ | |
| #include <linux/futex.h> | |
| #elif !defined(__APPLE__) | |
| #include <sys/umtx.h> | |
| #endif | |
| #include <sys/syscall.h> | |
| #include <limits.h> | |
| #include <pthread.h> | |
| #include <errno.h> | |
| #include <time.h> | |
| #ifdef __linux__ | |
| // Note on _atomic_futex_small: | |
| // Every 8 bit value is inside some aligned 32 bit value that doesn't cross a page boundary, same for 16 bit | |
| // C standard might say loading this 32 bit value is UB and to that I say BITE ME | |
| // We use volatile here to break the compiler's assumptions about memory | |
| // This does not "fix" the UB, but does make the UB mostly non-actionable to the compiler in practice | |
| // If you sacrifice performance for "idiomatic correctness" you are a sucker and PRs as such will not be accepted | |
| // We use bitset to guarantee that wake ops will wakes the correct thread, even if multiple threads are waiting | |
| // on the same 32 bit word but different 8/16 bit portions of that word | |
| // _atomic_futex32 is just normal futexes | |
| // _atomic_futex64 has to be emulated via a waiter pool: each "waiter" tracks the wake "generation" to avoid lost wakes between the check and the actual syscall | |
| // (this only breaks if there have been exactly 2^32 wakes between the check and the syscall which is virtually impossible in practice) | |
| // _atomic_futex64 also makes use of bitset to reduce unnecessary wakeups (from 1 in 32 to 1 in 1024) | |
| // The exact same algorithm used by _atomic_futex64 can be used to implement wait for arbitrary sizes or conditions | |
| #define _atomic_futex_wake_small(addr, n) int off = ((uintptr_t)addr)&3; \ | |
| syscall(SYS_futex, (char*)addr-off, FUTEX_WAKE_BITSET_PRIVATE, n, 0, 0, 1<<(off<<3)) | |
| #define _atomic_futex_wake32(addr, n) syscall(SYS_futex, addr, FUTEX_WAKE_PRIVATE, n, 0, 0) | |
| #define _atomic_futex_wake64(addr, n) _Atomic uint32_t *fut = &_atomic_waiter_pool[((uintptr_t)addr>>3)&31]; \ | |
| atomic_fetch_add_explicit(fut, 1, memory_order_release); \ | |
| syscall(SYS_futex, fut, FUTEX_WAKE_BITSET_PRIVATE, n, 0, 0, 1u<<(((uintptr_t)addr>>8)&31)); | |
| #define _atomic_futex_small(addr, val, m) int off = ((uintptr_t)addr)&3; \ | |
| addr -= off; off <<= 3; \ | |
| check: {\ | |
| uint32_t v = atomic_load_explicit((volatile _Atomic uint32_t*) addr, memory_order_relaxed); \ | |
| uint32_t v2 = v&m | val<<off; \ | |
| if(v != v2) return; \ | |
| syscall(SYS_futex, addr, FUTEX_WAIT_BITSET_PRIVATE, v, 0, 0, 1<<off); } | |
| #define _atomic_futex32(addr, val) syscall(SYS_futex, addr, FUTEX_WAIT_PRIVATE, val, 0, 0) | |
| #define _atomic_futex64(addr, val) _Atomic uint32_t *fut = &_atomic_waiter_pool[((uintptr_t)addr>>3)&31]; \ | |
| uint32_t tok = atomic_load_explicit(fut, memory_order_acquire); \ | |
| if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \ | |
| uint32_t m = 1u<<(((uintptr_t)addr>>8)&31); \ | |
| check: {\ | |
| syscall(SYS_futex, fut, FUTEX_WAIT_BITSET_PRIVATE, tok, 0, 0, m); \ | |
| if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \ | |
| uint32_t tok2 = tok; if(tok2 == (tok=atomic_load_explicit(fut, memory_order_relaxed))) return; \ | |
| syscall(SYS_futex, fut, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, 0, 0, m); } | |
| #else | |
| // MacOS __ulock and BSD _umtx are pretty similar | |
| // Generic 32 bit futex, because the rest is so similar | |
| #if defined(UMTX_OP_WAIT) | |
| #define _atomic_futex32(addr, val) _umtx_op(addr, UMTX_OP_WAIT_UINT_PRIVATE, val, 0, 0) | |
| #define _atomic_futex_wake32(addr, n) _umtx_op(addr, UMTX_OP_WAKE_PRIVATE, n, 0, 0) | |
| #elif defined(__APPLE__) | |
| #pragma clang diagnostic push | |
| #pragma clang diagnostic ignored "-Wdeprecated-declarations" | |
| #define _atomic_futex32(addr, val) syscall(SYS_ulock_wait, 0x1000001, addr, val) | |
| #define _atomic_futex_wake32(addr, n) syscall(SYS_ulock_wake, 0x1000001, addr, n) | |
| #endif | |
| // Read the notes on the linux implementation | |
| // Main difference: we don't have bitset, so _atomic_futex_small needs the same "wake the others" condition as _atomic_futex64 | |
| // Additionally, _atomic_futex64 can't use the mask for additional waiter pool separation, so we compensate by using a better hash instead | |
| #define _atomic_futex_wake_small(addr, n) int off = ((uintptr_t)addr)&3; \ | |
| _atomic_futex_wake32((char*)addr-off, n) | |
| #define _atomic_futex_small(addr, val, m) int off = ((uintptr_t)addr)&3; \ | |
| volatile _Atomic uint32_t* addr2 = (volatile _Atomic uint32_t*)((char*)addr-off); off <<= 3; \ | |
| check: {\ | |
| uint32_t v = atomic_load_explicit(addr2, memory_order_relaxed); \ | |
| uint32_t v2 = v&m | val<<off; \ | |
| if(v != v2) return; \ | |
| _atomic_futex32(addr2, v); \ | |
| uint32_t v3 = atomic_load_explicit(addr2, memory_order_relaxed)^v; \ | |
| if(v3&m) return; \ | |
| if(v3^~m) _atomic_futex_wake32(addr2, INT_MAX);} | |
| #if ULONG_MAX == UINT64_MAX && defined(UMTX_OP_WAIT) | |
| #define _atomic_futex_wake64(addr, n) _umtx_op(addr, UMTX_OP_WAKE_PRIVATE, n, 0, 0) | |
| #define _atomic_futex64(addr, val) _umtx_op(addr, UMTX_OP_WAIT_PRIVATE, val, 0, 0) | |
| #else | |
| #define _atomic_futex_wake64(addr, n) _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr>>3)^((uintptr_t)addr>>8))&31]; \ | |
| atomic_fetch_add_explicit(fut, 1, memory_order_release); \ | |
| _atomic_futex_wake32(fut, n) | |
| #define _atomic_futex64(addr, val) _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr>>3)^((uintptr_t)addr>>8))&31]; \ | |
| uint32_t tok = atomic_load_explicit(fut, memory_order_acquire); \ | |
| if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \ | |
| check: {\ | |
| _atomic_futex32(fut, tok); \ | |
| if(atomic_load_explicit(addr, memory_order_relaxed) != val) return; \ | |
| uint32_t tok2 = tok; if(tok2 == (tok=atomic_load_explicit(fut, memory_order_relaxed))) return; \ | |
| _atomic_futex_wake32(fut, INT_MAX); } | |
| #endif | |
| #endif | |
| static inline void _atomic_wait8(void* addr, uint8_t val){ _atomic_futex_small(addr, val, ~(255<<off)) } | |
| static inline void _atomic_waitloop8(void* addr, uint8_t val){ _atomic_futex_loop((_Atomic uint8_t*)addr, val, _atomic_futex_small(addr, val, ~(255<<off)), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) } | |
| static inline void _atomic_wait16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_small(addr, val, (0xFFFF0000>>off)) } | |
| static inline void _atomic_waitloop16(_Atomic uint16_t* addr, uint16_t val){ _atomic_futex_loop(addr, val, _atomic_futex_small(addr, val, (0xFFFF0000>>off)), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) } | |
| static inline void _atomic_wait32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex32(addr, val); } | |
| static inline void _atomic_waitloop32(_Atomic uint32_t* addr, uint32_t val){ _atomic_futex_loop(addr, val, check: _atomic_futex32(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) } | |
| static inline void _atomic_wait64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex64(addr, val) } | |
| static inline void _atomic_waitloop64(_Atomic uint64_t* addr, uint64_t val){ _atomic_futex_loop(addr, val, _atomic_futex64(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) } | |
| static inline void _atomic_wake8_16(void* addr, int n){ _atomic_futex_wake_small(addr, n); } | |
| static inline void _atomic_wake32(void* addr, int n){ _atomic_futex_wake32(addr, n); } | |
| static inline void _atomic_wake64(void* addr, int n){ _atomic_futex_wake64(addr, n); } | |
| #if UINTPTR_MAX == UINT64_MAX | |
| static inline void _atomic_wait_arch(void* a_, void* b_){ _Atomic uint64_t* addr = (_Atomic uint64_t*)a_; uint64_t val = (uint64_t)b_; _atomic_futex64(addr, val) } | |
| static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uint64_t* addr = (_Atomic uint64_t*)a_; uint64_t val = (uint64_t)b_; _atomic_futex_loop(addr, val, _atomic_futex64(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) } | |
| #define _atomic_wake_arch _atomic_wake64 | |
| #else | |
| static inline void _atomic_wait_arch(void* a_, void* b_){ _atomic_futex32(((_Atomic uint32_t*)a_), ((uint32_t)b_)) } | |
| static inline void _atomic_waitloop_arch(void* a_, void* b_){ _Atomic uint32_t* addr = a_; uint32_t val = (uint32_t)b_; _atomic_futex_loop(addr, val, _atomic_futex32(addr, val), _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) } | |
| #define _atomic_wake_arch _atomic_wake32 | |
| #endif | |
| #define atomic_wake(ptr, n) _Generic((ptr), \ | |
| _Atomic(_Bool)*: _atomic_wake8_16, _Atomic(uint8_t)*: _atomic_wake8_16, _Atomic(int8_t)*: _atomic_wake8_16, \ | |
| _Atomic(uint16_t)*: _atomic_wake8_16, _Atomic(int16_t)*: _atomic_wake8_16, \ | |
| _Atomic(uint32_t)*: _atomic_wake32, _Atomic(int32_t)*: _atomic_wake32, \ | |
| _Atomic(uint64_t)*: _atomic_wake64, _Atomic(int64_t)*: _atomic_wake64, \ | |
| default: _atomic_wake_arch \ | |
| )(ptr, n) | |
| #define _atomic_wake32_all(ptr) _atomic_wake32(ptr, INT_MAX) | |
| static inline void _atomic_wake_condition(void* addr, int n){ | |
| _Atomic uint32_t *fut = &_atomic_waiter_pool[(((uintptr_t)addr)^((uintptr_t)addr>>5))&31]; | |
| atomic_fetch_add_explicit(fut, 1, memory_order_release); | |
| _atomic_futex_wake32(fut, n); | |
| } | |
| typedef pthread_t thread_t; | |
| static inline size_t available_concurrency(){ return sysconf(_SC_NPROCESSORS_ONLN); } | |
| static inline thread_t thread_create(void* (*fn)(void*), void* arg, size_t stack){ | |
| // This implementation assumes that (pthread_t)0 is never used. This is usually true in practice | |
| thread_t t = {0}; | |
| if(!stack){ pthread_create(&t, 0, fn, arg); return t; } | |
| pthread_attr_t a; | |
| pthread_attr_init(&a); | |
| pthread_attr_setstacksize(&a, stack); | |
| pthread_create(&t, &a, fn, arg); | |
| pthread_attr_destroy(&a); | |
| return t; | |
| } | |
| static inline void thread_detach(thread_t t){ pthread_detach(t); } | |
| static inline void* thread_wait(thread_t t){ void* res; pthread_join(t, &res); return res; } | |
| static inline thread_t thread_self(){ return pthread_self(); } | |
| static inline void thread_yield(){ sched_yield(); } | |
| #ifdef __linux__ | |
| #define _A_CLOCK CLOCK_MONOTONIC_RAW | |
| #else | |
| #define _A_CLOCK CLOCK_MONOTONIC | |
| #endif | |
| static inline void thread_sleep(uint64_t nanoseconds){ | |
| struct timespec ts; | |
| ts.tv_sec = nanoseconds / 1000000000; | |
| ts.tv_nsec = nanoseconds - ts.tv_sec*1000000000; | |
| #ifndef __APPLE__ | |
| while(clock_nanosleep(_A_CLOCK, 0, &ts, &ts) && errno == EINTR); | |
| #else | |
| while(nanosleep(&ts, &ts) && errno == EINTR); | |
| #endif | |
| } | |
| static inline uint64_t local_now(){ | |
| struct timespec ts; | |
| clock_gettime(_A_CLOCK, &ts); | |
| return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec; | |
| } | |
| static inline uint64_t epoch_now(){ | |
| struct timespec ts; | |
| clock_gettime(CLOCK_REALTIME, &ts); | |
| return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec; | |
| } | |
| static inline bool thread_set_priority(thread_priority_t p){ | |
| struct sched_param param = {0}; | |
| #ifdef SCHED_IDLE | |
| return !pthread_setschedparam(pthread_self(), p == THREAD_PRIO_REALTIME ? SCHED_RR : p == THREAD_PRIO_NORMAL ? SCHED_OTHER : SCHED_IDLE, ¶m); | |
| #else | |
| return !pthread_setschedparam(pthread_self(), p == THREAD_PRIO_REALTIME ? SCHED_RR : SCHED_OTHER, ¶m); | |
| #endif | |
| } | |
| static inline uint64_t thread_now(){ | |
| struct timespec ts; | |
| clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); | |
| return (uint64_t)(ts.tv_nsec/1000) + SECOND_US*(uint64_t)ts.tv_sec; | |
| } | |
| #endif | |
| #define atomic_wait(ptr, val) _Generic((ptr), \ | |
| _Atomic(_Bool)*: _atomic_wait8, _Atomic(uint8_t)*: _atomic_wait8, _Atomic(int8_t)*: _atomic_wait8, \ | |
| _Atomic(uint16_t)*: _atomic_wait16, _Atomic(int16_t)*: _atomic_wait16, \ | |
| _Atomic(uint32_t)*: _atomic_wait32, _Atomic(int32_t)*: _atomic_wait32, \ | |
| _Atomic(uint64_t)*: _atomic_wait64, _Atomic(int64_t)*: _atomic_wait64, \ | |
| default: _atomic_wait_arch \ | |
| )(ptr, val) | |
| #define atomic_wait_loop(ptr, val) _Generic((ptr), \ | |
| _Atomic(_Bool)*: _atomic_waitloop8, _Atomic(uint8_t)*: _atomic_waitloop8, _Atomic(int8_t)*: _atomic_waitloop8, \ | |
| _Atomic(uint16_t)*: _atomic_waitloop16, _Atomic(int16_t)*: _atomic_waitloop16, \ | |
| _Atomic(uint32_t)*: _atomic_waitloop32, _Atomic(int32_t)*: _atomic_waitloop32, \ | |
| _Atomic(uint64_t)*: _atomic_waitloop64, _Atomic(int64_t)*: _atomic_waitloop64, \ | |
| default: _atomic_wait_arch \ | |
| )(ptr, val) | |
| #define _atomic_wait_until(key, cond, s, y) do{ \ | |
| int _atomic_count = s; \ | |
| while(_atomic_count--) if(cond) break; else thread_relax(); \ | |
| _atomic_count = y; while(_atomic_count--) if(cond) break; else thread_yield(); \ | |
| void* _atomic_addr = (void*)(key); \ | |
| _Atomic uint32_t *_atomic_fut = &_atomic_waiter_pool[(((uintptr_t)_atomic_addr)^((uintptr_t)_atomic_addr>>5))&31]; \ | |
| uint32_t _atomic_tok = atomic_load_explicit(_atomic_fut, memory_order_acquire); \ | |
| if(cond) break; \ | |
| check: { \ | |
| _atomic_wait32(_atomic_fut, _atomic_tok); \ | |
| if(cond) break; \ | |
| uint32_t _atomic_tok2 = _atomic_tok; if(_atomic_tok2 != (_atomic_tok=atomic_load_explicit(_atomic_fut, memory_order_relaxed))) _atomic_wake32(_atomic_fut, INT_MAX); } \ | |
| goto check; }while(0) | |
| #define atomic_wait_until(key, cond) _atomic_wait_until(key, cond, _ATOMIC_DEFAULT_SPIN, _ATOMIC_DEFAULT_YIELD) | |
| #define atomic_wake_condition(key, n) _atomic_wake_condition((void*)(key), n) | |
| static inline void lock_acquire(lock_t* lock, int n){ | |
| loop0: {} | |
| uint32_t v = atomic_load_explicit(lock, memory_order_relaxed); | |
| loop: {} | |
| int32_t v2 = (v&0x7FFFFFFF)-n; | |
| if(v2<0){ | |
| if(atomic_compare_exchange_weak_explicit(lock, &v, 0x80000000, memory_order_relaxed, memory_order_relaxed)){ | |
| n = -v2; | |
| // block | |
| int count = _ATOMIC_DEFAULT_SPIN; | |
| while(count--) if((v = atomic_load_explicit(lock, memory_order_relaxed)) & 0x7FFFFFFF) goto loop; else thread_relax(); | |
| count = _ATOMIC_DEFAULT_YIELD; | |
| while(count--) if((v = atomic_load_explicit(lock, memory_order_relaxed)) & 0x7FFFFFFF) goto loop; else thread_yield(); | |
| if(!(v&0x80000000)) atomic_fetch_or_explicit(lock, 0x80000000, memory_order_relaxed), v |= 0x80000000; | |
| _atomic_wait32(lock, v); | |
| goto loop0; | |
| } | |
| }else if(atomic_compare_exchange_weak_explicit(lock, &v, v2|0x80000000, memory_order_acquire, memory_order_relaxed)) return; // acquired | |
| goto loop; | |
| } | |
| static inline void lock_release(lock_t* lock, int n){ | |
| uint32_t wk = atomic_fetch_add_explicit(lock, n, memory_order_release); | |
| if(wk&0x80000000){ | |
| atomic_fetch_and_explicit(lock, 0x7FFFFFFF, memory_order_relaxed); | |
| // We need to wake at least this many waiters. By waking one more waiter we guarantee that the waiting flag is set again if there are indeed more waiters | |
| wk &= 0x7FFFFFFF; | |
| _atomic_wake32(lock, wk+(wk<0x7FFFFFFF)); | |
| } | |
| } | |
| #define LOCK_MAX 2147483647 | |
| #ifdef __APPLE__ | |
| #pragma clang diagnostic pop | |
| #endif | |
| typedef _Atomic(ssize_t) atomic_ssize_t; | |
| // L1 cache line is 64 bytes almost everywhere | |
| #ifndef CACHE_LINE | |
| #define CACHE_LINE 64 | |
| #endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment