|
// Copyright 2025 Igor Burago. Released under the ISC License. |
|
|
|
#include <limits.h> // CHAR_BIT |
|
#include <stddef.h> // ptrdiff_t |
|
#include <stdint.h> |
|
#include <string.h> // memcpy(), memmove(), memset() |
|
|
|
// Built-in u128 is not strictly required, but it makes it easier to guarantee |
|
// the absence of overflows during sampling in rank_partition() for all possible |
|
// input array lengths. It also simplifies the implementation of the mcg64 RNG. |
|
typedef unsigned __int128 uint128_t; |
|
|
|
#if !defined(__has_attribute) |
|
#define __has_attribute(x) (0) |
|
#endif |
|
#if !defined(__has_builtin) |
|
#define __has_builtin(x) (0) |
|
#endif |
|
|
|
#if __has_attribute(always_inline) |
|
#define must_inline inline __attribute__((always_inline)) |
|
#else |
|
#define must_inline inline |
|
#endif |
|
#if __has_attribute(noinline) |
|
#define must_not_inline __attribute__((noinline)) |
|
#else |
|
#define must_not_inline |
|
#endif |
|
#if __has_attribute(flatten) |
|
#define inline_callees __attribute__((flatten)) |
|
#else |
|
#define inline_callees |
|
#endif |
|
|
|
#if __has_builtin(__builtin_unpredictable) |
|
#define fickle(x) (__builtin_unpredictable((x))) |
|
#else |
|
#define fickle(x) (x) |
|
#endif |
|
#if __has_builtin(__builtin_expect) |
|
#define usually(x, c) (__builtin_expect((x), (c))) |
|
#else |
|
#define usually(x, c) (x) |
|
#endif |
|
#define likely(x) (usually(!!(x), 1)) |
|
#define unlikely(x) (usually(!!(x), 0)) |
|
|
|
#define glue_raw(x, y) x##y |
|
#define glue(x, y) glue_raw(x, y) |
|
#define dub(name) glue(name, glue(glue(__dub_L,__LINE__),__)) |
|
|
|
#define apriori(x) void *(*compile_time_check_dummy__(void))[ \ |
|
sizeof(struct { unsigned compile_time_check : (x) ? 1 : -1; })] |
|
|
|
// #define DEBUG_BUILD |
|
#if defined(DEBUG_BUILD) |
|
#include <stdio.h> |
|
#if __has_builtin(__builtin_trap) |
|
#define terminate_abnormally() (__builtin_trap()) |
|
#else |
|
#define terminate_abnormally() ((void)(*(volatile int *)0 = 0)) |
|
#endif |
|
#define surely(x) (likely(x) ? (void)0 : \ |
|
(fprintf(stderr, "FATAL: %s:%d: surely(%s) does not hold!\n", __FILE__, __LINE__, #x), \ |
|
fflush(NULL), \ |
|
terminate_abnormally())) |
|
#define unreachable() surely(0 && "This code path must never be reached.") |
|
#else |
|
#if __has_builtin(__builtin_unreachable) |
|
#define unreachable() (__builtin_unreachable()) |
|
#else |
|
static must_inline void unreachable(void) { for (;;) { *(volatile int *)0 = 0; } } |
|
#endif |
|
#define surely(x) (likely(x) ? (void)0 : unreachable()) |
|
#endif |
|
|
|
#if __STDC_VERSION__ >= 202311L |
|
#define aligned_to(n) alignas(+(n)) |
|
#elif __STDC_VERSION__ >= 201112L |
|
#define aligned_to(n) _Alignas(+(n)) |
|
#elif __has_attribute(aligned) |
|
#define aligned_to(n) __attribute__((aligned(n))) |
|
#else |
|
#error "aligned_to(n) specifier cannot be defined." |
|
#endif |
|
|
|
// For aligning hot buffers in partition() and rank_partitioned_heapsort(): |
|
enum { CACHE_LINE_SIZE = 64 }; |
|
|
|
#define countof(arr) (sizeof(arr) / sizeof((arr)[0])) |
|
|
|
#define is_pow2(x) ((x) > 0 && ((x) & ((x)-1)) == 0) |
|
#define int_type_is_signed(type) ((type)-1 < (type)0) |
|
#define int_type_max(type) (!int_type_is_signed(type) ? (type)-1 : \ |
|
(type)((((type)1 << (CHAR_BIT*(int)sizeof(type) - 2)) - 1)*2 + 1)) |
|
|
|
#define leading_zeros_u32(x) (__builtin_clzg((uint32_t)(x), 32)) |
|
#define leading_zeros_u64(x) (__builtin_clzg((uint64_t)(x), 64)) |
|
#if defined(__SIZEOF_INT128__) |
|
#define leading_zeros_u128(x) (__builtin_clzg((uint128_t)(x), 128)) |
|
#define bit_width(x) ( \ |
|
sizeof(x) <= sizeof(uint32_t) ? 32 - leading_zeros_u32(x) : \ |
|
sizeof(x) <= sizeof(uint64_t) ? 64 - leading_zeros_u64(x) : \ |
|
128 - leading_zeros_u128(x)) |
|
#else |
|
#define bit_width(x) (sizeof(x) <= sizeof(uint32_t) \ |
|
? 32 - leading_zeros_u32(x) : 64 - leading_zeros_u64(x)) |
|
#endif |
|
|
|
#define swap(x, y) do { \ |
|
void *dub(xp) = &(x), *dub(yp) = &(y); \ |
|
apriori(sizeof(x) == sizeof(y)); \ |
|
unsigned char dub(tmp)[sizeof(x)]; \ |
|
memcpy(dub(tmp), dub(xp), sizeof(dub(tmp))); \ |
|
memmove(dub(xp), dub(yp), sizeof(dub(tmp))); \ |
|
memcpy(dub(yp), dub(tmp), sizeof(dub(tmp))); \ |
|
} while (0) |
|
|
|
static must_inline void * |
|
align_ptr_down(void *p, uintptr_t align) { |
|
surely(is_pow2(align)); |
|
return (void *)((uintptr_t)p & -align); |
|
} |
|
static must_inline void * |
|
align_ptr_up(void *p, uintptr_t align) { |
|
surely(is_pow2(align)); |
|
return (void *)(((uintptr_t)p + align-1) & -align); |
|
} |
|
|
|
static must_inline uint64_t |
|
load_u64(void *p) { |
|
uint64_t x = 0; |
|
memcpy(&x, p, sizeof(x)); |
|
return x; |
|
} |
|
static must_inline uint64_t |
|
seed_mix_u64(uint64_t seed, uint64_t x) { |
|
x += seed * 1122334455667788991u; |
|
return x ^ (x >> 32); |
|
} |
|
|
|
static must_inline uint64_t |
|
splitmix64_next(uint64_t *s) { |
|
uint64_t x = *s += 0x9E3779B97F4A7C15u; |
|
x = (x ^ (x>>30)) * 0xBF58476D1CE4E5B9u; |
|
x = (x ^ (x>>27)) * 0x94D049BB133111EBu; |
|
return x ^ (x>>31); |
|
} |
|
|
|
typedef struct mcg64 mcg64_t; |
|
struct mcg64 { uint128_t s; }; |
|
static must_inline mcg64_t |
|
mcg64(uint64_t seed) { |
|
uint64_t a0 = splitmix64_next(&seed); |
|
uint64_t a1 = splitmix64_next(&seed); |
|
mcg64_t r = {0}; |
|
r.s = ((uint128_t)a1 << 64) | a0; |
|
return r; |
|
} |
|
static must_inline uint64_t |
|
mcg64_next(mcg64_t *r) { |
|
return (r->s *= 0xDA942042E4DD58B5u) >> 64; |
|
} |
|
|
|
typedef struct pcg32 pcg32_t; |
|
struct pcg32 { uint64_t s, t; }; |
|
static must_inline pcg32_t |
|
pcg32(uint64_t seed) { |
|
pcg32_t r = {0}; |
|
r.s = splitmix64_next(&seed); |
|
r.t = splitmix64_next(&seed) | 1; |
|
return r; |
|
} |
|
static must_inline uint32_t |
|
pcg32_next(pcg32_t *r) { |
|
uint64_t s = r->s; |
|
r->s = r->s*0x5851F42D4C957F2Du + r->t; |
|
uint32_t x = (s ^ (s>>18)) >> 27, k = s>>59; |
|
return (x >> k) | (x << (-k & 31)); |
|
} |
|
|
|
// Fast and only slightly biased for (end-beg) approaching u32/u64 max. |
|
static must_inline uint32_t |
|
onto_range_u32(uint32_t x, uint32_t beg, uint32_t end) { |
|
surely(beg < end); |
|
x = ((uint64_t)x * (end - beg)) >> 32; |
|
return beg + x; |
|
} |
|
static must_inline uint64_t |
|
onto_range_u64(uint64_t x, uint64_t beg, uint64_t end) { |
|
surely(beg < end); |
|
x = ((uint128_t)x * (end - beg)) >> 64; |
|
return beg + x; |
|
} |
|
|
|
|
|
//-----------------------------------------------------------------------------* |
|
// ARRAY VALUE AND INDEX TYPES |
|
//-----------------------------------------------------------------------------* |
|
|
|
typedef int item_t; |
|
#define item_less(x, y) ((x) < (y)) |
|
|
|
// Both signed and unsigned types are supported: |
|
typedef uint32_t index_t; |
|
|
|
// #define INDEX_RNG_USING_MCG |
|
#if defined(INDEX_RNG_USING_MCG) |
|
// MCG is faster than LCG or PCG, and is usually good enough for quicksort. |
|
typedef mcg64_t index_rng_t; |
|
#define index_rng(seed) (mcg64(seed)) |
|
#define index_rand(rng, len) (onto_range_u64(mcg64_next(rng), 0, (len))) |
|
#else |
|
typedef pcg32_t index_rng_t; |
|
#define index_rng(seed) (pcg32(seed)) |
|
#define index_rand(rng, len) (onto_range_u32(pcg32_next(rng), 0, (len))) |
|
#endif |
|
|
|
// An index_rng() wrapper for seeding a random index generator from an array. |
|
enum { INDEX_RNG_SEED_SAMPLE_COUNT = 128 }; |
|
static index_rng_t |
|
index_rng_seeded_from(item_t *arr, index_t len) { |
|
// Take advantage of the randomness added to addresses on modern systems |
|
// due to position-independent code, address space layout randomization, |
|
// and random stack gap insertion: |
|
uint64_t seed = -1; |
|
seed = seed_mix_u64(seed, (uintptr_t)&memset); |
|
seed = seed_mix_u64(seed, (uintptr_t)&index_rng_seeded_from); |
|
seed = seed_mix_u64(seed, (uintptr_t)&seed); |
|
seed = seed_mix_u64(seed, (uintptr_t)arr); |
|
index_rng_t rng = index_rng(seed); |
|
|
|
// Then, mix in some bits from the array itself by breaking it into 64-bit |
|
// aligned chunks and randomly choosing one u64 value out of each using the |
|
// seed obtained so far. Since we use index_rand() returning index_t values |
|
// to sample from chunks, it is obviously desirable for index_t to be ample |
|
// enough to fit any index up to the maximum possible chunk length: |
|
enum { N_CHUNKS = INDEX_RNG_SEED_SAMPLE_COUNT }; |
|
apriori(int_type_max(index_t) >= |
|
(int_type_max(index_t) * sizeof(item_t)) / (N_CHUNKS * sizeof(uint64_t))); |
|
|
|
uint64_t *beg = align_ptr_up(arr, sizeof(uint64_t)); |
|
uint64_t *end = align_ptr_down(arr+len, sizeof(uint64_t)); |
|
if unlikely(beg >= end) { return rng; } |
|
|
|
size_t span = end - beg; |
|
if unlikely(span < N_CHUNKS) { |
|
uint64_t *p = beg + index_rand(&rng, span); |
|
seed = seed_mix_u64(seed, load_u64(p)); |
|
return index_rng(seed); |
|
} |
|
|
|
size_t chunk_len = span/N_CHUNKS; |
|
beg += index_rand(&rng, span%N_CHUNKS + 1); |
|
end = beg + chunk_len*N_CHUNKS; |
|
for (uint64_t *chunk = beg; chunk != end; chunk += chunk_len) { |
|
uint64_t *p = chunk + index_rand(&rng, chunk_len); |
|
seed = seed_mix_u64(seed, load_u64(p)); |
|
} |
|
return index_rng(seed); |
|
} |
|
|
|
static must_inline void |
|
index_rand_3(index_t i[static 3], index_rng_t *rng, index_t len) { |
|
surely(len >= 3); |
|
i[0] = index_rand(rng, len-0); |
|
i[1] = index_rand(rng, len-1); |
|
i[2] = index_rand(rng, len-2); |
|
unsigned i1_past_i0 = (i[1] >= i[0]); |
|
i[1] += i1_past_i0; // Count i[1]-th item, jumping over i[0]-th. |
|
i[2] += (i[2] >= i[!i1_past_i0]); // Count i[2]-th, skipping min(i[0],i[1])-th... |
|
i[2] += (i[2] >= i[i1_past_i0]); // ...and then max(i[0],i[1])-th. |
|
} |
|
static must_inline index_t |
|
median_of_3_idx(item_t *arr, index_t i[static 3]) { |
|
unsigned gt01 = !!item_less(arr[i[1]], arr[i[0]]); |
|
unsigned lt02 = !!item_less(arr[i[0]], arr[i[2]]); |
|
unsigned lt12 = !!item_less(arr[i[1]], arr[i[2]]); |
|
return i[(gt01 ^ lt02) + (lt02 ^ lt12)]; |
|
} |
|
|
|
static void |
|
insertion_sort(item_t *arr, index_t len) { |
|
surely(len >= 0); |
|
for (index_t i = (len != 0); i != len; i++) { |
|
if fickle(!item_less(arr[i], arr[i-1])) { continue; } |
|
item_t x = arr[i]; |
|
index_t j = i; |
|
do { arr[j] = arr[j-1]; } while (--j != 0 && fickle(item_less(x, arr[j-1]))); |
|
arr[j] = x; |
|
} |
|
} |
|
|
|
|
|
//-----------------------------------------------------------------------------* |
|
// KEY PREREQUISITE: Fast array partitioning |
|
//-----------------------------------------------------------------------------* |
|
|
|
enum { PARTITION_BATCH_LEN = 256 }; |
|
apriori(PARTITION_BATCH_LEN - 1 <= int_type_max(unsigned char)); // For storing offsets in uchar. |
|
apriori(PARTITION_BATCH_LEN % 8 == 0); // For loop unrolling. |
|
|
|
#define REPEAT_1(s) {s;} |
|
#define REPEAT_8(s) {{s;}{s;}{s;}{s;}{s;}{s;}{s;}{s;}} |
|
|
|
#define BATCH_COLLECT_ITEMS(offsets_buf, n_offsets_ptr, \ |
|
idx_var, batch_len, match_expr, unroll_macro) do { \ |
|
unsigned char *dub(buf) = (offsets_buf); \ |
|
unsigned dub(n) = 0, (idx_var) = 0, dub(end) = (batch_len); \ |
|
while ((idx_var) != dub(end)) unroll_macro({ \ |
|
dub(buf)[dub(n)] = (idx_var); \ |
|
dub(n) += !!(match_expr); \ |
|
(idx_var)++; \ |
|
}) \ |
|
*(n_offsets_ptr) = dub(n); \ |
|
} while (0) |
|
|
|
#define BATCH_EXCHANGE_ITEMS(count, next_l_ptr_expr, next_r_ptr_expr) do { \ |
|
unsigned dub(n) = (count); \ |
|
if (dub(n) == 0) { break; } \ |
|
item_t *dub(l) = (next_l_ptr_expr), dub(x) = *dub(l); \ |
|
item_t *dub(r) = (next_r_ptr_expr); *dub(l) = *dub(r); \ |
|
while (--dub(n) != 0) { \ |
|
dub(l) = (next_l_ptr_expr); *dub(r) = *dub(l); \ |
|
dub(r) = (next_r_ptr_expr); *dub(l) = *dub(r); \ |
|
} \ |
|
*dub(r) = dub(x); \ |
|
} while (0) |
|
|
|
// partition() partitions array elements in [0,len[ using the item currently |
|
// stored at pivot_idx for the pivot and returns the index it ends up at after |
|
// partitioning is done. |
|
// |
|
// This is the normal Hoare partitioning, except that item swaps are done in |
|
// batches instead of one by one in order to eliminate branch mispredictions. |
|
// Items from each end of the current interval are compared with the pivot in |
|
// chunks of PARTITION_BATCH_LEN items, and the offsets of those to be exchanged |
|
// are therefore buffered in a fixed iteration loop (see BATCH_COLLECT_ITEMS()). |
|
// Pairs of items with indices taken from the two buffers are then swapped until |
|
// one or both of the buffers are emptied, at which point the corresponding end |
|
// of the interval is advanced, and the emptied buffer is refilled by scanning |
|
// the next batch of items. |
|
// |
|
// To reduce the number of item moves induced by the swapping loop (from 3 to |
|
// 2 per iteration), a cyclic permutation is used in favor of pairwise swaps, |
|
// i.e., instead of straightforward exchanges l(i) <-> r(i) for i = 0,...,n-1, |
|
// we effectively left rotate the following interleaved sequence by one: |
|
// x <- l(0) <- r(0) <- l(1) <- r(1) <- l(2) <- r(2) <- ... |
|
// ... <- l(n-2) <- r(n-2) <- l(n-1) <- r(n-1) <- x |
|
// (see BATCH_EXCHANGE_ITEMS()). |
|
static index_t |
|
partition(item_t *arr, index_t len, index_t pivot_idx) { |
|
surely(len >= 0), surely(0 <= pivot_idx), surely(pivot_idx < len); |
|
if unlikely(len <= 1) { return 0; } |
|
|
|
item_t pivot = arr[pivot_idx]; |
|
arr[pivot_idx] = arr[0]; |
|
item_t *l = arr + 1; |
|
item_t *r = arr + len - 1; // Inclusive! |
|
|
|
// The dl and dr arrays (backed by dl_buf and dr_buf) store deltas |
|
// from l and r of those items in the current left-hand side batch |
|
// [l,l+BATCH_LEN[ and right-hand side batch ]r-BATCH_LEN,r] that |
|
// are currently on the wrong side of the pivot. |
|
enum { BATCH_LEN = PARTITION_BATCH_LEN }; |
|
aligned_to(CACHE_LINE_SIZE) unsigned char dl_buf[BATCH_LEN]; |
|
aligned_to(CACHE_LINE_SIZE) unsigned char dr_buf[BATCH_LEN]; |
|
unsigned char *dl = NULL, *dr = NULL; |
|
unsigned n_dl = 0, n_dr = 0; |
|
|
|
while (r+1 - l >= 2*BATCH_LEN) { |
|
if (n_dl == 0) { |
|
dl = dl_buf; |
|
BATCH_COLLECT_ITEMS(dl, &n_dl, (i), BATCH_LEN, |
|
(!item_less(*(l + i), pivot)), REPEAT_8); |
|
} |
|
if (n_dr == 0) { |
|
dr = dr_buf; |
|
BATCH_COLLECT_ITEMS(dr, &n_dr, (i), BATCH_LEN, |
|
(!item_less(pivot, *(r - i))), REPEAT_8); |
|
} |
|
unsigned n = n_dl < n_dr ? n_dl : n_dr; |
|
if (n != 0) { |
|
n_dl -= n, n_dr -= n; |
|
BATCH_EXCHANGE_ITEMS(n, (l + *dl++), (r - *dr++)); |
|
} |
|
l += BATCH_LEN * (n_dl == 0); |
|
r -= BATCH_LEN * (n_dr == 0); |
|
} |
|
|
|
// Same for up to 2*BATCH_LEN-1 remaining items. If both offset buffers |
|
// are empty, scan half of the items for each. Otherwise, one of them is |
|
// not exhausted, meaning the BATCH_LEN items at the corresponding end |
|
// were already looked at, so scan only the remainder at the opposite end, |
|
// filling the other buffer. |
|
{ |
|
unsigned span = r+1 - l; |
|
unsigned span_l = |
|
n_dl != 0 ? BATCH_LEN : |
|
n_dr != 0 ? span - BATCH_LEN : |
|
span/2; |
|
unsigned span_r = span - span_l; |
|
if (n_dl == 0) { |
|
dl = dl_buf; |
|
BATCH_COLLECT_ITEMS(dl, &n_dl, (i), span_l, |
|
(!item_less(*(l + i), pivot)), REPEAT_1); |
|
} |
|
if (n_dr == 0) { |
|
dr = dr_buf; |
|
BATCH_COLLECT_ITEMS(dr, &n_dr, (i), span_r, |
|
(!item_less(pivot, *(r - i))), REPEAT_1); |
|
} |
|
unsigned n = n_dl < n_dr ? n_dl : n_dr; |
|
if (n != 0) { |
|
n_dl -= n, n_dr -= n; |
|
BATCH_EXCHANGE_ITEMS(n, (l + *dl++), (r - *dr++)); |
|
} |
|
l += span_l * (n_dl == 0); |
|
r -= span_r * (n_dr == 0); |
|
|
|
// Move any leftovers to the other end, going backwards through the buffer |
|
// to ensure we do not write over any of those items before we get to them. |
|
if (n_dl != 0) { |
|
dl += n_dl; |
|
BATCH_EXCHANGE_ITEMS(n_dl, (l + *--dl), (r--)); |
|
l = r+1; |
|
} else if (n_dr != 0) { |
|
dr += n_dr; |
|
BATCH_EXCHANGE_ITEMS(n_dr, (r - *--dr), (l++)); |
|
r = l-1; |
|
} |
|
} |
|
surely(arr <= r), surely(r < arr+len); |
|
arr[0] = *r; |
|
*r = pivot; |
|
return r - arr; |
|
} |
|
|
|
|
|
//-----------------------------------------------------------------------------* |
|
// APPROACH 1: Floyd-Rivest rank partitioning + N largest binary heap scan |
|
//-----------------------------------------------------------------------------* |
|
|
|
// approx_floor_pow_2over3_u64() approximates floor(x^(2/3)) for integer bases. |
|
// The 99 percentile of the absolute value of the relative error is less than |
|
// 0.05% over the entire u64 range. |
|
// |
|
// Based on a continued fraction representation (see ch. 2, para. 2, eq. 2.6 in |
|
// Khovanskii, A. N. (1963). The Application of Continued Fractions and Their |
|
// Generalizations to Problems in Approximation Theory. Groningen: Noordhoff): |
|
// |
|
// 2/3 (x-1) |
|
// x^(2/3) - 1 = ───────────────────────────────────────────────────── |
|
// ∞ ⎡(k-2/3)/(4k-2) (x-1) (k+2/3)/(4k+2) (x-1)⎤ |
|
// 1 + K ⎢──────────────────── ────────────────────⎥ |
|
// 1 ⎣ 1 + 1 ⎦ |
|
// |
|
// 2/3 (x-1) 1/6 (x-1) 5/18 (x-1) 2/9 (x-1) 4/15 (x-1) |
|
// = ───────── ───────── ────────── ───────── ────────── |
|
// 1 + 1 + 1 + 1 + 1 + ... |
|
// |
|
// Truncating it to the first four terms and substituting 3/2 for x in the |
|
// fifth one (and 1 in all the rest), we obtain the following approximation: |
|
// |
|
// 55x^2 + 100x + 7 912x + 708 |
|
// x^(2/3) ≈ ──────────────── = 11 - ───────────────, |
|
// 5x^2 + 92x + 65 5x^2 + 92x + 65 |
|
// |
|
// which is quite accurate for 1 <= x < 2 (it underestimates slightly for |
|
// x from 1 to ≈1.57, and the closer x gets to 2, the more it overestimates |
|
// after that). To use it for any x, we normalize by the highest 2^k <= x: |
|
// |
|
// (2^k * (x/2^k))^(2/3) = (2^(2/3))^k * (x/2^k)^(2/3), 1 <= x/2^k < 2; |
|
// |
|
// where the values of (2^(2/3))^k for all k = 0,...,63 are precomputed. |
|
// |
|
// We implement this using fixed-point arithmetic with 16 fractional bits. |
|
// |
|
// For the final multiplication by (2^(2/3))^k, we have no fewer than |
|
// 64 - P - ⌈log2((2^64-1)^(2/3))⌉ = 64 - P - 43 = 5 (for P = 16) |
|
// bits to spare until overflow. We precompute the exponents (2^(2/3))^k to |
|
// 4 fractional bits, which is comfortably enough, as their fractional parts |
|
// only matter for small powers k, yet when k is small, truncation of the |
|
// final product to an integer eliminates the need for high precision. |
|
static inline uint64_t |
|
approx_floor_pow_2over3_u64(uint64_t x) { |
|
enum { PREC = 16 }; // Fixed-point precision. |
|
#define I(n) ((uint64_t)(n) << (1*(PREC))) |
|
#define II(n) ((uint64_t)(n) << (2*(PREC))) |
|
#define III(n) ((uint64_t)(n) << (3*(PREC))) |
|
|
|
// Precomputed (2^(2/3))^(63-i) for i = 0,...,63. |
|
enum { PREC_POW2_2OVER3_TIMES_63MINUS = 4 }; // Table precision. |
|
static const uint64_t pow2_2over3_times_63minus[] = { |
|
#define X(v) (uint64_t)((v)*(1ull<<PREC_POW2_2OVER3_TIMES_63MINUS) + 0.5), |
|
X(4398046511104.) X(2770595688878.) X(1745365914583.) X(1099511627776.) |
|
X(692648922219.6) X(436341478645.7) X(274877906944.0) X(173162230554.9) |
|
X(109085369661.4) X(68719476736.00) X(43290557638.72) X(27271342415.36) |
|
X(17179869184.00) X(10822639409.68) X(6817835603.839) X(4294967296.000) |
|
X(2705659852.420) X(1704458900.960) X(1073741824.000) X(676414963.1051) |
|
X(426114725.2400) X(268435456.0000) X(169103740.7763) X(106528681.3100) |
|
X(67108864.00000) X(42275935.19407) X(26632170.32750) X(16777216.00000) |
|
X(10568983.79852) X(6658042.581874) X(4194304.000000) X(2642245.949629) |
|
X(1664510.645469) X(1048576.000000) X(660561.4874073) X(416127.6613672) |
|
X(262144.0000000) X(165140.3718518) X(104031.9153418) X(65536.00000000) |
|
X(41285.09296296) X(26007.97883545) X(16384.00000000) X(10321.27324074) |
|
X(6501.994708862) X(4096.000000000) X(2580.318310185) X(1625.498677215) |
|
X(1024.000000000) X(645.0795775462) X(406.3746693039) X(256.0000000000) |
|
X(161.2698943865) X(101.5936673260) X(64.00000000000) X(40.31747359664) |
|
X(25.39841683149) X(16.00000000000) X(10.07936839916) X(6.349604207873) |
|
X(4.000000000000) X(2.519842099790) X(1.587401051968) X(1.000000000000) |
|
#undef X |
|
}; apriori(countof(pow2_2over3_times_63minus) == 64); |
|
|
|
if unlikely(x == 0) { return 0; } |
|
unsigned l = leading_zeros_u64(x); |
|
x = (x << l) >> (63 - PREC); |
|
surely(I(1) <= x && x < I(2)); |
|
|
|
uint64_t y = I(11) - ((II(912*x) + III(708)) / (5*x*x + I(92*x) + II(65))); |
|
return (y * pow2_2over3_times_63minus[l]) >> (PREC + PREC_POW2_2OVER3_TIMES_63MINUS); |
|
|
|
#undef I |
|
#undef II |
|
#undef III |
|
} |
|
// approx_log_fixp16_u64() approximates ln(x) for integer arguments. It's |
|
// accurate to at least 3 digits after the decimal point over the entire u64 |
|
// range. The return value is in the 16.16 fixed-point format. |
|
// |
|
// Based on a continued fraction representation (see ch. 2, para. 4, eq. 4.5 in |
|
// Khovanskii, A. N. (1963). The Application of Continued Fractions and Their |
|
// Generalizations to Problems in Approximation Theory. Groningen: Noordhoff): |
|
// |
|
// 2(x-1) (x-1)^2 4(x-1)^2 9(x-1)^2 k^2 (x-1)^2 |
|
// ln(x) = ────── ─────── ──────── ──────── ─────────── |
|
// (x+1) + 3(x+1) + 5(x+1) + 7(x+1) + ... + (2k+1)(x+1) + ... |
|
// |
|
// Truncating it to the first two terms (by substituting 1 for x in the third |
|
// one onward), we obtain the following approximation: |
|
// |
|
// 3(x^2 - 1) 3(x^2 + 1) - 6 |
|
// ln(x) ≈ ──────────── = ──────────────, |
|
// x^2 + 4x + 1 (x^2 + 1) + 4x |
|
// |
|
// which is decently accurate for 1 <= x < 2 (the farther from 1, the more it |
|
// underestimates). To use it for any x, we normalize by the highest 2^k <= x: |
|
// |
|
// ln(2^k * (x/2^k)) = k*ln(2) + ln(x/2^k), 1 <= x/2^k < 2. |
|
// |
|
// We implement this using fixed-point arithmetic with 16 fractional bits. |
|
static inline uint32_t |
|
approx_log_fixp16_u64(uint64_t x) { |
|
enum { PREC = 16 }; // Fixed-point precision. |
|
#define I(n) ((uint64_t)(n) << (1*(PREC))) |
|
#define II(n) ((uint64_t)(n) << (2*(PREC))) |
|
|
|
// Precomputed ln(2)*(63-i) for i = 0,...,63. |
|
static const uint32_t ln2_times_63minus[] = { |
|
#define X(i) (uint32_t)(I(63-(i))*0.69314718056 + 0.5), |
|
X( 0)X( 1)X( 2)X( 3)X( 4)X( 5)X( 6)X( 7)X( 8)X( 9)X(10)X(11)X(12) |
|
X(13)X(14)X(15)X(16)X(17)X(18)X(19)X(20)X(21)X(22)X(23)X(24)X(25) |
|
X(26)X(27)X(28)X(29)X(30)X(31)X(32)X(33)X(34)X(35)X(36)X(37)X(38) |
|
X(39)X(40)X(41)X(42)X(43)X(44)X(45)X(46)X(47)X(48)X(49)X(50)X(51) |
|
X(52)X(53)X(54)X(55)X(56)X(57)X(58)X(59)X(60)X(61)X(62)X(63) |
|
#undef X |
|
}; apriori(countof(ln2_times_63minus) == 64); |
|
|
|
if unlikely(x == 0) { return 0; } |
|
unsigned l = leading_zeros_u64(x); |
|
x = (x << l) >> (63 - PREC); |
|
surely(I(1) <= x && x < I(2)); |
|
|
|
uint64_t y = x*x + II(1); |
|
y = I(3*y - II(6)) / (y + I(4*x)); |
|
return (uint32_t)y + ln2_times_63minus[l]; |
|
|
|
#undef I |
|
#undef II |
|
} |
|
// approx_floor_sqrt_u64() approximates floor(sqrt(x)) for integer arguments. |
|
// The 99 percentile of the absolute value of the relative error is less than |
|
// 0.05% over the entire u64 range. |
|
// |
|
// Based on a continued fraction representation (see ch. 2, para. 2, eq. 2.6 in |
|
// Khovanskii, A. N. (1963). The Application of Continued Fractions and Their |
|
// Generalizations to Problems in Approximation Theory. Groningen: Noordhoff): |
|
// |
|
// 1/2 (x-1) |
|
// sqrt(x) - 1 = ───────────────────────────────────────────────────── |
|
// ∞ ⎡(k-1/2)/(4k-2) (x-1) (k+1/2)/(4k+2) (x-1)⎤ |
|
// 1 + K ⎢──────────────────── ────────────────────⎥ |
|
// 1 ⎣ 1 + 1 ⎦ |
|
// |
|
// 1/2 (x-1) (x-1)/2 (x-1)/4 (x-1)/4 (x-1)/4 |
|
// = ─────────────────── = ─────── ─────── ─────── ─────── |
|
// ∞ ⎡1/4 (x-1)⎤ 1 + 1 + 1 + 1 + ... |
|
// 1 + K ⎢─────────⎥ |
|
// 1 ⎣ 1 ⎦ |
|
// |
|
// Truncating it to the first four terms (i.e., substituting 1 for x in the |
|
// fifth and all the subsequent ones), we obtain the following (lower-bound) |
|
// approximation: |
|
// |
|
// 5x^2 + 10x + 1 4(x^2 - 1) |
|
// sqrt(x) ≈ ────────────── = 1 + ───────────────────, |
|
// x^2 + 10x + 5 (x^2 - 1) + 10x + 6 |
|
// |
|
// which is decently accurate for 1 <= x < 2 (it underestimates by less than |
|
// 1e-4 for x between 1 and ≈1.7, and by up to ≈4.2e-4 for larger values of |
|
// x approaching 2). To use it for any x, we normalize by the highest 2^k <= x: |
|
// |
|
// sqrt(2^k * (x/2^k)) = sqrt(2)^k * sqrt(x/2^k), 1 <= x/2^k < 2; |
|
// |
|
// where, trivially, sqrt(2)^(2l) = 2^l and sqrt(2)^(2l+1) = 2^l * sqrt(2). |
|
// |
|
// We implement this using fixed-point arithmetic with 16 fractional bits. |
|
static inline uint64_t |
|
approx_floor_sqrt_u64(uint64_t x) { |
|
enum { PREC = 16 }; // Fixed-point precision. |
|
#define I(n) ((uint64_t)(n) << (1*(PREC))) |
|
#define II(n) ((uint64_t)(n) << (2*(PREC))) |
|
|
|
// Precomputed sqrt(2-i)/2 for i = 0, 1. |
|
const uint32_t half_sqrt_2minus[] = {(I(1)/2)*1.4142135624 + 0.5, I(1)/2}; |
|
apriori(countof(half_sqrt_2minus) == 2); |
|
|
|
if unlikely(x == 0) { return 0; } |
|
unsigned l = leading_zeros_u64(x); |
|
x = (x << l) >> (63 - PREC); |
|
surely(I(1) <= x && x < I(2)); |
|
|
|
uint64_t y = x*x - II(1); |
|
y = I(1) + (I(4*y) / (y + I(10*x) + II(6))); |
|
if (PREC == 16) { |
|
return (y * half_sqrt_2minus[l&1]) >> (l>>1); |
|
} else { |
|
return ((y << (31^(l>>1))) * half_sqrt_2minus[l&1]) >> (2*PREC-1); |
|
} |
|
|
|
#undef I |
|
#undef II |
|
} |
|
|
|
// rank_partition() uses the Floyd-Rivest Select algorithm to partition array |
|
// elements in [0,len[ around the pivot_rank-th smallest value in it (counting |
|
// from zero), which hence ends up at the index pivot_rank afterwards. |
|
// |
|
// The RANK_PARTITION_SAMPLING_CUTOFF_LEN for recursive sampling should not |
|
// be too small in order to limit the sampling costs and keep the recursion |
|
// tree shallow. Let s(n,0) = n and s(n,k) = s(n,k-1)^(2/3) * c for some |
|
// constant 0 < c <= 1. When s(n,k) <= CUTOFF_LEN < s(n,k-1), there will |
|
// be at most k levels of recursion for all arrays of n elements or fewer. |
|
// The guiding milestones for 32-bit and 48-bit lengths are as follows: |
|
// |
|
// s(2^32, 1) = s(2^48, 2) / c^0.667 < 2,642,246 * c, |
|
// s(2^32, 2) = s(2^48, 3) / c^0.444 < 19,113 * c^1.667, |
|
// s(2^32, 3) = s(2^48, 4) / c^0.296 < 715 * c^2.111, |
|
// s(2^32, 4) = s(2^48, 5) / c^0.198 < 80 * c^2.407, |
|
// s(2^32, 5) = s(2^48, 6) / c^0.132 < 19 * c^2.605. |
|
// |
|
// In our implementation here, the scaling constant c is defined by the ratio |
|
// RANK_PARTITION_SAMPLE_LEN_MUL/RANK_PARTITION_SAMPLE_LEN_DIV which can thus |
|
// be tweaked via the constants section below. |
|
// |
|
// For the underlying sampling technique and its asymptotic properties, see: |
|
// - Floyd, R. W., Rivest, R. L. (1975). Expected time bounds for selection. |
|
// Communications of the ACM, 18(3), 165-172. |
|
// - Kiwiel, K. C. (2005). On Floyd and Rivest's SELECT algorithm. |
|
// Theoretical Computer Science, 347(1-2), 214-238. |
|
// |
|
// Computing sample bounds using our own fixed-point approximations of ln(n) and |
|
// floor(n^(2/3)) is roughly 3 to 4 times faster than any equivalent computation |
|
// relying on the standard floating-point log() and exp() (or cbrt()) functions |
|
// (even with 32-bit floats). Of course, it's not a big contributor towards the |
|
// total time here, but it's a minor improvement nonetheless. We also use our |
|
// own approximation of floor(sqrt(n)) even though the floating-point CPU |
|
// instruction the standard sqrt() call boils down to is still a bit faster, |
|
// because it allows us to completely relieve rank_partition() from the libm |
|
// linking dependency (or compiler-dependent __builtin_sqrt() shenanigans) and |
|
// keep all the math within the function integer-only. |
|
// |
|
// We rely on the presence of the uint128_t type for fractional scaling in the |
|
// 112.16 fixed-point format during the sample_part_rank computation in order |
|
// to guarantee that no overflows can happen for all possible array lengths. |
|
// The risk is present, depending on the constants, for lengths starting from |
|
// 2^26 or so. Although this can be avoided for shorter arrays, it's generally |
|
// not worth it even for those, as the extra cost of a couple of u128*u64 |
|
// multiplications and a u128/u64 division per recursive call (as opposed |
|
// to the u64-only ones) is practically negligible here. |
|
enum { |
|
RANK_PARTITION_CUTOFF_LEN = 4, |
|
RANK_PARTITION_MEDIAN3_CUTOFF_LEN = 120, |
|
RANK_PARTITION_SAMPLING_CUTOFF_LEN = 3000, |
|
RANK_PARTITION_SAMPLE_LEN_MUL = 7, |
|
RANK_PARTITION_SAMPLE_LEN_DIV = 32, |
|
RANK_PARTITION_GAP_LEN_MUL = RANK_PARTITION_SAMPLE_LEN_MUL, |
|
RANK_PARTITION_GAP_LEN_DIV = RANK_PARTITION_SAMPLE_LEN_DIV, |
|
}; |
|
apriori(0 < RANK_PARTITION_CUTOFF_LEN); |
|
apriori(RANK_PARTITION_CUTOFF_LEN < RANK_PARTITION_MEDIAN3_CUTOFF_LEN); |
|
apriori(RANK_PARTITION_MEDIAN3_CUTOFF_LEN < RANK_PARTITION_SAMPLING_CUTOFF_LEN); |
|
static inline_callees void |
|
rank_partition_rec(index_rng_t *rng, item_t *arr, index_t len, index_t part_rank) { |
|
#define scale_sample_len(n) \ |
|
(((n) * RANK_PARTITION_SAMPLE_LEN_MUL) / RANK_PARTITION_SAMPLE_LEN_DIV) |
|
#define scale_gap_len(n) \ |
|
(((n) * RANK_PARTITION_GAP_LEN_MUL) / RANK_PARTITION_GAP_LEN_DIV) |
|
|
|
index_t l = 0, r = len; |
|
while (l + RANK_PARTITION_CUTOFF_LEN < r) { |
|
index_t n = r - l, pivot_idx = -1; |
|
// Choose and, via pivot_idx, point at the item to be used as the pivot |
|
// for partitioning the current subarray indexed by the [l,r[ interval. |
|
if (n > RANK_PARTITION_SAMPLING_CUTOFF_LEN) { |
|
index_t sample_len = scale_sample_len(approx_floor_pow_2over3_u64(n)); |
|
// Safety for invalid or poorly chosen scaling constants. |
|
if unlikely(sample_len > n/2) { sample_len = n/2; } |
|
if unlikely(sample_len < 1) { sample_len = 1; } |
|
|
|
// Randomly sample items from [l,r[ and move them to [l,l+sample_len[ |
|
// at the front (muscling the rest out to [l+sample_len,r[). |
|
{ |
|
item_t *front = arr + l; |
|
index_t chunk_len = n/sample_len; |
|
item_t *p, *chunk = front + index_rand(rng, n%sample_len + 1); |
|
BATCH_EXCHANGE_ITEMS(sample_len, (front++), |
|
(p = chunk + index_rand(rng, chunk_len), chunk += chunk_len, p)); |
|
} |
|
|
|
// Pick the rank of the item in the sample that will serve as the pivot. |
|
index_t sample_part_rank = -1; |
|
{ |
|
uint32_t log_n = approx_log_fixp16_u64(n); |
|
uint64_t gap_sq = ((uint128_t)log_n * (n - sample_len) * sample_len / n) >> 16; |
|
index_t gap = scale_gap_len(approx_floor_sqrt_u64(gap_sq)); |
|
// Safety for invalid or poorly chosen scaling constants. |
|
if unlikely(gap < 0) { gap = 0; } |
|
if unlikely(gap > sample_len/2) { gap = sample_len/2; } |
|
|
|
sample_part_rank = (uint64_t)((uint128_t)(part_rank - l) * sample_len / n); |
|
if fickle(sample_part_rank < sample_len/2) { |
|
sample_part_rank += gap; |
|
} else { |
|
sample_part_rank -= gap; |
|
} |
|
} |
|
|
|
// Partition the sample with the sole purpose of finding the item whose |
|
// rank in the sample is sample_part_rank, so that we can use it as the |
|
// pivot to then efficaciously partition all items in [l,r[ altogether. |
|
rank_partition_rec(rng, arr+l, sample_len, sample_part_rank); |
|
pivot_idx = sample_part_rank; |
|
} else if (n > RANK_PARTITION_MEDIAN3_CUTOFF_LEN) { |
|
index_t i[3]; |
|
index_rand_3(i, rng, n); |
|
pivot_idx = median_of_3_idx(arr+l, i); |
|
} else { |
|
pivot_idx = index_rand(rng, n); |
|
} |
|
|
|
pivot_idx = l + partition(arr+l, n, pivot_idx); |
|
surely(l <= pivot_idx), surely(pivot_idx < r); |
|
if fickle(pivot_idx <= part_rank) { l = pivot_idx + 1; } |
|
if fickle(part_rank <= pivot_idx) { r = pivot_idx; } |
|
} |
|
if (l < r) { insertion_sort(arr+l, r-l); } |
|
|
|
#undef scale_sample_len |
|
#undef scale_gap_len |
|
} |
|
static inline void |
|
rank_partition(item_t *arr, index_t len, index_t pivot_rank) { |
|
surely(len >= 0), surely(0 <= pivot_rank), surely(pivot_rank < len); |
|
if unlikely(len <= 1) { return; } |
|
|
|
index_rng_t rng = index_rng_seeded_from(arr, len); |
|
rank_partition_rec(&rng, arr, len, pivot_rank); |
|
} |
|
|
|
// Binary heap implementation. Supports both min- and max-heaps, as well as |
|
// both left-to-right and right-to-left storage layout in memory. The unifying |
|
// moniker of "lighter" is used for the heap property predicate: If item X is |
|
// lighter than item Y, it means that X cannot be a child of Y in the heap |
|
// (i.e., the former metaphorically wants to bubble up above the latter). |
|
#define heap_lighter(x, y) ((min_heap) ? item_less((x), (y)) : item_less((y), (x))) |
|
#define heap_at(i) (*((stored_backwards) ? (root)-(i) : (root)+(i))) |
|
static inline void |
|
heap_fix_down(int min_heap, int stored_backwards, item_t *root, index_t node, index_t len) { |
|
surely(len > 0), surely(0 <= node), surely(node < len); |
|
item_t x = heap_at(node); |
|
for (index_t child; child = (node<<1)|1, child < len; node = child) { |
|
child += (likely(child+1 != len) && heap_lighter(heap_at(child+1), heap_at(child))); |
|
if fickle(!heap_lighter(heap_at(child), x)) { break; } |
|
heap_at(node) = heap_at(child); |
|
} |
|
heap_at(node) = x; |
|
} |
|
#undef heap_lighter |
|
#undef heap_at |
|
static inline_callees void |
|
heap_build(int min_heap, int stored_backwards, item_t *root, index_t len) { |
|
surely(len >= 0); |
|
for (index_t i = len/2; i != 0; i--) { |
|
heap_fix_down(min_heap, stored_backwards, root, i-1, len); |
|
} |
|
} |
|
#define lr_max_heap_fix_down(root, node, len) (heap_fix_down(0,0, (root), (node), (len))) |
|
#define rl_min_heap_fix_down(root, node, len) (heap_fix_down(1,1, (root), (node), (len))) |
|
#define lr_max_heap_build(root, len) (heap_build(0,0, (root), (len))) |
|
#define rl_min_heap_build(root, len) (heap_build(1,1, (root), (len))) |
|
|
|
// rank_partitioned_heapsort() partially heap-sorts array elements in [0,len[ |
|
// so that those whose ranks in the overall sorted sequence belong to [lo,hi[ |
|
// take their correct positions, in order; those displaced into indices [0,lo[ |
|
// and [hi,len[ are unordered. |
|
// |
|
// The implementation bifurcates based on whether the requested sorting interval |
|
// is closer to the beginning or the end of the array. In the former case, we: |
|
// |
|
// L1. Partition the array around the (hi-1)-st largest element as a pivot. |
|
// L2. Build a right-to-left min-heap out of elements in [lo,hi[ so that |
|
// the root node holding the minimum value is stored at index hi-1. |
|
// L3. For each element in [0,lo[, if it is exceeding the current heap |
|
// minimum, exchange the former with the latter and fix the heap. |
|
// L4. Fill [lo,hi[ going left to right with successively removed heap |
|
// minimums, simultaneously shortening (and maintaining) the heap. |
|
// |
|
// Otherwise, if the interval is closer to the end of the array, we: |
|
// |
|
// R1. Partition the array around the lo-th largest element as a pivot. |
|
// R2. Build a left-to-right max-heap out of elements in [lo,hi[ so that |
|
// the root node holding the maximum value is stored at index lo. |
|
// R3. For each element in [hi,len[, if it is inferior to the current heap |
|
// maximum, exchange the former with the latter and fix the heap. |
|
// R4. Fill [lo,hi[ going right to left with successively removed heap |
|
// maximums, simultaneously shortening (and maintaining) the heap. |
|
// |
|
// In order to reduce the number of branch mispredictions that are bound to |
|
// happen in step L3/R3, similarly to partition(), we scan elements in chunks |
|
// of HEAPSORT_BATCH_LEN at a time, and in each of them buffer those that are |
|
// heavier than the current heap root (i.e., larger in case of L3 and smaller |
|
// in case of R3) in a branchless way first, and then perform the necessary |
|
// unpredictable comparisons (and the associated heap root displacements) only |
|
// with the buffered elements. For nondegenerate input sequences, this allows to |
|
// evade a significant portion of unpredictable branches, improving performance. |
|
enum { HEAPSORT_BATCH_LEN = 32 }; |
|
apriori(HEAPSORT_BATCH_LEN - 1 <= int_type_max(unsigned char)); |
|
apriori(HEAPSORT_BATCH_LEN % 8 == 0); |
|
static void |
|
rank_partitioned_heapsort(item_t *arr, index_t len, index_t lo, index_t hi) { |
|
surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len); |
|
if unlikely(len <= 1 || lo >= hi) { return; } |
|
|
|
#define BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len, \ |
|
count, next_ptr_expr, heap_lighter, heap_fix_down) do { \ |
|
item_t *dub(root) = (heap_root); \ |
|
index_t dub(len) = (heap_len); \ |
|
for (unsigned dub(n) = (count); dub(n) != 0; dub(n)--) { \ |
|
item_t *dub(x) = (next_ptr_expr); \ |
|
if fickle(!heap_lighter(*dub(root), *dub(x))) { continue; } \ |
|
swap(*dub(root), *dub(x)); \ |
|
heap_fix_down(dub(root), 0, dub(len)); \ |
|
} \ |
|
} while (0) |
|
#define item_greater(x, y) (item_less((y), (x))) |
|
|
|
enum { BATCH_LEN = HEAPSORT_BATCH_LEN }; |
|
aligned_to(CACHE_LINE_SIZE) unsigned char dl_buf[BATCH_LEN]; |
|
|
|
if (lo < len-hi) { |
|
rank_partition(arr, len, hi-1); |
|
|
|
index_t heap_len = hi - lo; |
|
item_t *heap_root = arr + hi - 1; |
|
rl_min_heap_build(heap_root, heap_len); |
|
|
|
item_t *l = arr + 0, *end = arr + lo; |
|
BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len, |
|
(end - l) % BATCH_LEN, (l++), item_less, rl_min_heap_fix_down); |
|
for (; l != end; l += BATCH_LEN) { |
|
unsigned char *dl = dl_buf; |
|
unsigned n_dl = 0; |
|
BATCH_COLLECT_ITEMS(dl, &n_dl, (i), BATCH_LEN, |
|
(item_less(*heap_root, l[i])), REPEAT_8); |
|
BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len, |
|
n_dl, (l + *dl++), item_less, rl_min_heap_fix_down); |
|
} |
|
|
|
for (index_t i = lo; i != hi-1; i++) { |
|
swap(arr[i], *heap_root); |
|
rl_min_heap_fix_down(heap_root, 0, --heap_len); |
|
} |
|
} else { |
|
if likely(lo != 0) { rank_partition(arr, len, lo); } |
|
|
|
index_t heap_len = hi - lo; |
|
item_t *heap_root = arr + lo; |
|
lr_max_heap_build(heap_root, heap_len); |
|
|
|
item_t *l = arr + hi, *end = arr + len; |
|
BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len, |
|
(end - l) % BATCH_LEN, (l++), item_greater, lr_max_heap_fix_down); |
|
for (; l != end; l += BATCH_LEN) { |
|
unsigned char *dl = dl_buf; |
|
unsigned n_dl = 0; |
|
BATCH_COLLECT_ITEMS(dl, &n_dl, (i), BATCH_LEN, |
|
(item_less(l[i], *heap_root)), REPEAT_8); |
|
BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len, |
|
n_dl, (l + *dl++), item_greater, lr_max_heap_fix_down); |
|
} |
|
|
|
for (index_t i = hi-1; i != lo; i--) { |
|
swap(arr[i], *heap_root); |
|
lr_max_heap_fix_down(heap_root, 0, --heap_len); |
|
} |
|
} |
|
|
|
#undef BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS |
|
#undef item_greater |
|
} |
|
|
|
#undef REPEAT_1 |
|
#undef REPEAT_8 |
|
#undef BATCH_COLLECT_ITEMS |
|
#undef BATCH_EXCHANGE_ITEMS |
|
|
|
|
|
//-----------------------------------------------------------------------------* |
|
// APPROACH 2: One-off partial quicksort (no state shared between calls) |
|
//-----------------------------------------------------------------------------* |
|
|
|
enum { |
|
QUICKSORT_HARD_CUTOFF_LEN = 4, |
|
QUICKSORT_SOFT_CUTOFF_LEN = 16, |
|
QUICKSORT_PSEUDO_MEDIAN_DEPTH = 4, |
|
QUICKSORT_PSEUDO_MEDIAN_CUTOFF_LEN = 120, |
|
}; |
|
apriori(QUICKSORT_HARD_CUTOFF_LEN >= 3); // For easy median-of-three selection. |
|
apriori(QUICKSORT_PSEUDO_MEDIAN_CUTOFF_LEN + 1 >= 3*3); // Same, but in thirds. |
|
|
|
// pow3_pseudo_median_idx() implements the power-of-three pseudo-median |
|
// selection. It recursively descends to divide an interval into (at most) |
|
// 3^depth chunks, finds the median of three randomly selected items from each, |
|
// and reduces those to a single value by keeping the median out of every triple |
|
// of medians (of medians...) while ascending. |
|
static inline_callees index_t |
|
pow3_pseudo_median_idx_rec(index_rng_t *rng, item_t *arr, index_t len, unsigned rec_lim) { |
|
surely(len >= 3); |
|
index_t i[3]; |
|
if (rec_lim-- == 0 || len <= QUICKSORT_PSEUDO_MEDIAN_CUTOFF_LEN) { |
|
index_rand_3(i, rng, len); |
|
} else { |
|
index_t n = len/3; |
|
i[0] = 0*n + pow3_pseudo_median_idx_rec(rng, arr+0*n, n, rec_lim); |
|
i[1] = 1*n + pow3_pseudo_median_idx_rec(rng, arr+1*n, n, rec_lim); |
|
i[2] = 2*n + pow3_pseudo_median_idx_rec(rng, arr+2*n, len-2*n, rec_lim); |
|
} |
|
return median_of_3_idx(arr, i); |
|
} |
|
static inline index_t |
|
pow3_pseudo_median_idx(index_rng_t *rng, item_t *arr, index_t len) { |
|
index_t i = pow3_pseudo_median_idx_rec(rng, arr, len, QUICKSORT_PSEUDO_MEDIAN_DEPTH); |
|
surely(0 <= i), surely(i < len); |
|
return i; |
|
} |
|
static must_not_inline void |
|
partial_quicksort_rec_bottom(item_t *arr, index_t lo, index_t hi, index_t l, index_t r) { |
|
if (lo < l) { lo = l; } |
|
if (hi > r) { hi = r; } |
|
// Although rank partitioning performed by the partial heapsort routine is |
|
// also (stochastically) recursive, it is guaranteed to have a fixed upper |
|
// bound on its stack usage, so it's fine to use it as a failsafe here. |
|
rank_partitioned_heapsort(arr+l, r-l, lo-l, hi-l); |
|
} |
|
// partial_quicksort_rand() partially quicksorts array elements in [0,len[ so |
|
// that those whose ranks in the overall sorted sequence belong to [lo,hi[ take |
|
// their correct positions in the array. |
|
static inline_callees void |
|
partial_quicksort_rand_rec(index_rng_t *rng, item_t *arr, index_t lo, index_t hi, |
|
index_t l, index_t r, unsigned rec_lim) { |
|
if unlikely(rec_lim-- == 0) { |
|
partial_quicksort_rec_bottom(arr, lo, hi, l, r); |
|
return; |
|
} |
|
for (;;) { |
|
surely(l+1 < r); |
|
if (r-l <= QUICKSORT_HARD_CUTOFF_LEN || |
|
(r-l <= QUICKSORT_SOFT_CUTOFF_LEN && lo <= l && r <= hi)) { |
|
insertion_sort(arr+l, r-l); |
|
return; |
|
} |
|
index_t pivot_idx = pow3_pseudo_median_idx(rng, arr+l, r-l); |
|
index_t r_left = l + partition(arr+l, r-l, pivot_idx); |
|
surely(l <= r_left), surely(r_left < r); |
|
index_t l_right = r_left + 1; |
|
unsigned go_left = (lo < r_left && r_left-l > 1); |
|
unsigned go_right = (l_right < hi && r-l_right > 1); |
|
|
|
switch fickle((go_left<<1) | go_right) { |
|
case 0: { return; } break; |
|
case 1: dive_right: { l = l_right; } break; |
|
case 2: dive_left: { r = r_left; } break; |
|
case 3: { |
|
if (r_left-l <= r-l_right) { |
|
partial_quicksort_rand_rec(rng, arr, lo, hi, l, r_left, rec_lim); |
|
goto dive_right; |
|
} else { |
|
partial_quicksort_rand_rec(rng, arr, lo, hi, l_right, r, rec_lim); |
|
goto dive_left; |
|
} |
|
} break; |
|
default: { unreachable(); } break; |
|
} |
|
} |
|
} |
|
static inline void |
|
partial_quicksort_rand(item_t *arr, index_t len, index_t lo, index_t hi) { |
|
surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len); |
|
if unlikely(len <= 1 || lo >= hi) { return; } |
|
|
|
index_rng_t rng = index_rng_seeded_from(arr, len); |
|
partial_quicksort_rand_rec(&rng, arr, lo, hi, 0, len, 5*(bit_width(len)-1)/4); |
|
} |
|
|
|
|
|
//-----------------------------------------------------------------------------* |
|
// APPROACH 3: Incremental quicksort (pivot indices are shared between calls) |
|
//-----------------------------------------------------------------------------* |
|
|
|
#define INC_QUICKSORT_CHAOS_MARK ((index_t)-1) |
|
#define INC_QUICKSORT_SORTED_MARK ((index_t)-2) |
|
static must_not_inline void |
|
inc_quicksort_rec_bottom(item_t *arr, index_t lo, index_t hi, |
|
index_t l, index_t r, index_t *part_idx) { |
|
if (lo < l) { lo = l; } |
|
if (hi > r) { hi = r; } |
|
if (lo == l && r == hi) { |
|
*part_idx = INC_QUICKSORT_SORTED_MARK; |
|
} else { |
|
// Once the items with ranks [lo,hi[ are put in place, any of them can serve |
|
// as the partition point of the [l,r[ interval. Pick the one closest to the |
|
// middle, unless the sorted subinterval is far away to either side in which |
|
// case mark the interval as unpartitioned until better luck comes around. |
|
index_t half = (r - l)/2, mid = l + half; |
|
*part_idx = |
|
mid < lo ? (lo < r - half/4 ? lo : INC_QUICKSORT_CHAOS_MARK) : |
|
hi <= mid ? (l + half/4 < hi ? hi-1 : INC_QUICKSORT_CHAOS_MARK) : |
|
mid; |
|
} |
|
// Items in [l,r[ will be reordered now; reset all descendant partitions: |
|
{apriori(INC_QUICKSORT_CHAOS_MARK == (index_t)-1); |
|
memset(part_idx+1, 0xFF, (r-l-1)*sizeof(*part_idx));} |
|
|
|
// Although rank partitioning performed by the partial heapsort routine is |
|
// also (stochastically) recursive, it is guaranteed to have a fixed upper |
|
// bound on its stack usage, so it's fine to use it as a failsafe here. |
|
rank_partitioned_heapsort(arr+l, r-l, lo-l, hi-l); |
|
} |
|
// inc_quicksort_rand() partially quicksorts array elements in [0,len[ so that |
|
// those whose ranks in the overall sorted sequence belong to [lo,hi[ end up at |
|
// their correct positions, taking into account the history of partitions from |
|
// past invocations and making new ones only in the previously unpartitioned |
|
// intervals when necessary. The function either leaves the array as it is or |
|
// brings it monotonically closer to its fully sorted state with each call |
|
// (hence the "incremental" moniker). |
|
// |
|
// The pivot indices of past partitions are stored in the part_idx array in |
|
// the order of preorder traversal of the quicksort tree. The special index |
|
// values INC_QUICKSORT_SORTED_MARK and INC_QUICKSORT_CHAOS_MARK signify that |
|
// an interval is fully sorted or has not been partitioned yet, respectively. |
|
// Each part_idx must initially be set to INC_QUICKSORT_CHAOS_MARK. |
|
static inline_callees void |
|
inc_quicksort_rand_rec(index_rng_t *rng, item_t *arr, index_t lo, index_t hi, |
|
index_t l, index_t r, index_t *part_idx, unsigned rec_lim) { |
|
if unlikely(rec_lim-- == 0) { |
|
inc_quicksort_rec_bottom(arr, lo, hi, l, r, part_idx); |
|
return; |
|
} |
|
for (;;) { |
|
surely(l+1 < r); |
|
int will_be_sorted = (lo <= l && r <= hi); |
|
if (r-l <= QUICKSORT_HARD_CUTOFF_LEN || |
|
(r-l <= QUICKSORT_SOFT_CUTOFF_LEN && will_be_sorted)) { |
|
*part_idx = INC_QUICKSORT_SORTED_MARK; |
|
insertion_sort(arr+l, r-l); |
|
return; |
|
} |
|
index_t r_left = *part_idx; |
|
if (r_left == INC_QUICKSORT_CHAOS_MARK) { |
|
index_t pivot_idx = pow3_pseudo_median_idx(rng, arr+l, r-l); |
|
r_left = l + partition(arr+l, r-l, pivot_idx); |
|
} |
|
surely(l <= r_left), surely(r_left < r); |
|
*part_idx = will_be_sorted ? INC_QUICKSORT_SORTED_MARK : r_left; |
|
index_t l_right = r_left + 1; |
|
|
|
unsigned go_left = (lo < r_left && r_left-l > 1); |
|
unsigned go_right = (l_right < hi && r-l_right > 1); |
|
index_t *part_idx_left = part_idx + 1; |
|
index_t *part_idx_right = part_idx_left + (r_left - l); |
|
go_left = (go_left && *part_idx_left != INC_QUICKSORT_SORTED_MARK); |
|
go_right = (go_right && *part_idx_right != INC_QUICKSORT_SORTED_MARK); |
|
|
|
switch fickle((go_left<<1) | go_right) { |
|
case 0: { return; } break; |
|
case 1: dive_right: { l = l_right, part_idx = part_idx_right; } break; |
|
case 2: dive_left: { r = r_left, part_idx = part_idx_left; } break; |
|
case 3: { |
|
if (r_left-l <= r-l_right) { |
|
inc_quicksort_rand_rec(rng, arr, lo, hi, l, r_left, part_idx_left, rec_lim); |
|
goto dive_right; |
|
} else { |
|
inc_quicksort_rand_rec(rng, arr, lo, hi, l_right, r, part_idx_right, rec_lim); |
|
goto dive_left; |
|
} |
|
} break; |
|
default: { unreachable(); } break; |
|
} |
|
} |
|
} |
|
static inline void |
|
inc_quicksort_rand(item_t *arr, index_t *part_idxs, index_t len, index_t lo, index_t hi) { |
|
surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len); |
|
if unlikely(len <= 1 || lo >= hi || *part_idxs == INC_QUICKSORT_SORTED_MARK) { return; } |
|
|
|
index_rng_t rng = index_rng_seeded_from(arr, len); |
|
inc_quicksort_rand_rec(&rng, arr, lo, hi, 0, len, part_idxs, 5*(bit_width(len)-1)/4); |
|
} |
|
|
|
|
|
//-----------------------------------------------------------------------------* |
|
// BENCHMARKS |
|
//-----------------------------------------------------------------------------* |
|
|
|
#include <stdio.h> // printf() |
|
#include <stdlib.h> // calloc(), free() |
|
#include <time.h> // clock_gettime() |
|
|
|
static must_inline uint64_t |
|
nanosecs(void) { |
|
struct timespec t; |
|
clock_gettime(CLOCK_MONOTONIC_RAW, &t); |
|
return (uint64_t)t.tv_sec*1000*1000*1000 + (uint64_t)t.tv_nsec; |
|
} |
|
|
|
// Integral type for unique item identifiers. Unnecessary when the item_t type |
|
// itself is integral; left up to customization for composite item_t types. |
|
typedef item_t item_key_t; |
|
#define item_key(item) ((item_key_t)(item)) |
|
#define item_with_key(key) ((item_t)(key)) |
|
|
|
static inline void |
|
shuffle_items(pcg32_t *r, item_t *arr, uint32_t n) { |
|
for (uint32_t i = 0; i < n; i++) { |
|
swap(arr[i], arr[onto_range_u32(pcg32_next(r), i, n)]); |
|
} |
|
} |
|
|
|
static inline_callees void |
|
call_partial_quicksort_rand(item_t *arr, index_t *part_idxs, |
|
index_t len, index_t lo, index_t hi) { |
|
(void)part_idxs; |
|
partial_quicksort_rand(arr, len, lo, hi); |
|
} |
|
static inline_callees void |
|
call_rank_partitioned_heapsort(item_t *arr, index_t *part_idxs, |
|
index_t len, index_t lo, index_t hi) { |
|
(void)part_idxs; |
|
rank_partitioned_heapsort(arr, len, lo, hi); |
|
} |
|
|
|
extern int |
|
main(void) { |
|
enum { |
|
RANDOMIZE = 0, |
|
VERBOSE = 0, |
|
}; |
|
typedef void partial_sort_test_func_t(item_t *, index_t *, index_t, index_t, index_t); |
|
static const struct test_func { |
|
partial_sort_test_func_t *func; |
|
const char *name; |
|
} funcs[] = { |
|
#define X(p,f) {glue(p,f), #f} |
|
X(,inc_quicksort_rand), |
|
X(call_,partial_quicksort_rand), |
|
X(call_,rank_partitioned_heapsort), |
|
#undef X |
|
}; enum { N_FUNCS = countof(funcs) }; |
|
typedef struct test_params test_params_t; |
|
static const struct test_params { |
|
int n_runs; |
|
int n_sorts; |
|
index_t arr_len; |
|
index_t min_sort_len; |
|
index_t max_sort_len; |
|
} tests[] = { |
|
{30000, 1, 2000, 1, 2000}, |
|
{ 8000, 1, 100*1000, 50, 500}, |
|
{ 300, 100, 100*1000, 50, 1000}, |
|
{ 5000, 1, 1000*1000, 50, 500}, |
|
{ 5000, 1, 1000*1000, 100, 2500}, |
|
{ 40, 100, 1000*1000, 50, 500}, |
|
{ 40, 100, 1000*1000, 100, 2500}, |
|
{ 25, 300, 1000*1000, 50, 500}, |
|
{ 25, 300, 1000*1000, 100, 2500}, |
|
{ 25, 50, 10*1000*1000, 500, 5000}, |
|
{ 5, 500, 10*1000*1000, 500, 5000}, |
|
{ 5, 100, 100*1000*1000, 1000, 10000}, |
|
}; enum { N_TESTS = countof(tests) }; |
|
|
|
index_t max_arr_len = 0; |
|
for (int i = 0; i < N_TESTS; i++) { |
|
if (max_arr_len < tests[i].arr_len) { |
|
max_arr_len = tests[i].arr_len; |
|
} |
|
} |
|
item_t *arr = calloc(max_arr_len, sizeof(arr[0])); |
|
index_t *part_idxs = calloc(max_arr_len, sizeof(part_idxs[0])); |
|
surely(arr), surely(part_idxs); |
|
if (!arr || !part_idxs) { return 1; } |
|
|
|
const uint64_t rng_seed = RANDOMIZE ? nanosecs() : 809; |
|
|
|
for (int i_func = 0; i_func < N_FUNCS; i_func++) { |
|
partial_sort_test_func_t *const partial_sort = funcs[i_func].func; |
|
printf("\nFunction %s\n", funcs[i_func].name); |
|
|
|
pcg32_t rng = pcg32(rng_seed); |
|
|
|
uint64_t all_tests_dt = 0; |
|
for (int i_test = 0; i_test < N_TESTS; i_test++) { |
|
const test_params_t T = tests[i_test]; |
|
if (VERBOSE) { |
|
printf("Test %3d. Parameters: %d x %d, %td, %td..%td\n", |
|
i_test+1, T.n_runs, T.n_sorts, (ptrdiff_t)T.arr_len, |
|
(ptrdiff_t)T.min_sort_len, (ptrdiff_t)T.max_sort_len); |
|
} |
|
|
|
uint64_t test_dt = 0; |
|
for (int i_run = 0; i_run < T.n_runs; i_run++) { |
|
// Initialize the array to a random permutation (of its indices). |
|
for (index_t i = 0; i < T.arr_len; i++) { |
|
arr[i] = item_with_key((item_key_t)i); |
|
} |
|
shuffle_items(&rng, arr, T.arr_len); |
|
// Set pivot indices to INC_QUICKSORT_CHAOS_MARK for inc_quicksort_rand(): |
|
{apriori(INC_QUICKSORT_CHAOS_MARK == (index_t)-1); |
|
memset(part_idxs, 0xFF, T.arr_len*sizeof(part_idxs[0]));} |
|
|
|
uint64_t run_dt = 0; |
|
for (int i_sort = 0; i_sort < T.n_sorts; i_sort++) { |
|
index_t span = onto_range_u32(pcg32_next(&rng), T.min_sort_len, T.max_sort_len+1); |
|
index_t lo = onto_range_u32(pcg32_next(&rng), 0, T.arr_len-span+1); |
|
index_t hi = lo + span; |
|
|
|
uint64_t t0 = nanosecs(); |
|
(*partial_sort)(arr, part_idxs, T.arr_len, lo, hi); |
|
uint64_t dt = nanosecs() - t0; |
|
run_dt += dt; |
|
|
|
int ok = (1); |
|
index_t first_diff = -1; |
|
for (index_t i = lo; i < hi; i++) { |
|
if (item_key(arr[i]) != (item_key_t)i) { |
|
ok = (0), first_diff = i; |
|
break; |
|
} |
|
} |
|
if (VERBOSE || !ok) { |
|
printf("Test %3d, run %3d, sort %3d. [%12td %12td] (%6td): %4s %.9f\n", |
|
i_test+1, i_run+1, i_sort+1, (ptrdiff_t)lo, (ptrdiff_t)hi-1, (ptrdiff_t)span, |
|
ok ? "OK" : "FAIL", dt*1e-9/span); |
|
} |
|
if (!ok) { |
|
enum { BEFORE = 5, AFTER = 10 }; |
|
index_t l = first_diff-lo >= BEFORE ? first_diff-BEFORE : lo; |
|
index_t r = hi-first_diff >= AFTER+1 ? first_diff+AFTER+1 : hi; |
|
printf(" [%td..%td]:", (ptrdiff_t)l, (ptrdiff_t)r-1); |
|
for (index_t i = l; i < r; i++) { |
|
if (int_type_is_signed(item_key_t)) { |
|
printf(" %jd", (intmax_t)item_key(arr[i])); |
|
} else { |
|
printf(" %ju", (uintmax_t)item_key(arr[i])); |
|
} |
|
} |
|
printf("\n"); |
|
} |
|
} |
|
test_dt += run_dt; |
|
|
|
if (VERBOSE) { |
|
printf("Test %3d, run %3d. Elapsed: %9.6f\n", |
|
i_test+1, i_run+1, run_dt*1e-9); |
|
} |
|
} |
|
all_tests_dt += test_dt; |
|
|
|
printf("Test %3d. Elapsed: %7.3f total, %9.6f per run\n", |
|
i_test+1, test_dt*1e-9, test_dt*1e-9/T.n_runs); |
|
fflush(stdout); |
|
} |
|
|
|
printf("Total test time: %7.3f\n", all_tests_dt*1e-9); |
|
fflush(stdout); |
|
} |
|
|
|
free(arr), arr = NULL; |
|
free(part_idxs), part_idxs = NULL; |
|
|
|
return 0; |
|
} |