igorburago/partsort.c

## partsort.c
// Copyright 2025 Igor Burago. Released under the ISC License.

#include <limits.h> // CHAR_BIT
#include <stddef.h> // ptrdiff_t
#include <stdint.h>
#include <string.h> // memcpy(), memmove(), memset()

// Built-in u128 is not strictly required, but it makes it easier to guarantee
// the absence of overflows during sampling in rank_partition() for all possible
// input array lengths. It also simplifies the implementation of the MCG64 RNG.
typedef unsigned __int128 uint128_t;

#if !defined(__has_attribute)
  #define __has_attribute(x) (0)
#endif
#if !defined(__has_builtin)
  #define __has_builtin(x) (0)
#endif

#if __has_attribute(always_inline)
  #define must_inline inline __attribute__((always_inline))
#else
  #define must_inline inline
#endif
#if __has_attribute(noinline)
  #define must_not_inline __attribute__((noinline))
#else
  #define must_not_inline
#endif
#if __has_attribute(flatten)
  #define inline_callees __attribute__((flatten))
#else
  #define inline_callees
#endif

#if __has_builtin(__builtin_unpredictable)
  #define fickle(x) (__builtin_unpredictable((x)))
#else
  #define fickle(x) (x)
#endif
#if __has_builtin(__builtin_expect)
  #define usually(x, c) (__builtin_expect((x), (c)))
#else
  #define usually(x, c) (x)
#endif
#define likely(x)   (usually(!!(x), 1))
#define unlikely(x) (usually(!!(x), 0))

#define glue_raw(x, y) x##y
#define glue(x, y) glue_raw(x, y)
#define dub(name) glue(name, glue(glue(__dub_L,__LINE__),__))

#define apriori(x) void *(*compile_time_check_dummy__(void))[ \
  sizeof(struct { unsigned compile_time_check : (x) ? 1 : -1; })]

// #define DEBUG_BUILD
#if defined(DEBUG_BUILD)
  #include <stdio.h>
  #if __has_builtin(__builtin_trap)
    #define terminate_abnormally() (__builtin_trap())
  #else
    #define terminate_abnormally() ((void)(*(volatile int *)0 = 0))
  #endif
  #define surely(x) (likely(x) ? (void)0 : \
    (fprintf(stderr, "FATAL: %s:%d: surely(%s) does not hold!\n", __FILE__, __LINE__, #x), \
      fflush(NULL), \
      terminate_abnormally()))
  #define unreachable() surely(0 && "This code path must never be reached.")
#else
  #if __has_builtin(__builtin_unreachable)
    #define unreachable() (__builtin_unreachable())
  #else
    static must_inline void unreachable(void) { for (;;) { *(volatile int *)0 = 0; } }
  #endif
  #define surely(x) (likely(x) ? (void)0 : unreachable())
#endif

#if __STDC_VERSION__ >= 202311L
  #define aligned_to(n) alignas(+(n))
#elif __STDC_VERSION__ >= 201112L
  #define aligned_to(n) _Alignas(+(n))
#elif __has_attribute(aligned)
  #define aligned_to(n) __attribute__((aligned(n)))
#else
  #error "aligned_to(n) specifier cannot be defined."
#endif

// For aligning hot buffers in partition() and rank_partitioned_heapsort():
enum { CACHE_LINE_SIZE = 64 };

#define countof(arr) (sizeof(arr) / sizeof((arr)[0]))

#define is_pow2(x) ((x) > 0 && ((x) & ((x)-1)) == 0)
#define int_type_is_signed(type) ((type)-1 < (type)0)
#define int_type_max(type) (!int_type_is_signed(type) ? (type)-1 : \
  (type)((((type)1 << (CHAR_BIT*(int)sizeof(type) - 2)) - 1)*2 + 1))

#define leading_zeros_u32(x) (__builtin_clzg((uint32_t)(x), 32))
#define leading_zeros_u64(x) (__builtin_clzg((uint64_t)(x), 64))
#if defined(__SIZEOF_INT128__)
  #define leading_zeros_u128(x) (__builtin_clzg((uint128_t)(x), 128))
  #define bit_width(x) ( \
    sizeof(x) <= sizeof(uint32_t) ? 32 - leading_zeros_u32(x) : \
    sizeof(x) <= sizeof(uint64_t) ? 64 - leading_zeros_u64(x) : \
    128 - leading_zeros_u128(x))
#else
  #define bit_width(x) (sizeof(x) <= sizeof(uint32_t) \
    ? 32 - leading_zeros_u32(x) : 64 - leading_zeros_u64(x))
#endif

#define swap(x, y) do { \
  void *dub(xp) = &(x), *dub(yp) = &(y); \
  apriori(sizeof(x) == sizeof(y)); \
  unsigned char dub(tmp)[sizeof(x)]; \
  memcpy(dub(tmp), dub(xp), sizeof(dub(tmp))); \
  memmove(dub(xp), dub(yp), sizeof(dub(tmp))); \
  memcpy(dub(yp), dub(tmp), sizeof(dub(tmp))); \
} while (0)

static must_inline void *
align_ptr_down(void *p, uintptr_t align) {
  surely(is_pow2(align));
  return (void *)((uintptr_t)p & -align);
}
static must_inline void *
align_ptr_up(void *p, uintptr_t align) {
  surely(is_pow2(align));
  return (void *)(((uintptr_t)p + align-1) & -align);
}

static must_inline uint64_t
load_u64(void *p) {
  uint64_t x = 0;
  memcpy(&x, p, sizeof(x));
  return x;
}
static must_inline uint64_t
seed_mix_u64(uint64_t seed, uint64_t x) {
  x += seed * 1122334455667788991u;
  return x ^ (x >> 32);
}

static must_inline uint64_t
splitmix64_rng_next(uint64_t *s) {
  uint64_t x = *s += 0x9E3779B97F4A7C15u;
  x = (x ^ (x>>30)) * 0xBF58476D1CE4E5B9u;
  x = (x ^ (x>>27)) * 0x94D049BB133111EBu;
  return x ^ (x>>31);
}

typedef struct mcg64_rng mcg64_rng_t;
struct mcg64_rng { uint128_t s; };
static must_inline mcg64_rng_t
mcg64_rng(uint64_t seed) {
  uint64_t a0 = splitmix64_rng_next(&seed);
  uint64_t a1 = splitmix64_rng_next(&seed);
  mcg64_rng_t r = {0};
  r.s = ((uint128_t)a1 << 64) | a0;
  return r;
}
static must_inline uint64_t
mcg64_rng_next(mcg64_rng_t *r) {
  return (r->s *= 0xDA942042E4DD58B5u) >> 64;
}

typedef struct pcg32_rng pcg32_rng_t;
struct pcg32_rng { uint64_t s, t; };
static must_inline pcg32_rng_t
pcg32_rng(uint64_t seed) {
  pcg32_rng_t r = {0};
  r.s = splitmix64_rng_next(&seed);
  r.t = splitmix64_rng_next(&seed) | 1;
  return r;
}
static must_inline uint32_t
pcg32_rng_next(pcg32_rng_t *r) {
  uint64_t s = r->s;
  r->s = r->s*0x5851F42D4C957F2Du + r->t;
  uint32_t x = (s ^ (s>>18)) >> 27, k = s>>59;
  return (x >> k) | (x << (-k & 31));
}

// Fast and only slightly biased for (end-beg) approaching u32/u64 max.
static must_inline uint32_t
onto_range_u32(uint32_t x, uint32_t beg, uint32_t end) {
  surely(beg < end);
  x = ((uint64_t)x * (end - beg)) >> 32;
  return beg + x;
}
static must_inline uint64_t
onto_range_u64(uint64_t x, uint64_t beg, uint64_t end) {
  surely(beg < end);
  x = ((uint128_t)x * (end - beg)) >> 64;
  return beg + x;
}


//-----------------------------------------------------------------------------*
// ARRAY VALUE AND INDEX TYPES
//-----------------------------------------------------------------------------*

typedef int item_t;
#define item_less(x, y) ((x) < (y))

// Both signed and unsigned types are supported:
typedef uint32_t index_t;

// #define INDEX_RNG_USING_MCG
#if defined(INDEX_RNG_USING_MCG)
  // MCG is faster than LCG or PCG, and is usually good enough for quicksort.
  typedef mcg64_rng_t index_rng_t;
  #define index_rng(seed) (mcg64_rng(seed))
  #define index_rand(rng, len) (onto_range_u64(mcg64_rng_next(rng), 0, (len)))
#else
  typedef pcg32_rng_t index_rng_t;
  #define index_rng(seed) (pcg32_rng(seed))
  #define index_rand(rng, len) (onto_range_u32(pcg32_rng_next(rng), 0, (len)))
#endif

// An index_rng() wrapper for seeding a random index generator from an array.
enum { INDEX_RNG_SEED_SAMPLE_COUNT = 128 };
static index_rng_t
index_rng_seeded_from(item_t *arr, index_t len) {
  // Take advantage of the randomness added to addresses on modern systems
  // due to position-independent code, address space layout randomization,
  // and random stack gap insertion:
  uint64_t seed = -1;
  seed = seed_mix_u64(seed, (uintptr_t)&memset);
  seed = seed_mix_u64(seed, (uintptr_t)&index_rng_seeded_from);
  seed = seed_mix_u64(seed, (uintptr_t)&seed);
  seed = seed_mix_u64(seed, (uintptr_t)arr);
  index_rng_t rng = index_rng(seed);

  // Then, mix in some bits from the array itself by breaking it into 64-bit
  // aligned chunks and randomly choosing one u64 value out of each using the
  // seed obtained so far. Since we use index_rand() returning index_t values
  // to sample from chunks, it is obviously desirable for index_t to be ample
  // enough to fit any index up to the maximum possible chunk length:
  enum { N_CHUNKS = INDEX_RNG_SEED_SAMPLE_COUNT };
  apriori(int_type_max(index_t) >=
    (int_type_max(index_t) * sizeof(item_t)) / (N_CHUNKS * sizeof(uint64_t)));

  uint64_t *beg = align_ptr_up(arr, sizeof(uint64_t));
  uint64_t *end = align_ptr_down(arr+len, sizeof(uint64_t));
  if unlikely(beg >= end) { return rng; }

  size_t span = end - beg;
  if unlikely(span < N_CHUNKS) {
    uint64_t *p = beg + index_rand(&rng, span);
    seed = seed_mix_u64(seed, load_u64(p));
    return index_rng(seed);
  }

  size_t chunk_len = span/N_CHUNKS;
  beg += index_rand(&rng, span%N_CHUNKS + 1);
  end = beg + chunk_len*N_CHUNKS;
  for (uint64_t *chunk = beg; chunk != end; chunk += chunk_len) {
    uint64_t *p = chunk + index_rand(&rng, chunk_len);
    seed = seed_mix_u64(seed, load_u64(p));
  }
  return index_rng(seed);
}

static must_inline void
index_rand_3(index_t i[static 3], index_rng_t *rng, index_t len) {
  surely(len >= 3);
  i[0] = index_rand(rng, len-0);
  i[1] = index_rand(rng, len-1);
  i[2] = index_rand(rng, len-2);
  unsigned i1_past_i0 = (i[1] >= i[0]);
  i[1] += i1_past_i0; // Count i[1]-th item, jumping over i[0]-th.
  i[2] += (i[2] >= i[!i1_past_i0]); // Count i[2]-th, skipping min(i[0],i[1])-th...
  i[2] += (i[2] >= i[i1_past_i0]); // ...and then max(i[0],i[1])-th.
}
static must_inline index_t
median_of_3_idx(item_t *arr, index_t i[static 3]) {
  unsigned gt01 = !!item_less(arr[i[1]], arr[i[0]]);
  unsigned lt02 = !!item_less(arr[i[0]], arr[i[2]]);
  unsigned lt12 = !!item_less(arr[i[1]], arr[i[2]]);
  return i[(gt01 ^ lt02) + (lt02 ^ lt12)];
}

static void
insertion_sort(item_t *arr, index_t len) {
  surely(len >= 0);
  for (index_t i = (len != 0); i != len; i++) {
    if fickle(!item_less(arr[i], arr[i-1])) { continue; }
    item_t x = arr[i];
    index_t j = i;
    do { arr[j] = arr[j-1]; } while (--j != 0 && fickle(item_less(x, arr[j-1])));
    arr[j] = x;
  }
}


//-----------------------------------------------------------------------------*
// KEY PREREQUISITE: Fast array partitioning
//-----------------------------------------------------------------------------*

enum { PARTITION_BATCH_LEN = 256 };
apriori(PARTITION_BATCH_LEN - 1 <= int_type_max(unsigned char)); // For storing offsets in uchar.
apriori(PARTITION_BATCH_LEN % 8 == 0); // For loop unrolling.

#define REPEAT_1(s) {s;}
#define REPEAT_8(s) {{s;}{s;}{s;}{s;}{s;}{s;}{s;}{s;}}

#define BATCH_COLLECT_ITEMS(offsets_buf, n_offsets_ptr, \
    idx_var, batch_len, match_expr, unroll_macro) do { \
  unsigned char *dub(buf) = (offsets_buf); \
  unsigned dub(n) = 0, (idx_var) = 0, dub(end) = (batch_len); \
  while ((idx_var) != dub(end)) unroll_macro({ \
    dub(buf)[dub(n)] = (idx_var); \
    dub(n) += !!(match_expr); \
    (idx_var)++; \
  }) \
  *(n_offsets_ptr) = dub(n); \
} while (0)

#define BATCH_EXCHANGE_ITEMS(count, next_l_ptr_expr, next_r_ptr_expr) do { \
  unsigned dub(n) = (count); \
  if (dub(n) == 0) { break; } \
  item_t *dub(l) = (next_l_ptr_expr),  dub(x) = *dub(l); \
  item_t *dub(r) = (next_r_ptr_expr); *dub(l) = *dub(r); \
  while (--dub(n) != 0) { \
    dub(l) = (next_l_ptr_expr); *dub(r) = *dub(l); \
    dub(r) = (next_r_ptr_expr); *dub(l) = *dub(r); \
  } \
  *dub(r) = dub(x); \
} while (0)

// partition() partitions array elements in [0,len[ using the item currently
// stored at pivot_idx for the pivot and returns the index it ends up at after
// partitioning is done.
//
// This is the normal Hoare partitioning, except that item swaps are done in
// batches instead of one by one in order to eliminate branch mispredictions.
// Items from each end of the current interval are compared with the pivot in
// chunks of PARTITION_BATCH_LEN items, and the offsets of those to be exchanged
// are therefore buffered in a fixed iteration loop (see BATCH_COLLECT_ITEMS()).
// Pairs of items with indices taken from the two buffers are then swapped until
// one or both of the buffers are emptied, at which point the corresponding end
// of the interval is advanced, and the emptied buffer is refilled by scanning
// the next batch of items.
//
// To reduce the number of item moves induced by the swapping loop (from 3 to
// 2 per iteration), a cyclic permutation is used in favor of pairwise swaps,
// i.e., instead of straightforward exchanges l(i) <-> r(i) for i = 0,...,n-1,
// we effectively left rotate the following interleaved sequence by one:
//   x <- l(0) <- r(0) <- l(1) <- r(1) <- l(2) <- r(2) <- ...
//     ... <- l(n-2) <- r(n-2) <- l(n-1) <- r(n-1) <- x
// (see BATCH_EXCHANGE_ITEMS()).
static index_t
partition(item_t *arr, index_t len, index_t pivot_idx) {
  surely(len >= 0), surely(0 <= pivot_idx), surely(pivot_idx < len);
  if unlikely(len <= 1) { return 0; }

  item_t pivot = arr[pivot_idx];
  arr[pivot_idx] = arr[0];
  item_t *l = arr + 1;
  item_t *r = arr + len - 1; // Inclusive!

  // The dl and dr arrays (backed by dl_buf and dr_buf) store deltas
  // from l and r of those items in the current left-hand side batch
  // [l,l+BATCH_LEN[ and right-hand side batch ]r-BATCH_LEN,r] that
  // are currently on the wrong side of the pivot.
  enum { BATCH_LEN = PARTITION_BATCH_LEN };
  aligned_to(CACHE_LINE_SIZE) unsigned char dl_buf[BATCH_LEN];
  aligned_to(CACHE_LINE_SIZE) unsigned char dr_buf[BATCH_LEN];
  unsigned char *dl = NULL, *dr = NULL;
  unsigned n_dl = 0, n_dr = 0;

  while (r+1 - l >= 2*BATCH_LEN) {
    if (n_dl == 0) {
      dl = dl_buf;
      BATCH_COLLECT_ITEMS(dl, &n_dl, (i), BATCH_LEN,
        (!item_less(*(l + i), pivot)), REPEAT_8);
    }
    if (n_dr == 0) {
      dr = dr_buf;
      BATCH_COLLECT_ITEMS(dr, &n_dr, (i), BATCH_LEN,
        (!item_less(pivot, *(r - i))), REPEAT_8);
    }
    unsigned n = n_dl < n_dr ? n_dl : n_dr;
    if (n != 0) {
      n_dl -= n, n_dr -= n;
      BATCH_EXCHANGE_ITEMS(n, (l + *dl++), (r - *dr++));
    }
    l += BATCH_LEN * (n_dl == 0);
    r -= BATCH_LEN * (n_dr == 0);
  }

  // Same for up to 2*BATCH_LEN-1 remaining items. If both offset buffers
  // are empty, scan half of the items for each. Otherwise, one of them is
  // not exhausted, meaning the BATCH_LEN items at the corresponding end
  // were already looked at, so scan only the remainder at the opposite end,
  // filling the other buffer.
  {
    unsigned span = r+1 - l;
    unsigned span_l =
      n_dl != 0 ? BATCH_LEN :
      n_dr != 0 ? span - BATCH_LEN :
      span/2;
    unsigned span_r = span - span_l;
    if (n_dl == 0) {
      dl = dl_buf;
      BATCH_COLLECT_ITEMS(dl, &n_dl, (i), span_l,
        (!item_less(*(l + i), pivot)), REPEAT_1);
    }
    if (n_dr == 0) {
      dr = dr_buf;
      BATCH_COLLECT_ITEMS(dr, &n_dr, (i), span_r,
        (!item_less(pivot, *(r - i))), REPEAT_1);
    }
    unsigned n = n_dl < n_dr ? n_dl : n_dr;
    if (n != 0) {
      n_dl -= n, n_dr -= n;
      BATCH_EXCHANGE_ITEMS(n, (l + *dl++), (r - *dr++));
    }
    l += span_l * (n_dl == 0);
    r -= span_r * (n_dr == 0);

    // Move any leftovers to the other end, going backwards through the buffer
    // to ensure we do not write over any of those items before we get to them.
    if (n_dl != 0) {
      dl += n_dl;
      BATCH_EXCHANGE_ITEMS(n_dl, (l + *--dl), (r--));
      l = r+1;
    } else if (n_dr != 0) {
      dr += n_dr;
      BATCH_EXCHANGE_ITEMS(n_dr, (r - *--dr), (l++));
      r = l-1;
    }
  }
  surely(arr <= r), surely(r < arr+len);
  arr[0] = *r;
  *r = pivot;
  return r - arr;
}


//-----------------------------------------------------------------------------*
// APPROACH 1: Floyd-Rivest rank partitioning + N largest binary heap scan
//-----------------------------------------------------------------------------*

// approx_floor_pow_2over3_u64() approximates floor(x^(2/3)) for integer bases.
// The 99 percentile of the absolute value of the relative error is less than
// 0.05% over the entire u64 range.
//
// Based on a continued fraction representation (see ch. 2, para. 2, eq. 2.6 in
// Khovanskii, A. N. (1963). The Application of Continued Fractions and Their
// Generalizations to Problems in Approximation Theory. Groningen: Noordhoff):
//
//                                        2/3 (x-1)
//   x^(2/3) - 1 = ─────────────────────────────────────────────────────
//                      ∞ ⎡(k-2/3)/(4k-2) (x-1)   (k+2/3)/(4k+2) (x-1)⎤
//                  1 + K ⎢────────────────────   ────────────────────⎥
//                      1 ⎣          1          +           1         ⎦
//
//             2/3 (x-1)   1/6 (x-1)   5/18 (x-1)   2/9 (x-1)   4/15 (x-1)
//           = ─────────   ─────────   ──────────   ─────────   ──────────
//                 1     +     1     +      1     +     1     +      1     + ...
//
// Truncating it to the first four terms and substituting 3/2 for x in the
// fifth one (and 1 in all the rest), we obtain the following approximation:
//
//             55x^2 + 100x + 7           912x + 708
//   x^(2/3) ≈ ──────────────── = 11 - ───────────────,
//              5x^2 + 92x + 65        5x^2 + 92x + 65
//
// which is quite accurate for 1 <= x < 2 (it underestimates slightly for
// x from 1 to ≈1.57, and the closer x gets to 2, the more it overestimates
// after that). To use it for any x, we normalize by the highest 2^k <= x:
//
//   (2^k * (x/2^k))^(2/3) = (2^(2/3))^k * (x/2^k)^(2/3),  1 <= x/2^k < 2;
//
// where the values of (2^(2/3))^k for all k = 0,...,63 are precomputed.
//
// We implement this using fixed-point arithmetic with 16 fractional bits.
//
// For the final multiplication by (2^(2/3))^k, we have no fewer than
//   64 - P - ⌈log2((2^64-1)^(2/3))⌉ = 64 - P - 43 = 5  (for P = 16)
// bits to spare until overflow. We precompute the exponents (2^(2/3))^k to
// 4 fractional bits, which is comfortably enough, as their fractional parts
// only matter for small powers k, yet when k is small, truncation of the
// final product to an integer eliminates the need for high precision.
static inline uint64_t
approx_floor_pow_2over3_u64(uint64_t x) {
  enum { PREC = 16 }; // Fixed-point precision.
  #define I(n)   ((uint64_t)(n) << (1*(PREC)))
  #define II(n)  ((uint64_t)(n) << (2*(PREC)))
  #define III(n) ((uint64_t)(n) << (3*(PREC)))

  // Precomputed (2^(2/3))^(63-i) for i = 0,...,63.
  enum { PREC_POW2_2OVER3_TIMES_63MINUS = 4 }; // Table precision.
  static const uint64_t pow2_2over3_times_63minus[] = {
    #define X(v) (uint64_t)((v)*(1ull<<PREC_POW2_2OVER3_TIMES_63MINUS) + 0.5),
    X(4398046511104.) X(2770595688878.) X(1745365914583.) X(1099511627776.)
    X(692648922219.6) X(436341478645.7) X(274877906944.0) X(173162230554.9)
    X(109085369661.4) X(68719476736.00) X(43290557638.72) X(27271342415.36)
    X(17179869184.00) X(10822639409.68) X(6817835603.839) X(4294967296.000)
    X(2705659852.420) X(1704458900.960) X(1073741824.000) X(676414963.1051)
    X(426114725.2400) X(268435456.0000) X(169103740.7763) X(106528681.3100)
    X(67108864.00000) X(42275935.19407) X(26632170.32750) X(16777216.00000)
    X(10568983.79852) X(6658042.581874) X(4194304.000000) X(2642245.949629)
    X(1664510.645469) X(1048576.000000) X(660561.4874073) X(416127.6613672)
    X(262144.0000000) X(165140.3718518) X(104031.9153418) X(65536.00000000)
    X(41285.09296296) X(26007.97883545) X(16384.00000000) X(10321.27324074)
    X(6501.994708862) X(4096.000000000) X(2580.318310185) X(1625.498677215)
    X(1024.000000000) X(645.0795775462) X(406.3746693039) X(256.0000000000)
    X(161.2698943865) X(101.5936673260) X(64.00000000000) X(40.31747359664)
    X(25.39841683149) X(16.00000000000) X(10.07936839916) X(6.349604207873)
    X(4.000000000000) X(2.519842099790) X(1.587401051968) X(1.000000000000)
    #undef X
  }; apriori(countof(pow2_2over3_times_63minus) == 64);

  if unlikely(x == 0) { return 0; }
  unsigned l = leading_zeros_u64(x);
  x = (x << l) >> (63 - PREC);
  surely(I(1) <= x && x < I(2));

  uint64_t y = I(11) - ((II(912*x) + III(708)) / (5*x*x + I(92*x) + II(65)));
  return (y * pow2_2over3_times_63minus[l]) >> (PREC + PREC_POW2_2OVER3_TIMES_63MINUS);

  #undef I
  #undef II
  #undef III
}
// approx_log_fixp16_u64() approximates ln(x) for integer arguments. It's
// accurate to at least 3 digits after the decimal point over the entire u64
// range. The return value is in the 16.16 fixed-point format.
//
// Based on a continued fraction representation (see ch. 2, para. 4, eq. 4.5 in
// Khovanskii, A. N. (1963). The Application of Continued Fractions and Their
// Generalizations to Problems in Approximation Theory. Groningen: Noordhoff):
//
//           2(x-1)   (x-1)^2   4(x-1)^2   9(x-1)^2         k^2 (x-1)^2
//   ln(x) = ──────   ───────   ────────   ────────         ───────────
//            (x+1) +  3(x+1) +  5(x+1)  +  7(x+1)  + ... + (2k+1)(x+1) + ...
//
// Truncating it to the first two terms (by substituting 1 for x in the third
// one onward), we obtain the following approximation:
//
//            3(x^2 - 1)    3(x^2 + 1) - 6
//   ln(x) ≈ ──────────── = ──────────────,
//           x^2 + 4x + 1   (x^2 + 1) + 4x
//
// which is decently accurate for 1 <= x < 2 (the farther from 1, the more it
// underestimates). To use it for any x, we normalize by the highest 2^k <= x:
//
//   ln(2^k * (x/2^k)) = k*ln(2) + ln(x/2^k),  1 <= x/2^k < 2.
//
// We implement this using fixed-point arithmetic with 16 fractional bits.
static inline uint32_t
approx_log_fixp16_u64(uint64_t x) {
  enum { PREC = 16 }; // Fixed-point precision.
  #define I(n)  ((uint64_t)(n) << (1*(PREC)))
  #define II(n) ((uint64_t)(n) << (2*(PREC)))

  // Precomputed ln(2)*(63-i) for i = 0,...,63.
  static const uint32_t ln2_times_63minus[] = {
    #define X(i) (uint32_t)(I(63-(i))*0.69314718056 + 0.5),
    X( 0)X( 1)X( 2)X( 3)X( 4)X( 5)X( 6)X( 7)X( 8)X( 9)X(10)X(11)X(12)
    X(13)X(14)X(15)X(16)X(17)X(18)X(19)X(20)X(21)X(22)X(23)X(24)X(25)
    X(26)X(27)X(28)X(29)X(30)X(31)X(32)X(33)X(34)X(35)X(36)X(37)X(38)
    X(39)X(40)X(41)X(42)X(43)X(44)X(45)X(46)X(47)X(48)X(49)X(50)X(51)
    X(52)X(53)X(54)X(55)X(56)X(57)X(58)X(59)X(60)X(61)X(62)X(63)
    #undef X
  }; apriori(countof(ln2_times_63minus) == 64);

  if unlikely(x == 0) { return 0; }
  unsigned l = leading_zeros_u64(x);
  x = (x << l) >> (63 - PREC);
  surely(I(1) <= x && x < I(2));

  uint64_t y = x*x + II(1);
  y = I(3*y - II(6)) / (y + I(4*x));
  return (uint32_t)y + ln2_times_63minus[l];

  #undef I
  #undef II
}
// approx_floor_sqrt_u64() approximates floor(sqrt(x)) for integer arguments.
// The 99 percentile of the absolute value of the relative error is less than
// 0.05% over the entire u64 range.
//
// Based on a continued fraction representation (see ch. 2, para. 2, eq. 2.6 in
// Khovanskii, A. N. (1963). The Application of Continued Fractions and Their
// Generalizations to Problems in Approximation Theory. Groningen: Noordhoff):
//
//                                        1/2 (x-1)
//   sqrt(x) - 1 = ─────────────────────────────────────────────────────
//                      ∞ ⎡(k-1/2)/(4k-2) (x-1)   (k+1/2)/(4k+2) (x-1)⎤
//                  1 + K ⎢────────────────────   ────────────────────⎥
//                      1 ⎣          1          +           1         ⎦
//
//                  1/2 (x-1)        (x-1)/2   (x-1)/4   (x-1)/4   (x-1)/4
//           = ─────────────────── = ───────   ───────   ───────   ───────
//                  ∞ ⎡1/4 (x-1)⎤       1    +    1    +    1    +    1    + ...
//              1 + K ⎢─────────⎥
//                  1 ⎣    1    ⎦
//
// Truncating it to the first four terms (i.e., substituting 1 for x in the
// fifth and all the subsequent ones), we obtain the following (lower-bound)
// approximation:
//
//             5x^2 + 10x + 1            4(x^2 - 1)
//   sqrt(x) ≈ ────────────── = 1 + ───────────────────,
//              x^2 + 10x + 5       (x^2 - 1) + 10x + 6
//
// which is decently accurate for 1 <= x < 2 (it underestimates by less than
// 1e-4 for x between 1 and ≈1.7, and by up to ≈4.2e-4 for larger values of
// x approaching 2). To use it for any x, we normalize by the highest 2^k <= x:
//
//   sqrt(2^k * (x/2^k)) = sqrt(2)^k * sqrt(x/2^k),  1 <= x/2^k < 2;
//
// where, trivially, sqrt(2)^(2l) = 2^l and sqrt(2)^(2l+1) = 2^l * sqrt(2).
//
// We implement this using fixed-point arithmetic with 16 fractional bits.
static inline uint64_t
approx_floor_sqrt_u64(uint64_t x) {
  enum { PREC = 16 }; // Fixed-point precision.
  #define I(n)  ((uint64_t)(n) << (1*(PREC)))
  #define II(n) ((uint64_t)(n) << (2*(PREC)))

  // Precomputed sqrt(2-i)/2 for i = 0, 1.
  const uint32_t half_sqrt_2minus[] = {(I(1)/2)*1.4142135624 + 0.5, I(1)/2};
  apriori(countof(half_sqrt_2minus) == 2);

  if unlikely(x == 0) { return 0; }
  unsigned l = leading_zeros_u64(x);
  x = (x << l) >> (63 - PREC);
  surely(I(1) <= x && x < I(2));

  uint64_t y = x*x - II(1);
  y = I(1) + (I(4*y) / (y + I(10*x) + II(6)));
  if (PREC == 16) {
    return (y * half_sqrt_2minus[l&1]) >> (l>>1);
  } else {
    return ((y << (31^(l>>1))) * half_sqrt_2minus[l&1]) >> (2*PREC-1);
  }

  #undef I
  #undef II
}

// rank_partition() uses the Floyd-Rivest Select algorithm to partition array
// elements in [0,len[ around the pivot_rank-th smallest value in it (counting
// from zero), which hence ends up at the index pivot_rank afterwards.
//
// The RANK_PARTITION_SAMPLING_CUTOFF_LEN for recursive sampling should not
// be too small in order to limit the sampling costs and keep the recursion
// tree shallow. Let s(n,0) = n and s(n,k) = s(n,k-1)^(2/3) * c for some
// constant 0 < c <= 1. When s(n,k) <= CUTOFF_LEN < s(n,k-1), there will
// be at most k levels of recursion for all arrays of n elements or fewer.
// The guiding milestones for 32-bit and 48-bit lengths are as follows:
//
//   s(2^32, 1) = s(2^48, 2) / c^0.667 < 2,642,246 * c,
//   s(2^32, 2) = s(2^48, 3) / c^0.444 <    19,113 * c^1.667,
//   s(2^32, 3) = s(2^48, 4) / c^0.296 <       715 * c^2.111,
//   s(2^32, 4) = s(2^48, 5) / c^0.198 <        80 * c^2.407,
//   s(2^32, 5) = s(2^48, 6) / c^0.132 <        19 * c^2.605.
//
// In our implementation here, the scaling constant c is defined by the ratio
// RANK_PARTITION_SAMPLE_LEN_MUL/RANK_PARTITION_SAMPLE_LEN_DIV which can thus
// be tweaked via the constants section below.
//
// For the underlying sampling technique and its asymptotic properties, see:
// - Floyd, R. W., Rivest, R. L. (1975). Expected time bounds for selection.
//   Communications of the ACM, 18(3), 165-172.
// - Kiwiel, K. C. (2005). On Floyd and Rivest's SELECT algorithm.
//   Theoretical Computer Science, 347(1-2), 214-238.
//
// Computing sample bounds using our own fixed-point approximations of ln(n) and
// floor(n^(2/3)) is roughly 3 to 4 times faster than any equivalent computation
// relying on the standard floating-point log() and exp() (or cbrt()) functions
// (even with 32-bit floats). Of course, it's not a big contributor towards the
// total time here, but it's a minor improvement nonetheless. We also use our
// own approximation of floor(sqrt(n)) even though the floating-point CPU
// instruction the standard sqrt() call boils down to is still a bit faster,
// because it allows us to completely relieve rank_partition() from the libm
// linking dependency (or compiler-dependent __builtin_sqrt() shenanigans) and
// keep all the math within the function integer-only.
//
// We rely on the presence of the uint128_t type for fractional scaling in the
// 112.16 fixed-point format during the sample_part_rank computation in order
// to guarantee that no overflows can happen for all possible array lengths.
// The risk is present, depending on the constants, for lengths starting from
// 2^26 or so. Although this can be avoided for shorter arrays, it's generally
// not worth it even for those, as the extra cost of a couple of u128*u64
// multiplications and a u128/u64 division per recursive call (as opposed
// to the u64-only ones) is practically negligible here.
enum {
  RANK_PARTITION_CUTOFF_LEN = 4,
  RANK_PARTITION_MEDIAN3_CUTOFF_LEN = 120,
  RANK_PARTITION_SAMPLING_CUTOFF_LEN = 3000,
  RANK_PARTITION_SAMPLE_LEN_MUL = 7,
  RANK_PARTITION_SAMPLE_LEN_DIV = 32,
  RANK_PARTITION_GAP_LEN_MUL = RANK_PARTITION_SAMPLE_LEN_MUL,
  RANK_PARTITION_GAP_LEN_DIV = RANK_PARTITION_SAMPLE_LEN_DIV,
};
apriori(0 < RANK_PARTITION_CUTOFF_LEN);
apriori(RANK_PARTITION_CUTOFF_LEN < RANK_PARTITION_MEDIAN3_CUTOFF_LEN);
apriori(RANK_PARTITION_MEDIAN3_CUTOFF_LEN < RANK_PARTITION_SAMPLING_CUTOFF_LEN);
static inline_callees void
rank_partition_rec(index_rng_t *rng, item_t *arr, index_t len, index_t part_rank) {
  #define scale_sample_len(n) \
    (((n) * RANK_PARTITION_SAMPLE_LEN_MUL) / RANK_PARTITION_SAMPLE_LEN_DIV)
  #define scale_gap_len(n) \
    (((n) * RANK_PARTITION_GAP_LEN_MUL) / RANK_PARTITION_GAP_LEN_DIV)

  index_t l = 0, r = len;
  while (l + RANK_PARTITION_CUTOFF_LEN < r) {
    index_t n = r - l, pivot_idx = -1;
    // Choose and, via pivot_idx, point at the item to be used as the pivot
    // for partitioning the current subarray indexed by the [l,r[ interval.
    if (n > RANK_PARTITION_SAMPLING_CUTOFF_LEN) {
      index_t sample_len = scale_sample_len(approx_floor_pow_2over3_u64(n));
      // Safety for invalid or poorly chosen scaling constants.
      if unlikely(sample_len > n/2) { sample_len = n/2; }
      if unlikely(sample_len < 1) { sample_len = 1; }

      // Randomly sample items from [l,r[ and move them to [l,l+sample_len[
      // at the front (muscling the rest out to [l+sample_len,r[).
      {
        item_t *front = arr + l;
        index_t chunk_len = n/sample_len;
        item_t *p, *chunk = front + index_rand(rng, n%sample_len + 1);
        BATCH_EXCHANGE_ITEMS(sample_len, (front++),
          (p = chunk + index_rand(rng, chunk_len), chunk += chunk_len, p));
      }

      // Pick the rank of the item in the sample that will serve as the pivot.
      index_t sample_part_rank = -1;
      {
        uint32_t log_n = approx_log_fixp16_u64(n);
        uint64_t gap_sq = ((uint128_t)log_n * (n - sample_len) * sample_len / n) >> 16;
        index_t gap = scale_gap_len(approx_floor_sqrt_u64(gap_sq));
        // Safety for invalid or poorly chosen scaling constants.
        if unlikely(gap < 0) { gap = 0; }
        if unlikely(gap > sample_len/2) { gap = sample_len/2; }

        sample_part_rank = (uint64_t)((uint128_t)(part_rank - l) * sample_len / n);
        if fickle(sample_part_rank < sample_len/2) {
          sample_part_rank += gap;
        } else {
          sample_part_rank -= gap;
        }
      }

      // Partition the sample with the sole purpose of finding the item whose
      // rank in the sample is sample_part_rank, so that we can use it as the
      // pivot to then efficaciously partition all items in [l,r[ altogether.
      rank_partition_rec(rng, arr+l, sample_len, sample_part_rank);
      pivot_idx = sample_part_rank;
    } else if (n > RANK_PARTITION_MEDIAN3_CUTOFF_LEN) {
      index_t i[3];
      index_rand_3(i, rng, n);
      pivot_idx = median_of_3_idx(arr+l, i);
    } else {
      pivot_idx = index_rand(rng, n);
    }

    pivot_idx = l + partition(arr+l, n, pivot_idx);
    surely(l <= pivot_idx), surely(pivot_idx < r);
    if fickle(pivot_idx <= part_rank) { l = pivot_idx + 1; }
    if fickle(part_rank <= pivot_idx) { r = pivot_idx; }
  }
  if (l < r) { insertion_sort(arr+l, r-l); }

  #undef scale_sample_len
  #undef scale_gap_len
}
static inline void
rank_partition(item_t *arr, index_t len, index_t pivot_rank) {
  surely(len >= 0), surely(0 <= pivot_rank), surely(pivot_rank < len);
  if unlikely(len <= 1) { return; }

  index_rng_t rng = index_rng_seeded_from(arr, len);
  rank_partition_rec(&rng, arr, len, pivot_rank);
}

// Binary heap implementation. Supports both min- and max-heaps, as well as
// both left-to-right and right-to-left storage layout in memory. The unifying
// moniker of "lighter" is used for the heap property predicate: If item X is
// lighter than item Y, it means that X cannot be a child of Y in the heap
// (i.e., the former metaphorically wants to bubble up above the latter).
#define heap_lighter(x, y) ((min_heap) ? item_less((x), (y)) : item_less((y), (x)))
#define heap_at(i) (*((stored_backwards) ? (root)-(i) : (root)+(i)))
static inline void
heap_fix_down(int min_heap, int stored_backwards, item_t *root, index_t node, index_t len) {
  surely(len > 0), surely(0 <= node), surely(node < len);
  item_t x = heap_at(node);
  for (index_t child; child = (node<<1)|1, child < len; node = child) {
    child += (likely(child+1 != len) && heap_lighter(heap_at(child+1), heap_at(child)));
    if fickle(!heap_lighter(heap_at(child), x)) { break; }
    heap_at(node) = heap_at(child);
  }
  heap_at(node) = x;
}
#undef heap_lighter
#undef heap_at
static inline_callees void
heap_build(int min_heap, int stored_backwards, item_t *root, index_t len) {
  surely(len >= 0);
  for (index_t i = len/2; i != 0; i--) {
    heap_fix_down(min_heap, stored_backwards, root, i-1, len);
  }
}
#define lr_max_heap_fix_down(root, node, len) (heap_fix_down(0,0, (root), (node), (len)))
#define rl_min_heap_fix_down(root, node, len) (heap_fix_down(1,1, (root), (node), (len)))
#define lr_max_heap_build(root, len) (heap_build(0,0, (root), (len)))
#define rl_min_heap_build(root, len) (heap_build(1,1, (root), (len)))

// rank_partitioned_heapsort() partially heap-sorts array elements in [0,len[
// so that those whose ranks in the overall sorted sequence belong to [lo,hi[
// take their correct positions, in order; those displaced into indices [0,lo[
// and [hi,len[ are unordered.
//
// The implementation bifurcates based on whether the requested sorting interval
// is closer to the beginning or the end of the array. In the former case, we:
//
//   L1. Partition the array around the (hi-1)-st largest element as a pivot.
//   L2. Build a right-to-left min-heap out of elements in [lo,hi[ so that
//       the root node holding the minimum value is stored at index hi-1.
//   L3. For each element in [0,lo[, if it is exceeding the current heap
//       minimum, exchange the former with the latter and fix the heap.
//   L4. Fill [lo,hi[ going left to right with successively removed heap
//       minimums, simultaneously shortening (and maintaining) the heap.
//
// Otherwise, if the interval is closer to the end of the array, we:
//
//   R1. Partition the array around the lo-th largest element as a pivot.
//   R2. Build a left-to-right max-heap out of elements in [lo,hi[ so that
//       the root node holding the maximum value is stored at index lo.
//   R3. For each element in [hi,len[, if it is inferior to the current heap
//       maximum, exchange the former with the latter and fix the heap.
//   R4. Fill [lo,hi[ going right to left with successively removed heap
//       maximums, simultaneously shortening (and maintaining) the heap.
//
// In order to reduce the number of branch mispredictions that are bound to
// happen in step L3/R3, similarly to partition(), we scan elements in chunks
// of HEAPSORT_BATCH_LEN at a time, and in each of them buffer those that are
// heavier than the current heap root (i.e., larger in case of L3 and smaller
// in case of R3) in a branchless way first, and then perform the necessary
// unpredictable comparisons (and the associated heap root displacements) only
// with the buffered elements. For nondegenerate input sequences, this allows to
// evade a significant portion of unpredictable branches, improving performance.
enum { HEAPSORT_BATCH_LEN = 32 };
apriori(HEAPSORT_BATCH_LEN - 1 <= int_type_max(unsigned char));
apriori(HEAPSORT_BATCH_LEN % 8 == 0);
static void
rank_partitioned_heapsort(item_t *arr, index_t len, index_t lo, index_t hi) {
  surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
  if unlikely(len <= 1 || lo >= hi) { return; }

  #define BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len, \
      count, next_ptr_expr, heap_lighter, heap_fix_down) do { \
    item_t *dub(root) = (heap_root); \
    index_t dub(len) = (heap_len); \
    for (unsigned dub(n) = (count); dub(n) != 0; dub(n)--) { \
      item_t *dub(x) = (next_ptr_expr); \
      if fickle(!heap_lighter(*dub(root), *dub(x))) { continue; } \
      swap(*dub(root), *dub(x)); \
      heap_fix_down(dub(root), 0, dub(len)); \
    } \
  } while (0)
  #define item_greater(x, y) (item_less((y), (x)))

  enum { BATCH_LEN = HEAPSORT_BATCH_LEN };
  aligned_to(CACHE_LINE_SIZE) unsigned char dl_buf[BATCH_LEN];

  if (lo < len-hi) {
    rank_partition(arr, len, hi-1);

    index_t heap_len = hi - lo;
    item_t *heap_root = arr + hi - 1;
    rl_min_heap_build(heap_root, heap_len);

    item_t *l = arr + 0, *end = arr + lo;
    BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len,
      (end - l) % BATCH_LEN, (l++), item_less, rl_min_heap_fix_down);
    for (; l != end; l += BATCH_LEN) {
      unsigned char *dl = dl_buf;
      unsigned n_dl = 0;
      BATCH_COLLECT_ITEMS(dl, &n_dl, (i), BATCH_LEN,
        (item_less(*heap_root, l[i])), REPEAT_8);
      BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len,
        n_dl, (l + *dl++), item_less, rl_min_heap_fix_down);
    }

    for (index_t i = lo; i != hi-1; i++) {
      swap(arr[i], *heap_root);
      rl_min_heap_fix_down(heap_root, 0, --heap_len);
    }
  } else {
    if likely(lo != 0) { rank_partition(arr, len, lo); }

    index_t heap_len = hi - lo;
    item_t *heap_root = arr + lo;
    lr_max_heap_build(heap_root, heap_len);

    item_t *l = arr + hi, *end = arr + len;
    BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len,
      (end - l) % BATCH_LEN, (l++), item_greater, lr_max_heap_fix_down);
    for (; l != end; l += BATCH_LEN) {
      unsigned char *dl = dl_buf;
      unsigned n_dl = 0;
      BATCH_COLLECT_ITEMS(dl, &n_dl, (i), BATCH_LEN,
        (item_less(l[i], *heap_root)), REPEAT_8);
      BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS(heap_root, heap_len,
        n_dl, (l + *dl++), item_greater, lr_max_heap_fix_down);
    }

    for (index_t i = hi-1; i != lo; i--) {
      swap(arr[i], *heap_root);
      lr_max_heap_fix_down(heap_root, 0, --heap_len);
    }
  }

  #undef BATCH_DISPLACE_HEAP_ROOT_BY_HEAVIER_ITEMS
  #undef item_greater
}

#undef REPEAT_1
#undef REPEAT_8
#undef BATCH_COLLECT_ITEMS
#undef BATCH_EXCHANGE_ITEMS


//-----------------------------------------------------------------------------*
// APPROACH 2: One-off partial quicksort (no state shared between calls)
//-----------------------------------------------------------------------------*

enum {
  QUICKSORT_HARD_CUTOFF_LEN = 4,
  QUICKSORT_SOFT_CUTOFF_LEN = 16,
  QUICKSORT_PSEUDO_MEDIAN_DEPTH = 4,
  QUICKSORT_PSEUDO_MEDIAN_CUTOFF_LEN = 120,
};
apriori(QUICKSORT_HARD_CUTOFF_LEN >= 3); // For easy median-of-three selection.
apriori(QUICKSORT_PSEUDO_MEDIAN_CUTOFF_LEN + 1 >= 3*3); // Same, but in thirds.

// pow3_pseudo_median_idx() implements the power-of-three pseudo-median
// selection. It recursively descends to divide an interval into (at most)
// 3^depth chunks, finds the median of three randomly selected items from each,
// and reduces those to a single value by keeping the median out of every triple
// of medians (of medians...) while ascending.
static inline_callees index_t
pow3_pseudo_median_idx_rec(index_rng_t *rng, item_t *arr, index_t len, unsigned rec_lim) {
  surely(len >= 3);
  index_t i[3];
  if (rec_lim-- == 0 || len <= QUICKSORT_PSEUDO_MEDIAN_CUTOFF_LEN) {
    index_rand_3(i, rng, len);
  } else {
    index_t n = len/3;
    i[0] = 0*n + pow3_pseudo_median_idx_rec(rng, arr+0*n, n, rec_lim);
    i[1] = 1*n + pow3_pseudo_median_idx_rec(rng, arr+1*n, n, rec_lim);
    i[2] = 2*n + pow3_pseudo_median_idx_rec(rng, arr+2*n, len-2*n, rec_lim);
  }
  return median_of_3_idx(arr, i);
}
static inline index_t
pow3_pseudo_median_idx(index_rng_t *rng, item_t *arr, index_t len) {
  index_t i = pow3_pseudo_median_idx_rec(rng, arr, len, QUICKSORT_PSEUDO_MEDIAN_DEPTH);
  surely(0 <= i), surely(i < len);
  return i;
}
static must_not_inline void
partial_quicksort_rec_bottom(item_t *arr, index_t lo, index_t hi, index_t l, index_t r) {
  if (lo < l) { lo = l; }
  if (hi > r) { hi = r; }
  // Although rank partitioning performed by the partial heapsort routine is
  // also (stochastically) recursive, it is guaranteed to have a fixed upper
  // bound on its stack usage, so it's fine to use it as a failsafe here.
  rank_partitioned_heapsort(arr+l, r-l, lo-l, hi-l);
}
// partial_quicksort_rand() partially quicksorts array elements in [0,len[ so
// that those whose ranks in the overall sorted sequence belong to [lo,hi[ take
// their correct positions in the array.
static inline_callees void
partial_quicksort_rand_rec(index_rng_t *rng, item_t *arr, index_t lo, index_t hi,
    index_t l, index_t r, unsigned rec_lim) {
  if unlikely(rec_lim-- == 0) {
    partial_quicksort_rec_bottom(arr, lo, hi, l, r);
    return;
  }
  for (;;) {
    surely(l+1 < r);
    if (r-l <= QUICKSORT_HARD_CUTOFF_LEN ||
        (r-l <= QUICKSORT_SOFT_CUTOFF_LEN && lo <= l && r <= hi)) {
      insertion_sort(arr+l, r-l);
      return;
    }
    index_t pivot_idx = pow3_pseudo_median_idx(rng, arr+l, r-l);
    index_t r_left = l + partition(arr+l, r-l, pivot_idx);
    surely(l <= r_left), surely(r_left < r);
    index_t l_right = r_left + 1;
    unsigned go_left  = (lo < r_left  && r_left-l  > 1);
    unsigned go_right = (l_right < hi && r-l_right > 1);

    switch fickle((go_left<<1) | go_right) {
    case 0: { return; } break;
    case 1: dive_right: { l = l_right; } break;
    case 2: dive_left:  { r = r_left;  } break;
    case 3: {
      if (r_left-l <= r-l_right) {
        partial_quicksort_rand_rec(rng, arr, lo, hi, l, r_left, rec_lim);
        goto dive_right;
      } else {
        partial_quicksort_rand_rec(rng, arr, lo, hi, l_right, r, rec_lim);
        goto dive_left;
      }
    } break;
    default: { unreachable(); } break;
    }
  }
}
static inline void
partial_quicksort_rand(item_t *arr, index_t len, index_t lo, index_t hi) {
  surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
  if unlikely(len <= 1 || lo >= hi) { return; }

  index_rng_t rng = index_rng_seeded_from(arr, len);
  partial_quicksort_rand_rec(&rng, arr, lo, hi, 0, len, 5*(bit_width(len)-1)/4);
}


//-----------------------------------------------------------------------------*
// APPROACH 3: Incremental quicksort (pivot indices are shared between calls)
//-----------------------------------------------------------------------------*

#define INC_QUICKSORT_CHAOS_MARK  ((index_t)-1)
#define INC_QUICKSORT_SORTED_MARK ((index_t)-2)
static must_not_inline void
inc_quicksort_rec_bottom(item_t *arr, index_t lo, index_t hi,
    index_t l, index_t r, index_t *part_idx) {
  if (lo < l) { lo = l; }
  if (hi > r) { hi = r; }
  if (lo == l && r == hi) {
    *part_idx = INC_QUICKSORT_SORTED_MARK;
  } else {
    // Once the items with ranks [lo,hi[ are put in place, any of them can serve
    // as the partition point of the [l,r[ interval. Pick the one closest to the
    // middle, unless the sorted subinterval is far away to either side in which
    // case mark the interval as unpartitioned until better luck comes around.
    index_t half = (r - l)/2, mid = l + half;
    *part_idx =
      mid < lo  ? (lo < r - half/4 ? lo   : INC_QUICKSORT_CHAOS_MARK) :
      hi <= mid ? (l + half/4 < hi ? hi-1 : INC_QUICKSORT_CHAOS_MARK) :
      mid;
  }
  // Items in [l,r[ will be reordered now; reset all descendant partitions:
  {apriori(INC_QUICKSORT_CHAOS_MARK == (index_t)-1);
  memset(part_idx+1, 0xFF, (r-l-1)*sizeof(*part_idx));}

  // Although rank partitioning performed by the partial heapsort routine is
  // also (stochastically) recursive, it is guaranteed to have a fixed upper
  // bound on its stack usage, so it's fine to use it as a failsafe here.
  rank_partitioned_heapsort(arr+l, r-l, lo-l, hi-l);
}
// inc_quicksort_rand() partially quicksorts array elements in [0,len[ so that
// those whose ranks in the overall sorted sequence belong to [lo,hi[ end up at
// their correct positions, taking into account the history of partitions from
// past invocations and making new ones only in the previously unpartitioned
// intervals when necessary. The function either leaves the array as it is or
// brings it monotonically closer to its fully sorted state with each call
// (hence the "incremental" moniker).
//
// The pivot indices of past partitions are stored in the part_idx array in
// the order of preorder traversal of the quicksort tree. The special index
// values INC_QUICKSORT_SORTED_MARK and INC_QUICKSORT_CHAOS_MARK signify that
// an interval is fully sorted or has not been partitioned yet, respectively.
// Each part_idx must initially be set to INC_QUICKSORT_CHAOS_MARK.
static inline_callees void
inc_quicksort_rand_rec(index_rng_t *rng, item_t *arr, index_t lo, index_t hi,
    index_t l, index_t r, index_t *part_idx, unsigned rec_lim) {
  if unlikely(rec_lim-- == 0) {
    inc_quicksort_rec_bottom(arr, lo, hi, l, r, part_idx);
    return;
  }
  for (;;) {
    surely(l+1 < r);
    int will_be_sorted = (lo <= l && r <= hi);
    if (r-l <= QUICKSORT_HARD_CUTOFF_LEN ||
        (r-l <= QUICKSORT_SOFT_CUTOFF_LEN && will_be_sorted)) {
      *part_idx = INC_QUICKSORT_SORTED_MARK;
      insertion_sort(arr+l, r-l);
      return;
    }
    index_t r_left = *part_idx;
    if (r_left == INC_QUICKSORT_CHAOS_MARK) {
      index_t pivot_idx = pow3_pseudo_median_idx(rng, arr+l, r-l);
      r_left = l + partition(arr+l, r-l, pivot_idx);
    }
    surely(l <= r_left), surely(r_left < r);
    *part_idx = will_be_sorted ? INC_QUICKSORT_SORTED_MARK : r_left;
    index_t l_right = r_left + 1;

    unsigned go_left  = (lo < r_left  && r_left-l  > 1);
    unsigned go_right = (l_right < hi && r-l_right > 1);
    index_t *part_idx_left  = part_idx + 1;
    index_t *part_idx_right = part_idx_left + (r_left - l);
    go_left  = (go_left  && *part_idx_left  != INC_QUICKSORT_SORTED_MARK);
    go_right = (go_right && *part_idx_right != INC_QUICKSORT_SORTED_MARK);

    switch fickle((go_left<<1) | go_right) {
    case 0: { return; } break;
    case 1: dive_right: { l = l_right, part_idx = part_idx_right; } break;
    case 2: dive_left:  { r = r_left,  part_idx = part_idx_left;  } break;
    case 3: {
      if (r_left-l <= r-l_right) {
        inc_quicksort_rand_rec(rng, arr, lo, hi, l, r_left, part_idx_left, rec_lim);
        goto dive_right;
      } else {
        inc_quicksort_rand_rec(rng, arr, lo, hi, l_right, r, part_idx_right, rec_lim);
        goto dive_left;
      }
    } break;
    default: { unreachable(); } break;
    }
  }
}
static inline void
inc_quicksort_rand(item_t *arr, index_t *part_idxs, index_t len, index_t lo, index_t hi) {
  surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
  if unlikely(len <= 1 || lo >= hi || *part_idxs == INC_QUICKSORT_SORTED_MARK) { return; }

  index_rng_t rng = index_rng_seeded_from(arr, len);
  inc_quicksort_rand_rec(&rng, arr, lo, hi, 0, len, part_idxs, 5*(bit_width(len)-1)/4);
}


//-----------------------------------------------------------------------------*
// BENCHMARKS
//-----------------------------------------------------------------------------*

#include <stdio.h> // printf()
#include <stdlib.h> // calloc(), free()
#include <time.h> // clock_gettime()

static must_inline uint64_t
nanosecs(void) {
  struct timespec t;
  clock_gettime(CLOCK_MONOTONIC_RAW, &t);
  return (uint64_t)t.tv_sec*1000*1000*1000 + (uint64_t)t.tv_nsec;
}

// Integral type for unique item identifiers. Unnecessary when the item_t type
// itself is integral; left up to customization for composite item_t types.
typedef item_t item_key_t;
#define item_key(item) ((item_key_t)(item))
#define item_with_key(key) ((item_t)(key))

static inline void
shuffle_items(pcg32_rng_t *r, item_t *arr, uint32_t n) {
  for (uint32_t i = 0; i < n; i++) {
    swap(arr[i], arr[onto_range_u32(pcg32_rng_next(r), i, n)]);
  }
}

static inline_callees void
call_partial_quicksort_rand(item_t *arr, index_t *part_idxs,
    index_t len, index_t lo, index_t hi) {
  (void)part_idxs;
  partial_quicksort_rand(arr, len, lo, hi);
}
static inline_callees void
call_rank_partitioned_heapsort(item_t *arr, index_t *part_idxs,
    index_t len, index_t lo, index_t hi) {
  (void)part_idxs;
  rank_partitioned_heapsort(arr, len, lo, hi);
}

extern int
main(void) {
  enum {
    RANDOMIZE = 0,
    VERBOSE = 0,
  };
  typedef void partial_sort_test_func_t(item_t *, index_t *, index_t, index_t, index_t);
  static const struct test_func {
    partial_sort_test_func_t *func;
    const char *name;
  } funcs[] = {
    #define X(p,f) {glue(p,f), #f}
    X(,inc_quicksort_rand),
    X(call_,partial_quicksort_rand),
    X(call_,rank_partitioned_heapsort),
    #undef X
  }; enum { N_FUNCS = countof(funcs) };
  typedef struct test_params test_params_t;
  static const struct test_params {
    int n_runs;
    int n_sorts;
    index_t arr_len;
    index_t min_sort_len;
    index_t max_sort_len;
  } tests[] = {
    {30000,    1,          2000,    1,  2000},
    { 8000,    1,      100*1000,   50,   500},
    {  300,  100,      100*1000,   50,  1000},
    { 5000,    1,     1000*1000,   50,   500},
    { 5000,    1,     1000*1000,  100,  2500},
    {   40,  100,     1000*1000,   50,   500},
    {   40,  100,     1000*1000,  100,  2500},
    {   25,  300,     1000*1000,   50,   500},
    {   25,  300,     1000*1000,  100,  2500},
    {   25,   50,  10*1000*1000,  500,  5000},
    {    5,  500,  10*1000*1000,  500,  5000},
    {    5,  100, 100*1000*1000, 1000, 10000},
  }; enum { N_TESTS = countof(tests) };

  index_t max_arr_len = 0;
  for (int i = 0; i < N_TESTS; i++) {
    if (max_arr_len < tests[i].arr_len) {
      max_arr_len = tests[i].arr_len;
    }
  }
  item_t *arr = calloc(max_arr_len, sizeof(arr[0]));
  index_t *part_idxs = calloc(max_arr_len, sizeof(part_idxs[0]));
  surely(arr), surely(part_idxs);
  if (!arr || !part_idxs) { return 1; }

  const uint64_t rng_seed = RANDOMIZE ? nanosecs() : 809;

  for (int i_func = 0; i_func < N_FUNCS; i_func++) {
    partial_sort_test_func_t *const partial_sort = funcs[i_func].func;
    printf("\nFunction %s\n", funcs[i_func].name);

    pcg32_rng_t rng = pcg32_rng(rng_seed);

    uint64_t all_tests_dt = 0;
    for (int i_test = 0; i_test < N_TESTS; i_test++) {
      const test_params_t T = tests[i_test];
      if (VERBOSE) {
        printf("Test %3d. Parameters: %d x %d, %td, %td..%td\n",
          i_test+1, T.n_runs, T.n_sorts, (ptrdiff_t)T.arr_len,
          (ptrdiff_t)T.min_sort_len, (ptrdiff_t)T.max_sort_len);
      }

      uint64_t test_dt = 0;
      for (int i_run = 0; i_run < T.n_runs; i_run++) {
        // Initialize the array to a random permutation (of its indices).
        for (index_t i = 0; i < T.arr_len; i++) {
          arr[i] = item_with_key((item_key_t)i);
        }
        shuffle_items(&rng, arr, T.arr_len);
        // Set pivot indices to INC_QUICKSORT_CHAOS_MARK for inc_quicksort_rand():
        {apriori(INC_QUICKSORT_CHAOS_MARK == (index_t)-1);
        memset(part_idxs, 0xFF, T.arr_len*sizeof(part_idxs[0]));}

        uint64_t run_dt = 0;
        for (int i_sort = 0; i_sort < T.n_sorts; i_sort++) {
          index_t span = onto_range_u32(pcg32_rng_next(&rng), T.min_sort_len, T.max_sort_len+1);
          index_t lo = onto_range_u32(pcg32_rng_next(&rng), 0, T.arr_len-span+1);
          index_t hi = lo + span;

          uint64_t t0 = nanosecs();
          (*partial_sort)(arr, part_idxs, T.arr_len, lo, hi);
          uint64_t dt = nanosecs() - t0;
          run_dt += dt;

          int ok = (1);
          index_t first_diff = -1;
          for (index_t i = lo; i < hi; i++) {
            if (item_key(arr[i]) != (item_key_t)i) {
              ok = (0), first_diff = i;
              break;
            }
          }
          if (VERBOSE || !ok) {
            printf("Test %3d, run %3d, sort %3d. [%12td %12td] (%6td): %4s %.9f\n",
              i_test+1, i_run+1, i_sort+1,
              (ptrdiff_t)lo, (ptrdiff_t)hi-1, (ptrdiff_t)span,
              ok ? "OK" : "FAIL", dt*1e-9/span);
          }
          if (!ok) {
            enum { BEFORE = 5, AFTER = 10 };
            index_t l = first_diff-lo >= BEFORE ? first_diff-BEFORE : lo;
            index_t r = hi-first_diff >= AFTER+1 ? first_diff+AFTER+1 : hi;
            printf("  [%td..%td]:", (ptrdiff_t)l, (ptrdiff_t)r-1);
            for (index_t i = l; i < r; i++) {
              if (int_type_is_signed(item_key_t)) {
                printf(" %jd", (intmax_t)item_key(arr[i]));
              } else {
                printf(" %ju", (uintmax_t)item_key(arr[i]));
              }
            }
            printf("\n");
          }
        }
        test_dt += run_dt;

        if (VERBOSE) {
          printf("Test %3d, run %3d. Elapsed: %9.6f\n",
            i_test+1, i_run+1, run_dt*1e-9);
        }
      }
      all_tests_dt += test_dt;

      printf("Test %3d. Elapsed: %7.3f total, %9.6f per run\n",
        i_test+1, test_dt*1e-9, test_dt*1e-9/T.n_runs);
      fflush(stdout);
    }

    printf("Total test time:   %7.3f\n", all_tests_dt*1e-9);
    fflush(stdout);
  }

  free(arr), arr = NULL;
  free(part_idxs), part_idxs = NULL;

  return 0;
}

## partsort_output.md

      
    Raw
  

              partsort_output.md
            
          
    Machine: MacBook Air (M1, 2020).
All times are in seconds.
$ clang -std=c11 -Wall -Wextra -Wno-unused-function -O3 -o partsort partsort.c && ./partsort

Function inc_quicksort_rand
Test   1. Elapsed:   0.554 total,  0.000018 per run
Test   2. Elapsed:   0.823 total,  0.000103 per run
Test   3. Elapsed:   0.320 total,  0.001068 per run
Test   4. Elapsed:   4.696 total,  0.000939 per run
Test   5. Elapsed:   4.787 total,  0.000957 per run
Test   6. Elapsed:   0.177 total,  0.004422 per run
Test   7. Elapsed:   0.236 total,  0.005906 per run
Test   8. Elapsed:   0.156 total,  0.006231 per run
Test   9. Elapsed:   0.247 total,  0.009878 per run
Test  10. Elapsed:   0.882 total,  0.035271 per run
Test  11. Elapsed:   0.361 total,  0.072207 per run
Test  12. Elapsed:   1.950 total,  0.390098 per run
Total test time:    15.190

Function partial_quicksort_rand
Test   1. Elapsed:   0.523 total,  0.000017 per run
Test   2. Elapsed:   0.826 total,  0.000103 per run
Test   3. Elapsed:   2.298 total,  0.007660 per run
Test   4. Elapsed:   4.782 total,  0.000956 per run
Test   5. Elapsed:   4.868 total,  0.000974 per run
Test   6. Elapsed:   2.616 total,  0.065397 per run
Test   7. Elapsed:   2.687 total,  0.067177 per run
Test   8. Elapsed:   4.834 total,  0.193368 per run
Test   9. Elapsed:   4.968 total,  0.198731 per run
Test  10. Elapsed:   8.305 total,  0.332191 per run
Test  11. Elapsed:  16.040 total,  3.207993 per run
Test  12. Elapsed:  32.350 total,  6.469953 per run
Total test time:    85.097

Function rank_partitioned_heapsort
Test   1. Elapsed:   0.939 total,  0.000031 per run
Test   2. Elapsed:   0.654 total,  0.000082 per run
Test   3. Elapsed:   2.269 total,  0.007563 per run
Test   4. Elapsed:   3.187 total,  0.000637 per run
Test   5. Elapsed:   3.540 total,  0.000708 per run
Test   6. Elapsed:   2.183 total,  0.054583 per run
Test   7. Elapsed:   2.386 total,  0.059650 per run
Test   8. Elapsed:   4.057 total,  0.162285 per run
Test   9. Elapsed:   4.389 total,  0.175555 per run
Test  10. Elapsed:   6.853 total,  0.274112 per run
Test  11. Elapsed:  13.739 total,  2.747785 per run
Test  12. Elapsed:  26.267 total,  5.253393 per run
Total test time:    70.464


## §LICENSE
ISC License

Copyright 2025 Igor Burago

Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	ISC License

	Copyright 2025 Igor Burago

	Permission to use, copy, modify, and/or distribute this software for any
	purpose with or without fee is hereby granted, provided that the above
	copyright notice and this permission notice appear in all copies.

	THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
No results found