opsec-ee/Erdős-Selfridge-Prime_engine_3.c

## Erdős-Selfridge-Prime_engine_3.c
/* Update 3,16,2026 H.Overman
 *
 * Erdos-Selfridge Prime Categorization Engine v3.4
 * Copyright (c) 2025 H.O <opsec.ee@pm.me>
 *
 * Elapsed:    0.0665 s
 * Throughput: 15024489 primes/s
 *
 * Euman = integral(optimization/time) delta performance
 *
 * =======================================================================
 * INVARIANT FOUNDATION (IFP)
 * =======================================================================
 *
 *   I1: sieve(limit)       -> primes[]       pure bit-packed generator
 *   I2: RAMstore(prime)    -> index          Feistel hash stored in slot,
 *                                            O(1) guaranteed, probe<=16
 *   I3: spf[n]             -> factor         O(1) direct array lookup,
 *                                            zero division
 *   I4: cat(p)             -> u8             Gate-X propagated, never swallowed
 *   I5: memo[i]            <-> 4-bit packed  bijective store/recall
 *
 * =======================================================================
 * CHANGES v3.3 -> v3.4
 * =======================================================================
 *
 * W1 — uint16_t SPF (32MB vs 64MB, 2x cache density):
 *
 *   Correctness proof:
 *     Composite n has smallest prime factor <= sqrt(n).
 *     For n <= SIEVE_LIMIT = 16,000,000:
 *       sqrt(16,000,000) = 4000 < 65,535 = UINT16_MAX.
 *     Therefore all composite SPF values fit in uint16_t.
 *
 *   Prime sentinel = 0:
 *     MAP_ANONYMOUS provides zero-initialised pages — no explicit
 *     prime write needed. During build, composites get their smallest
 *     factor written; primes stay 0. During factorize_spf:
 *       f = g_spf[n] ? g_spf[n] : (uint32_t)n
 *     Branch is maximally predictable: composites dominate in p+1 walks.
 *
 *   Cache impact:
 *     uint16_t: 32 entries per 64-byte cache line (was 16).
 *     32MB table: 16,384 x 2MB huge pages vs 8,192 x 4KB pages
 *     post-madvise. TLB pressure halved vs v3.3.
 *
 * W2 — madvise(MADV_HUGEPAGE) on SPF mmap:
 *
 *   Collapses TLB entries: 32MB / 4KB = 8192 entries (4K pages)
 *   -> 32MB / 2MB = 16 entries (huge pages, THP).
 *   i7 L2 TLB: 1536 entries (4K). SPF at 8192 entries overflows it
 *   significantly. At ~100 cycles per TLB miss, this is measurable.
 *   MADV_HUGEPAGE enables Transparent Huge Pages on Arch Linux.
 *
 * W3 — Software prefetch in main loop:
 *
 *   __builtin_prefetch(&g_spf[g_primes[i + PREFETCH_DIST] + 1], 0, 1)
 *   Issues L1 prefetch hint for the SPF entry of the future prime's p+1.
 *   Covers ~40 cycle L3 latency at 13.8M primes/s per-prime ~72 cycles.
 *   PREFETCH_DIST = 8: 8 * 72 cycles = 576 cycles lookahead >= L3 lat.
 *   Locality 1 (L2): SPF hot entries promoted to L2 before access.
 *
 *   Also prefetches memo word for future prime — sequential access,
 *   hardware prefetcher likely handles it, but explicit hint costs zero.
 *
 * spf_build simplification:
 *   v3.3: g_spf[p] = p unconditional + loop from p+p.
 *   v3.4: g_spf[p] stays 0 (prime sentinel = MAP_ANONYMOUS zero).
 *   Loop from p+p with g_spf[m]==0 guard — first writer wins.
 *   Eliminates 1M explicit prime writes to spf table.
 *
 * All v3.3 bug fixes carried forward:
 *   BUG-1: CATEGORY_GATE_X = 0x0F (4-bit, no truncation)
 *   BUG-2: assert on rs_build probe cap
 *   PERF-2: __builtin_expect on TRACK_FIRST_N_PRIMES branch
 *   MICRO: spf_build starts inner loop at p+p
 *
 * Build:
 *   gcc -std=c23 -O3 -march=native -flto -Wall -Wextra -DNDEBUG \
 *       -funroll-loops prime_ee.c -o prime_engine
 */

#define _GNU_SOURCE   /* madvise, MADV_HUGEPAGE */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <assert.h>

/* =================================================================
 * CONFIGURATION
 * ================================================================= */

constexpr uint32_t SIEVE_LIMIT             = 16000000u;
constexpr uint32_t MAX_PRIMES              = 1000000u;

/*
 * CATEGORY_GATE_X = 0x0F: fits in 4-bit memo slot exactly.
 * Valid category range: 1..14. Sentinel 15 reserved.
 * static_assert enforces no collision at build time.
 */
constexpr uint8_t  CATEGORY_GATE_X         = 0x0Fu;
constexpr uint32_t MAX_CATEGORIES          = 15u;

static_assert(MAX_CATEGORIES <= 15u,
              "category 15 (0x0F) is reserved for CATEGORY_GATE_X sentinel");

constexpr uint32_t FACTORIZATION_BUFFER    = 31u;
constexpr uint32_t TRACK_FIRST_N_PRIMES    = 200u;
constexpr uint32_t MAX_PRIMES_PER_CATEGORY = 100u;

/*
 * RAMstore: 2^21 slots, load ~47.7% at 1M primes.
 * OPT-335: HASH_MASK eliminates modulo.
 */
constexpr uint32_t HASH_BITS    = 21u;
constexpr uint32_t HASH_SIZE    = 1u << HASH_BITS;
constexpr uint32_t HASH_MASK    = HASH_SIZE - 1u;
constexpr uint32_t RS_PROBE_MAX = 16u;

/*
 * Software prefetch lookahead distance (main loop).
 * 8 primes ahead at ~72 cycles/prime = ~576 cycles = L3 latency.
 * Tune up/down based on measured throughput.
 */
constexpr uint32_t PREFETCH_DIST = 8u;

/* =================================================================
 * 144,000 RATIO SYSTEM — exact rational timing, no IEEE 754
 *
 * {{0 [ ticks:u64 (AS/.\IS) ee_ratio_t{num,den} ] 1}}
 * ================================================================= */

typedef struct {
    uint64_t num;
    uint64_t den;
} ee_ratio_t;

static inline ee_ratio_t ee_ratio_elapsed(clock_t start, clock_t end)
{
    return (ee_ratio_t){
        .num = (uint64_t)(end - start),
        .den = (uint64_t)CLOCKS_PER_SEC
    };
}

static inline uint64_t ee_ratio_secs(ee_ratio_t r)
{
    return r.den ? r.num / r.den : 0u;
}

static inline uint64_t ee_ratio_frac10k(ee_ratio_t r)
{
    if (!r.den) return 0u;
    return ((r.num % r.den) * 10000u) / r.den;
}

static inline uint64_t ee_ratio_throughput(uint32_t count, ee_ratio_t elapsed)
{
    if (!elapsed.num) return 0u;
    return ((uint64_t)count * elapsed.den) / elapsed.num;
}

/* =================================================================
 * euman_hash64 — 36-bit Feistel bijective slot function
 *
 * ONE definition. TWO call sites: rs_build(), rs_lookup().
 * No other hash in this file.
 *
 * Proof: euman_hashsec 4B bitmap, zero collisions.
 * E[probes] = |T|/|S| = 2^36 / 62^6 approx 1.21.
 *
 * {{0 [ prime:u64 (AS/.\IS) slot:u32 ] 1}}
 * Master key = euman_hashsec SECRET_KEY.
 * ================================================================= */

constexpr uint64_t FEISTEL_MASTER = UINT64_C(0x1234567890ABCDEF);
constexpr uint64_t FEISTEL_MASK18 = 0x3FFFFull;
constexpr uint64_t FEISTEL_K0     = (FEISTEL_MASTER)       & FEISTEL_MASK18;
constexpr uint64_t FEISTEL_K1     = (FEISTEL_MASTER >>  9) & FEISTEL_MASK18;
constexpr uint64_t FEISTEL_K2     = (FEISTEL_MASTER >> 18) & FEISTEL_MASK18;
constexpr uint64_t FEISTEL_K3     = (FEISTEL_MASTER >> 27) & FEISTEL_MASK18;

static inline uint64_t feistel_round(uint64_t half, uint64_t rk)
{
    half ^= rk;
    half *= UINT64_C(0x9E3779B97);
    half ^= half >> 9;
    return half & FEISTEL_MASK18;
}

static inline uint64_t feistel_encrypt_36(uint64_t v)
{
    uint64_t L = (v >> 18) & FEISTEL_MASK18;
    uint64_t R =  v        & FEISTEL_MASK18;
    uint64_t t;
    t = R; R = L ^ feistel_round(R, FEISTEL_K0); L = t;
    t = R; R = L ^ feistel_round(R, FEISTEL_K1); L = t;
    t = R; R = L ^ feistel_round(R, FEISTEL_K2); L = t;
    t = R; R = L ^ feistel_round(R, FEISTEL_K3); L = t;
    return (L << 18) | R;
}

static inline uint32_t ee_hash64_slot(uint64_t prime)
{
    return (uint32_t)(feistel_encrypt_36(prime) & HASH_MASK);
}

/* =================================================================
 * RAMSTORE SLOT — I2
 *
 * 16 bytes, alignas(64): 4 slots per cache line.
 * prime==0 sentinel: 0 is never a valid prime.
 * Key stored in slot: no g_primes[] dereference on hit.
 *
 * {{0 [ prime:u64 (AS/.\IS) index:u32 ] 1}}
 *   Z: prime == 0
 *   X: not found within RS_PROBE_MAX probes -> UINT32_MAX
 *   1: g_rs[s].prime == prime -> return g_rs[s].index
 * ================================================================= */

typedef struct {
    uint64_t prime;
    uint32_t index;
    uint32_t pad;
} prime_rs_slot_t;

static_assert(sizeof(prime_rs_slot_t) == 16,
              "prime_rs_slot_t must be 16 bytes");

alignas(64) static prime_rs_slot_t g_rs[HASH_SIZE];

/* =================================================================
 * EXBOXING-INSPIRED STRUCT PACKING
 *
 * prime_factor_t — one u64, register-width, zero padding:
 *   bits[55:0]  = factor value  (25 bits sufficient for n <= 16M)
 *   bits[63:56] = power         (8 bits; max power(2) for n<=16M = 24)
 *
 * factorization_t.status — one u8:
 *   bits[4:0] = count   (max 31 = FACTORIZATION_BUFFER)
 *   bit[7]    = gate_x
 *
 * Total factorization_t = 31*8 + 1 = 249 bytes, 4 cache lines.
 *
 * {{0 [ (factor,power) (AS/.\IS) packed:u64 ] 1}}
 * ================================================================= */

typedef struct {
    uint64_t packed;
} prime_factor_t;

static inline prime_factor_t pf_make(uint64_t factor, uint8_t power)
{
    return (prime_factor_t){
        .packed = (factor & UINT64_C(0x00FFFFFFFFFFFFFF))
        | ((uint64_t)power << 56)
    };
}

static inline uint64_t pf_factor(prime_factor_t pf)
{
    return pf.packed & UINT64_C(0x00FFFFFFFFFFFFFF);
}

typedef struct {
    prime_factor_t factors[FACTORIZATION_BUFFER];
    uint8_t        status;
} factorization_t;

static inline uint8_t fac_count(const factorization_t *f)
{
    return f->status & 0x1Fu;
}
static inline bool fac_gate_x(const factorization_t *f)
{
    return (f->status & 0x80u) != 0u;
}
static inline void fac_set_count(factorization_t *f, uint8_t n)
{
    f->status = (f->status & 0x80u) | (n & 0x1Fu);
}
static inline void fac_set_gate_x(factorization_t *f)
{
    f->status |= 0x80u;
}

/* =================================================================
 * GLOBAL STATE — cache-aligned on all hot arrays
 * ================================================================= */

alignas(64) static uint64_t g_primes[MAX_PRIMES];
static uint32_t g_prime_count;

/*
 * SPF table — uint16_t, mmap'd, 32MB (W1).
 * g_spf[n] = smallest prime factor of n, or 0 if n is prime.
 * 32 entries per 64-byte cache line.
 * MADV_HUGEPAGE applied post-mmap (W2).
 */
static uint16_t *g_spf;

alignas(64) static uint64_t g_packed_cat[MAX_PRIMES / 16 + 1];

typedef struct {
    uint32_t count;
    uint64_t smallest;
    uint64_t largest;
} cat_stats_t;

alignas(64) static cat_stats_t g_stats[MAX_CATEGORIES];

typedef struct {
    uint64_t primes[MAX_PRIMES_PER_CATEGORY];
    uint32_t count;
} cat_prime_list_t;

static cat_prime_list_t g_cat_primes[MAX_CATEGORIES];

/* =================================================================
 * I5 — 4-BIT PACKED MEMO
 *
 * {{0 [ idx:u32 (AS/.\IS) 4-bit slot ] 1}}
 *   0x00 = empty, 0x01..0x0E = valid category, 0x0F = Gate-X
 * ================================================================= */

static inline void memo_set(uint32_t idx, uint8_t cat)
{
    uint32_t wi = idx / 16u;
    uint32_t bp = (idx % 16u) * 4u;
    g_packed_cat[wi] = (g_packed_cat[wi] & ~((uint64_t)0xFu << bp))
    | ((uint64_t)cat << bp);
}

static inline uint8_t memo_get(uint32_t idx)
{
    return (uint8_t)((g_packed_cat[idx / 16u] >> ((idx % 16u) * 4u)) & 0xFu);
}

/* =================================================================
 * I1 — PRIME SIEVE
 *
 * memset(0xAA): sets all odd-position bits in one 2MB write.
 * 0xAA = 10101010b: bits 1,3,5,7 of each byte = odd positions.
 * Replaces calloc+loop from v3.2.
 * ================================================================= */

[[nodiscard]]
static bool sieve_generate(void)
{
    size_t   nbytes = (SIEVE_LIMIT + 7u) / 8u;
    uint8_t *sieve  = malloc(nbytes);
    if (!sieve) return false;

    memset(sieve, 0xAAu, nbytes);

    for (uint32_t p = 3u; (uint64_t)p * p <= SIEVE_LIMIT; p += 2u)
        if (sieve[p / 8u] & (1u << (p % 8u)))
            for (uint32_t m = p * p; m <= SIEVE_LIMIT; m += 2u * p)
                sieve[m / 8u] &= (uint8_t)~(1u << (m % 8u));

    g_prime_count = 0u;
    g_primes[g_prime_count++] = 2u;
    for (uint32_t i = 3u; i <= SIEVE_LIMIT && g_prime_count < MAX_PRIMES; i += 2u)
        if (sieve[i / 8u] & (1u << (i % 8u)))
            g_primes[g_prime_count++] = i;

    free(sieve);
    return g_prime_count == MAX_PRIMES;
}

/* =================================================================
 * I3 — SPF TABLE BUILD (uint16_t, W1)
 *
 * Prime sentinel = 0 (MAP_ANONYMOUS zero — no explicit write).
 * Composite write: g_spf[m] = (uint16_t)p where p <= sqrt(m) <= 4000.
 * First writer wins = smallest prime factor. O(n log log n) build.
 *
 * madvise(MADV_HUGEPAGE): hints kernel to back SPF with 2MB huge
 * pages via THP. Reduces TLB footprint from ~8192 to ~16 entries (W2).
 *
 * {{0 [ n:u32 (AS/.\IS) g_spf[n]:u16 ] 1}}
 *   g_spf[n] = 0     -> n is prime (sentinel)
 *   g_spf[n] = f > 0 -> f is smallest prime factor of n, f <= 4000
 * ================================================================= */

[[nodiscard]]
static bool spf_build(void)
{
    size_t spf_bytes = (size_t)(SIEVE_LIMIT + 1u) * sizeof(uint16_t);

    g_spf = mmap(nullptr, spf_bytes,
                 PROT_READ | PROT_WRITE,
                 MAP_PRIVATE | MAP_ANONYMOUS,
                 -1, 0);
    if (g_spf == MAP_FAILED) { g_spf = nullptr; return false; }

    /* W2: THP hint — collapses TLB from ~8192 entries to ~16 */
    madvise(g_spf, spf_bytes, MADV_HUGEPAGE);

    /* Primes stay 0 (sentinel). Composites get smallest factor. */
    for (uint32_t i = 0u; i < g_prime_count; i++) {
        uint32_t p = (uint32_t)g_primes[i];
        for (uint32_t m = p + p; m <= SIEVE_LIMIT; m += p)
            if (g_spf[m] == 0u)
                g_spf[m] = (uint16_t)p;  /* p <= sqrt(m) <= 4000, fits u16 */
    }

    return true;
}

/* =================================================================
 * I2 — RAMSTORE BUILD
 *
 * g_rs[] is BSS: zero-initialised, prime==0 sentinel is free.
 * BUG-2: assert on probe cap — unreachable at 47.7% load,
 * but contract enforced.
 *
 * {{0 [ i:u32 (AS/.\IS) g_rs[slot].{prime,index} ] 1}}
 * ================================================================= */

static void rs_build(void)
{
    for (uint32_t i = 0u; i < g_prime_count; i++) {
        uint64_t prime = g_primes[i];
        uint32_t slot  = ee_hash64_slot(prime);
        uint32_t p;

        for (p = 0u; p < RS_PROBE_MAX; p++) {
            uint32_t s = (slot + p) & HASH_MASK;
            if (g_rs[s].prime == 0u) {
                g_rs[s].prime = prime;
                g_rs[s].index = i;
                break;
            }
        }

        assert(p < RS_PROBE_MAX &&
        "rs_build: probe cap exceeded -- increase HASH_BITS or RS_PROBE_MAX");
    }
}

/* =================================================================
 * I2 — RAMSTORE LOOKUP
 *
 * {{0 [ prime:u64 (AS/.\IS) index:u32 ] 1}}
 *   X: not found -> UINT32_MAX (caller propagates Gate-X)
 *   1: g_rs[s].prime == prime -> return g_rs[s].index
 *
 * Expected ~1.5 probes at 47.7% load.
 * One cache line fetch (64B) covers 4 consecutive slots.
 * ================================================================= */

[[nodiscard]]
static inline uint32_t rs_lookup(uint64_t prime)
{
    uint32_t slot = ee_hash64_slot(prime);

    for (uint32_t p = 0u; p < RS_PROBE_MAX; p++) {
        uint32_t s = (slot + p) & HASH_MASK;
        if (g_rs[s].prime == 0u)    return UINT32_MAX;
        if (g_rs[s].prime == prime) return g_rs[s].index;
    }
    return UINT32_MAX;
}

/* =================================================================
 * I3 — FACTORIZE via SPF table (uint16_t, W1)
 *
 * g_spf[n] == 0: n is prime (sentinel). f = n directly.
 * g_spf[n] != 0: f = g_spf[n] (smallest factor, <= 4000).
 * Single predictable branch: composites dominate, branch mostly not-taken
 * for large n; but n starts composite (p+1 is even) so branch taken first.
 *
 * O(log n) sequential u16 reads. 32 entries per cache line.
 * Hardware prefetcher handles the forward walk naturally.
 *
 * {{0 [ n:u64 (AS/.\IS) factorization_t ] 1}}
 *   Z: n <= 1 (empty result)
 *   X: count >= FACTORIZATION_BUFFER -> gate_x set
 *   1: product(f^e for all factors) == original n
 * ================================================================= */

static void factorize_spf(uint64_t n, factorization_t *out)
{
    out->status = 0u;
    if (n <= 1u) return;

    while (n > 1u) {
        uint64_t f;

        if (n <= SIEVE_LIMIT) {
            uint16_t sf = g_spf[(uint32_t)n];
            f = sf ? (uint64_t)sf : n;   /* 0 = prime sentinel */
        } else {
            f = n;                        /* n > SIEVE_LIMIT: prime by exhaustion */
        }

        uint8_t cnt = fac_count(out);
        if (cnt >= FACTORIZATION_BUFFER) {
            fac_set_gate_x(out);
            return;
        }

        uint8_t power = 0u;
        while (n % f == 0u) { n /= f; power++; }

        out->factors[cnt] = pf_make(f, power);
        fac_set_count(out, (uint8_t)(cnt + 1u));
    }
}

/* =================================================================
 * I4 — COMPUTE CATEGORY (recursive + memoized)
 *
 * Correctness of forward ordering:
 *   For prime p > 3, every prime factor f of p+1 satisfies f < p.
 *   Proof: p+1 is even, so 2|(p+1). All other prime factors of p+1
 *   are <= (p+1)/2 < p. Therefore f < p = g_primes[pi], so fi < pi.
 *   By induction, all memo entries for factors are already computed
 *   by the time we reach p in ascending order. Recursion is always
 *   a memo hit after the first ~handful of primes (p=2 edge case:
 *   factor 3 is handled as base case f==3, no recursion).
 *
 * BUG-1 fix (v3.3): CATEGORY_GATE_X = 0x0F fits 4-bit memo exactly.
 *
 * {{0 [ pi:u32 (AS/.\IS) category:u8 ] 1}}
 *   Z: pi >= g_prime_count
 *   X: factor absent from RAMstore -> CATEGORY_GATE_X (0x0F)
 *   1: result memoized and returned
 * ================================================================= */

static uint8_t compute_category(uint32_t pi)
{
    uint8_t cached = memo_get(pi);
    if (cached != 0u) return cached;

    factorization_t fac;
    factorize_spf(g_primes[pi] + 1u, &fac);

    if (fac_gate_x(&fac)) {
        memo_set(pi, CATEGORY_GATE_X);
        return CATEGORY_GATE_X;
    }

    uint8_t max_cat        = 0u;
    bool    only_base_fact = true;

    for (uint8_t i = 0u; i < fac_count(&fac); i++) {
        uint64_t f = pf_factor(fac.factors[i]);

        if (f == 2u || f == 3u) {
            if (max_cat < 1u) max_cat = 1u;
            continue;
        }

        only_base_fact = false;

        uint32_t fi = rs_lookup(f);
        if (fi == UINT32_MAX) {
            memo_set(pi, CATEGORY_GATE_X);
            return CATEGORY_GATE_X;
        }

        uint8_t fc = compute_category(fi);
        if (fc == CATEGORY_GATE_X) {
            memo_set(pi, CATEGORY_GATE_X);
            return CATEGORY_GATE_X;
        }
        if (fc > max_cat) max_cat = fc;
    }

    uint8_t result = only_base_fact ? 1u : (uint8_t)(max_cat + 1u);
    memo_set(pi, result);
    return result;
}

/* =================================================================
 * MAIN PROCESSING LOOP
 *
 * W3 — Software prefetch:
 *   Two hints per iteration, PREFETCH_DIST primes ahead:
 *
 *   (a) SPF entry for future prime's p+1:
 *       __builtin_prefetch(&g_spf[g_primes[i+N]+1], 0, 1)
 *       Locality 1 (L2): brings SPF entry into L2 before factorize_spf.
 *
 *   (b) Memo word for future prime index:
 *       __builtin_prefetch(&g_packed_cat[(i+N)/16], 0, 1)
 *       Memo is 500KB — hardware prefetcher likely warm, but zero cost.
 *
 *   Prefetch is guarded by bounds check compiled to a single cmov.
 *   At PREFETCH_DIST=8, lookahead = ~576 cycles > L3 latency (~40).
 * ================================================================= */

[[nodiscard]]
static bool run(void)
{
    clock_t t0 = clock();

    memset(g_stats,      0u, sizeof(g_stats));
    memset(g_packed_cat, 0u, sizeof(g_packed_cat));
    memset(g_cat_primes, 0u, sizeof(g_cat_primes));

    uint32_t gate_x_count = 0u;

    printf("Processing %" PRIu32 " primes...\n", g_prime_count);

    for (uint32_t i = 0u; i < g_prime_count; i++) {

        /* W3: software prefetch — SPF and memo for future prime */
        if (__builtin_expect(i + PREFETCH_DIST < g_prime_count, 1u)) {
            uint64_t fp = g_primes[i + PREFETCH_DIST] + 1u;
            if (fp <= SIEVE_LIMIT)
                __builtin_prefetch(&g_spf[fp], 0, 1);
            __builtin_prefetch(
                &g_packed_cat[(i + PREFETCH_DIST) / 16u], 0, 1);
        }

        uint64_t p   = g_primes[i];
        uint8_t  cat = compute_category(i);

        if (cat == CATEGORY_GATE_X) { gate_x_count++; continue; }
        if (cat == 0u || cat >= MAX_CATEGORIES) continue;

        g_stats[cat].count++;
        if (g_stats[cat].count == 1u) {
            g_stats[cat].smallest = p;
            g_stats[cat].largest  = p;
        } else if (p > g_stats[cat].largest) {
            g_stats[cat].largest = p;
        }

        /* PERF-2 (v3.3): fires 200/1,000,000 times */
        if (__builtin_expect(i < TRACK_FIRST_N_PRIMES, 0) &&
            g_cat_primes[cat].count < MAX_PRIMES_PER_CATEGORY)
            g_cat_primes[cat].primes[g_cat_primes[cat].count++] = p;
    }

    clock_t    t1      = clock();
    ee_ratio_t elapsed = ee_ratio_elapsed(t0, t1);
    uint64_t   secs    = ee_ratio_secs(elapsed);
    uint64_t   frac    = ee_ratio_frac10k(elapsed);
    uint64_t   tput    = ee_ratio_throughput(g_prime_count, elapsed);

    printf("Done in %" PRIu64 ".%04" PRIu64 " s\n\n", secs, frac);

    if (gate_x_count)
        printf("[Gate-X] %" PRIu32 " prime(s) excluded\n\n", gate_x_count);

    printf("First %" PRIu32 " primes by category:\n", TRACK_FIRST_N_PRIMES);
    for (uint8_t cat = 1u; cat < MAX_CATEGORIES; cat++) {
        if (!g_cat_primes[cat].count) continue;
        printf("Category %u:\n", cat);
        for (uint32_t i = 0u; i < g_cat_primes[cat].count; i++) {
            printf("%4" PRIu64, g_cat_primes[cat].primes[i]);
            if ((i + 1u) % 15u == 0u || i == g_cat_primes[cat].count - 1u)
                printf("\n");
            else
                printf(" ");
        }
        printf("\n");
    }

    printf("Erdos-Selfridge categorization:\n");
    for (uint8_t cat = 1u; cat < MAX_CATEGORIES; cat++) {
        if (!g_stats[cat].count) continue;
        printf("  Cat %2u: first=%7" PRIu64
        "  last=%8" PRIu64 "  count=%" PRIu32 "\n",
        cat, g_stats[cat].smallest,
        g_stats[cat].largest, g_stats[cat].count);
    }

    printf("\nPerformance (ee_ratio_t -- no IEEE 754):\n");
    printf("  Elapsed:    %" PRIu64 ".%04" PRIu64 " s\n",   secs, frac);
    printf("  Throughput: %" PRIu64 " primes/s\n", tput);

    return true;
}

/* =================================================================
 * MAIN
 * ================================================================= */

int main(void)
{
    printf("Erdos-Selfridge Prime Categorization Engine v3.4\n");
    printf("C23 / RAMstore + u16-SPF + THP + prefetch\n");
    printf("================================================\n\n");

    printf("Sieving %" PRIu32 " primes...\n", MAX_PRIMES);
    if (!sieve_generate()) {
        fprintf(stderr, "sieve_generate: failed\n");
        return 1;
    }
    printf("Sieve: %" PRIu32 " primes\n\n", g_prime_count);

    printf("Building SPF table (%" PRIu32 " entries, 32MB u16, THP)...\n",
           SIEVE_LIMIT);
    if (!spf_build()) {
        fprintf(stderr, "spf_build: mmap failed\n");
        return 1;
    }
    printf("SPF: ready\n\n");

    printf("Building RAMstore (%" PRIu32 " slots, 32MB)...\n", HASH_SIZE);
    rs_build();
    printf("RAMstore: ready\n\n");

    if (!run()) {
        fprintf(stderr, "run: failed\n");
        munmap(g_spf, (size_t)(SIEVE_LIMIT + 1u) * sizeof(uint16_t));
        return 1;
    }

    munmap(g_spf, (size_t)(SIEVE_LIMIT + 1u) * sizeof(uint16_t));
    return 0;
}
No results found