egorsmkv/amd_zen_matrix.c

## amd_zen_matrix.c
// Compile with:
// clang -O3 -march=znver4 -fopenmp -ffast-math -mavx512f -mavx512dq amd_zen_matrix.c -o amd_zen_matrix
// Note: Use -march=znver5 if available in your compiler (GCC 14+ / Clang 18+).
// If not, -march=znver4 is safe and highly effective for Ryzen 9000.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <time.h>
#include <immintrin.h> // Essential for AVX-512
#include <omp.h>       // Essential for multicore scaling

// --- Architecture Constants ---
#define BATCH_SIZE 8 // AVX-512 holds 8 doubles
#define ALIGNMENT 64 // Cache line size (Zen 5)

// --- Helper: Aligned Memory Allocation ---
void* aligned_alloc_wrapper(size_t alignment, size_t size) {
#if defined(_MSC_VER)
    return _aligned_malloc(size, alignment);
#else
    return aligned_alloc(alignment, size);
#endif
}

void aligned_free_wrapper(void* ptr) {
#if defined(_MSC_VER)
    _aligned_free(ptr);
#else
    free(ptr);
#endif
}

// --- Original Scalar Implementation (Optimized Attributes) ---
// Using 'static inline' to encourage inlining into tight loops
static inline void mat33_mul_rank23_scalar(double *restrict C, const double *restrict A, const double *restrict B) {
    const double a11 = A[0], a12 = A[1], a13 = A[2];
    const double a21 = A[3], a22 = A[4], a23 = A[5];
    const double a31 = A[6], a32 = A[7], a33 = A[8];

    const double b11 = B[0], b12 = B[1], b13 = B[2];
    const double b21 = B[3], b22 = B[4], b23 = B[5];
    const double b31 = B[6], b32 = B[7], b33 = B[8];

    const double u1 = a31 + a33;
    const double u2 = a21 + a22;
    const double u3 = a13 + u1;
    const double u4 = a32 - u2;

    const double v1 = b22 + b32;
    const double v2 = b31 - v1;
    const double v3 = b12 + v2;
    const double v4 = b11 - v3;
    const double v5 = b33 + v1;
    const double v6 = b21 - b23;
    const double v7 = b12 - v5;
    const double v8 = v4 - v6;

    const double m1  = u1 * v5;
    const double m2  = u2 * (v2 + v6);
    const double m3  = a32 * b23;
    const double m4  = a31 * (b13 + v7);
    const double m5  = u3 * v7;
    const double m6  = (a32 - a33) * b22;
    const double m7  = a23 * b33;
    const double m8  = (u1 - u2) * v2;
    const double m9  = (a12 - a13) * b22;
    const double m10 = (u3 - a21) * v3;
    const double m11 = (a13 + a33) * (v1 - b12);
    const double m12 = a13 * b33;
    const double m13 = (a31 + u4) * v4;
    const double m14 = a11 * b13;
    const double m15 = (a11 + u3) * b12;
    const double m16 = (a13 + a23) * b31;
    const double m17 = (a22 - a32) * (v4 - b21);
    const double m18 = (a11 + a21) * b11;
    const double m19 = a12 * b23;
    const double m20 = (a12 + a22) * b21;
    const double m21 = a21 * (b13 - v8);
    const double m22 = (a22 - a23) * (b31 - b32);
    const double m23 = u4 * v8;

    const double w1 = m5 + m12;
    const double w2 = m1 + w1;
    const double w3 = m8 + w2;
    const double w4 = m3 - m23;
    const double w5 = m2 + w4;
    const double w6 = w3 + w5;
    const double w7 = m10 - w6;
    const double w8 = m17 + w7;

    C[0] = m18 + m20 + w8;
    C[1] = m9 + m15 - w2;
    C[2] = m12 + m14 + m19;
    C[3] = m16 - w8;
    C[4] = m16 + m22 + w3 - m10;
    C[5] = m7 + m21 + w4 - m17;
    C[6] = m11 + m13 + w6;
    C[7] = m6 + m11 + w2;
    C[8] = m3 + m4 - m11 - w1;
}

// --- Zen 5 Optimized: AVX-512 Batched Perminov ---
// This processes 8 separate 3x3 matrix multiplications simultaneously.
// Input layout:
// A_batch points to a block of 8 matrices.
// Specifically, we load A[0] from matrix 0, A[0] from matrix 1... A[0] from matrix 7 into one register.
// This requires the input data to be organized or gathered efficiently.
// For high perf, we assume data is loaded Strided or Transposed,
// but here we use gathers to support standard array-of-structs layout
// (which is slower but drop-in compatible) OR we assume the user organizes data for the batch.
//
// To demonstrate maximum Zen 5 throughput, this function assumes 'vA' and 'vB' are
// arrays of __m512d where index 0 contains (a11_mat0, a11_mat1... a11_mat7).
// This implies a "Structure of Arrays" (SoA) layout for the batch.

static inline void mat33_mul_rank23_avx512_soa(
    __m512d *restrict vC,
    const __m512d *restrict vA,
    const __m512d *restrict vB
) {
    // --- Input Loading (Assumes SoA Layout) ---
    // vA[0] = [a11_0, a11_1, ..., a11_7]
    // vA[1] = [a12_0, a12_1, ..., a12_7]
    // ...

    // U terms
    __m512d u1 = _mm512_add_pd(vA[6], vA[8]); // a31 + a33
    __m512d u2 = _mm512_add_pd(vA[3], vA[4]); // a21 + a22
    __m512d u3 = _mm512_add_pd(vA[2], u1);    // a13 + u1
    __m512d u4 = _mm512_sub_pd(vA[7], u2);    // a32 - u2

    // V terms
    __m512d v1 = _mm512_add_pd(vB[4], vB[7]); // b22 + b32
    __m512d v2 = _mm512_sub_pd(vB[6], v1);    // b31 - v1
    __m512d v3 = _mm512_add_pd(vB[1], v2);    // b12 + v2
    __m512d v4 = _mm512_sub_pd(vB[0], v3);    // b11 - v3
    __m512d v5 = _mm512_add_pd(vB[8], v1);    // b33 + v1
    __m512d v6 = _mm512_sub_pd(vB[3], vB[5]); // b21 - b23
    __m512d v7 = _mm512_sub_pd(vB[1], v5);    // b12 - v5
    __m512d v8 = _mm512_sub_pd(v4, v6);       // v4 - v6

    // M terms (Multiplications)
    __m512d m1  = _mm512_mul_pd(u1, v5);
    __m512d m2  = _mm512_mul_pd(u2, _mm512_add_pd(v2, v6));
    __m512d m3  = _mm512_mul_pd(vA[7], vB[5]); // a32 * b23
    __m512d m4  = _mm512_mul_pd(vA[6], _mm512_add_pd(vB[2], v7));
    __m512d m5  = _mm512_mul_pd(u3, v7);
    __m512d m6  = _mm512_mul_pd(_mm512_sub_pd(vA[7], vA[8]), vB[4]);
    __m512d m7  = _mm512_mul_pd(vA[5], vB[8]);
    __m512d m8  = _mm512_mul_pd(_mm512_sub_pd(u1, u2), v2);
    __m512d m9  = _mm512_mul_pd(_mm512_sub_pd(vA[1], vA[2]), vB[4]);
    __m512d m10 = _mm512_mul_pd(_mm512_sub_pd(u3, vA[3]), v3);
    __m512d m11 = _mm512_mul_pd(_mm512_add_pd(vA[2], vA[8]), _mm512_sub_pd(v1, vB[1]));
    __m512d m12 = _mm512_mul_pd(vA[2], vB[8]);
    __m512d m13 = _mm512_mul_pd(_mm512_add_pd(vA[6], u4), v4);
    __m512d m14 = _mm512_mul_pd(vA[0], vB[2]);
    __m512d m15 = _mm512_mul_pd(_mm512_add_pd(vA[0], u3), vB[1]);
    __m512d m16 = _mm512_mul_pd(_mm512_add_pd(vA[2], vA[5]), vB[6]);
    __m512d m17 = _mm512_mul_pd(_mm512_sub_pd(vA[4], vA[7]), _mm512_sub_pd(v4, vB[3]));
    __m512d m18 = _mm512_mul_pd(_mm512_add_pd(vA[0], vA[3]), vB[0]);
    __m512d m19 = _mm512_mul_pd(vA[1], vB[5]);
    __m512d m20 = _mm512_mul_pd(_mm512_add_pd(vA[1], vA[4]), vB[3]);
    __m512d m21 = _mm512_mul_pd(vA[3], _mm512_sub_pd(vB[2], v8));
    __m512d m22 = _mm512_mul_pd(_mm512_sub_pd(vA[4], vA[5]), _mm512_sub_pd(vB[6], vB[7]));
    __m512d m23 = _mm512_mul_pd(u4, v8);

    // W terms (Post-computations)
    __m512d w1 = _mm512_add_pd(m5, m12);
    __m512d w2 = _mm512_add_pd(m1, w1);
    __m512d w3 = _mm512_add_pd(m8, w2);
    __m512d w4 = _mm512_sub_pd(m3, m23);
    __m512d w5 = _mm512_add_pd(m2, w4);
    __m512d w6 = _mm512_add_pd(w3, w5);
    __m512d w7 = _mm512_sub_pd(m10, w6);
    __m512d w8 = _mm512_add_pd(m17, w7);

    // C terms (Result Construction)
    vC[0] = _mm512_add_pd(_mm512_add_pd(m18, m20), w8);
    vC[1] = _mm512_sub_pd(_mm512_add_pd(m9, m15), w2);
    vC[2] = _mm512_add_pd(_mm512_add_pd(m12, m14), m19);
    vC[3] = _mm512_sub_pd(m16, w8);
    vC[4] = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(m16, m22), w3), m10);
    vC[5] = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(m7, m21), w4), m17);
    vC[6] = _mm512_add_pd(_mm512_add_pd(m11, m13), w6);
    vC[7] = _mm512_add_pd(_mm512_add_pd(m6, m11), w2);
    vC[8] = _mm512_sub_pd(_mm512_sub_pd(_mm512_add_pd(m3, m4), m11), w1);
}

// --- Utils ---

static inline double now_seconds(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
}

void fill_random(double *M, size_t count) {
    for (size_t i = 0; i < count; i++) {
        M[i] = ((double)rand() / (double)RAND_MAX) * 2.0 - 1.0;
    }
}

// --- Driver ---

int main(int argc, char **argv) {
    // We want to process a large number of matrices to test throughput
    // 9900X L3 cache is large, but let's keep working set reasonable to measure core math
    const int N_MATRICES = 1024 * 1024 * 10; // 10 Million matrices
    const size_t elements = N_MATRICES * 9;
    const size_t bytes = elements * sizeof(double);

    printf("Allocating %.2f MB for matrices...\n", (double)bytes * 3 / (1024*1024));

    double *A_store = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes);
    double *B_store = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes);
    double *C_store = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes);

    // For AVX SoA test, we need transposed storage
    double *A_soa = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes);
    double *B_soa = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes);
    double *C_soa = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes);

    if (!A_store || !B_store || !C_store) {
        printf("Memory allocation failed.\n");
        return 1;
    }

    fill_random(A_store, elements);
    fill_random(B_store, elements);

    // --- Prepare SoA Data ---
    // Transform Array of Structs (standard 3x3 layout) to Structure of Arrays
    // (Batches of 8 matrices where a11s are contiguous)
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < N_MATRICES; i += BATCH_SIZE) {
        for (int k = 0; k < 9; k++) {
            for (int j = 0; j < BATCH_SIZE; j++) {
                if (i+j < N_MATRICES) {
                    A_soa[(i * 9) + (k * BATCH_SIZE) + j] = A_store[(i + j) * 9 + k];
                    B_soa[(i * 9) + (k * BATCH_SIZE) + j] = B_store[(i + j) * 9 + k];
                }
            }
        }
    }

    printf("Starting Benchmarks on Ryzen 9 9900X (Zen 5)...\n");

    // --- Benchmark 1: Standard Scalar (Single Thread) ---
    double t0 = now_seconds();
    for (int i = 0; i < N_MATRICES; i++) {
        mat33_mul_rank23_scalar(&C_store[i*9], &A_store[i*9], &B_store[i*9]);
    }
    double t1 = now_seconds();
    printf("Scalar (1 Core):        %.4f s | %.2f M Matrices/s\n",
           t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

    // --- Benchmark 2: Standard Scalar (OpenMP Multi-Core) ---
    // Zen 5 scales well with SMT, but for pure FPU throughput, physical cores matter most.
    t0 = now_seconds();
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < N_MATRICES; i++) {
        mat33_mul_rank23_scalar(&C_store[i*9], &A_store[i*9], &B_store[i*9]);
    }
    t1 = now_seconds();
    printf("Scalar (OpenMP):        %.4f s | %.2f M Matrices/s\n",
           t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

    // --- Benchmark 3: AVX-512 SoA (Single Thread) ---
    t0 = now_seconds();
    // Pointers to __m512d types for easy indexing
    __m512d *vA_ptr = (__m512d*)A_soa;
    __m512d *vB_ptr = (__m512d*)B_soa;
    __m512d *vC_ptr = (__m512d*)C_soa;

    // 9 vectors make up one "block" of 8 matrices
    int num_blocks = N_MATRICES / BATCH_SIZE;

    for (int i = 0; i < num_blocks; i++) {
        mat33_mul_rank23_avx512_soa(&vC_ptr[i*9], &vA_ptr[i*9], &vB_ptr[i*9]);
    }
    t1 = now_seconds();
    printf("AVX-512 (1 Core):       %.4f s | %.2f M Matrices/s\n",
           t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

    // --- Benchmark 4: AVX-512 SoA (OpenMP Multi-Core) ---
    // This combines 512-bit width with 12 cores.
    t0 = now_seconds();
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < num_blocks; i++) {
        mat33_mul_rank23_avx512_soa(&vC_ptr[i*9], &vA_ptr[i*9], &vB_ptr[i*9]);
    }
    t1 = now_seconds();
    printf("AVX-512 (OpenMP):       %.4f s | %.2f M Matrices/s\n",
           t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

    // --- Cleanup ---
    aligned_free_wrapper(A_store);
    aligned_free_wrapper(B_store);
    aligned_free_wrapper(C_store);
    aligned_free_wrapper(A_soa);
    aligned_free_wrapper(B_soa);
    aligned_free_wrapper(C_soa);

    return 0;
}

## fast_matrix_mult.c
// clang -O0 -march=native fast_matrix_mult.c -o fast_matrix_mult
// clang -O3 -ffast-math -march=native fast_matrix_mult.c -o fast_matrix_mult

// clang -O3 -ffast-math -march=native -funroll-loops -fomit-frame-pointer -DNDEBUG fast_matrix_mult.c -o fast_matrix_mult

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <time.h>

/**
 * Perminov's Rank-23, 58-Addition Scheme for 3x3 Matrix Multiplication.
 * * Target: General non-commutative rings (implemented here for double).
 * Optimization:
 * - 'restrict' for pointer aliasing optimization.
 * - 'const' for SSA-friendly intermediate variable generation.
 * - Stack-allocated intermediates to avoid heap overhead.
 * * Matrix Layout: Row-major
 * Indices: 0:11, 1:12, 2:13, 3:21, 4:22, 5:23, 6:31, 7:32, 8:33
 */
void mat33_mul_rank23(double *restrict C, const double *restrict A, const double *restrict B) {
    // --- Input Aliases for Readability (Compiler will optimize these out) ---
    const double a11 = A[0], a12 = A[1], a13 = A[2];
    const double a21 = A[3], a22 = A[4], a23 = A[5];
    const double a31 = A[6], a32 = A[7], a33 = A[8];

    const double b11 = B[0], b12 = B[1], b13 = B[2];
    const double b21 = B[3], b22 = B[4], b23 = B[5];
    const double b31 = B[6], b32 = B[7], b33 = B[8];

    // --- Phase 1: Pre-computations (U and V terms) ---
    // Total Additions: 4 (U) + 8 (V) = 12

    // U terms (Linear combinations of A)
    const double u1 = a31 + a33;
    const double u2 = a21 + a22;
    const double u3 = a13 + u1;       // Dependency on u1
    const double u4 = a32 - u2;       // Dependency on u2

    // V terms (Linear combinations of B)
    const double v1 = b22 + b32;
    const double v2 = b31 - v1;       // Dependency on v1
    const double v3 = b12 + v2;       // Dependency on v2
    const double v4 = b11 - v3;       // Dependency on v3
    const double v5 = b33 + v1;       // Dependency on v1
    const double v6 = b21 - b23;
    const double v7 = b12 - v5;       // Dependency on v5
    const double v8 = v4 - v6;        // Dependency on v4, v6

    // --- Phase 2: Multiplications (M terms) ---
    // 23 Multiplications
    // 18 Additions inside the products

    const double m1  = u1 * v5;
    const double m2  = u2 * (v2 + v6);
    const double m3  = a32 * b23;
    const double m4  = a31 * (b13 + v7);
    const double m5  = u3 * v7;
    const double m6  = (a32 - a33) * b22;
    const double m7  = a23 * b33;
    const double m8  = (u1 - u2) * v2;
    const double m9  = (a12 - a13) * b22;
    const double m10 = (u3 - a21) * v3;
    const double m11 = (a13 + a33) * (v1 - b12);
    const double m12 = a13 * b33;
    const double m13 = (a31 + u4) * v4;
    const double m14 = a11 * b13;
    const double m15 = (a11 + u3) * b12;
    const double m16 = (a13 + a23) * b31;
    const double m17 = (a22 - a32) * (v4 - b21);
    const double m18 = (a11 + a21) * b11;
    const double m19 = a12 * b23;
    const double m20 = (a12 + a22) * b21;
    const double m21 = a21 * (b13 - v8);
    const double m22 = (a22 - a23) * (b31 - b32);
    const double m23 = u4 * v8;

    // --- Phase 3: Post-computations (W terms) ---
    // Total Additions: 8

    const double w1 = m5 + m12;
    const double w2 = m1 + w1;
    const double w3 = m8 + w2;
    const double w4 = m3 - m23;
    const double w5 = m2 + w4;
    const double w6 = w3 + w5;
    const double w7 = m10 - w6;
    const double w8 = m17 + w7;

    // --- Phase 4: Construct Result (C terms) ---
    // Total Additions: 20

    // Row 1
    C[0] = m18 + m20 + w8;            // c11
    C[1] = m9 + m15 - w2;             // c12
    C[2] = m12 + m14 + m19;           // c13

    // Row 2
    C[3] = m16 - w8;                  // c21
    C[4] = m16 + m22 + w3 - m10;      // c22
    C[5] = m7 + m21 + w4 - m17;       // c23

    // Row 3
    C[6] = m11 + m13 + w6;            // c31
    C[7] = m6 + m11 + w2;             // c32
    C[8] = m3 + m4 - m11 - w1;        // c33
}

// --- Verification/Usage Driver ---

void print_mat33(const char *name, const double *M) {
    printf("%s:\n", name);
    for (int i = 0; i < 3; i++) {
        printf("  %8.2f %8.2f %8.2f\n", M[i*3+0], M[i*3+1], M[i*3+2]);
    }
    printf("\n");
}

static inline double now_seconds(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
}

void format_uint64_commas(uint64_t value, char *out, size_t out_size) {
    char buf[32];
    int i = 0;
    int group = 0;
    do {
        if (group == 3) {
            buf[i++] = ',';
            group = 0;
        }
        buf[i++] = (char)('0' + (value % 10));
        value /= 10;
        group++;
    } while (value > 0 && i < (int)sizeof(buf) - 1);
    if (i >= (int)sizeof(buf)) {
        if (out_size > 0) {
            out[0] = '\0';
        }
        return;
    }
    int j = 0;
    while (i > 0 && j < (int)out_size - 1) {
        out[j++] = buf[--i];
    }
    out[j] = '\0';
}

void fill_mat33_random(double *M) {
    for (int i = 0; i < 9; i++) {
        double r = (double)rand() / (double)RAND_MAX;
        M[i] = r * 2.0 - 1.0;
    }
}

void mat33_mul_naive(double *restrict C, const double *restrict A, const double *restrict B) {
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            double acc = 0.0;
            for (int k = 0; k < 3; k++) {
                acc += A[i*3+k] * B[k*3+j];
            }
            C[i*3+j] = acc;
        }
    }
}

static inline double sum_mat33(const double *M) {
    return M[0] + M[1] + M[2] +
           M[3] + M[4] + M[5] +
           M[6] + M[7] + M[8];
}

int main(int argc, char **argv) {
    // Example matrices
    double A[9] = {0};
    double B[9] = {0};

    double C[9] = {0}; // Result
    double C_check[9] = {0};

    srand((unsigned)time(NULL));
    fill_mat33_random(A);
    fill_mat33_random(B);

    // Execute Perminov's Algorithm
    mat33_mul_rank23(C, A, B);

    // Naive Check (for verification logic only)
    mat33_mul_naive(C_check, A, B);

    print_mat33("Matrix A", A);
    print_mat33("Matrix B", B);
    print_mat33("Result C (Perminov Rank-23)", C);
    print_mat33("Result Check (Naive)", C_check);

    // --- Timing comparison ---
    uint64_t iterations = 10000000ULL;
    if (argc > 1) {
        char *end = NULL;
        unsigned long long parsed = strtoull(argv[1], &end, 10);
        if (end != argv[1] && parsed > 0) {
            iterations = (uint64_t)parsed;
        }
    }
    const uint64_t warmup = 100000ULL;
    volatile double sink = 0.0;
    double checksum_rank23 = 0.0;
    double checksum_naive = 0.0;
    double t0, t1;
    char iters_str[32];

    format_uint64_commas(iterations, iters_str, sizeof(iters_str));
    printf("Iterations: %s (warmup: %llu)\n",
           iters_str[0] ? iters_str : "N/A",
           (unsigned long long)warmup);

    for (uint64_t it = 0; it < warmup; it++) {
        mat33_mul_rank23(C, A, B);
        sink += C[it % 9];
    }

    t0 = now_seconds();
    for (uint64_t it = 0; it < iterations; it++) {
        mat33_mul_rank23(C, A, B);
        sink += C[it % 9];
        checksum_rank23 += sum_mat33(C);
    }
    t1 = now_seconds();
    printf("Perminov Rank-23: %.6f s (sink=%.2f)\n", t1 - t0, sink);
    double time_rank23 = t1 - t0;

    for (uint64_t it = 0; it < warmup; it++) {
        mat33_mul_naive(C, A, B);
        sink += C[(it + 3) % 9];
    }

    t0 = now_seconds();
    for (uint64_t it = 0; it < iterations; it++) {
        mat33_mul_naive(C, A, B);
        sink += C[(it + 3) % 9];
        checksum_naive += sum_mat33(C);
    }
    t1 = now_seconds();
    printf("Naive O(N^3):     %.6f s (sink=%.2f)\n", t1 - t0, sink);
    double time_naive = t1 - t0;
    printf("Checksums: rank23=%.6f naive=%.6f diff=%.6f\n",
           checksum_rank23, checksum_naive, checksum_rank23 - checksum_naive);
    if (time_rank23 > 0.0 && time_naive > 0.0) {
        double speedup = time_naive / time_rank23;
        printf("Speedup: %.2fx (naive / rank23)\n", speedup);
    }
    if (time_rank23 < time_naive) {
        printf("Winner: Perminov Rank-23\n");
    } else if (time_naive < time_rank23) {
        printf("Winner: Naive O(N^3)\n");
    } else {
        printf("Winner: Tie\n");
    }

    return 0;
}
	// Compile with:
	// clang -O3 -march=znver4 -fopenmp -ffast-math -mavx512f -mavx512dq amd_zen_matrix.c -o amd_zen_matrix
	// Note: Use -march=znver5 if available in your compiler (GCC 14+ / Clang 18+).
	// If not, -march=znver4 is safe and highly effective for Ryzen 9000.

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <stdbool.h>
	#include <time.h>
	#include <immintrin.h> // Essential for AVX-512
	#include <omp.h> // Essential for multicore scaling

	// --- Architecture Constants ---
	#define BATCH_SIZE 8 // AVX-512 holds 8 doubles
	#define ALIGNMENT 64 // Cache line size (Zen 5)

	// --- Helper: Aligned Memory Allocation ---
	void* aligned_alloc_wrapper(size_t alignment, size_t size) {
	#if defined(_MSC_VER)
	return _aligned_malloc(size, alignment);
	#else
	return aligned_alloc(alignment, size);
	#endif
	}

	void aligned_free_wrapper(void* ptr) {
	#if defined(_MSC_VER)
	_aligned_free(ptr);
	#else
	free(ptr);
	#endif
	}

	// --- Original Scalar Implementation (Optimized Attributes) ---
	// Using 'static inline' to encourage inlining into tight loops
	static inline void mat33_mul_rank23_scalar(double restrict C, const double restrict A, const double *restrict B) {
	const double a11 = A[0], a12 = A[1], a13 = A[2];
	const double a21 = A[3], a22 = A[4], a23 = A[5];
	const double a31 = A[6], a32 = A[7], a33 = A[8];

	const double b11 = B[0], b12 = B[1], b13 = B[2];
	const double b21 = B[3], b22 = B[4], b23 = B[5];
	const double b31 = B[6], b32 = B[7], b33 = B[8];

	const double u1 = a31 + a33;
	const double u2 = a21 + a22;
	const double u3 = a13 + u1;
	const double u4 = a32 - u2;

	const double v1 = b22 + b32;
	const double v2 = b31 - v1;
	const double v3 = b12 + v2;
	const double v4 = b11 - v3;
	const double v5 = b33 + v1;
	const double v6 = b21 - b23;
	const double v7 = b12 - v5;
	const double v8 = v4 - v6;

	const double m1 = u1 * v5;
	const double m2 = u2 * (v2 + v6);
	const double m3 = a32 * b23;
	const double m4 = a31 * (b13 + v7);
	const double m5 = u3 * v7;
	const double m6 = (a32 - a33) * b22;
	const double m7 = a23 * b33;
	const double m8 = (u1 - u2) * v2;
	const double m9 = (a12 - a13) * b22;
	const double m10 = (u3 - a21) * v3;
	const double m11 = (a13 + a33) * (v1 - b12);
	const double m12 = a13 * b33;
	const double m13 = (a31 + u4) * v4;
	const double m14 = a11 * b13;
	const double m15 = (a11 + u3) * b12;
	const double m16 = (a13 + a23) * b31;
	const double m17 = (a22 - a32) * (v4 - b21);
	const double m18 = (a11 + a21) * b11;
	const double m19 = a12 * b23;
	const double m20 = (a12 + a22) * b21;
	const double m21 = a21 * (b13 - v8);
	const double m22 = (a22 - a23) * (b31 - b32);
	const double m23 = u4 * v8;

	const double w1 = m5 + m12;
	const double w2 = m1 + w1;
	const double w3 = m8 + w2;
	const double w4 = m3 - m23;
	const double w5 = m2 + w4;
	const double w6 = w3 + w5;
	const double w7 = m10 - w6;
	const double w8 = m17 + w7;

	C[0] = m18 + m20 + w8;
	C[1] = m9 + m15 - w2;
	C[2] = m12 + m14 + m19;
	C[3] = m16 - w8;
	C[4] = m16 + m22 + w3 - m10;
	C[5] = m7 + m21 + w4 - m17;
	C[6] = m11 + m13 + w6;
	C[7] = m6 + m11 + w2;
	C[8] = m3 + m4 - m11 - w1;
	}

	// --- Zen 5 Optimized: AVX-512 Batched Perminov ---
	// This processes 8 separate 3x3 matrix multiplications simultaneously.
	// Input layout:
	// A_batch points to a block of 8 matrices.
	// Specifically, we load A[0] from matrix 0, A[0] from matrix 1... A[0] from matrix 7 into one register.
	// This requires the input data to be organized or gathered efficiently.
	// For high perf, we assume data is loaded Strided or Transposed,
	// but here we use gathers to support standard array-of-structs layout
	// (which is slower but drop-in compatible) OR we assume the user organizes data for the batch.
	//
	// To demonstrate maximum Zen 5 throughput, this function assumes 'vA' and 'vB' are
	// arrays of __m512d where index 0 contains (a11_mat0, a11_mat1... a11_mat7).
	// This implies a "Structure of Arrays" (SoA) layout for the batch.

	static inline void mat33_mul_rank23_avx512_soa(
	__m512d *restrict vC,
	const __m512d *restrict vA,
	const __m512d *restrict vB
	) {
	// --- Input Loading (Assumes SoA Layout) ---
	// vA[0] = [a11_0, a11_1, ..., a11_7]
	// vA[1] = [a12_0, a12_1, ..., a12_7]
	// ...

	// U terms
	__m512d u1 = _mm512_add_pd(vA[6], vA[8]); // a31 + a33
	__m512d u2 = _mm512_add_pd(vA[3], vA[4]); // a21 + a22
	__m512d u3 = _mm512_add_pd(vA[2], u1); // a13 + u1
	__m512d u4 = _mm512_sub_pd(vA[7], u2); // a32 - u2

	// V terms
	__m512d v1 = _mm512_add_pd(vB[4], vB[7]); // b22 + b32
	__m512d v2 = _mm512_sub_pd(vB[6], v1); // b31 - v1
	__m512d v3 = _mm512_add_pd(vB[1], v2); // b12 + v2
	__m512d v4 = _mm512_sub_pd(vB[0], v3); // b11 - v3
	__m512d v5 = _mm512_add_pd(vB[8], v1); // b33 + v1
	__m512d v6 = _mm512_sub_pd(vB[3], vB[5]); // b21 - b23
	__m512d v7 = _mm512_sub_pd(vB[1], v5); // b12 - v5
	__m512d v8 = _mm512_sub_pd(v4, v6); // v4 - v6

	// M terms (Multiplications)
	__m512d m1 = _mm512_mul_pd(u1, v5);
	__m512d m2 = _mm512_mul_pd(u2, _mm512_add_pd(v2, v6));
	__m512d m3 = _mm512_mul_pd(vA[7], vB[5]); // a32 * b23
	__m512d m4 = _mm512_mul_pd(vA[6], _mm512_add_pd(vB[2], v7));
	__m512d m5 = _mm512_mul_pd(u3, v7);
	__m512d m6 = _mm512_mul_pd(_mm512_sub_pd(vA[7], vA[8]), vB[4]);
	__m512d m7 = _mm512_mul_pd(vA[5], vB[8]);
	__m512d m8 = _mm512_mul_pd(_mm512_sub_pd(u1, u2), v2);
	__m512d m9 = _mm512_mul_pd(_mm512_sub_pd(vA[1], vA[2]), vB[4]);
	__m512d m10 = _mm512_mul_pd(_mm512_sub_pd(u3, vA[3]), v3);
	__m512d m11 = _mm512_mul_pd(_mm512_add_pd(vA[2], vA[8]), _mm512_sub_pd(v1, vB[1]));
	__m512d m12 = _mm512_mul_pd(vA[2], vB[8]);
	__m512d m13 = _mm512_mul_pd(_mm512_add_pd(vA[6], u4), v4);
	__m512d m14 = _mm512_mul_pd(vA[0], vB[2]);
	__m512d m15 = _mm512_mul_pd(_mm512_add_pd(vA[0], u3), vB[1]);
	__m512d m16 = _mm512_mul_pd(_mm512_add_pd(vA[2], vA[5]), vB[6]);
	__m512d m17 = _mm512_mul_pd(_mm512_sub_pd(vA[4], vA[7]), _mm512_sub_pd(v4, vB[3]));
	__m512d m18 = _mm512_mul_pd(_mm512_add_pd(vA[0], vA[3]), vB[0]);
	__m512d m19 = _mm512_mul_pd(vA[1], vB[5]);
	__m512d m20 = _mm512_mul_pd(_mm512_add_pd(vA[1], vA[4]), vB[3]);
	__m512d m21 = _mm512_mul_pd(vA[3], _mm512_sub_pd(vB[2], v8));
	__m512d m22 = _mm512_mul_pd(_mm512_sub_pd(vA[4], vA[5]), _mm512_sub_pd(vB[6], vB[7]));
	__m512d m23 = _mm512_mul_pd(u4, v8);

	// W terms (Post-computations)
	__m512d w1 = _mm512_add_pd(m5, m12);
	__m512d w2 = _mm512_add_pd(m1, w1);
	__m512d w3 = _mm512_add_pd(m8, w2);
	__m512d w4 = _mm512_sub_pd(m3, m23);
	__m512d w5 = _mm512_add_pd(m2, w4);
	__m512d w6 = _mm512_add_pd(w3, w5);
	__m512d w7 = _mm512_sub_pd(m10, w6);
	__m512d w8 = _mm512_add_pd(m17, w7);

	// C terms (Result Construction)
	vC[0] = _mm512_add_pd(_mm512_add_pd(m18, m20), w8);
	vC[1] = _mm512_sub_pd(_mm512_add_pd(m9, m15), w2);
	vC[2] = _mm512_add_pd(_mm512_add_pd(m12, m14), m19);
	vC[3] = _mm512_sub_pd(m16, w8);
	vC[4] = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(m16, m22), w3), m10);
	vC[5] = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(m7, m21), w4), m17);
	vC[6] = _mm512_add_pd(_mm512_add_pd(m11, m13), w6);
	vC[7] = _mm512_add_pd(_mm512_add_pd(m6, m11), w2);
	vC[8] = _mm512_sub_pd(_mm512_sub_pd(_mm512_add_pd(m3, m4), m11), w1);
	}

	// --- Utils ---

	static inline double now_seconds(void) {
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
	}

	void fill_random(double *M, size_t count) {
	for (size_t i = 0; i < count; i++) {
	M[i] = ((double)rand() / (double)RAND_MAX) * 2.0 - 1.0;
	}
	}

	// --- Driver ---

	int main(int argc, char **argv) {
	// We want to process a large number of matrices to test throughput
	// 9900X L3 cache is large, but let's keep working set reasonable to measure core math
	const int N_MATRICES = 1024 * 1024 * 10; // 10 Million matrices
	const size_t elements = N_MATRICES * 9;
	const size_t bytes = elements * sizeof(double);

	printf("Allocating %.2f MB for matrices...\n", (double)bytes * 3 / (1024*1024));

	double A_store = (double)aligned_alloc_wrapper(ALIGNMENT, bytes);
	double B_store = (double)aligned_alloc_wrapper(ALIGNMENT, bytes);
	double C_store = (double)aligned_alloc_wrapper(ALIGNMENT, bytes);

	// For AVX SoA test, we need transposed storage
	double A_soa = (double)aligned_alloc_wrapper(ALIGNMENT, bytes);
	double B_soa = (double)aligned_alloc_wrapper(ALIGNMENT, bytes);
	double C_soa = (double)aligned_alloc_wrapper(ALIGNMENT, bytes);

	if (!A_store \|\| !B_store \|\| !C_store) {
	printf("Memory allocation failed.\n");
	return 1;
	}

	fill_random(A_store, elements);
	fill_random(B_store, elements);

	// --- Prepare SoA Data ---
	// Transform Array of Structs (standard 3x3 layout) to Structure of Arrays
	// (Batches of 8 matrices where a11s are contiguous)
	#pragma omp parallel for schedule(static)
	for (int i = 0; i < N_MATRICES; i += BATCH_SIZE) {
	for (int k = 0; k < 9; k++) {
	for (int j = 0; j < BATCH_SIZE; j++) {
	if (i+j < N_MATRICES) {
	A_soa[(i * 9) + (k * BATCH_SIZE) + j] = A_store[(i + j) * 9 + k];
	B_soa[(i * 9) + (k * BATCH_SIZE) + j] = B_store[(i + j) * 9 + k];
	}
	}
	}
	}

	printf("Starting Benchmarks on Ryzen 9 9900X (Zen 5)...\n");

	// --- Benchmark 1: Standard Scalar (Single Thread) ---
	double t0 = now_seconds();
	for (int i = 0; i < N_MATRICES; i++) {
	mat33_mul_rank23_scalar(&C_store[i9], &A_store[i9], &B_store[i*9]);
	}
	double t1 = now_seconds();
	printf("Scalar (1 Core): %.4f s \| %.2f M Matrices/s\n",
	t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

	// --- Benchmark 2: Standard Scalar (OpenMP Multi-Core) ---
	// Zen 5 scales well with SMT, but for pure FPU throughput, physical cores matter most.
	t0 = now_seconds();
	#pragma omp parallel for schedule(static)
	for (int i = 0; i < N_MATRICES; i++) {
	mat33_mul_rank23_scalar(&C_store[i9], &A_store[i9], &B_store[i*9]);
	}
	t1 = now_seconds();
	printf("Scalar (OpenMP): %.4f s \| %.2f M Matrices/s\n",
	t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

	// --- Benchmark 3: AVX-512 SoA (Single Thread) ---
	t0 = now_seconds();
	// Pointers to __m512d types for easy indexing
	__m512d vA_ptr = (__m512d)A_soa;
	__m512d vB_ptr = (__m512d)B_soa;
	__m512d vC_ptr = (__m512d)C_soa;

	// 9 vectors make up one "block" of 8 matrices
	int num_blocks = N_MATRICES / BATCH_SIZE;

	for (int i = 0; i < num_blocks; i++) {
	mat33_mul_rank23_avx512_soa(&vC_ptr[i9], &vA_ptr[i9], &vB_ptr[i*9]);
	}
	t1 = now_seconds();
	printf("AVX-512 (1 Core): %.4f s \| %.2f M Matrices/s\n",
	t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

	// --- Benchmark 4: AVX-512 SoA (OpenMP Multi-Core) ---
	// This combines 512-bit width with 12 cores.
	t0 = now_seconds();
	#pragma omp parallel for schedule(static)
	for (int i = 0; i < num_blocks; i++) {
	mat33_mul_rank23_avx512_soa(&vC_ptr[i9], &vA_ptr[i9], &vB_ptr[i*9]);
	}
	t1 = now_seconds();
	printf("AVX-512 (OpenMP): %.4f s \| %.2f M Matrices/s\n",
	t1 - t0, N_MATRICES / (t1 - t0) / 1e6);

	// --- Cleanup ---
	aligned_free_wrapper(A_store);
	aligned_free_wrapper(B_store);
	aligned_free_wrapper(C_store);
	aligned_free_wrapper(A_soa);
	aligned_free_wrapper(B_soa);
	aligned_free_wrapper(C_soa);

	return 0;
	}
	// clang -O0 -march=native fast_matrix_mult.c -o fast_matrix_mult
	// clang -O3 -ffast-math -march=native fast_matrix_mult.c -o fast_matrix_mult

	// clang -O3 -ffast-math -march=native -funroll-loops -fomit-frame-pointer -DNDEBUG fast_matrix_mult.c -o fast_matrix_mult

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <stdbool.h>
	#include <time.h>

	/**
	* Perminov's Rank-23, 58-Addition Scheme for 3x3 Matrix Multiplication.
	* * Target: General non-commutative rings (implemented here for double).
	* Optimization:
	* - 'restrict' for pointer aliasing optimization.
	* - 'const' for SSA-friendly intermediate variable generation.
	* - Stack-allocated intermediates to avoid heap overhead.
	* * Matrix Layout: Row-major
	* Indices: 0:11, 1:12, 2:13, 3:21, 4:22, 5:23, 6:31, 7:32, 8:33
	*/
	void mat33_mul_rank23(double restrict C, const double restrict A, const double *restrict B) {
	// --- Input Aliases for Readability (Compiler will optimize these out) ---
	const double a11 = A[0], a12 = A[1], a13 = A[2];
	const double a21 = A[3], a22 = A[4], a23 = A[5];
	const double a31 = A[6], a32 = A[7], a33 = A[8];

	const double b11 = B[0], b12 = B[1], b13 = B[2];
	const double b21 = B[3], b22 = B[4], b23 = B[5];
	const double b31 = B[6], b32 = B[7], b33 = B[8];

	// --- Phase 1: Pre-computations (U and V terms) ---
	// Total Additions: 4 (U) + 8 (V) = 12

	// U terms (Linear combinations of A)
	const double u1 = a31 + a33;
	const double u2 = a21 + a22;
	const double u3 = a13 + u1; // Dependency on u1
	const double u4 = a32 - u2; // Dependency on u2

	// V terms (Linear combinations of B)
	const double v1 = b22 + b32;
	const double v2 = b31 - v1; // Dependency on v1
	const double v3 = b12 + v2; // Dependency on v2
	const double v4 = b11 - v3; // Dependency on v3
	const double v5 = b33 + v1; // Dependency on v1
	const double v6 = b21 - b23;
	const double v7 = b12 - v5; // Dependency on v5
	const double v8 = v4 - v6; // Dependency on v4, v6

	// --- Phase 2: Multiplications (M terms) ---
	// 23 Multiplications
	// 18 Additions inside the products

	const double m1 = u1 * v5;
	const double m2 = u2 * (v2 + v6);
	const double m3 = a32 * b23;
	const double m4 = a31 * (b13 + v7);
	const double m5 = u3 * v7;
	const double m6 = (a32 - a33) * b22;
	const double m7 = a23 * b33;
	const double m8 = (u1 - u2) * v2;
	const double m9 = (a12 - a13) * b22;
	const double m10 = (u3 - a21) * v3;
	const double m11 = (a13 + a33) * (v1 - b12);
	const double m12 = a13 * b33;
	const double m13 = (a31 + u4) * v4;
	const double m14 = a11 * b13;
	const double m15 = (a11 + u3) * b12;
	const double m16 = (a13 + a23) * b31;
	const double m17 = (a22 - a32) * (v4 - b21);
	const double m18 = (a11 + a21) * b11;
	const double m19 = a12 * b23;
	const double m20 = (a12 + a22) * b21;
	const double m21 = a21 * (b13 - v8);
	const double m22 = (a22 - a23) * (b31 - b32);
	const double m23 = u4 * v8;

	// --- Phase 3: Post-computations (W terms) ---
	// Total Additions: 8

	const double w1 = m5 + m12;
	const double w2 = m1 + w1;
	const double w3 = m8 + w2;
	const double w4 = m3 - m23;
	const double w5 = m2 + w4;
	const double w6 = w3 + w5;
	const double w7 = m10 - w6;
	const double w8 = m17 + w7;

	// --- Phase 4: Construct Result (C terms) ---
	// Total Additions: 20

	// Row 1
	C[0] = m18 + m20 + w8; // c11
	C[1] = m9 + m15 - w2; // c12
	C[2] = m12 + m14 + m19; // c13

	// Row 2
	C[3] = m16 - w8; // c21
	C[4] = m16 + m22 + w3 - m10; // c22
	C[5] = m7 + m21 + w4 - m17; // c23

	// Row 3
	C[6] = m11 + m13 + w6; // c31
	C[7] = m6 + m11 + w2; // c32
	C[8] = m3 + m4 - m11 - w1; // c33
	}

	// --- Verification/Usage Driver ---

	void print_mat33(const char name, const double M) {
	printf("%s:\n", name);
	for (int i = 0; i < 3; i++) {
	printf(" %8.2f %8.2f %8.2f\n", M[i3+0], M[i3+1], M[i*3+2]);
	}
	printf("\n");
	}

	static inline double now_seconds(void) {
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
	}

	void format_uint64_commas(uint64_t value, char *out, size_t out_size) {
	char buf[32];
	int i = 0;
	int group = 0;
	do {
	if (group == 3) {
	buf[i++] = ',';
	group = 0;
	}
	buf[i++] = (char)('0' + (value % 10));
	value /= 10;
	group++;
	} while (value > 0 && i < (int)sizeof(buf) - 1);
	if (i >= (int)sizeof(buf)) {
	if (out_size > 0) {
	out[0] = '\0';
	}
	return;
	}
	int j = 0;
	while (i > 0 && j < (int)out_size - 1) {
	out[j++] = buf[--i];
	}
	out[j] = '\0';
	}

	void fill_mat33_random(double *M) {
	for (int i = 0; i < 9; i++) {
	double r = (double)rand() / (double)RAND_MAX;
	M[i] = r * 2.0 - 1.0;
	}
	}

	void mat33_mul_naive(double restrict C, const double restrict A, const double *restrict B) {
	for (int i = 0; i < 3; i++) {
	for (int j = 0; j < 3; j++) {
	double acc = 0.0;
	for (int k = 0; k < 3; k++) {
	acc += A[i3+k] B[k*3+j];
	}
	C[i*3+j] = acc;
	}
	}
	}

	static inline double sum_mat33(const double *M) {
	return M[0] + M[1] + M[2] +
	M[3] + M[4] + M[5] +
	M[6] + M[7] + M[8];
	}

	int main(int argc, char **argv) {
	// Example matrices
	double A[9] = {0};
	double B[9] = {0};

	double C[9] = {0}; // Result
	double C_check[9] = {0};

	srand((unsigned)time(NULL));
	fill_mat33_random(A);
	fill_mat33_random(B);

	// Execute Perminov's Algorithm
	mat33_mul_rank23(C, A, B);

	// Naive Check (for verification logic only)
	mat33_mul_naive(C_check, A, B);

	print_mat33("Matrix A", A);
	print_mat33("Matrix B", B);
	print_mat33("Result C (Perminov Rank-23)", C);
	print_mat33("Result Check (Naive)", C_check);

	// --- Timing comparison ---
	uint64_t iterations = 10000000ULL;
	if (argc > 1) {
	char *end = NULL;
	unsigned long long parsed = strtoull(argv[1], &end, 10);
	if (end != argv[1] && parsed > 0) {
	iterations = (uint64_t)parsed;
	}
	}
	const uint64_t warmup = 100000ULL;
	volatile double sink = 0.0;
	double checksum_rank23 = 0.0;
	double checksum_naive = 0.0;
	double t0, t1;
	char iters_str[32];

	format_uint64_commas(iterations, iters_str, sizeof(iters_str));
	printf("Iterations: %s (warmup: %llu)\n",
	iters_str[0] ? iters_str : "N/A",
	(unsigned long long)warmup);

	for (uint64_t it = 0; it < warmup; it++) {
	mat33_mul_rank23(C, A, B);
	sink += C[it % 9];
	}

	t0 = now_seconds();
	for (uint64_t it = 0; it < iterations; it++) {
	mat33_mul_rank23(C, A, B);
	sink += C[it % 9];
	checksum_rank23 += sum_mat33(C);
	}
	t1 = now_seconds();
	printf("Perminov Rank-23: %.6f s (sink=%.2f)\n", t1 - t0, sink);
	double time_rank23 = t1 - t0;

	for (uint64_t it = 0; it < warmup; it++) {
	mat33_mul_naive(C, A, B);
	sink += C[(it + 3) % 9];
	}

	t0 = now_seconds();
	for (uint64_t it = 0; it < iterations; it++) {
	mat33_mul_naive(C, A, B);
	sink += C[(it + 3) % 9];
	checksum_naive += sum_mat33(C);
	}
	t1 = now_seconds();
	printf("Naive O(N^3): %.6f s (sink=%.2f)\n", t1 - t0, sink);
	double time_naive = t1 - t0;
	printf("Checksums: rank23=%.6f naive=%.6f diff=%.6f\n",
	checksum_rank23, checksum_naive, checksum_rank23 - checksum_naive);
	if (time_rank23 > 0.0 && time_naive > 0.0) {
	double speedup = time_naive / time_rank23;
	printf("Speedup: %.2fx (naive / rank23)\n", speedup);
	}
	if (time_rank23 < time_naive) {
	printf("Winner: Perminov Rank-23\n");
	} else if (time_naive < time_rank23) {
	printf("Winner: Naive O(N^3)\n");
	} else {
	printf("Winner: Tie\n");
	}

	return 0;
	}