Last active
January 12, 2026 14:33
-
-
Save egorsmkv/c2678d2b4e3bc009c2035b6909bb8c73 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Compile with: | |
| // clang -O3 -march=znver4 -fopenmp -ffast-math -mavx512f -mavx512dq amd_zen_matrix.c -o amd_zen_matrix | |
| // Note: Use -march=znver5 if available in your compiler (GCC 14+ / Clang 18+). | |
| // If not, -march=znver4 is safe and highly effective for Ryzen 9000. | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <stdint.h> | |
| #include <stdbool.h> | |
| #include <time.h> | |
| #include <immintrin.h> // Essential for AVX-512 | |
| #include <omp.h> // Essential for multicore scaling | |
| // --- Architecture Constants --- | |
| #define BATCH_SIZE 8 // AVX-512 holds 8 doubles | |
| #define ALIGNMENT 64 // Cache line size (Zen 5) | |
| // --- Helper: Aligned Memory Allocation --- | |
| void* aligned_alloc_wrapper(size_t alignment, size_t size) { | |
| #if defined(_MSC_VER) | |
| return _aligned_malloc(size, alignment); | |
| #else | |
| return aligned_alloc(alignment, size); | |
| #endif | |
| } | |
| void aligned_free_wrapper(void* ptr) { | |
| #if defined(_MSC_VER) | |
| _aligned_free(ptr); | |
| #else | |
| free(ptr); | |
| #endif | |
| } | |
| // --- Original Scalar Implementation (Optimized Attributes) --- | |
| // Using 'static inline' to encourage inlining into tight loops | |
| static inline void mat33_mul_rank23_scalar(double *restrict C, const double *restrict A, const double *restrict B) { | |
| const double a11 = A[0], a12 = A[1], a13 = A[2]; | |
| const double a21 = A[3], a22 = A[4], a23 = A[5]; | |
| const double a31 = A[6], a32 = A[7], a33 = A[8]; | |
| const double b11 = B[0], b12 = B[1], b13 = B[2]; | |
| const double b21 = B[3], b22 = B[4], b23 = B[5]; | |
| const double b31 = B[6], b32 = B[7], b33 = B[8]; | |
| const double u1 = a31 + a33; | |
| const double u2 = a21 + a22; | |
| const double u3 = a13 + u1; | |
| const double u4 = a32 - u2; | |
| const double v1 = b22 + b32; | |
| const double v2 = b31 - v1; | |
| const double v3 = b12 + v2; | |
| const double v4 = b11 - v3; | |
| const double v5 = b33 + v1; | |
| const double v6 = b21 - b23; | |
| const double v7 = b12 - v5; | |
| const double v8 = v4 - v6; | |
| const double m1 = u1 * v5; | |
| const double m2 = u2 * (v2 + v6); | |
| const double m3 = a32 * b23; | |
| const double m4 = a31 * (b13 + v7); | |
| const double m5 = u3 * v7; | |
| const double m6 = (a32 - a33) * b22; | |
| const double m7 = a23 * b33; | |
| const double m8 = (u1 - u2) * v2; | |
| const double m9 = (a12 - a13) * b22; | |
| const double m10 = (u3 - a21) * v3; | |
| const double m11 = (a13 + a33) * (v1 - b12); | |
| const double m12 = a13 * b33; | |
| const double m13 = (a31 + u4) * v4; | |
| const double m14 = a11 * b13; | |
| const double m15 = (a11 + u3) * b12; | |
| const double m16 = (a13 + a23) * b31; | |
| const double m17 = (a22 - a32) * (v4 - b21); | |
| const double m18 = (a11 + a21) * b11; | |
| const double m19 = a12 * b23; | |
| const double m20 = (a12 + a22) * b21; | |
| const double m21 = a21 * (b13 - v8); | |
| const double m22 = (a22 - a23) * (b31 - b32); | |
| const double m23 = u4 * v8; | |
| const double w1 = m5 + m12; | |
| const double w2 = m1 + w1; | |
| const double w3 = m8 + w2; | |
| const double w4 = m3 - m23; | |
| const double w5 = m2 + w4; | |
| const double w6 = w3 + w5; | |
| const double w7 = m10 - w6; | |
| const double w8 = m17 + w7; | |
| C[0] = m18 + m20 + w8; | |
| C[1] = m9 + m15 - w2; | |
| C[2] = m12 + m14 + m19; | |
| C[3] = m16 - w8; | |
| C[4] = m16 + m22 + w3 - m10; | |
| C[5] = m7 + m21 + w4 - m17; | |
| C[6] = m11 + m13 + w6; | |
| C[7] = m6 + m11 + w2; | |
| C[8] = m3 + m4 - m11 - w1; | |
| } | |
| // --- Zen 5 Optimized: AVX-512 Batched Perminov --- | |
| // This processes 8 separate 3x3 matrix multiplications simultaneously. | |
| // Input layout: | |
| // A_batch points to a block of 8 matrices. | |
| // Specifically, we load A[0] from matrix 0, A[0] from matrix 1... A[0] from matrix 7 into one register. | |
| // This requires the input data to be organized or gathered efficiently. | |
| // For high perf, we assume data is loaded Strided or Transposed, | |
| // but here we use gathers to support standard array-of-structs layout | |
| // (which is slower but drop-in compatible) OR we assume the user organizes data for the batch. | |
| // | |
| // To demonstrate maximum Zen 5 throughput, this function assumes 'vA' and 'vB' are | |
| // arrays of __m512d where index 0 contains (a11_mat0, a11_mat1... a11_mat7). | |
| // This implies a "Structure of Arrays" (SoA) layout for the batch. | |
| static inline void mat33_mul_rank23_avx512_soa( | |
| __m512d *restrict vC, | |
| const __m512d *restrict vA, | |
| const __m512d *restrict vB | |
| ) { | |
| // --- Input Loading (Assumes SoA Layout) --- | |
| // vA[0] = [a11_0, a11_1, ..., a11_7] | |
| // vA[1] = [a12_0, a12_1, ..., a12_7] | |
| // ... | |
| // U terms | |
| __m512d u1 = _mm512_add_pd(vA[6], vA[8]); // a31 + a33 | |
| __m512d u2 = _mm512_add_pd(vA[3], vA[4]); // a21 + a22 | |
| __m512d u3 = _mm512_add_pd(vA[2], u1); // a13 + u1 | |
| __m512d u4 = _mm512_sub_pd(vA[7], u2); // a32 - u2 | |
| // V terms | |
| __m512d v1 = _mm512_add_pd(vB[4], vB[7]); // b22 + b32 | |
| __m512d v2 = _mm512_sub_pd(vB[6], v1); // b31 - v1 | |
| __m512d v3 = _mm512_add_pd(vB[1], v2); // b12 + v2 | |
| __m512d v4 = _mm512_sub_pd(vB[0], v3); // b11 - v3 | |
| __m512d v5 = _mm512_add_pd(vB[8], v1); // b33 + v1 | |
| __m512d v6 = _mm512_sub_pd(vB[3], vB[5]); // b21 - b23 | |
| __m512d v7 = _mm512_sub_pd(vB[1], v5); // b12 - v5 | |
| __m512d v8 = _mm512_sub_pd(v4, v6); // v4 - v6 | |
| // M terms (Multiplications) | |
| __m512d m1 = _mm512_mul_pd(u1, v5); | |
| __m512d m2 = _mm512_mul_pd(u2, _mm512_add_pd(v2, v6)); | |
| __m512d m3 = _mm512_mul_pd(vA[7], vB[5]); // a32 * b23 | |
| __m512d m4 = _mm512_mul_pd(vA[6], _mm512_add_pd(vB[2], v7)); | |
| __m512d m5 = _mm512_mul_pd(u3, v7); | |
| __m512d m6 = _mm512_mul_pd(_mm512_sub_pd(vA[7], vA[8]), vB[4]); | |
| __m512d m7 = _mm512_mul_pd(vA[5], vB[8]); | |
| __m512d m8 = _mm512_mul_pd(_mm512_sub_pd(u1, u2), v2); | |
| __m512d m9 = _mm512_mul_pd(_mm512_sub_pd(vA[1], vA[2]), vB[4]); | |
| __m512d m10 = _mm512_mul_pd(_mm512_sub_pd(u3, vA[3]), v3); | |
| __m512d m11 = _mm512_mul_pd(_mm512_add_pd(vA[2], vA[8]), _mm512_sub_pd(v1, vB[1])); | |
| __m512d m12 = _mm512_mul_pd(vA[2], vB[8]); | |
| __m512d m13 = _mm512_mul_pd(_mm512_add_pd(vA[6], u4), v4); | |
| __m512d m14 = _mm512_mul_pd(vA[0], vB[2]); | |
| __m512d m15 = _mm512_mul_pd(_mm512_add_pd(vA[0], u3), vB[1]); | |
| __m512d m16 = _mm512_mul_pd(_mm512_add_pd(vA[2], vA[5]), vB[6]); | |
| __m512d m17 = _mm512_mul_pd(_mm512_sub_pd(vA[4], vA[7]), _mm512_sub_pd(v4, vB[3])); | |
| __m512d m18 = _mm512_mul_pd(_mm512_add_pd(vA[0], vA[3]), vB[0]); | |
| __m512d m19 = _mm512_mul_pd(vA[1], vB[5]); | |
| __m512d m20 = _mm512_mul_pd(_mm512_add_pd(vA[1], vA[4]), vB[3]); | |
| __m512d m21 = _mm512_mul_pd(vA[3], _mm512_sub_pd(vB[2], v8)); | |
| __m512d m22 = _mm512_mul_pd(_mm512_sub_pd(vA[4], vA[5]), _mm512_sub_pd(vB[6], vB[7])); | |
| __m512d m23 = _mm512_mul_pd(u4, v8); | |
| // W terms (Post-computations) | |
| __m512d w1 = _mm512_add_pd(m5, m12); | |
| __m512d w2 = _mm512_add_pd(m1, w1); | |
| __m512d w3 = _mm512_add_pd(m8, w2); | |
| __m512d w4 = _mm512_sub_pd(m3, m23); | |
| __m512d w5 = _mm512_add_pd(m2, w4); | |
| __m512d w6 = _mm512_add_pd(w3, w5); | |
| __m512d w7 = _mm512_sub_pd(m10, w6); | |
| __m512d w8 = _mm512_add_pd(m17, w7); | |
| // C terms (Result Construction) | |
| vC[0] = _mm512_add_pd(_mm512_add_pd(m18, m20), w8); | |
| vC[1] = _mm512_sub_pd(_mm512_add_pd(m9, m15), w2); | |
| vC[2] = _mm512_add_pd(_mm512_add_pd(m12, m14), m19); | |
| vC[3] = _mm512_sub_pd(m16, w8); | |
| vC[4] = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(m16, m22), w3), m10); | |
| vC[5] = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(m7, m21), w4), m17); | |
| vC[6] = _mm512_add_pd(_mm512_add_pd(m11, m13), w6); | |
| vC[7] = _mm512_add_pd(_mm512_add_pd(m6, m11), w2); | |
| vC[8] = _mm512_sub_pd(_mm512_sub_pd(_mm512_add_pd(m3, m4), m11), w1); | |
| } | |
| // --- Utils --- | |
| static inline double now_seconds(void) { | |
| struct timespec ts; | |
| clock_gettime(CLOCK_MONOTONIC, &ts); | |
| return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; | |
| } | |
| void fill_random(double *M, size_t count) { | |
| for (size_t i = 0; i < count; i++) { | |
| M[i] = ((double)rand() / (double)RAND_MAX) * 2.0 - 1.0; | |
| } | |
| } | |
| // --- Driver --- | |
| int main(int argc, char **argv) { | |
| // We want to process a large number of matrices to test throughput | |
| // 9900X L3 cache is large, but let's keep working set reasonable to measure core math | |
| const int N_MATRICES = 1024 * 1024 * 10; // 10 Million matrices | |
| const size_t elements = N_MATRICES * 9; | |
| const size_t bytes = elements * sizeof(double); | |
| printf("Allocating %.2f MB for matrices...\n", (double)bytes * 3 / (1024*1024)); | |
| double *A_store = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes); | |
| double *B_store = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes); | |
| double *C_store = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes); | |
| // For AVX SoA test, we need transposed storage | |
| double *A_soa = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes); | |
| double *B_soa = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes); | |
| double *C_soa = (double*)aligned_alloc_wrapper(ALIGNMENT, bytes); | |
| if (!A_store || !B_store || !C_store) { | |
| printf("Memory allocation failed.\n"); | |
| return 1; | |
| } | |
| fill_random(A_store, elements); | |
| fill_random(B_store, elements); | |
| // --- Prepare SoA Data --- | |
| // Transform Array of Structs (standard 3x3 layout) to Structure of Arrays | |
| // (Batches of 8 matrices where a11s are contiguous) | |
| #pragma omp parallel for schedule(static) | |
| for (int i = 0; i < N_MATRICES; i += BATCH_SIZE) { | |
| for (int k = 0; k < 9; k++) { | |
| for (int j = 0; j < BATCH_SIZE; j++) { | |
| if (i+j < N_MATRICES) { | |
| A_soa[(i * 9) + (k * BATCH_SIZE) + j] = A_store[(i + j) * 9 + k]; | |
| B_soa[(i * 9) + (k * BATCH_SIZE) + j] = B_store[(i + j) * 9 + k]; | |
| } | |
| } | |
| } | |
| } | |
| printf("Starting Benchmarks on Ryzen 9 9900X (Zen 5)...\n"); | |
| // --- Benchmark 1: Standard Scalar (Single Thread) --- | |
| double t0 = now_seconds(); | |
| for (int i = 0; i < N_MATRICES; i++) { | |
| mat33_mul_rank23_scalar(&C_store[i*9], &A_store[i*9], &B_store[i*9]); | |
| } | |
| double t1 = now_seconds(); | |
| printf("Scalar (1 Core): %.4f s | %.2f M Matrices/s\n", | |
| t1 - t0, N_MATRICES / (t1 - t0) / 1e6); | |
| // --- Benchmark 2: Standard Scalar (OpenMP Multi-Core) --- | |
| // Zen 5 scales well with SMT, but for pure FPU throughput, physical cores matter most. | |
| t0 = now_seconds(); | |
| #pragma omp parallel for schedule(static) | |
| for (int i = 0; i < N_MATRICES; i++) { | |
| mat33_mul_rank23_scalar(&C_store[i*9], &A_store[i*9], &B_store[i*9]); | |
| } | |
| t1 = now_seconds(); | |
| printf("Scalar (OpenMP): %.4f s | %.2f M Matrices/s\n", | |
| t1 - t0, N_MATRICES / (t1 - t0) / 1e6); | |
| // --- Benchmark 3: AVX-512 SoA (Single Thread) --- | |
| t0 = now_seconds(); | |
| // Pointers to __m512d types for easy indexing | |
| __m512d *vA_ptr = (__m512d*)A_soa; | |
| __m512d *vB_ptr = (__m512d*)B_soa; | |
| __m512d *vC_ptr = (__m512d*)C_soa; | |
| // 9 vectors make up one "block" of 8 matrices | |
| int num_blocks = N_MATRICES / BATCH_SIZE; | |
| for (int i = 0; i < num_blocks; i++) { | |
| mat33_mul_rank23_avx512_soa(&vC_ptr[i*9], &vA_ptr[i*9], &vB_ptr[i*9]); | |
| } | |
| t1 = now_seconds(); | |
| printf("AVX-512 (1 Core): %.4f s | %.2f M Matrices/s\n", | |
| t1 - t0, N_MATRICES / (t1 - t0) / 1e6); | |
| // --- Benchmark 4: AVX-512 SoA (OpenMP Multi-Core) --- | |
| // This combines 512-bit width with 12 cores. | |
| t0 = now_seconds(); | |
| #pragma omp parallel for schedule(static) | |
| for (int i = 0; i < num_blocks; i++) { | |
| mat33_mul_rank23_avx512_soa(&vC_ptr[i*9], &vA_ptr[i*9], &vB_ptr[i*9]); | |
| } | |
| t1 = now_seconds(); | |
| printf("AVX-512 (OpenMP): %.4f s | %.2f M Matrices/s\n", | |
| t1 - t0, N_MATRICES / (t1 - t0) / 1e6); | |
| // --- Cleanup --- | |
| aligned_free_wrapper(A_store); | |
| aligned_free_wrapper(B_store); | |
| aligned_free_wrapper(C_store); | |
| aligned_free_wrapper(A_soa); | |
| aligned_free_wrapper(B_soa); | |
| aligned_free_wrapper(C_soa); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // clang -O0 -march=native fast_matrix_mult.c -o fast_matrix_mult | |
| // clang -O3 -ffast-math -march=native fast_matrix_mult.c -o fast_matrix_mult | |
| // clang -O3 -ffast-math -march=native -funroll-loops -fomit-frame-pointer -DNDEBUG fast_matrix_mult.c -o fast_matrix_mult | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <stdint.h> | |
| #include <stdbool.h> | |
| #include <time.h> | |
| /** | |
| * Perminov's Rank-23, 58-Addition Scheme for 3x3 Matrix Multiplication. | |
| * * Target: General non-commutative rings (implemented here for double). | |
| * Optimization: | |
| * - 'restrict' for pointer aliasing optimization. | |
| * - 'const' for SSA-friendly intermediate variable generation. | |
| * - Stack-allocated intermediates to avoid heap overhead. | |
| * * Matrix Layout: Row-major | |
| * Indices: 0:11, 1:12, 2:13, 3:21, 4:22, 5:23, 6:31, 7:32, 8:33 | |
| */ | |
| void mat33_mul_rank23(double *restrict C, const double *restrict A, const double *restrict B) { | |
| // --- Input Aliases for Readability (Compiler will optimize these out) --- | |
| const double a11 = A[0], a12 = A[1], a13 = A[2]; | |
| const double a21 = A[3], a22 = A[4], a23 = A[5]; | |
| const double a31 = A[6], a32 = A[7], a33 = A[8]; | |
| const double b11 = B[0], b12 = B[1], b13 = B[2]; | |
| const double b21 = B[3], b22 = B[4], b23 = B[5]; | |
| const double b31 = B[6], b32 = B[7], b33 = B[8]; | |
| // --- Phase 1: Pre-computations (U and V terms) --- | |
| // Total Additions: 4 (U) + 8 (V) = 12 | |
| // U terms (Linear combinations of A) | |
| const double u1 = a31 + a33; | |
| const double u2 = a21 + a22; | |
| const double u3 = a13 + u1; // Dependency on u1 | |
| const double u4 = a32 - u2; // Dependency on u2 | |
| // V terms (Linear combinations of B) | |
| const double v1 = b22 + b32; | |
| const double v2 = b31 - v1; // Dependency on v1 | |
| const double v3 = b12 + v2; // Dependency on v2 | |
| const double v4 = b11 - v3; // Dependency on v3 | |
| const double v5 = b33 + v1; // Dependency on v1 | |
| const double v6 = b21 - b23; | |
| const double v7 = b12 - v5; // Dependency on v5 | |
| const double v8 = v4 - v6; // Dependency on v4, v6 | |
| // --- Phase 2: Multiplications (M terms) --- | |
| // 23 Multiplications | |
| // 18 Additions inside the products | |
| const double m1 = u1 * v5; | |
| const double m2 = u2 * (v2 + v6); | |
| const double m3 = a32 * b23; | |
| const double m4 = a31 * (b13 + v7); | |
| const double m5 = u3 * v7; | |
| const double m6 = (a32 - a33) * b22; | |
| const double m7 = a23 * b33; | |
| const double m8 = (u1 - u2) * v2; | |
| const double m9 = (a12 - a13) * b22; | |
| const double m10 = (u3 - a21) * v3; | |
| const double m11 = (a13 + a33) * (v1 - b12); | |
| const double m12 = a13 * b33; | |
| const double m13 = (a31 + u4) * v4; | |
| const double m14 = a11 * b13; | |
| const double m15 = (a11 + u3) * b12; | |
| const double m16 = (a13 + a23) * b31; | |
| const double m17 = (a22 - a32) * (v4 - b21); | |
| const double m18 = (a11 + a21) * b11; | |
| const double m19 = a12 * b23; | |
| const double m20 = (a12 + a22) * b21; | |
| const double m21 = a21 * (b13 - v8); | |
| const double m22 = (a22 - a23) * (b31 - b32); | |
| const double m23 = u4 * v8; | |
| // --- Phase 3: Post-computations (W terms) --- | |
| // Total Additions: 8 | |
| const double w1 = m5 + m12; | |
| const double w2 = m1 + w1; | |
| const double w3 = m8 + w2; | |
| const double w4 = m3 - m23; | |
| const double w5 = m2 + w4; | |
| const double w6 = w3 + w5; | |
| const double w7 = m10 - w6; | |
| const double w8 = m17 + w7; | |
| // --- Phase 4: Construct Result (C terms) --- | |
| // Total Additions: 20 | |
| // Row 1 | |
| C[0] = m18 + m20 + w8; // c11 | |
| C[1] = m9 + m15 - w2; // c12 | |
| C[2] = m12 + m14 + m19; // c13 | |
| // Row 2 | |
| C[3] = m16 - w8; // c21 | |
| C[4] = m16 + m22 + w3 - m10; // c22 | |
| C[5] = m7 + m21 + w4 - m17; // c23 | |
| // Row 3 | |
| C[6] = m11 + m13 + w6; // c31 | |
| C[7] = m6 + m11 + w2; // c32 | |
| C[8] = m3 + m4 - m11 - w1; // c33 | |
| } | |
| // --- Verification/Usage Driver --- | |
| void print_mat33(const char *name, const double *M) { | |
| printf("%s:\n", name); | |
| for (int i = 0; i < 3; i++) { | |
| printf(" %8.2f %8.2f %8.2f\n", M[i*3+0], M[i*3+1], M[i*3+2]); | |
| } | |
| printf("\n"); | |
| } | |
| static inline double now_seconds(void) { | |
| struct timespec ts; | |
| clock_gettime(CLOCK_MONOTONIC, &ts); | |
| return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; | |
| } | |
| void format_uint64_commas(uint64_t value, char *out, size_t out_size) { | |
| char buf[32]; | |
| int i = 0; | |
| int group = 0; | |
| do { | |
| if (group == 3) { | |
| buf[i++] = ','; | |
| group = 0; | |
| } | |
| buf[i++] = (char)('0' + (value % 10)); | |
| value /= 10; | |
| group++; | |
| } while (value > 0 && i < (int)sizeof(buf) - 1); | |
| if (i >= (int)sizeof(buf)) { | |
| if (out_size > 0) { | |
| out[0] = '\0'; | |
| } | |
| return; | |
| } | |
| int j = 0; | |
| while (i > 0 && j < (int)out_size - 1) { | |
| out[j++] = buf[--i]; | |
| } | |
| out[j] = '\0'; | |
| } | |
| void fill_mat33_random(double *M) { | |
| for (int i = 0; i < 9; i++) { | |
| double r = (double)rand() / (double)RAND_MAX; | |
| M[i] = r * 2.0 - 1.0; | |
| } | |
| } | |
| void mat33_mul_naive(double *restrict C, const double *restrict A, const double *restrict B) { | |
| for (int i = 0; i < 3; i++) { | |
| for (int j = 0; j < 3; j++) { | |
| double acc = 0.0; | |
| for (int k = 0; k < 3; k++) { | |
| acc += A[i*3+k] * B[k*3+j]; | |
| } | |
| C[i*3+j] = acc; | |
| } | |
| } | |
| } | |
| static inline double sum_mat33(const double *M) { | |
| return M[0] + M[1] + M[2] + | |
| M[3] + M[4] + M[5] + | |
| M[6] + M[7] + M[8]; | |
| } | |
| int main(int argc, char **argv) { | |
| // Example matrices | |
| double A[9] = {0}; | |
| double B[9] = {0}; | |
| double C[9] = {0}; // Result | |
| double C_check[9] = {0}; | |
| srand((unsigned)time(NULL)); | |
| fill_mat33_random(A); | |
| fill_mat33_random(B); | |
| // Execute Perminov's Algorithm | |
| mat33_mul_rank23(C, A, B); | |
| // Naive Check (for verification logic only) | |
| mat33_mul_naive(C_check, A, B); | |
| print_mat33("Matrix A", A); | |
| print_mat33("Matrix B", B); | |
| print_mat33("Result C (Perminov Rank-23)", C); | |
| print_mat33("Result Check (Naive)", C_check); | |
| // --- Timing comparison --- | |
| uint64_t iterations = 10000000ULL; | |
| if (argc > 1) { | |
| char *end = NULL; | |
| unsigned long long parsed = strtoull(argv[1], &end, 10); | |
| if (end != argv[1] && parsed > 0) { | |
| iterations = (uint64_t)parsed; | |
| } | |
| } | |
| const uint64_t warmup = 100000ULL; | |
| volatile double sink = 0.0; | |
| double checksum_rank23 = 0.0; | |
| double checksum_naive = 0.0; | |
| double t0, t1; | |
| char iters_str[32]; | |
| format_uint64_commas(iterations, iters_str, sizeof(iters_str)); | |
| printf("Iterations: %s (warmup: %llu)\n", | |
| iters_str[0] ? iters_str : "N/A", | |
| (unsigned long long)warmup); | |
| for (uint64_t it = 0; it < warmup; it++) { | |
| mat33_mul_rank23(C, A, B); | |
| sink += C[it % 9]; | |
| } | |
| t0 = now_seconds(); | |
| for (uint64_t it = 0; it < iterations; it++) { | |
| mat33_mul_rank23(C, A, B); | |
| sink += C[it % 9]; | |
| checksum_rank23 += sum_mat33(C); | |
| } | |
| t1 = now_seconds(); | |
| printf("Perminov Rank-23: %.6f s (sink=%.2f)\n", t1 - t0, sink); | |
| double time_rank23 = t1 - t0; | |
| for (uint64_t it = 0; it < warmup; it++) { | |
| mat33_mul_naive(C, A, B); | |
| sink += C[(it + 3) % 9]; | |
| } | |
| t0 = now_seconds(); | |
| for (uint64_t it = 0; it < iterations; it++) { | |
| mat33_mul_naive(C, A, B); | |
| sink += C[(it + 3) % 9]; | |
| checksum_naive += sum_mat33(C); | |
| } | |
| t1 = now_seconds(); | |
| printf("Naive O(N^3): %.6f s (sink=%.2f)\n", t1 - t0, sink); | |
| double time_naive = t1 - t0; | |
| printf("Checksums: rank23=%.6f naive=%.6f diff=%.6f\n", | |
| checksum_rank23, checksum_naive, checksum_rank23 - checksum_naive); | |
| if (time_rank23 > 0.0 && time_naive > 0.0) { | |
| double speedup = time_naive / time_rank23; | |
| printf("Speedup: %.2fx (naive / rank23)\n", speedup); | |
| } | |
| if (time_rank23 < time_naive) { | |
| printf("Winner: Perminov Rank-23\n"); | |
| } else if (time_naive < time_rank23) { | |
| printf("Winner: Naive O(N^3)\n"); | |
| } else { | |
| printf("Winner: Tie\n"); | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment