Skip to content

Instantly share code, notes, and snippets.

@nazarii-piontko
Created January 15, 2026 21:31
Show Gist options
  • Select an option

  • Save nazarii-piontko/fafb2294c2b7b3a8f45057b15afc3b75 to your computer and use it in GitHub Desktop.

Select an option

Save nazarii-piontko/fafb2294c2b7b3a8f45057b15afc3b75 to your computer and use it in GitHub Desktop.
FFOR Benchmark

Benchmark code using nanobench (https://github.com/martinus/nanobench)

// FFOR code here ....
// Include nanobench

alignas(64) UInt64 input[2048];
alignas(64) UInt64 packed[2048];
alignas(64) UInt64 output[2048];

constexpr UInt64 BENCH_BASE = 1000;

void init_data()
{
    std::mt19937_64 rng(42);
    UInt64 mask = (UInt64{1} << 63) - 1;

    for (size_t i = 0; i < 2048; ++i)
    {
        input[i] = BENCH_BASE + (rng() & mask);
    }
}

// Benchmark: 4x 512 pack+unpack on 2048 array (0-512, 512-1024, 1024-1536, 1536-2048)
void bench_4x512(ankerl::nanobench::Bench& bench, UInt16 bits)
{
    bench.run("4x512", [&] {
        DB::Compression::FFOR::bitPack<512>(input, packed, bits, BENCH_BASE);
        DB::Compression::FFOR::bitPack<512>(input + 512, packed + 512, bits, BENCH_BASE);
        DB::Compression::FFOR::bitPack<512>(input + 1024, packed + 1024, bits, BENCH_BASE);
        DB::Compression::FFOR::bitPack<512>(input + 1536, packed + 1536, bits, BENCH_BASE);

        ankerl::nanobench::doNotOptimizeAway(packed);

        DB::Compression::FFOR::bitUnpack<512>(packed, output, bits, BENCH_BASE);
        DB::Compression::FFOR::bitUnpack<512>(packed + 512, output + 512, bits, BENCH_BASE);
        DB::Compression::FFOR::bitUnpack<512>(packed + 1024, output + 1024, bits, BENCH_BASE);
        DB::Compression::FFOR::bitUnpack<512>(packed + 1536, output + 1536, bits, BENCH_BASE);

        ankerl::nanobench::doNotOptimizeAway(output);
    });
}

// Benchmark: 2x 1024 pack+unpack on 2048 array (0-1024, then 1024-2048)
void bench_2x1024(ankerl::nanobench::Bench& bench, UInt16 bits)
{
    bench.run("2x1024", [&] {
        DB::Compression::FFOR::bitPack<1024>(input, packed, bits, BENCH_BASE);
        DB::Compression::FFOR::bitPack<1024>(input + 1024, packed + 1024, bits, BENCH_BASE);

        ankerl::nanobench::doNotOptimizeAway(packed);

        DB::Compression::FFOR::bitUnpack<1024>(packed, output, bits, BENCH_BASE);
        DB::Compression::FFOR::bitUnpack<1024>(packed + 1024, output + 1024, bits, BENCH_BASE);

        ankerl::nanobench::doNotOptimizeAway(output);
    });
}

// Benchmark: 1x 2048 pack+unpack
void bench_1x2048(ankerl::nanobench::Bench& bench, UInt16 bits)
{
    bench.run("1x2048", [&] {
        DB::Compression::FFOR::bitPack<2048>(input, packed, bits, BENCH_BASE);

        ankerl::nanobench::doNotOptimizeAway(packed);

        DB::Compression::FFOR::bitUnpack<2048>(packed, output, bits, BENCH_BASE);

        ankerl::nanobench::doNotOptimizeAway(output);
    });
}

int main()
{
    init_data();

    for (UInt16 bits = 0; bits <= 64; ++bits)
    {
        ankerl::nanobench::Bench bench;
        bench.title(std::to_string(bits) + " bits")
            .relative(true)
            .warmup(100);

        bench_4x512(bench, bits);
        bench_2x1024(bench, bits);
        bench_1x2048(bench, bits);
    }

    return 0;
}

Compilation flags: -O3 -mavx512f

Results

relative ns/op op/s err% total 0 bits
100.0% 91.57 10,921,075.84 0.4% 0.01 4x512
108.7% 84.22 11,873,947.95 0.4% 0.01 2x1024
109.6% 83.58 11,964,614.48 0.4% 0.01 1x2048
relative ns/op op/s err% total 1 bits
100.0% 282.73 3,537,002.39 0.8% 0.01 4x512
104.0% 271.91 3,677,717.49 1.0% 0.01 2x1024
98.3% 287.74 3,475,314.01 0.5% 0.01 1x2048
relative ns/op op/s err% total 2 bits
100.0% 268.11 3,729,795.32 0.2% 0.01 4x512
101.2% 265.03 3,773,107.88 0.4% 0.01 2x1024
94.7% 283.22 3,530,835.23 0.5% 0.01 1x2048
relative ns/op op/s err% total 3 bits
100.0% 296.26 3,375,370.61 0.7% 0.01 4x512
102.1% 290.22 3,445,668.72 0.4% 0.01 2x1024
91.2% 324.76 3,079,224.95 0.4% 0.01 1x2048
relative ns/op op/s err% total 4 bits
100.0% 271.62 3,681,564.84 0.9% 0.01 4x512
100.6% 269.92 3,704,745.66 1.4% 0.01 2x1024
88.0% 308.64 3,240,071.55 0.2% 0.01 1x2048
relative ns/op op/s err% total 5 bits
100.0% 311.43 3,211,031.28 0.3% 0.01 4x512
98.9% 314.86 3,175,983.47 0.8% 0.01 2x1024
89.4% 348.18 2,872,099.57 0.4% 0.01 1x2048
relative ns/op op/s err% total 6 bits
100.0% 298.58 3,349,138.42 0.4% 0.01 4x512
98.5% 303.05 3,299,793.22 1.0% 0.01 2x1024
88.1% 339.04 2,949,482.10 0.3% 0.01 1x2048
relative ns/op op/s err% total 7 bits
100.0% 336.67 2,970,259.12 0.8% 0.01 4x512
99.9% 336.90 2,968,274.36 0.3% 0.01 2x1024
90.0% 374.04 2,673,479.81 0.4% 0.01 1x2048
relative ns/op op/s err% total 8 bits
100.0% 263.70 3,792,202.99 0.6% 0.01 4x512
94.5% 279.02 3,583,953.02 0.4% 0.01 2x1024
81.4% 323.89 3,087,427.86 0.4% 0.01 1x2048
relative ns/op op/s err% total 9 bits
100.0% 342.90 2,916,297.66 0.7% 0.01 4x512
96.3% 356.09 2,808,315.02 0.2% 0.01 2x1024
87.6% 391.38 2,555,090.80 0.4% 0.01 1x2048
relative ns/op op/s err% total 10 bits
100.0% 332.14 3,010,757.93 0.5% 0.01 4x512
94.8% 350.23 2,855,267.31 0.2% 0.01 2x1024
86.5% 384.02 2,604,008.74 0.4% 0.01 1x2048
relative ns/op op/s err% total 11 bits
100.0% 365.53 2,735,767.35 0.6% 0.01 4x512
93.1% 392.62 2,546,978.72 0.4% 0.01 2x1024
87.8% 416.40 2,401,545.66 0.3% 0.01 1x2048
relative ns/op op/s err% total 12 bits
100.0% 309.63 3,229,644.30 0.6% 0.01 4x512
93.6% 330.79 3,023,085.20 0.7% 0.01 2x1024
81.8% 378.45 2,642,380.69 0.1% 0.01 1x2048
relative ns/op op/s err% total 13 bits
100.0% 368.02 2,717,245.07 0.2% 0.01 4x512
89.4% 411.43 2,430,554.21 0.4% 0.01 2x1024
84.2% 436.95 2,288,583.63 0.4% 0.01 1x2048
relative ns/op op/s err% total 14 bits
100.0% 363.76 2,749,081.46 0.3% 0.01 4x512
91.0% 399.54 2,502,856.19 0.4% 0.01 2x1024
84.8% 429.18 2,330,006.57 0.5% 0.01 1x2048
relative ns/op op/s err% total 15 bits
100.0% 382.35 2,615,410.52 0.4% 0.01 4x512
87.0% 439.58 2,274,913.78 0.3% 0.01 2x1024
84.6% 451.68 2,213,936.85 0.4% 0.01 1x2048
relative ns/op op/s err% total 16 bits
100.0% 291.39 3,431,771.21 0.6% 0.01 4x512
83.2% 350.38 2,854,068.34 0.7% 0.01 2x1024
69.8% 417.41 2,395,723.16 0.3% 0.01 1x2048
relative ns/op op/s err% total 17 bits
100.0% 393.75 2,539,683.10 0.4% 0.01 4x512
87.6% 449.32 2,225,599.46 0.3% 0.01 2x1024
82.6% 476.77 2,097,457.99 0.5% 0.01 1x2048
relative ns/op op/s err% total 18 bits
100.0% 393.68 2,540,126.83 0.2% 0.01 4x512
89.1% 442.07 2,262,060.53 0.9% 0.01 2x1024
84.8% 464.42 2,153,216.81 0.3% 0.01 1x2048
relative ns/op op/s err% total 19 bits
100.0% 414.09 2,414,945.28 0.3% 0.01 4x512
87.6% 472.79 2,115,103.73 0.2% 0.01 2x1024
83.2% 497.96 2,008,210.74 0.5% 0.01 1x2048
relative ns/op op/s err% total 20 bits
100.0% 353.71 2,827,213.05 0.3% 0.01 4x512
84.5% 418.80 2,387,761.49 0.6% 0.01 2x1024
74.7% 473.82 2,110,520.37 0.4% 0.01 1x2048
relative ns/op op/s err% total 21 bits
100.0% 427.26 2,340,510.70 0.3% 0.01 4x512
87.7% 487.19 2,052,607.01 0.6% 0.01 2x1024
84.0% 508.49 1,966,608.02 0.4% 0.01 1x2048
relative ns/op op/s err% total 22 bits
100.0% 436.34 2,291,778.09 0.4% 0.01 4x512
91.8% 475.47 2,103,177.74 0.2% 0.01 2x1024
87.1% 500.96 1,996,156.45 0.3% 0.01 1x2048
relative ns/op op/s err% total 23 bits
100.0% 455.77 2,194,109.21 0.2% 0.01 4x512
88.5% 515.11 1,941,340.92 0.4% 0.01 2x1024
85.8% 531.24 1,882,378.54 0.3% 0.01 1x2048
relative ns/op op/s err% total 24 bits
100.0% 374.94 2,667,071.86 0.5% 0.01 4x512
82.5% 454.53 2,200,071.72 0.9% 0.01 2x1024
75.2% 498.80 2,004,825.56 0.3% 0.01 1x2048
relative ns/op op/s err% total 25 bits
100.0% 476.75 2,097,555.69 0.5% 0.01 4x512
89.3% 533.71 1,873,659.70 0.4% 0.01 2x1024
85.2% 559.53 1,787,207.45 0.3% 0.01 1x2048
relative ns/op op/s err% total 26 bits
100.0% 459.38 2,176,828.20 0.4% 0.01 4x512
87.1% 527.64 1,895,232.45 0.3% 0.01 2x1024
84.5% 543.54 1,839,782.29 0.3% 0.01 1x2048
relative ns/op op/s err% total 27 bits
100.0% 484.78 2,062,795.77 0.4% 0.01 4x512
88.1% 550.36 1,816,976.81 0.3% 0.01 2x1024
84.2% 575.69 1,737,054.56 0.4% 0.01 1x2048
relative ns/op op/s err% total 28 bits
100.0% 425.20 2,351,820.68 0.5% 0.01 4x512
83.7% 507.89 1,968,940.44 0.9% 0.01 2x1024
77.8% 546.30 1,830,509.04 0.3% 0.01 1x2048
relative ns/op op/s err% total 29 bits
100.0% 513.47 1,947,535.73 0.2% 0.01 4x512
89.0% 577.18 1,732,575.01 0.4% 0.01 2x1024
85.6% 600.05 1,666,532.45 0.5% 0.01 1x2048
relative ns/op op/s err% total 30 bits
100.0% 492.22 2,031,599.51 0.2% 0.01 4x512
87.9% 560.10 1,785,395.78 0.2% 0.01 2x1024
85.0% 578.87 1,727,505.13 0.4% 0.01 1x2048
relative ns/op op/s err% total 31 bits
100.0% 520.88 1,919,821.00 0.4% 0.01 4x512
85.6% 608.37 1,643,730.03 0.6% 0.01 2x1024
83.6% 622.96 1,605,243.86 0.3% 0.01 1x2048
relative ns/op op/s err% total 32 bits
100.0% 446.54 2,239,450.93 0.3% 0.01 4x512
85.1% 524.60 1,906,204.18 0.3% 0.01 2x1024
75.4% 592.19 1,688,655.19 0.4% 0.01 1x2048
relative ns/op op/s err% total 33 bits
100.0% 538.51 1,856,967.98 0.4% 0.01 4x512
86.1% 625.09 1,599,776.81 0.2% 0.01 2x1024
82.7% 651.10 1,535,856.60 0.2% 0.01 1x2048
relative ns/op op/s err% total 34 bits
100.0% 541.89 1,845,405.56 0.2% 0.01 4x512
88.4% 612.98 1,631,362.29 0.3% 0.01 2x1024
84.4% 641.82 1,558,065.01 0.3% 0.01 1x2048
relative ns/op op/s err% total 35 bits
100.0% 556.32 1,797,529.81 0.6% 0.01 4x512
86.0% 646.53 1,546,727.05 0.3% 0.01 2x1024
82.5% 674.68 1,482,189.65 0.4% 0.01 1x2048
relative ns/op op/s err% total 36 bits
100.0% 503.78 1,984,974.06 0.5% 0.01 4x512
88.3% 570.52 1,752,792.85 0.3% 0.01 2x1024
80.8% 623.61 1,603,568.01 0.3% 0.01 1x2048
relative ns/op op/s err% total 37 bits
100.0% 572.71 1,746,071.58 0.4% 0.01 4x512
86.3% 663.88 1,506,307.18 0.1% 0.01 2x1024
80.3% 713.08 1,402,361.31 0.5% 0.01 1x2048
relative ns/op op/s err% total 38 bits
100.0% 601.26 1,663,176.89 0.4% 0.01 4x512
90.1% 667.62 1,497,864.27 0.3% 0.01 2x1024
84.7% 709.68 1,409,075.99 0.2% 0.01 1x2048
relative ns/op op/s err% total 39 bits
100.0% 596.30 1,677,000.80 0.3% 0.01 4x512
85.6% 696.51 1,435,735.14 0.3% 0.01 2x1024
80.2% 743.06 1,345,781.00 0.6% 0.01 1x2048
relative ns/op op/s err% total 40 bits
100.0% 526.66 1,898,773.64 0.3% 0.01 4x512
88.6% 594.38 1,682,413.22 0.4% 0.01 2x1024
81.2% 648.22 1,542,694.66 0.3% 0.01 1x2048
relative ns/op op/s err% total 41 bits
100.0% 631.67 1,583,117.58 0.3% 0.01 4x512
87.1% 725.50 1,378,365.15 0.3% 0.01 2x1024
80.3% 786.73 1,271,080.84 0.8% 0.01 1x2048
relative ns/op op/s err% total 42 bits
100.0% 668.25 1,496,450.42 0.2% 0.01 4x512
91.2% 732.48 1,365,224.35 0.2% 0.01 2x1024
85.8% 778.73 1,284,145.00 0.4% 0.01 1x2048
relative ns/op op/s err% total 43 bits
100.0% 651.01 1,536,082.32 0.4% 0.01 4x512
87.2% 746.36 1,339,829.90 0.4% 0.01 2x1024
81.5% 798.40 1,252,506.32 0.3% 0.01 1x2048
relative ns/op op/s err% total 44 bits
100.0% 599.51 1,668,026.63 0.3% 0.01 4x512
95.5% 627.84 1,592,759.99 0.3% 0.01 2x1024
86.0% 697.05 1,434,613.20 0.6% 0.01 1x2048
relative ns/op op/s err% total 45 bits
100.0% 660.87 1,513,158.98 0.5% 0.01 4x512
95.1% 695.14 1,438,554.63 0.3% 0.01 2x1024
89.2% 741.12 1,349,316.17 0.4% 0.01 1x2048
relative ns/op op/s err% total 46 bits
100.0% 700.00 1,428,565.22 0.3% 0.01 4x512
105.9% 661.19 1,512,426.78 0.4% 0.01 2x1024
97.6% 717.25 1,394,214.97 0.3% 0.01 1x2048
relative ns/op op/s err% total 47 bits
100.0% 673.96 1,483,774.93 0.3% 0.01 4x512
97.6% 690.54 1,448,135.65 0.5% 0.01 2x1024
89.9% 749.99 1,333,344.36 0.4% 0.01 1x2048
relative ns/op op/s err% total 48 bits
100.0% 616.97 1,620,835.79 0.5% 0.01 4x512
96.0% 642.52 1,556,369.50 0.3% 0.01 2x1024
86.0% 717.53 1,393,667.83 0.6% 0.01 1x2048
relative ns/op op/s err% total 49 bits
100.0% 699.06 1,430,496.68 0.3% 0.01 4x512
98.6% 708.80 1,410,830.28 0.3% 0.01 2x1024
91.1% 767.12 1,303,585.05 0.5% 0.01 1x2048
relative ns/op op/s err% total 50 bits
100.0% 734.04 1,362,314.65 0.4% 0.01 4x512
106.1% 691.75 1,445,609.27 0.4% 0.01 2x1024
98.2% 747.55 1,337,694.82 0.3% 0.01 1x2048
relative ns/op op/s err% total 51 bits
100.0% 719.53 1,389,792.48 0.4% 0.01 4x512
99.5% 722.89 1,383,343.54 0.3% 0.01 2x1024
92.2% 779.98 1,282,082.48 0.2% 0.01 1x2048
relative ns/op op/s err% total 52 bits
100.0% 655.89 1,524,648.16 0.4% 0.01 4x512
95.5% 686.88 1,455,866.44 0.2% 0.01 2x1024
87.0% 753.81 1,326,594.33 0.4% 0.01 1x2048
relative ns/op op/s err% total 53 bits
100.0% 745.44 1,341,495.07 0.5% 0.01 4x512
101.8% 731.94 1,366,238.62 0.3% 0.01 2x1024
94.5% 788.58 1,268,097.33 0.4% 0.01 1x2048
relative ns/op op/s err% total 54 bits
100.0% 793.02 1,260,996.71 0.3% 0.01 4x512
108.1% 733.76 1,362,838.78 0.8% 0.01 2x1024
101.9% 778.04 1,285,275.71 0.3% 0.01 1x2048
relative ns/op op/s err% total 55 bits
100.0% 779.21 1,283,356.54 0.2% 0.01 4x512
102.3% 761.46 1,313,266.17 0.2% 0.01 2x1024
96.1% 810.83 1,233,311.15 0.2% 0.01 1x2048
relative ns/op op/s err% total 56 bits
100.0% 727.60 1,374,381.53 0.1% 0.01 4x512
102.2% 711.90 1,404,684.25 0.4% 0.01 2x1024
93.8% 775.89 1,288,847.19 0.2% 0.01 1x2048
relative ns/op op/s err% total 57 bits
100.0% 787.01 1,270,634.74 0.4% 0.01 4x512
101.1% 778.32 1,284,813.87 0.2% 0.01 2x1024
95.2% 826.45 1,209,988.44 0.5% 0.01 1x2048
relative ns/op op/s err% total 58 bits
100.0% 824.86 1,212,330.23 0.1% 0.01 4x512
107.5% 767.30 1,303,273.37 0.1% 0.01 2x1024
102.4% 805.26 1,241,840.99 0.2% 0.01 1x2048
relative ns/op op/s err% total 59 bits
100.0% 789.28 1,266,982.78 0.2% 0.01 4x512
98.8% 798.67 1,252,087.86 0.4% 0.01 2x1024
94.2% 837.48 1,194,064.93 0.4% 0.01 1x2048
relative ns/op op/s err% total 60 bits
100.0% 761.76 1,312,752.74 0.4% 0.01 4x512
99.6% 764.82 1,307,490.33 0.3% 0.01 2x1024
93.4% 815.24 1,226,634.76 0.4% 0.01 1x2048
relative ns/op op/s err% total 61 bits
100.0% 801.34 1,247,902.94 0.3% 0.01 4x512
97.7% 820.18 1,219,250.11 0.3% 0.01 2x1024
93.5% 856.65 1,167,333.74 0.4% 0.01 1x2048
relative ns/op op/s err% total 62 bits
100.0% 873.90 1,144,299.89 0.5% 0.01 4x512
108.6% 804.61 1,242,838.27 0.3% 0.01 2x1024
104.2% 838.92 1,192,008.87 0.2% 0.01 1x2048
relative ns/op op/s err% total 63 bits
100.0% 810.58 1,233,678.49 0.8% 0.01 4x512
99.5% 814.77 1,227,346.32 0.2% 0.01 2x1024
95.3% 850.96 1,175,144.19 0.4% 0.01 1x2048
relative ns/op op/s err% total 64 bits
100.0% 688.22 1,453,017.08 0.4% 0.01 4x512
86.0% 800.13 1,249,789.24 0.3% 0.01 2x1024
81.9% 839.91 1,190,598.54 0.4% 0.01 `1x2048
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment