Skip to content

Instantly share code, notes, and snippets.

@alekswn
Last active January 5, 2026 20:39
Show Gist options
  • Select an option

  • Save alekswn/24fb2d1892bb0914b34ca9bcf66145f6 to your computer and use it in GitHub Desktop.

Select an option

Save alekswn/24fb2d1892bb0914b34ca9bcf66145f6 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <stdatomic.h>
#include <pthread.h>
#include <time.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>
#ifdef __x86_64__
#include <x86intrin.h>
#define GET_CYCLES() __rdtsc()
#elif defined(__aarch64__)
static inline uint64_t GET_CYCLES(void) {
uint64_t cycles;
__asm__ volatile("mrs %0, cntvct_el0" : "=r" (cycles));
return cycles;
}
#else
#define GET_CYCLES() 0 // Fallback for unsupported architectures
#endif
typedef struct {
size_t value;
pthread_mutex_t mutex;
} mutex_counter_t;
typedef struct {
union {
mutex_counter_t *mutex_counters;
atomic_size_t *atomic_counters;
size_t *plain_counters;
}; // 8 bytes
uint32_t *cycles; // 4 bytes
uint32_t num_iterations; // 4 bytes
uint16_t num_counters; // 2 bytes
uint16_t thread_id; // 2 bytes
memory_order mem_order; //1 byte?
} __attribute__((aligned(64))) thread_data_t;
static inline
void increment_cas_counter(atomic_size_t *counter, memory_order mem_order) {
size_t expected, required;
do {
expected = atomic_load_explicit(counter, mem_order);
required = expected + 1;
} while (!atomic_compare_exchange_weak_explicit(counter,
&expected, required,
mem_order, mem_order));
}
#define DECLARE_COUNTER_THREAD(name, impl) \
void* name##_counter_thread(void* arg) { \
thread_data_t *data = (thread_data_t*)arg; \
for (size_t i = 0, j = 0; i < data->num_iterations; i++, j = (j+1) % data->num_counters) { \
uint64_t start_cycles = GET_CYCLES(); \
do { \
impl \
} while(0); \
uint64_t end_cycles = GET_CYCLES(); \
data->cycles[i] = end_cycles - start_cycles; \
} \
return NULL; \
}
DECLARE_COUNTER_THREAD(mutex,
pthread_mutex_lock(&data->mutex_counters[j].mutex);
data->mutex_counters[j].value++;
pthread_mutex_unlock(&data->mutex_counters[j].mutex);
)
DECLARE_COUNTER_THREAD(atomic_fetch_add,
atomic_fetch_add_explicit(&data->atomic_counters[j], 1, data->mem_order);
)
DECLARE_COUNTER_THREAD(atomic_cas,
increment_cas_counter(&data->atomic_counters[j], data->mem_order);
)
DECLARE_COUNTER_THREAD(plain,
data->plain_counters[(size_t)data->thread_id*(size_t)data->num_counters + j]++;
)
DECLARE_COUNTER_THREAD(noop,
(void)0;
)
double get_time_diff(struct timeval *start, struct timeval *end) {
return (end->tv_sec - start->tv_sec) + (end->tv_usec - start->tv_usec) / 1000000.0;
}
int compare_uint32(const void *a, const void *b) {
uint32_t ua = *(const uint32_t*)a;
uint32_t ub = *(const uint32_t*)b;
return (ua > ub) - (ua < ub);
}
void calculate_percentiles(uint32_t array[], size_t sz,
size_t denominator, const size_t numerators[],
size_t num_percentiles, uint32_t out_percentiles[]) {
qsort(array, sz, sizeof(uint32_t), compare_uint32);
for (size_t i = 0; i < num_percentiles; i++) {
const size_t sz_mult_numerator = sz * numerators[i];
const size_t index = sz_mult_numerator / denominator;
const size_t reminder = sz_mult_numerator % denominator;
if (index == 0) {
out_percentiles[i] = array[0];
continue;
}
out_percentiles[i] = array[index - 1];
if (reminder && index) {
out_percentiles[i] += array[index+1];
out_percentiles[i] /= 2;
}
}
}
#define DECLARE_RUN_BENCHMARK(name, counter_decl, counter_name, thread_name, init_impl, cleanup_impl, m_order) \
double run_##name##_counter_benchmark(uint16_t num_threads, uint16_t num_counters, \
uint32_t total_iterations, uint32_t cycles[]) { \
pthread_t threads[num_threads]; \
thread_data_t thread_data[num_threads]; \
counter_decl; \
struct timeval start, end; \
for (size_t i = 0; i < num_counters; i++) { \
init_impl \
} \
const uint32_t increments_per_iteration = (uint32_t)num_threads; \
const uint32_t base_iterations = total_iterations / increments_per_iteration; \
const uint32_t remainder = total_iterations % increments_per_iteration; \
for (size_t t = 0, i = 0; t < num_threads; t++) { \
const uint32_t num_iterations = base_iterations + (t < remainder ? 1 : 0); \
thread_data[t].counter_name = counter_name; \
thread_data[t].cycles = &cycles[i]; \
thread_data[t].num_counters = num_counters; \
thread_data[t].num_iterations = num_iterations; \
thread_data[t].thread_id = t; \
thread_data[t].mem_order = m_order; \
i+=num_iterations; \
} \
gettimeofday(&start, NULL); \
for (size_t t = 0; t < num_threads; t++) { \
pthread_create(&threads[t], NULL, thread_name##_counter_thread, &thread_data[t]); \
} \
for (int t = 0; t < num_threads; t++) { \
pthread_join(threads[t], NULL); \
} \
gettimeofday(&end, NULL); \
uint32_t total_increments = 0; \
for (int i = 0; i < num_counters; i++) { \
cleanup_impl \
} \
if (total_increments != total_iterations) abort(); \
return get_time_diff(&start, &end); \
}
DECLARE_RUN_BENCHMARK(mutex, mutex_counter_t mutex_counters[num_counters], mutex_counters, mutex, {
if (pthread_mutex_init(&mutex_counters[i].mutex, NULL))
abort();
mutex_counters[i].value = 0;
},{
total_increments += mutex_counters[i].value;
pthread_mutex_destroy(&mutex_counters[i].mutex);
}, 0)
DECLARE_RUN_BENCHMARK(cas_relaxed, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_cas, {
atomic_init(&atomic_counters[i], 0);
},{
total_increments += atomic_load(&atomic_counters[i]);
}, memory_order_relaxed)
DECLARE_RUN_BENCHMARK(fetch_add_seq_cst, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_fetch_add, {
atomic_init(&atomic_counters[i], 0);
},{
total_increments += atomic_load(&atomic_counters[i]);
}, memory_order_seq_cst)
DECLARE_RUN_BENCHMARK(fetch_add_acq_rel, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_fetch_add, {
atomic_init(&atomic_counters[i], 0);
},{
total_increments += atomic_load(&atomic_counters[i]);
}, memory_order_acq_rel)
DECLARE_RUN_BENCHMARK(fetch_add_relaxed, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_fetch_add, {
atomic_init(&atomic_counters[i], 0);
},{
total_increments += atomic_load(&atomic_counters[i]);
}, memory_order_relaxed)
DECLARE_RUN_BENCHMARK(plain, size_t plain_counters[(size_t)num_threads*(size_t)num_counters], plain_counters, plain, {
(void)0;
},{
total_increments = total_iterations;
}, 0)
DECLARE_RUN_BENCHMARK(noop, size_t *plain_counters = NULL, plain_counters, noop, {
(void)0;
},{
total_increments = total_iterations;
}, 0)
int main(int argc, char *argv[]) {
// Test configurations
const uint16_t thread_counts[] = {1, 10, 100, 1000};
const uint16_t counter_counts[] = {1, 2, 8, 32, 128};
double (*benchmarks[])(uint16_t, uint16_t, uint32_t, uint32_t[]) = {
&run_mutex_counter_benchmark,
&run_cas_relaxed_counter_benchmark,
&run_fetch_add_seq_cst_counter_benchmark,
&run_fetch_add_acq_rel_counter_benchmark,
&run_fetch_add_relaxed_counter_benchmark,
&run_plain_counter_benchmark,
&run_noop_counter_benchmark,
};
const char* benchmark_names[] = { "Mutex", "CAS-Relaxed", "Atomic-SeqCst", "Atomic-AcqRel", "Atomic-Relaxed", "Plain", "NO-OP" };
const size_t num_thread_configs = sizeof(thread_counts) / sizeof(thread_counts[0]);
const size_t num_counter_configs = sizeof(counter_counts) / sizeof(counter_counts[0]);
const size_t num_benchmarks = sizeof(benchmarks) / sizeof(benchmarks[0]);
const size_t num_benchmark_names = sizeof(benchmark_names) / sizeof(benchmark_names[0]);
if (num_benchmarks != num_benchmark_names) abort();
const uint32_t total_iterations = thread_counts[num_thread_configs - 1] * counter_counts[num_counter_configs - 1];
const size_t percentile_denominator = 100000;
const size_t percentile_numerators[] = {50000, 90000, 99000, 99990, 99999};
const size_t num_percentiles = sizeof(percentile_numerators) / sizeof(percentile_numerators[0]);
uint32_t percentiles[num_percentiles];
uint32_t cycles[total_iterations];
printf("Total iterations: %u\n", total_iterations);
printf("%-20s %4s %4s %8s %8s %12s %8s %8s %8s %8s %8s\n",
"Type", "Thrd", "Cntr", "Total", "Time(s)", "Ops/sec", "P50", "P90", "P99", "P99.99", "P99.999");
printf("===================================================================================================================\n");
// Run benchmarks
for (size_t i = 0; i < num_thread_configs; i++) {
for (size_t j = 0; j < num_counter_configs; j++) {
for (size_t k = 0; k < num_benchmarks; k++) {
const double elapsed = benchmarks[k](thread_counts[i], counter_counts[j], total_iterations, cycles);
const double ops_per_sec = total_iterations / elapsed;
calculate_percentiles(cycles, total_iterations,
percentile_denominator, percentile_numerators,
num_percentiles, percentiles);
printf("%-20s %4u %4u %8u %8.3f %12.0f %8u %8u %8u %8u %8u\n",
benchmark_names[k],
thread_counts[i], counter_counts[j], total_iterations, elapsed, ops_per_sec,
percentiles[0], percentiles[1], percentiles[2], percentiles[3], percentiles[4]);
}
printf("\n");
}
}
return 0;
}
@alekswn
Copy link
Author

alekswn commented Jan 5, 2026

c7i.48xlarge

Total iterations: 128000
Type                 Thrd Cntr    Total  Time(s)      Ops/sec      P50      P90      P99   P99.99  P99.999
===================================================================================================================
Mutex                   1    1   128000    0.007     19190405       76       88       94      929    27093
CAS-Relaxed             1    1   128000    0.005     25884732       58       60       70      490    28663
Atomic-SeqCst           1    1   128000    0.005     25723473       54       56       64      866    23339
Atomic-AcqRel           1    1   128000    0.005     26688907       54       62       66      387    23506
Atomic-Relaxed          1    1   128000    0.005     26370004       54       56       66      245    18338
Plain                   1    1   128000    0.004     33298647       32       38       40      175    20615
NO-OP                   1    1   128000    0.003     36613272       30       30       36      114    12999

Mutex                   1    2   128000    0.006     21052632       76       78       92      259    20308
CAS-Relaxed             1    2   128000    0.005     24502297       58       66       70      410    36700
Atomic-SeqCst           1    2   128000    0.005     26186579       54       56       66      470    42319
Atomic-AcqRel           1    2   128000    0.005     25281454       54       62       66      244     4619
Atomic-Relaxed          1    2   128000    0.005     26451746       54       56       66      243    10861
Plain                   1    2   128000    0.004     33083484       34       38       40      192    33256
NO-OP                   1    2   128000    0.004     36127576       30       34       36      192    16970

Mutex                   1    8   128000    0.006     20529270       76       90       94      772    41085
CAS-Relaxed             1    8   128000    0.005     24839899       58       66       70      510    35648
Atomic-SeqCst           1    8   128000    0.005     26042726       54       56       66      251    25173
Atomic-AcqRel           1    8   128000    0.005     26337449       54       56       66      229    30345
Atomic-Relaxed          1    8   128000    0.005     26528497       54       56       66      248    16663
Plain                   1    8   128000    0.004     33281331       34       38       40      192    16094
NO-OP                   1    8   128000    0.004     34867883       30       30       32       91    27887

Mutex                   1   32   128000    0.006     21056095       76       78       92      275    45185
CAS-Relaxed             1   32   128000    0.005     24568138       58       68       72      328    31830
Atomic-SeqCst           1   32   128000    0.005     26117119       54       62       66      248    45827
Atomic-AcqRel           1   32   128000    0.005     25661588       54       58       66      257    41060
Atomic-Relaxed          1   32   128000    0.005     25432148       54       56       66      251    39065
Plain                   1   32   128000    0.004     34858388       32       34       40      157    29890
NO-OP                   1   32   128000    0.004     35884497       30       34       36      192    27079

Mutex                   1  128   128000    0.006     20618557       76       78       92      954    32904
CAS-Relaxed             1  128   128000    0.005     24413504       58       66       70      281   118160
Atomic-SeqCst           1  128   128000    0.005     25702811       54       62       66      438    24564
Atomic-AcqRel           1  128   128000    0.005     26229508       54       62       66      278    30978
Atomic-Relaxed          1  128   128000    0.005     26294166       54       58       66      249    29847
Plain                   1  128   128000    0.004     32703117       32       38       40      192    16511
NO-OP                   1  128   128000    0.004     36281179       30       30       36      112    12147

Mutex                  10    1   128000    0.018      6947083      520     8466    46018   128787   161812
CAS-Relaxed            10    1   128000    0.010     12342108     1236     3490     7216    46709    86757
Atomic-SeqCst          10    1   128000    0.009     13500686     1122     3070     6456    38333    73604
Atomic-AcqRel          10    1   128000    0.010     13418597     1106     3042     6556    33399    63412
Atomic-Relaxed         10    1   128000    0.010     13223140     1186     3158     6458    32148    73457
Plain                  10    1   128000    0.002     63586687       32       34       42     4275    56476
NO-OP                  10    1   128000    0.001    134878820       30       34       36      648    19016

Mutex                  10    2   128000    0.029      4373526       90    15420    77274   214273   463800
CAS-Relaxed            10    2   128000    0.011     11958146     1222     3672     7888    36440    57286
Atomic-SeqCst          10    2   128000    0.010     12327844     1238     3432     7308    57211    93246
Atomic-AcqRel          10    2   128000    0.010     12602146     1218     3358     7258    32903    69196
Atomic-Relaxed         10    2   128000    0.010     12643224     1220     3386     7260    40118    92638
Plain                  10    2   128000    0.003     42077581       32       34       40    30274    89488
NO-OP                  10    2   128000    0.001    146620848       30       34       36      716    39562

Mutex                  10    8   128000    0.017      7702955      306     6262    47734   129176   321762
CAS-Relaxed            10    8   128000    0.008     15936255      904     2956     6496    37724    70223
Atomic-SeqCst          10    8   128000    0.008     16978379      690     2518     6896    35699    57451
Atomic-AcqRel          10    8   128000    0.007     17114588      774     2540     6178    32751   115373
Atomic-Relaxed         10    8   128000    0.008     16946909      796     2616     6068    53538    80883
Plain                  10    8   128000    0.001     94885100       32       34       40     1191    40594
NO-OP                  10    8   128000    0.001    155151515       30       36       36      737     4162

Mutex                  10   32   128000    0.008     15892724      270     2336    23324    74672   260960
CAS-Relaxed            10   32   128000    0.004     31651830      318     1400     4168    37482    47125
Atomic-SeqCst          10   32   128000    0.004     36168409      316     1090     3168    37237    74021
Atomic-AcqRel          10   32   128000    0.004     34848897      308     1016     2992    32915    51717
Atomic-Relaxed         10   32   128000    0.004     35995501      320     1036     3152    31359    58442
Plain                  10   32   128000    0.001    143982002       32       34       40      614    28550
NO-OP                  10   32   128000    0.001    161209068       30       30       36      772    23920

Mutex                  10  128   128000    0.005     27485506      110     1118    13900    61637    99586
CAS-Relaxed            10  128   128000    0.002     55291577      220      692     1832    31424    45492
Atomic-SeqCst          10  128   128000    0.002     61805891      224      610     1238    31930    67079
Atomic-AcqRel          10  128   128000    0.002     63713290      226      586     1176    28096    63936
Atomic-Relaxed         10  128   128000    0.002     60663507      230      616     1296    14518    38024
Plain                  10  128   128000    0.001    131416838       32       34       40      859    32167
NO-OP                  10  128   128000    0.001    166233766       30       34       36      817    35611

Mutex                 100    1   128000    0.019      6780738       90      650   807386  4834457  6794460
CAS-Relaxed           100    1   128000    0.011     12109745     5298    21276    51508   155459   289425
Atomic-SeqCst         100    1   128000    0.010     12255841     5066    20448    50732   209100   327963
Atomic-AcqRel         100    1   128000    0.010     12829508     3866    18924    48924   136071   298818
Atomic-Relaxed        100    1   128000    0.011     11759302     4724    20304    52618   159233   261687
Plain                 100    1   128000    0.005     26970080       36       38       42     8198    53671
NO-OP                 100    1   128000    0.004     29110757       32       34       36      827    20504

Mutex                 100    2   128000    0.016      7782101       96     1334   675568  4897769  7736744
CAS-Relaxed           100    2   128000    0.011     12115476     5312    21210    52660   207331   447459
Atomic-SeqCst         100    2   128000    0.010     12479282     3992    18534    49228   117267   156784
Atomic-AcqRel         100    2   128000    0.010     12462272     4760    19710    50536   137459   234919
Atomic-Relaxed        100    2   128000    0.011     12091442     5092    21262    54110   145610   183528
Plain                 100    2   128000    0.006     23017443       36       38       40     7307    93053
NO-OP                 100    2   128000    0.005     27777778       32       34       36      890    21656

Mutex                 100    8   128000    0.014      8889506      192     1414   440190  5450065  7055383
CAS-Relaxed           100    8   128000    0.010     12209081     5036    21056    57280   230848   311123
Atomic-SeqCst         100    8   128000    0.010     12679544     4420    19946    48820   221404   527319
Atomic-AcqRel         100    8   128000    0.010     12685828     5180    20428    48706   191758   258429
Atomic-Relaxed        100    8   128000    0.011     12174244     3386    18630    50778   217544   504424
Plain                 100    8   128000    0.004     28783450       36       38       40      957   196347
NO-OP                 100    8   128000    0.005     26310380       30       32       34      842    21127

Mutex                 100   32   128000    0.007     19101627      366     3932    88480   963953  1295006
CAS-Relaxed           100   32   128000    0.005     27765727      396     1576     4814    48727    67100
Atomic-SeqCst         100   32   128000    0.005     25979298      266      752     1798    33856    73468
Atomic-AcqRel         100   32   128000    0.005     27503223      258      788     2158    30726    56968
Atomic-Relaxed        100   32   128000    0.006     22800143      254      750     1914    29025    72471
Plain                 100   32   128000    0.004     28913485       34       36       40      985    28447
NO-OP                 100   32   128000    0.005     27084215       30       32       34      811    27828

Mutex                 100  128   128000    0.006     20075282      208      904    13166    83822   688412
CAS-Relaxed           100  128   128000    0.006     21680217       64      558     1248    46932   290142
Atomic-SeqCst         100  128   128000    0.004     30865686       56      462      772     5060    56685
Atomic-AcqRel         100  128   128000    0.005     24591739       56      430      740    23280    42370
Atomic-Relaxed        100  128   128000    0.004     29767442       58      474      774     4047    49157
Plain                 100  128   128000    0.004     31022782       34       38       40      858    28269
NO-OP                 100  128   128000    0.005     27009918       30       34       36      730   188831

Mutex                1000    1   128000    0.041      3088952       80       92     1700    74179   100791
CAS-Relaxed          1000    1   128000    0.043      2985562       62      260      828     4750    30915
Atomic-SeqCst        1000    1   128000    0.040      3236901       62       76      656     2540   192160
Atomic-AcqRel        1000    1   128000    0.043      3003990       60      268      744     2833    43712
Atomic-Relaxed       1000    1   128000    0.043      3003355       62      140      744     2907    40960
Plain                1000    1   128000    0.040      3190906       36       40      102     1032     4140
NO-OP                1000    1   128000    0.042      3069103       32       34      102      944     8606

Mutex                1000    2   128000    0.044      2942326       82      136     1268    53526    76698
CAS-Relaxed          1000    2   128000    0.042      3032457       66      128      738     3537    38346
Atomic-SeqCst        1000    2   128000    0.043      2948969       62      254      752     2936    43995
Atomic-AcqRel        1000    2   128000    0.044      2941852       62      264      752     2977    36149
Atomic-Relaxed       1000    2   128000    0.043      3008155       58      234      828     4830    47923
Plain                1000    2   128000    0.042      3064400       36       40      102     1027    43904
NO-OP                1000    2   128000    0.042      3039658       34       36      102      997   132405

Mutex                1000    8   128000    0.043      2961797       82      292     1254    43203   122786
CAS-Relaxed          1000    8   128000    0.044      2925848       64      136      746     4945   153891
Atomic-SeqCst        1000    8   128000    0.043      2976675       62       96      646     1769    47251
Atomic-AcqRel        1000    8   128000    0.044      2922575       62      102      676     1764    42471
Atomic-Relaxed       1000    8   128000    0.046      2784970       60      202      712     2010    29823
Plain                1000    8   128000    0.043      2950669       38       40      102     1026    24536
NO-OP                1000    8   128000    0.040      3165026       32       36       94     1005    28606

Mutex                1000   32   128000    0.043      2979446       88      172      894    37253    49850
CAS-Relaxed          1000   32   128000    0.042      3061908       66       88      700     2139    47612
Atomic-SeqCst        1000   32   128000    0.045      2846848       60       98      672     1525    40810
Atomic-AcqRel        1000   32   128000    0.046      2792321       60       96      684     1691    29755
Atomic-Relaxed       1000   32   128000    0.043      2990025       62       88      652     1655    23391
Plain                1000   32   128000    0.043      2957486       36       40       96     1056   314588
NO-OP                1000   32   128000    0.043      2983544       32       34      102      961    45061

Mutex                1000  128   128000    0.044      2888021       88      254      916    28456    94522
CAS-Relaxed          1000  128   128000    0.044      2907373       66       84      768     1829    42714
Atomic-SeqCst        1000  128   128000    0.043      2943002       60       74      726     1657    31082
Atomic-AcqRel        1000  128   128000    0.045      2859441       60       86      734     2497    63193
Atomic-Relaxed       1000  128   128000    0.043      3011765       62       74      744     1594    34283
Plain                1000  128   128000    0.042      3022789       38       40       56     1258    43201
NO-OP                1000  128   128000    0.040      3192975       32       36      104      938     1116

ubuntu@ip-172-31-40-71:~/24fb2d1892bb0914b34ca9bcf66145f6$ head /proc/cpuinfo
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 143
model name      : Intel(R) Xeon(R) Platinum 8488C
stepping        : 8
microcode       : 0x2b000643
cpu MHz         : 3200.948
cache size      : 107520 KB
physical id     : 0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment