Last active
January 5, 2026 20:39
-
-
Save alekswn/24fb2d1892bb0914b34ca9bcf66145f6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <stdatomic.h> | |
| #include <pthread.h> | |
| #include <time.h> | |
| #include <stdint.h> | |
| #include <unistd.h> | |
| #include <string.h> | |
| #include <sys/time.h> | |
| #ifdef __x86_64__ | |
| #include <x86intrin.h> | |
| #define GET_CYCLES() __rdtsc() | |
| #elif defined(__aarch64__) | |
| static inline uint64_t GET_CYCLES(void) { | |
| uint64_t cycles; | |
| __asm__ volatile("mrs %0, cntvct_el0" : "=r" (cycles)); | |
| return cycles; | |
| } | |
| #else | |
| #define GET_CYCLES() 0 // Fallback for unsupported architectures | |
| #endif | |
| typedef struct { | |
| size_t value; | |
| pthread_mutex_t mutex; | |
| } mutex_counter_t; | |
| typedef struct { | |
| union { | |
| mutex_counter_t *mutex_counters; | |
| atomic_size_t *atomic_counters; | |
| size_t *plain_counters; | |
| }; // 8 bytes | |
| uint32_t *cycles; // 4 bytes | |
| uint32_t num_iterations; // 4 bytes | |
| uint16_t num_counters; // 2 bytes | |
| uint16_t thread_id; // 2 bytes | |
| memory_order mem_order; //1 byte? | |
| } __attribute__((aligned(64))) thread_data_t; | |
| static inline | |
| void increment_cas_counter(atomic_size_t *counter, memory_order mem_order) { | |
| size_t expected, required; | |
| do { | |
| expected = atomic_load_explicit(counter, mem_order); | |
| required = expected + 1; | |
| } while (!atomic_compare_exchange_weak_explicit(counter, | |
| &expected, required, | |
| mem_order, mem_order)); | |
| } | |
| #define DECLARE_COUNTER_THREAD(name, impl) \ | |
| void* name##_counter_thread(void* arg) { \ | |
| thread_data_t *data = (thread_data_t*)arg; \ | |
| for (size_t i = 0, j = 0; i < data->num_iterations; i++, j = (j+1) % data->num_counters) { \ | |
| uint64_t start_cycles = GET_CYCLES(); \ | |
| do { \ | |
| impl \ | |
| } while(0); \ | |
| uint64_t end_cycles = GET_CYCLES(); \ | |
| data->cycles[i] = end_cycles - start_cycles; \ | |
| } \ | |
| return NULL; \ | |
| } | |
| DECLARE_COUNTER_THREAD(mutex, | |
| pthread_mutex_lock(&data->mutex_counters[j].mutex); | |
| data->mutex_counters[j].value++; | |
| pthread_mutex_unlock(&data->mutex_counters[j].mutex); | |
| ) | |
| DECLARE_COUNTER_THREAD(atomic_fetch_add, | |
| atomic_fetch_add_explicit(&data->atomic_counters[j], 1, data->mem_order); | |
| ) | |
| DECLARE_COUNTER_THREAD(atomic_cas, | |
| increment_cas_counter(&data->atomic_counters[j], data->mem_order); | |
| ) | |
| DECLARE_COUNTER_THREAD(plain, | |
| data->plain_counters[(size_t)data->thread_id*(size_t)data->num_counters + j]++; | |
| ) | |
| DECLARE_COUNTER_THREAD(noop, | |
| (void)0; | |
| ) | |
| double get_time_diff(struct timeval *start, struct timeval *end) { | |
| return (end->tv_sec - start->tv_sec) + (end->tv_usec - start->tv_usec) / 1000000.0; | |
| } | |
| int compare_uint32(const void *a, const void *b) { | |
| uint32_t ua = *(const uint32_t*)a; | |
| uint32_t ub = *(const uint32_t*)b; | |
| return (ua > ub) - (ua < ub); | |
| } | |
| void calculate_percentiles(uint32_t array[], size_t sz, | |
| size_t denominator, const size_t numerators[], | |
| size_t num_percentiles, uint32_t out_percentiles[]) { | |
| qsort(array, sz, sizeof(uint32_t), compare_uint32); | |
| for (size_t i = 0; i < num_percentiles; i++) { | |
| const size_t sz_mult_numerator = sz * numerators[i]; | |
| const size_t index = sz_mult_numerator / denominator; | |
| const size_t reminder = sz_mult_numerator % denominator; | |
| if (index == 0) { | |
| out_percentiles[i] = array[0]; | |
| continue; | |
| } | |
| out_percentiles[i] = array[index - 1]; | |
| if (reminder && index) { | |
| out_percentiles[i] += array[index+1]; | |
| out_percentiles[i] /= 2; | |
| } | |
| } | |
| } | |
| #define DECLARE_RUN_BENCHMARK(name, counter_decl, counter_name, thread_name, init_impl, cleanup_impl, m_order) \ | |
| double run_##name##_counter_benchmark(uint16_t num_threads, uint16_t num_counters, \ | |
| uint32_t total_iterations, uint32_t cycles[]) { \ | |
| pthread_t threads[num_threads]; \ | |
| thread_data_t thread_data[num_threads]; \ | |
| counter_decl; \ | |
| struct timeval start, end; \ | |
| for (size_t i = 0; i < num_counters; i++) { \ | |
| init_impl \ | |
| } \ | |
| const uint32_t increments_per_iteration = (uint32_t)num_threads; \ | |
| const uint32_t base_iterations = total_iterations / increments_per_iteration; \ | |
| const uint32_t remainder = total_iterations % increments_per_iteration; \ | |
| for (size_t t = 0, i = 0; t < num_threads; t++) { \ | |
| const uint32_t num_iterations = base_iterations + (t < remainder ? 1 : 0); \ | |
| thread_data[t].counter_name = counter_name; \ | |
| thread_data[t].cycles = &cycles[i]; \ | |
| thread_data[t].num_counters = num_counters; \ | |
| thread_data[t].num_iterations = num_iterations; \ | |
| thread_data[t].thread_id = t; \ | |
| thread_data[t].mem_order = m_order; \ | |
| i+=num_iterations; \ | |
| } \ | |
| gettimeofday(&start, NULL); \ | |
| for (size_t t = 0; t < num_threads; t++) { \ | |
| pthread_create(&threads[t], NULL, thread_name##_counter_thread, &thread_data[t]); \ | |
| } \ | |
| for (int t = 0; t < num_threads; t++) { \ | |
| pthread_join(threads[t], NULL); \ | |
| } \ | |
| gettimeofday(&end, NULL); \ | |
| uint32_t total_increments = 0; \ | |
| for (int i = 0; i < num_counters; i++) { \ | |
| cleanup_impl \ | |
| } \ | |
| if (total_increments != total_iterations) abort(); \ | |
| return get_time_diff(&start, &end); \ | |
| } | |
| DECLARE_RUN_BENCHMARK(mutex, mutex_counter_t mutex_counters[num_counters], mutex_counters, mutex, { | |
| if (pthread_mutex_init(&mutex_counters[i].mutex, NULL)) | |
| abort(); | |
| mutex_counters[i].value = 0; | |
| },{ | |
| total_increments += mutex_counters[i].value; | |
| pthread_mutex_destroy(&mutex_counters[i].mutex); | |
| }, 0) | |
| DECLARE_RUN_BENCHMARK(cas_relaxed, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_cas, { | |
| atomic_init(&atomic_counters[i], 0); | |
| },{ | |
| total_increments += atomic_load(&atomic_counters[i]); | |
| }, memory_order_relaxed) | |
| DECLARE_RUN_BENCHMARK(fetch_add_seq_cst, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_fetch_add, { | |
| atomic_init(&atomic_counters[i], 0); | |
| },{ | |
| total_increments += atomic_load(&atomic_counters[i]); | |
| }, memory_order_seq_cst) | |
| DECLARE_RUN_BENCHMARK(fetch_add_acq_rel, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_fetch_add, { | |
| atomic_init(&atomic_counters[i], 0); | |
| },{ | |
| total_increments += atomic_load(&atomic_counters[i]); | |
| }, memory_order_acq_rel) | |
| DECLARE_RUN_BENCHMARK(fetch_add_relaxed, atomic_size_t atomic_counters[num_counters], atomic_counters, atomic_fetch_add, { | |
| atomic_init(&atomic_counters[i], 0); | |
| },{ | |
| total_increments += atomic_load(&atomic_counters[i]); | |
| }, memory_order_relaxed) | |
| DECLARE_RUN_BENCHMARK(plain, size_t plain_counters[(size_t)num_threads*(size_t)num_counters], plain_counters, plain, { | |
| (void)0; | |
| },{ | |
| total_increments = total_iterations; | |
| }, 0) | |
| DECLARE_RUN_BENCHMARK(noop, size_t *plain_counters = NULL, plain_counters, noop, { | |
| (void)0; | |
| },{ | |
| total_increments = total_iterations; | |
| }, 0) | |
| int main(int argc, char *argv[]) { | |
| // Test configurations | |
| const uint16_t thread_counts[] = {1, 10, 100, 1000}; | |
| const uint16_t counter_counts[] = {1, 2, 8, 32, 128}; | |
| double (*benchmarks[])(uint16_t, uint16_t, uint32_t, uint32_t[]) = { | |
| &run_mutex_counter_benchmark, | |
| &run_cas_relaxed_counter_benchmark, | |
| &run_fetch_add_seq_cst_counter_benchmark, | |
| &run_fetch_add_acq_rel_counter_benchmark, | |
| &run_fetch_add_relaxed_counter_benchmark, | |
| &run_plain_counter_benchmark, | |
| &run_noop_counter_benchmark, | |
| }; | |
| const char* benchmark_names[] = { "Mutex", "CAS-Relaxed", "Atomic-SeqCst", "Atomic-AcqRel", "Atomic-Relaxed", "Plain", "NO-OP" }; | |
| const size_t num_thread_configs = sizeof(thread_counts) / sizeof(thread_counts[0]); | |
| const size_t num_counter_configs = sizeof(counter_counts) / sizeof(counter_counts[0]); | |
| const size_t num_benchmarks = sizeof(benchmarks) / sizeof(benchmarks[0]); | |
| const size_t num_benchmark_names = sizeof(benchmark_names) / sizeof(benchmark_names[0]); | |
| if (num_benchmarks != num_benchmark_names) abort(); | |
| const uint32_t total_iterations = thread_counts[num_thread_configs - 1] * counter_counts[num_counter_configs - 1]; | |
| const size_t percentile_denominator = 100000; | |
| const size_t percentile_numerators[] = {50000, 90000, 99000, 99990, 99999}; | |
| const size_t num_percentiles = sizeof(percentile_numerators) / sizeof(percentile_numerators[0]); | |
| uint32_t percentiles[num_percentiles]; | |
| uint32_t cycles[total_iterations]; | |
| printf("Total iterations: %u\n", total_iterations); | |
| printf("%-20s %4s %4s %8s %8s %12s %8s %8s %8s %8s %8s\n", | |
| "Type", "Thrd", "Cntr", "Total", "Time(s)", "Ops/sec", "P50", "P90", "P99", "P99.99", "P99.999"); | |
| printf("===================================================================================================================\n"); | |
| // Run benchmarks | |
| for (size_t i = 0; i < num_thread_configs; i++) { | |
| for (size_t j = 0; j < num_counter_configs; j++) { | |
| for (size_t k = 0; k < num_benchmarks; k++) { | |
| const double elapsed = benchmarks[k](thread_counts[i], counter_counts[j], total_iterations, cycles); | |
| const double ops_per_sec = total_iterations / elapsed; | |
| calculate_percentiles(cycles, total_iterations, | |
| percentile_denominator, percentile_numerators, | |
| num_percentiles, percentiles); | |
| printf("%-20s %4u %4u %8u %8.3f %12.0f %8u %8u %8u %8u %8u\n", | |
| benchmark_names[k], | |
| thread_counts[i], counter_counts[j], total_iterations, elapsed, ops_per_sec, | |
| percentiles[0], percentiles[1], percentiles[2], percentiles[3], percentiles[4]); | |
| } | |
| printf("\n"); | |
| } | |
| } | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
c7i.48xlarge