Created
February 18, 2026 20:55
-
-
Save Pikachuxxxx/1e5490570efe6c94c3ff10a5f857ad91 to your computer and use it in GitHub Desktop.
simple radix sort benchmarking with serial vs SIMD NEON, speedup = ~1.5x
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <stdint.h> | |
| #include <string.h> | |
| #include <time.h> | |
| #ifdef __aarch64__ | |
| #include <arm_neon.h> | |
| #endif | |
| // ================= BUILD INFO ================= | |
| #ifndef __OPTIMIZE__ | |
| #define BUILD_OPT_LEVEL "O0" | |
| #elif defined(__OPTIMIZE_SIZE__) | |
| #define BUILD_OPT_LEVEL "Os" | |
| #else | |
| #define BUILD_OPT_LEVEL "O2/O3" | |
| #endif | |
| #if defined(__clang__) | |
| #define BUILD_COMPILER "clang" | |
| #elif defined(__GNUC__) | |
| #define BUILD_COMPILER "gcc" | |
| #else | |
| #define BUILD_COMPILER "unknown" | |
| #endif | |
| #if defined(__aarch64__) | |
| #define BUILD_ARCH "arm64" | |
| #elif defined(__x86_64__) | |
| #define BUILD_ARCH "x86_64" | |
| #else | |
| #define BUILD_ARCH "unknown" | |
| #endif | |
| #define PRINT_BUILD_INFO() \ | |
| printf("Compiler: %s | Opt: %s | Arch: %s\n", \ | |
| BUILD_COMPILER, BUILD_OPT_LEVEL, BUILD_ARCH) | |
| // ================= CONFIG ================= | |
| #define RADIX_BUCKETS 256 | |
| #define RADIX_MASK 0xFF | |
| #define RADIX_SHIFT 8 | |
| #define SIMD_LANES 4 | |
| #define TEST_SIZE 1000000 | |
| // ================= TIMING ================= | |
| #define TIMER_DECL(name) \ | |
| struct timespec name##_start, name##_end | |
| #define TIMER_START(name) \ | |
| clock_gettime(CLOCK_MONOTONIC, &name##_start) | |
| #define TIMER_END(name) \ | |
| clock_gettime(CLOCK_MONOTONIC, &name##_end) | |
| #define TIMER_MS(name) \ | |
| ((name##_end.tv_sec - name##_start.tv_sec) * 1e3 + \ | |
| (name##_end.tv_nsec - name##_start.tv_nsec) * 1e-6) | |
| // ================= UTIL ================= | |
| #define VERIFY_SORT(arr, size) \ | |
| do { \ | |
| int ok = 1; \ | |
| for (uint32_t i = 1; i < size; i++) \ | |
| if (arr[i] < arr[i-1]) ok = 0; \ | |
| printf("Verification: %s\n", ok ? "OK" : "FAIL"); \ | |
| } while (0) | |
| // ================= SERIAL PASS ================= | |
| void radix_pass_serial(const uint32_t* input, uint32_t* output, uint32_t size, uint32_t shift) | |
| { | |
| uint32_t histogram[RADIX_BUCKETS] = {0}; | |
| for (uint32_t i = 0; i < size; i++) | |
| histogram[(input[i] >> shift) & RADIX_MASK]++; | |
| uint32_t offsets[RADIX_BUCKETS]; | |
| uint32_t sum = 0; | |
| for (uint32_t i = 0; i < RADIX_BUCKETS; i++) | |
| { | |
| offsets[i] = sum; | |
| sum += histogram[i]; | |
| } | |
| for (uint32_t i = 0; i < size; i++) | |
| { | |
| uint32_t v = input[i]; | |
| uint32_t d = (v >> shift) & RADIX_MASK; | |
| output[offsets[d]++] = v; | |
| } | |
| } | |
| // ================= NEON PASSES ================= | |
| #ifdef __aarch64__ | |
| #define DEFINE_RADIX_PASS_NEON_0() \ | |
| void radix_pass_neon_0(const uint32_t* input, uint32_t* output, uint32_t size) \ | |
| { \ | |
| uint32_t hist0[RADIX_BUCKETS] = {0}; \ | |
| uint32_t hist1[RADIX_BUCKETS] = {0}; \ | |
| uint32_t hist2[RADIX_BUCKETS] = {0}; \ | |
| uint32_t hist3[RADIX_BUCKETS] = {0}; \ | |
| \ | |
| uint32x4_t mask = vdupq_n_u32(RADIX_MASK); \ | |
| \ | |
| uint32_t i = 0; \ | |
| for (; i + 16 <= size; i += 16)\ | |
| {\ | |
| uint32x4_t v0 = vld1q_u32(input + i + 0);\ | |
| uint32x4_t v1 = vld1q_u32(input + i + 4);\ | |
| uint32x4_t v2 = vld1q_u32(input + i + 8);\ | |
| uint32x4_t v3 = vld1q_u32(input + i + 12);\ | |
| \ | |
| uint32x4_t d0 = vandq_u32(v0, mask);\ | |
| uint32x4_t d1 = vandq_u32(v1, mask);\ | |
| uint32x4_t d2 = vandq_u32(v2, mask);\ | |
| uint32x4_t d3 = vandq_u32(v3, mask);\ | |
| \ | |
| uint32_t tmp0[4], tmp1[4], tmp2[4], tmp3[4];\ | |
| \ | |
| vst1q_u32(tmp0, d0);\ | |
| vst1q_u32(tmp1, d1);\ | |
| vst1q_u32(tmp2, d2);\ | |
| vst1q_u32(tmp3, d3);\ | |
| \ | |
| hist0[tmp0[0]]++; hist1[tmp0[1]]++; hist2[tmp0[2]]++; hist3[tmp0[3]]++;\ | |
| hist0[tmp1[0]]++; hist1[tmp1[1]]++; hist2[tmp1[2]]++; hist3[tmp1[3]]++;\ | |
| hist0[tmp2[0]]++; hist1[tmp2[1]]++; hist2[tmp2[2]]++; hist3[tmp2[3]]++;\ | |
| hist0[tmp3[0]]++; hist1[tmp3[1]]++; hist2[tmp3[2]]++; hist3[tmp3[3]]++;\ | |
| }\ | |
| \ | |
| for (; i < size; i++) \ | |
| hist0[input[i] & RADIX_MASK]++; \ | |
| \ | |
| uint32_t offsets[RADIX_BUCKETS]; \ | |
| uint32_t sum = 0; \ | |
| \ | |
| for (uint32_t j = 0; j < RADIX_BUCKETS; j++) \ | |
| { \ | |
| hist0[j] += hist1[j] + hist2[j] + hist3[j]; \ | |
| offsets[j] = sum; \ | |
| sum += hist0[j]; \ | |
| } \ | |
| \ | |
| for (uint32_t j = 0; j < size; j++) \ | |
| { \ | |
| uint32_t v = input[j]; \ | |
| uint32_t d = v & RADIX_MASK; \ | |
| output[offsets[d]++] = v; \ | |
| } \ | |
| } | |
| #define DEFINE_RADIX_PASS_NEON(SHIFT) \ | |
| void radix_pass_neon_##SHIFT(const uint32_t* input, uint32_t* output, uint32_t size) \ | |
| { \ | |
| uint32_t hist0[RADIX_BUCKETS] = {0}; \ | |
| uint32_t hist1[RADIX_BUCKETS] = {0}; \ | |
| uint32_t hist2[RADIX_BUCKETS] = {0}; \ | |
| uint32_t hist3[RADIX_BUCKETS] = {0}; \ | |
| \ | |
| uint32x4_t mask = vdupq_n_u32(RADIX_MASK); \ | |
| \ | |
| uint32_t i = 0; \ | |
| for (; i + 16 <= size; i += 16)\ | |
| {\ | |
| uint32x4_t v0 = vld1q_u32(input + i + 0);\ | |
| uint32x4_t v1 = vld1q_u32(input + i + 4);\ | |
| uint32x4_t v2 = vld1q_u32(input + i + 8);\ | |
| uint32x4_t v3 = vld1q_u32(input + i + 12);\ | |
| \ | |
| uint32x4_t d0 = vandq_u32(vshrq_n_u32(v0, SHIFT), mask);\ | |
| uint32x4_t d1 = vandq_u32(vshrq_n_u32(v1, SHIFT), mask);\ | |
| uint32x4_t d2 = vandq_u32(vshrq_n_u32(v2, SHIFT), mask);\ | |
| uint32x4_t d3 = vandq_u32(vshrq_n_u32(v3, SHIFT), mask);\ | |
| \ | |
| uint32_t tmp0[4], tmp1[4], tmp2[4], tmp3[4];\ | |
| \ | |
| vst1q_u32(tmp0, d0);\ | |
| vst1q_u32(tmp1, d1);\ | |
| vst1q_u32(tmp2, d2);\ | |
| vst1q_u32(tmp3, d3);\ | |
| \ | |
| hist0[tmp0[0]]++; hist1[tmp0[1]]++; hist2[tmp0[2]]++; hist3[tmp0[3]]++;\ | |
| hist0[tmp1[0]]++; hist1[tmp1[1]]++; hist2[tmp1[2]]++; hist3[tmp1[3]]++;\ | |
| hist0[tmp2[0]]++; hist1[tmp2[1]]++; hist2[tmp2[2]]++; hist3[tmp2[3]]++;\ | |
| hist0[tmp3[0]]++; hist1[tmp3[1]]++; hist2[tmp3[2]]++; hist3[tmp3[3]]++;\ | |
| }\ | |
| \ | |
| for (; i < size; i++) \ | |
| hist0[(input[i] >> SHIFT) & RADIX_MASK]++; \ | |
| \ | |
| uint32_t offsets[RADIX_BUCKETS]; \ | |
| uint32_t sum = 0; \ | |
| \ | |
| for (uint32_t j = 0; j < RADIX_BUCKETS; j++) \ | |
| { \ | |
| hist0[j] += hist1[j] + hist2[j] + hist3[j]; \ | |
| offsets[j] = sum; \ | |
| sum += hist0[j]; \ | |
| } \ | |
| \ | |
| for (uint32_t j = 0; j < size; j++) \ | |
| { \ | |
| uint32_t v = input[j]; \ | |
| uint32_t d = (v >> SHIFT) & RADIX_MASK; \ | |
| output[offsets[d]++] = v; \ | |
| } \ | |
| } | |
| DEFINE_RADIX_PASS_NEON_0() | |
| DEFINE_RADIX_PASS_NEON(8) | |
| DEFINE_RADIX_PASS_NEON(16) | |
| DEFINE_RADIX_PASS_NEON(24) | |
| #endif | |
| // ================= SORT ================= | |
| void radix_sort_serial(uint32_t* data, uint32_t size) | |
| { | |
| uint32_t* temp = malloc(size * sizeof(uint32_t)); | |
| uint32_t* src = data; | |
| uint32_t* dst = temp; | |
| uint32_t* t; | |
| radix_pass_serial(src, dst, size, 0); t = src; src = dst; dst = t; | |
| radix_pass_serial(src, dst, size, 8); t = src; src = dst; dst = t; | |
| radix_pass_serial(src, dst, size, 16); t = src; src = dst; dst = t; | |
| radix_pass_serial(src, dst, size, 24); t = src; src = dst; dst = t; | |
| if (src != data) | |
| memcpy(data, src, size * sizeof(uint32_t)); | |
| free(temp); | |
| } | |
| #ifdef __aarch64__ | |
| void radix_sort_neon(uint32_t* data, uint32_t size) | |
| { | |
| uint32_t* temp = malloc(size * sizeof(uint32_t)); | |
| uint32_t* src = data; | |
| uint32_t* dst = temp; | |
| uint32_t* t; | |
| radix_pass_neon_0(src, dst, size); t = src; src = dst; dst = t; | |
| radix_pass_neon_8(src, dst, size); t = src; src = dst; dst = t; | |
| radix_pass_neon_16(src, dst, size); t = src; src = dst; dst = t; | |
| radix_pass_neon_24(src, dst, size); t = src; src = dst; dst = t; | |
| if (src != data) | |
| memcpy(data, src, size * sizeof(uint32_t)); | |
| free(temp); | |
| } | |
| #endif | |
| // ================= TEST ================= | |
| void fill_random(uint32_t* data, uint32_t size) | |
| { | |
| for (uint32_t i = 0; i < size; i++) | |
| data[i] = rand(); | |
| } | |
| // ================= MAIN ================= | |
| int main() | |
| { | |
| PRINT_BUILD_INFO(); | |
| uint32_t* data = malloc(TEST_SIZE * sizeof(uint32_t)); | |
| fill_random(data, TEST_SIZE); | |
| TIMER_DECL(sort_serial); | |
| TIMER_START(sort_serial); | |
| radix_sort_serial(data, TEST_SIZE); | |
| TIMER_END(sort_serial); | |
| { | |
| double time_ms = TIMER_MS(sort_serial); | |
| VERIFY_SORT(data, TEST_SIZE); | |
| double elems_per_sec = (TEST_SIZE / time_ms) * 1000.0; | |
| double mb_per_sec = | |
| (TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0)) / | |
| (time_ms / 1000.0); | |
| printf("===== Serial Stats =====\n"); | |
| printf("Elements: %u\n", TEST_SIZE); | |
| printf("Data size: %.2f MB\n", | |
| TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0)); | |
| printf("Sort time: %.6f ms\n", time_ms); | |
| printf("Throughput: %.2f million elems/sec\n", | |
| elems_per_sec / 1e6); | |
| printf("Bandwidth: %.2f MB/sec\n", mb_per_sec); | |
| } | |
| #ifdef __aarch64__ | |
| fill_random(data, TEST_SIZE); | |
| TIMER_DECL(sort_neon); | |
| TIMER_START(sort_neon); | |
| radix_sort_neon(data, TEST_SIZE); | |
| TIMER_END(sort_neon); | |
| { | |
| double time_ms = TIMER_MS(sort_neon); | |
| VERIFY_SORT(data, TEST_SIZE); | |
| double elems_per_sec = (TEST_SIZE / time_ms) * 1000.0; | |
| double mb_per_sec = | |
| (TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0)) / | |
| (time_ms / 1000.0); | |
| printf("===== NEON Stats =====\n"); | |
| printf("Elements: %u\n", TEST_SIZE); | |
| printf("Data size: %.2f MB\n", | |
| TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0)); | |
| printf("Sort time: %.6f ms\n", time_ms); | |
| printf("Throughput: %.2f million elems/sec\n", | |
| elems_per_sec / 1e6); | |
| printf("Bandwidth: %.2f MB/sec\n", mb_per_sec); | |
| } | |
| #endif | |
| free(data); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment