Skip to content

Instantly share code, notes, and snippets.

@Pikachuxxxx
Created February 18, 2026 20:55
Show Gist options
  • Select an option

  • Save Pikachuxxxx/1e5490570efe6c94c3ff10a5f857ad91 to your computer and use it in GitHub Desktop.

Select an option

Save Pikachuxxxx/1e5490570efe6c94c3ff10a5f857ad91 to your computer and use it in GitHub Desktop.
simple radix sort benchmarking with serial vs SIMD NEON, speedup = ~1.5x
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
// ================= BUILD INFO =================
#ifndef __OPTIMIZE__
#define BUILD_OPT_LEVEL "O0"
#elif defined(__OPTIMIZE_SIZE__)
#define BUILD_OPT_LEVEL "Os"
#else
#define BUILD_OPT_LEVEL "O2/O3"
#endif
#if defined(__clang__)
#define BUILD_COMPILER "clang"
#elif defined(__GNUC__)
#define BUILD_COMPILER "gcc"
#else
#define BUILD_COMPILER "unknown"
#endif
#if defined(__aarch64__)
#define BUILD_ARCH "arm64"
#elif defined(__x86_64__)
#define BUILD_ARCH "x86_64"
#else
#define BUILD_ARCH "unknown"
#endif
#define PRINT_BUILD_INFO() \
printf("Compiler: %s | Opt: %s | Arch: %s\n", \
BUILD_COMPILER, BUILD_OPT_LEVEL, BUILD_ARCH)
// ================= CONFIG =================
#define RADIX_BUCKETS 256
#define RADIX_MASK 0xFF
#define RADIX_SHIFT 8
#define SIMD_LANES 4
#define TEST_SIZE 1000000
// ================= TIMING =================
#define TIMER_DECL(name) \
struct timespec name##_start, name##_end
#define TIMER_START(name) \
clock_gettime(CLOCK_MONOTONIC, &name##_start)
#define TIMER_END(name) \
clock_gettime(CLOCK_MONOTONIC, &name##_end)
#define TIMER_MS(name) \
((name##_end.tv_sec - name##_start.tv_sec) * 1e3 + \
(name##_end.tv_nsec - name##_start.tv_nsec) * 1e-6)
// ================= UTIL =================
#define VERIFY_SORT(arr, size) \
do { \
int ok = 1; \
for (uint32_t i = 1; i < size; i++) \
if (arr[i] < arr[i-1]) ok = 0; \
printf("Verification: %s\n", ok ? "OK" : "FAIL"); \
} while (0)
// ================= SERIAL PASS =================
void radix_pass_serial(const uint32_t* input, uint32_t* output, uint32_t size, uint32_t shift)
{
uint32_t histogram[RADIX_BUCKETS] = {0};
for (uint32_t i = 0; i < size; i++)
histogram[(input[i] >> shift) & RADIX_MASK]++;
uint32_t offsets[RADIX_BUCKETS];
uint32_t sum = 0;
for (uint32_t i = 0; i < RADIX_BUCKETS; i++)
{
offsets[i] = sum;
sum += histogram[i];
}
for (uint32_t i = 0; i < size; i++)
{
uint32_t v = input[i];
uint32_t d = (v >> shift) & RADIX_MASK;
output[offsets[d]++] = v;
}
}
// ================= NEON PASSES =================
#ifdef __aarch64__
#define DEFINE_RADIX_PASS_NEON_0() \
void radix_pass_neon_0(const uint32_t* input, uint32_t* output, uint32_t size) \
{ \
uint32_t hist0[RADIX_BUCKETS] = {0}; \
uint32_t hist1[RADIX_BUCKETS] = {0}; \
uint32_t hist2[RADIX_BUCKETS] = {0}; \
uint32_t hist3[RADIX_BUCKETS] = {0}; \
\
uint32x4_t mask = vdupq_n_u32(RADIX_MASK); \
\
uint32_t i = 0; \
for (; i + 16 <= size; i += 16)\
{\
uint32x4_t v0 = vld1q_u32(input + i + 0);\
uint32x4_t v1 = vld1q_u32(input + i + 4);\
uint32x4_t v2 = vld1q_u32(input + i + 8);\
uint32x4_t v3 = vld1q_u32(input + i + 12);\
\
uint32x4_t d0 = vandq_u32(v0, mask);\
uint32x4_t d1 = vandq_u32(v1, mask);\
uint32x4_t d2 = vandq_u32(v2, mask);\
uint32x4_t d3 = vandq_u32(v3, mask);\
\
uint32_t tmp0[4], tmp1[4], tmp2[4], tmp3[4];\
\
vst1q_u32(tmp0, d0);\
vst1q_u32(tmp1, d1);\
vst1q_u32(tmp2, d2);\
vst1q_u32(tmp3, d3);\
\
hist0[tmp0[0]]++; hist1[tmp0[1]]++; hist2[tmp0[2]]++; hist3[tmp0[3]]++;\
hist0[tmp1[0]]++; hist1[tmp1[1]]++; hist2[tmp1[2]]++; hist3[tmp1[3]]++;\
hist0[tmp2[0]]++; hist1[tmp2[1]]++; hist2[tmp2[2]]++; hist3[tmp2[3]]++;\
hist0[tmp3[0]]++; hist1[tmp3[1]]++; hist2[tmp3[2]]++; hist3[tmp3[3]]++;\
}\
\
for (; i < size; i++) \
hist0[input[i] & RADIX_MASK]++; \
\
uint32_t offsets[RADIX_BUCKETS]; \
uint32_t sum = 0; \
\
for (uint32_t j = 0; j < RADIX_BUCKETS; j++) \
{ \
hist0[j] += hist1[j] + hist2[j] + hist3[j]; \
offsets[j] = sum; \
sum += hist0[j]; \
} \
\
for (uint32_t j = 0; j < size; j++) \
{ \
uint32_t v = input[j]; \
uint32_t d = v & RADIX_MASK; \
output[offsets[d]++] = v; \
} \
}
#define DEFINE_RADIX_PASS_NEON(SHIFT) \
void radix_pass_neon_##SHIFT(const uint32_t* input, uint32_t* output, uint32_t size) \
{ \
uint32_t hist0[RADIX_BUCKETS] = {0}; \
uint32_t hist1[RADIX_BUCKETS] = {0}; \
uint32_t hist2[RADIX_BUCKETS] = {0}; \
uint32_t hist3[RADIX_BUCKETS] = {0}; \
\
uint32x4_t mask = vdupq_n_u32(RADIX_MASK); \
\
uint32_t i = 0; \
for (; i + 16 <= size; i += 16)\
{\
uint32x4_t v0 = vld1q_u32(input + i + 0);\
uint32x4_t v1 = vld1q_u32(input + i + 4);\
uint32x4_t v2 = vld1q_u32(input + i + 8);\
uint32x4_t v3 = vld1q_u32(input + i + 12);\
\
uint32x4_t d0 = vandq_u32(vshrq_n_u32(v0, SHIFT), mask);\
uint32x4_t d1 = vandq_u32(vshrq_n_u32(v1, SHIFT), mask);\
uint32x4_t d2 = vandq_u32(vshrq_n_u32(v2, SHIFT), mask);\
uint32x4_t d3 = vandq_u32(vshrq_n_u32(v3, SHIFT), mask);\
\
uint32_t tmp0[4], tmp1[4], tmp2[4], tmp3[4];\
\
vst1q_u32(tmp0, d0);\
vst1q_u32(tmp1, d1);\
vst1q_u32(tmp2, d2);\
vst1q_u32(tmp3, d3);\
\
hist0[tmp0[0]]++; hist1[tmp0[1]]++; hist2[tmp0[2]]++; hist3[tmp0[3]]++;\
hist0[tmp1[0]]++; hist1[tmp1[1]]++; hist2[tmp1[2]]++; hist3[tmp1[3]]++;\
hist0[tmp2[0]]++; hist1[tmp2[1]]++; hist2[tmp2[2]]++; hist3[tmp2[3]]++;\
hist0[tmp3[0]]++; hist1[tmp3[1]]++; hist2[tmp3[2]]++; hist3[tmp3[3]]++;\
}\
\
for (; i < size; i++) \
hist0[(input[i] >> SHIFT) & RADIX_MASK]++; \
\
uint32_t offsets[RADIX_BUCKETS]; \
uint32_t sum = 0; \
\
for (uint32_t j = 0; j < RADIX_BUCKETS; j++) \
{ \
hist0[j] += hist1[j] + hist2[j] + hist3[j]; \
offsets[j] = sum; \
sum += hist0[j]; \
} \
\
for (uint32_t j = 0; j < size; j++) \
{ \
uint32_t v = input[j]; \
uint32_t d = (v >> SHIFT) & RADIX_MASK; \
output[offsets[d]++] = v; \
} \
}
DEFINE_RADIX_PASS_NEON_0()
DEFINE_RADIX_PASS_NEON(8)
DEFINE_RADIX_PASS_NEON(16)
DEFINE_RADIX_PASS_NEON(24)
#endif
// ================= SORT =================
void radix_sort_serial(uint32_t* data, uint32_t size)
{
uint32_t* temp = malloc(size * sizeof(uint32_t));
uint32_t* src = data;
uint32_t* dst = temp;
uint32_t* t;
radix_pass_serial(src, dst, size, 0); t = src; src = dst; dst = t;
radix_pass_serial(src, dst, size, 8); t = src; src = dst; dst = t;
radix_pass_serial(src, dst, size, 16); t = src; src = dst; dst = t;
radix_pass_serial(src, dst, size, 24); t = src; src = dst; dst = t;
if (src != data)
memcpy(data, src, size * sizeof(uint32_t));
free(temp);
}
#ifdef __aarch64__
void radix_sort_neon(uint32_t* data, uint32_t size)
{
uint32_t* temp = malloc(size * sizeof(uint32_t));
uint32_t* src = data;
uint32_t* dst = temp;
uint32_t* t;
radix_pass_neon_0(src, dst, size); t = src; src = dst; dst = t;
radix_pass_neon_8(src, dst, size); t = src; src = dst; dst = t;
radix_pass_neon_16(src, dst, size); t = src; src = dst; dst = t;
radix_pass_neon_24(src, dst, size); t = src; src = dst; dst = t;
if (src != data)
memcpy(data, src, size * sizeof(uint32_t));
free(temp);
}
#endif
// ================= TEST =================
void fill_random(uint32_t* data, uint32_t size)
{
for (uint32_t i = 0; i < size; i++)
data[i] = rand();
}
// ================= MAIN =================
int main()
{
PRINT_BUILD_INFO();
uint32_t* data = malloc(TEST_SIZE * sizeof(uint32_t));
fill_random(data, TEST_SIZE);
TIMER_DECL(sort_serial);
TIMER_START(sort_serial);
radix_sort_serial(data, TEST_SIZE);
TIMER_END(sort_serial);
{
double time_ms = TIMER_MS(sort_serial);
VERIFY_SORT(data, TEST_SIZE);
double elems_per_sec = (TEST_SIZE / time_ms) * 1000.0;
double mb_per_sec =
(TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0)) /
(time_ms / 1000.0);
printf("===== Serial Stats =====\n");
printf("Elements: %u\n", TEST_SIZE);
printf("Data size: %.2f MB\n",
TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0));
printf("Sort time: %.6f ms\n", time_ms);
printf("Throughput: %.2f million elems/sec\n",
elems_per_sec / 1e6);
printf("Bandwidth: %.2f MB/sec\n", mb_per_sec);
}
#ifdef __aarch64__
fill_random(data, TEST_SIZE);
TIMER_DECL(sort_neon);
TIMER_START(sort_neon);
radix_sort_neon(data, TEST_SIZE);
TIMER_END(sort_neon);
{
double time_ms = TIMER_MS(sort_neon);
VERIFY_SORT(data, TEST_SIZE);
double elems_per_sec = (TEST_SIZE / time_ms) * 1000.0;
double mb_per_sec =
(TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0)) /
(time_ms / 1000.0);
printf("===== NEON Stats =====\n");
printf("Elements: %u\n", TEST_SIZE);
printf("Data size: %.2f MB\n",
TEST_SIZE * sizeof(uint32_t) / (1024.0 * 1024.0));
printf("Sort time: %.6f ms\n", time_ms);
printf("Throughput: %.2f million elems/sec\n",
elems_per_sec / 1e6);
printf("Bandwidth: %.2f MB/sec\n", mb_per_sec);
}
#endif
free(data);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment