Skip to content

Instantly share code, notes, and snippets.

@scottchiefbaker
Last active January 16, 2026 20:04
Show Gist options
  • Select an option

  • Save scottchiefbaker/e4c87de8d6c1ad4a33b1ffe5a3ce707f to your computer and use it in GitHub Desktop.

Select an option

Save scottchiefbaker/e4c87de8d6c1ad4a33b1ffe5a3ce707f to your computer and use it in GitHub Desktop.
Quicky benchmark to compare 32bit and 64bit PRNGs on an ESP32
#include <stdint.h>
static uint64_t s[8];
static uint32_t r[2];
static uint64_t sm;
static uint64_t fs[4];
// For Biski
uint64_t mix;
uint64_t loopMix;
uint64_t fast_loop;
// PCG uses a structure
typedef struct { uint64_t state; uint64_t inc; } pcg32_random_t;
pcg32_random_t rng;
uint64_t next_out = 0;
uint32_t count = 0;
void setup() {
Serial.begin(115200);
delay(1000);
// Init all the various global seeds for the PRNGs
for (int i = 0; i < 8; i++) {
s[i] = rdtsc_rand64();
}
for (int i = 0; i < 2; i++) {
r[i] = rdtsc_rand64();
}
for (int i = 0; i < 4; i++) {
fs[i] = rdtsc_rand64();
}
sm = rdtsc_rand64();
rng.state = rdtsc_rand64();
rng.inc = rdtsc_rand64();
mix = rdtsc_rand64();
loopMix = rdtsc_rand64();
fast_loop = rdtsc_rand64();
next_out = millis() + 1000;
}
void loop() {
uint32_t md = 1500;
delay(5000);
next_out = millis() + md;
Serial.printf("\r\n");
Serial.printf("| %-15s | %10s | %11s | %12s |\r\n", "PRNG", "Iterations", "Output bits", "Bytes per second");
Serial.printf("| --------------- | ---------- | ----------- | ---------------- |\r\n");
//////////////////////////////////////////////////////////////
uint32_t num = 11;
while (next_out > millis()) {
num = xoroshiro64starstar();
count++;
}
//Serial.printf("Generated %u x64** = %0.1f b/s\r\n", count, (count * 4.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "xoroshiro64**", count, 32, (count * 4.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
uint64_t num2 = 11;
while (next_out > millis()) {
num2 = xoshiro256plus();
count++;
}
//Serial.printf("Generated %u x256+ = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "xoshiro256+", count, 64, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 11;
while (next_out > millis()) {
num2 = xoshiro512plusplus();
count++;
}
//Serial.printf("Generated %u x512++ = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "xoshiro512++", count, 64, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 33;
while (next_out > millis()) {
num2 = splitmix64();
count++;
}
//Serial.printf("Generated %u sm64 = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "splitmix64", count, 64, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
while (next_out > millis()) {
num2 = pcg32();
count++;
}
//Serial.printf("Generated %u pcg32 = %0.1f b/s\r\n", count, (count * 4.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "pcg32", count, 32, (count * 4.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 44;
while (next_out > millis()) {
num2 = pcg64_32x2();
count++;
}
//Serial.printf("Generated %u pcg32x = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "pcg64_32x2", count, 64, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 44;
while (next_out > millis()) {
num2 = pcg64();
count++;
}
//Serial.printf("Generated %u pcg64 = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "pcg64", count, 64, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 44;
while (next_out > millis()) {
num2 = biski64();
count++;
}
//Serial.printf("Generated %u biski = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
Serial.printf("| %-15s | %10u | %11d | %16.1f |\r\n", "biski64", count, 64, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
}
///////////////////////////////////////////////////////////
// rdtsc_rand
///////////////////////////////////////////////////////////
// Get the instruction counter for various CPU/Platforms
uint64_t get_rdtsc() {
#if defined(_WIN32) || defined(_WIN64)
return __rdtsc();
#elif defined(__aarch64__)
uint64_t count;
__asm__ volatile ("mrs %0, cntvct_el0" : "=r" (count));
return count;
#elif defined(ARDUINO)
return micros();
#elif defined(__GNUC__) || defined(__clang__)
uint32_t low, high;
__asm__ volatile ("rdtsc" : "=a"(low), "=d"(high));
return ((uint64_t)(high) << 32) | low;
#else
#error "Unsupported platform"
#endif
}
// Multiply-Shift Hash (Passes SmallCrush and PractRand up to 128GB)
static uint64_t hash_msh(uint64_t x) {
uint64_t prime = 0x9e3779b97f4a7c15; // A large prime constant
x ^= (x >> 30);
x *= prime;
x ^= (x >> 27);
x *= prime;
x ^= (x >> 31);
return x;
}
// Get an unsigned 64bit random integer
static uint64_t rdtsc_rand64() {
// Hash the rdtsc value through hash64
uint64_t rdtsc_val = get_rdtsc();
uint64_t ret = hash_msh(rdtsc_val);
return ret;
}
///////////////////////////////////////////////////////////
// PRNGs
///////////////////////////////////////////////////////////
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
//////////////////////////////////////////////////////////////////
uint32_t xoroshiro64starstar(void) {
const uint32_t s0 = r[0];
uint32_t s1 = r[1];
const uint32_t result = rotl(s0 * 0x9E3779BB, 5) * 5;
s1 ^= s0;
r[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
r[1] = rotl(s1, 13); // c
return result;
}
//////////////////////////////////////////////////////////////////
uint64_t xoshiro256plus(void) {
const uint64_t result = fs[0] + fs[3];
const uint64_t t = fs[1] << 17;
fs[2] ^= fs[0];
fs[3] ^= fs[1];
fs[1] ^= fs[2];
fs[0] ^= fs[3];
fs[2] ^= t;
fs[3] = rotl(fs[3], 45);
return result;
}
//////////////////////////////////////////////////////////////////
uint64_t xoshiro512plusplus(void) {
const uint64_t result = rotl(s[0] + s[2], 17) + s[2];
const uint64_t t = s[1] << 11;
s[2] ^= s[0];
s[5] ^= s[1];
s[1] ^= s[2];
s[7] ^= s[3];
s[3] ^= s[4];
s[4] ^= s[5];
s[0] ^= s[6];
s[6] ^= s[7];
s[6] ^= t;
s[7] = rotl(s[7], 21);
return result;
}
//////////////////////////////////////////////////////////////////
uint64_t splitmix64() {
uint64_t z = (sm += 0x9e3779b97f4a7c15);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}
//////////////////////////////////////////////////////////////////
uint32_t pcg32() {
uint64_t oldstate = rng.state;
// Advance internal state
rng.state = oldstate * 6364136223846793005ULL + (rng.inc|1);
// Calculate output function (XSH RR), uses old state for max ILP
uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
uint32_t rot = oldstate >> 59u;
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
}
//////////////////////////////////////////////////////////////////
// Generate a 64bit integer by chaining two 32bit numbers together
uint64_t pcg64_32x2() {
uint64_t high = pcg32();
uint32_t low = pcg32();
uint64_t ret = (high << 32) | low;
return ret;
}
static inline uint64_t pcg64() {
const uint64_t word = ((rng.state >> ((rng.state >> 59) + 5)) ^ rng.state) * 12605985483714917081ull;
rng.state = rng.state * 6364136223846793005ull + rng.inc;
return (word >> 43) ^ word;
}
uint64_t biski64() {
uint64_t output = mix + loopMix;
uint64_t oldLoopMix = loopMix;
loopMix = fast_loop ^ mix;
mix = rotl(mix, 16) + rotl(oldLoopMix, 40);
fast_loop += 0x9999999999999999;
return output;
}
@scottchiefbaker
Copy link
Author

On my 32bit ESP32-C3 I'm seeing:

PRNG Iterations per second Output Bits Bytes per second
pcg32 487802 32 1951266.7 b/s
xoroshiro64** 516023 32 2050966.7 b/s
xoshiro256+ 487808 64 3878726.7 b/s
xoshiro512++ 441735 64 3514373.3 b/s
splitmix64 462290 64 3677033.3 b/s
pcg64 416297 64 3313060.0 b/s

Very little difference on PRNGs that use 64bit operations vs 32bit operations. Even on limited hardware like this it makes sense to use a 64bit PRNG because you get more bytes per cycle.

@scottchiefbaker
Copy link
Author

scottchiefbaker commented Jan 16, 2026

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment