Skip to content

Instantly share code, notes, and snippets.

@BlobTheKat
Last active March 11, 2026 14:02
Show Gist options
  • Select an option

  • Save BlobTheKat/cadbd8edcef58869c4b7be13486774ed to your computer and use it in GitHub Desktop.

Select an option

Save BlobTheKat/cadbd8edcef58869c4b7be13486774ed to your computer and use it in GitHub Desktop.
Lean Chacha20 + Poly1305 + Blake2s implementation
/**
* Relatively fast and low-level ChaCha20, Poly1305 and Blake2s implementation
* Matthew Reiner, 2026
* Available under the GPL 3.0 license
* Fuzz-tested on over 2.5 billion unique inputs for each algorithm, to match libsodium's implementation 1:1
* Tests performed with clang -O3 and included UB-sanitizer and address-sanitizer
*/
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#ifdef __APPLE__
#include <libkern/OSByteOrder.h>
#include <machine/endian.h>
#define bswap_16(x) OSSwapInt16(x)
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
#define htole16(x) OSSwapHostToLittleInt16(x)
#define htole32(x) OSSwapHostToLittleInt32(x)
#define htole64(x) OSSwapHostToLittleInt64(x)
#define le16toh(x) OSSwapLittleToHostInt16(x)
#define le32toh(x) OSSwapLittleToHostInt32(x)
#define le64toh(x) OSSwapLittleToHostInt64(x)
#else
#include <byteswap.h>
#ifndef _DEFAULT_SOURCE
#define _DEFAULT_SOURCE
#endif
#include <endian.h>
#endif
#if defined(__cplusplus) && !defined(_Alignas)
#define _Alignas alignas
#endif
#define CHACHA20_QROUND(a, b, c, d) \
a += b; d ^= a; d = d << 16 | d >> 16; \
c += d; b ^= c; b = b << 12 | b >> 20; \
a += b; d ^= a; d = d << 8 | d >> 24; \
c += d; b ^= c; b = b << 7 | b >> 25
/**
* @param in Host-endian initial state
* @param out Content that will be XOR'd with chacha20 result. Zero this array before calling in order to get raw chacha20 output. Must be 4-byte aligned.
*/
void ChaCha20_block(const uint32_t in[16], uint8_t out[64]){
uint32_t state[16];
memcpy(state, in, 64);
for(int i = 0; i < 10; i++){
CHACHA20_QROUND(state[0], state[4], state[ 8], state[12]); // Column 0
CHACHA20_QROUND(state[1], state[5], state[ 9], state[13]); // Column 1
CHACHA20_QROUND(state[2], state[6], state[10], state[14]); // Column 2
CHACHA20_QROUND(state[3], state[7], state[11], state[15]); // Column 3
CHACHA20_QROUND(state[0], state[5], state[10], state[15]); // Diagonal 1 (main diagonal)
CHACHA20_QROUND(state[1], state[6], state[11], state[12]); // Diagonal 2
CHACHA20_QROUND(state[2], state[7], state[ 8], state[13]); // Diagonal 3
CHACHA20_QROUND(state[3], state[4], state[ 9], state[14]); // Diagonal 4
}
for(int i = 0; i < 16; i++)
((uint32_t*)out)[i] ^= htole32(state[i] + in[i]);
}
#undef CHACHA20_QROUND
#define BLAKE2S_QROUND(a,b,c,d,x,y) \
a = a + b + x; d ^= a; d = d<<16|d>>16; \
c = c + d; b ^= c; b = b>>12|b<<20; \
a = a + b + y; d ^= a; d = d>>8|d<<24; \
c = c + d; b ^= c; b = b>>7|b<<25;
void blake2s_block(uint32_t in[16], uint32_t out[8]){
static const uint8_t sigma[10][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 }
};
uint32_t h[8] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19}, v[16] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E523F, 0x9B05688C, 0xE07C2654, 0x5BE0CD19};
for(int r = 0; r < 10; r++){
BLAKE2S_QROUND(v[0],v[4],v[8], v[12], in[sigma[r][0]], in[sigma[r][1]]);
BLAKE2S_QROUND(v[1],v[5],v[9], v[13], in[sigma[r][2]], in[sigma[r][3]]);
BLAKE2S_QROUND(v[2],v[6],v[10],v[14], in[sigma[r][4]], in[sigma[r][5]]);
BLAKE2S_QROUND(v[3],v[7],v[11],v[15], in[sigma[r][6]], in[sigma[r][7]]);
BLAKE2S_QROUND(v[0],v[5],v[10],v[15], in[sigma[r][8]], in[sigma[r][9]]);
BLAKE2S_QROUND(v[1],v[6],v[11],v[12], in[sigma[r][10]],in[sigma[r][11]]);
BLAKE2S_QROUND(v[2],v[7],v[8], v[13], in[sigma[r][12]],in[sigma[r][13]]);
BLAKE2S_QROUND(v[3],v[4],v[9], v[14], in[sigma[r][14]],in[sigma[r][15]]);
}
for(int i = 0; i < 8; i++)
out[i] = htole32(h[i]^v[i]^v[i+8]);
}
#undef BLAKE2S_QROUND
#define POLY1305_LOAD(m0,m1,m2,m3,m4,buf) m0 = le32toh(((uint32_t*)buf)[0]); \
m1 = le32toh(((uint32_t*)buf)[1]); \
m2 = le32toh(((uint32_t*)buf)[2]); \
m3 = le32toh(((uint32_t*)buf)[3]); \
m4 |= m3 >> 8; \
m3 = (m2 >> 14 | m3 << 18) & 0x3ffffff; \
m2 = (m1 >> 20 | m2 << 12) & 0x3ffffff; \
m1 = (m0 >> 26 | m1 << 6) & 0x3ffffff; \
m0 &= 0x3ffffff;
/**
* @param in Contents to compute checksum over. No alignment requirement
* @param inlen Length of `in` in bytes
* @param key Input key. Must be 4-byte aligned.
* @param out Content that will be populated with poly1305 result. Must be 4-byte aligned
*/
void poly1305(const uint8_t *in, size_t inlen, const uint8_t key[32], uint8_t out[16]){
uint32_t r0, r1, r2, r3, r4 = 0;
uint32_t s1, s2, s3, s4;
uint32_t h0 = 0, h1 = 0, h2 = 0, h3 = 0, h4 = 0;
POLY1305_LOAD(r0,r1,r2,r3,r4,key)
r1 &= 0x3ffff03;
r2 &= 0x3ffc0ff;
r3 &= 0x3f03fff;
r4 &= 0x00fffff;
s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5;
while (inlen) {
uint32_t m0, m1, m2, m3, m4 = 0;
uint32_t c;
int block = 16;
_Alignas(4) char buf[16] = {0};
if(inlen < 16){
buf[block = inlen] = 1;
}else m4 = 0x1000000;
memcpy(buf, in, block);
POLY1305_LOAD(m0,m1,m2,m3,m4,buf)
h0 += m0; h1 += m1; h2 += m2; h3 += m3; h4 += m4;
/* multiply (h *= r) */
uint64_t d0 = (uint64_t)h0*r0 + (uint64_t)h1*s4 + (uint64_t)h2*s3 + (uint64_t)h3*s2 + (uint64_t)h4*s1;
uint64_t d1 = (uint64_t)h0*r1 + (uint64_t)h1*r0 + (uint64_t)h2*s4 + (uint64_t)h3*s3 + (uint64_t)h4*s2;
uint64_t d2 = (uint64_t)h0*r2 + (uint64_t)h1*r1 + (uint64_t)h2*r0 + (uint64_t)h3*s4 + (uint64_t)h4*s3;
uint64_t d3 = (uint64_t)h0*r3 + (uint64_t)h1*r2 + (uint64_t)h2*r1 + (uint64_t)h3*r0 + (uint64_t)h4*s4;
uint64_t d4 = (uint64_t)h0*r4 + (uint64_t)h1*r3 + (uint64_t)h2*r2 + (uint64_t)h3*r1 + (uint64_t)h4*r0;
/* carry propagation */
c = (d0 >> 26); h0 = d0 & 0x3ffffff; d1 += c;
c = (d1 >> 26); h1 = d1 & 0x3ffffff; d2 += c;
c = (d2 >> 26); h2 = d2 & 0x3ffffff; d3 += c;
c = (d3 >> 26); h3 = d3 & 0x3ffffff; d4 += c;
c = (d4 >> 26); h4 = d4 & 0x3ffffff; h0 += c * 5;
c = (h0 >> 26); h0 &= 0x3ffffff; h1 += c;
in += block;
inlen -= block;
}
/* final reduction */
uint32_t c;
c = h1 >> 26; h1 &= 0x3ffffff; h2 += c;
c = h2 >> 26; h2 &= 0x3ffffff; h3 += c;
c = h3 >> 26; h3 &= 0x3ffffff; h4 += c;
c = h4 >> 26; h4 &= 0x3ffffff; h0 += c * 5;
c = h0 >> 26; h0 &= 0x3ffffff; h1 += c;
/* compute h + -p to check if reduction needed */
uint32_t g0 = h0 + 5;
c = g0 >> 26; g0 &= 0x3ffffff;
uint32_t g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
uint32_t g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
uint32_t g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
uint32_t g4 = h4 + c - (1ULL << 26);
uint32_t mask = (g4 >> 31) - 1;
h0 = (h0 & ~mask) | (g0 & mask);
h1 = (h1 & ~mask) | (g1 & mask);
h2 = (h2 & ~mask) | (g2 & mask);
h3 = (h3 & ~mask) | (g3 & mask);
h4 = (h4 & ~mask) | (g4 & mask);
/* serialize h */
uint64_t f0 = (uint64_t)((h0 ) | (h1 << 26)) + le32toh(((uint32_t*)key)[4]);
uint64_t f1 = (uint64_t)((h1 >> 6 ) | (h2 << 20)) + le32toh(((uint32_t*)key)[5]) + (f0 >> 32);
uint64_t f2 = (uint64_t)((h2 >> 12) | (h3 << 14)) + le32toh(((uint32_t*)key)[6]) + (f1 >> 32);
uint64_t f3 = (uint64_t)((h3 >> 18) | (h4 << 8 )) + le32toh(((uint32_t*)key)[7]) + (f2 >> 32);
((uint32_t*)out)[0] = htole32(f0);
((uint32_t*)out)[1] = htole32(f1);
((uint32_t*)out)[2] = htole32(f2);
((uint32_t*)out)[3] = htole32(f3);
}
#undef POLY1305_LOAD
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment