Last active
March 11, 2026 14:02
-
-
Save BlobTheKat/cadbd8edcef58869c4b7be13486774ed to your computer and use it in GitHub Desktop.
Lean Chacha20 + Poly1305 + Blake2s implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Relatively fast and low-level ChaCha20, Poly1305 and Blake2s implementation | |
| * Matthew Reiner, 2026 | |
| * Available under the GPL 3.0 license | |
| * Fuzz-tested on over 2.5 billion unique inputs for each algorithm, to match libsodium's implementation 1:1 | |
| * Tests performed with clang -O3 and included UB-sanitizer and address-sanitizer | |
| */ | |
| #include <stdint.h> | |
| #include <stddef.h> | |
| #include <string.h> | |
| #ifdef __APPLE__ | |
| #include <libkern/OSByteOrder.h> | |
| #include <machine/endian.h> | |
| #define bswap_16(x) OSSwapInt16(x) | |
| #define bswap_32(x) OSSwapInt32(x) | |
| #define bswap_64(x) OSSwapInt64(x) | |
| #define htole16(x) OSSwapHostToLittleInt16(x) | |
| #define htole32(x) OSSwapHostToLittleInt32(x) | |
| #define htole64(x) OSSwapHostToLittleInt64(x) | |
| #define le16toh(x) OSSwapLittleToHostInt16(x) | |
| #define le32toh(x) OSSwapLittleToHostInt32(x) | |
| #define le64toh(x) OSSwapLittleToHostInt64(x) | |
| #else | |
| #include <byteswap.h> | |
| #ifndef _DEFAULT_SOURCE | |
| #define _DEFAULT_SOURCE | |
| #endif | |
| #include <endian.h> | |
| #endif | |
| #if defined(__cplusplus) && !defined(_Alignas) | |
| #define _Alignas alignas | |
| #endif | |
| #define CHACHA20_QROUND(a, b, c, d) \ | |
| a += b; d ^= a; d = d << 16 | d >> 16; \ | |
| c += d; b ^= c; b = b << 12 | b >> 20; \ | |
| a += b; d ^= a; d = d << 8 | d >> 24; \ | |
| c += d; b ^= c; b = b << 7 | b >> 25 | |
| /** | |
| * @param in Host-endian initial state | |
| * @param out Content that will be XOR'd with chacha20 result. Zero this array before calling in order to get raw chacha20 output. Must be 4-byte aligned. | |
| */ | |
| void ChaCha20_block(const uint32_t in[16], uint8_t out[64]){ | |
| uint32_t state[16]; | |
| memcpy(state, in, 64); | |
| for(int i = 0; i < 10; i++){ | |
| CHACHA20_QROUND(state[0], state[4], state[ 8], state[12]); // Column 0 | |
| CHACHA20_QROUND(state[1], state[5], state[ 9], state[13]); // Column 1 | |
| CHACHA20_QROUND(state[2], state[6], state[10], state[14]); // Column 2 | |
| CHACHA20_QROUND(state[3], state[7], state[11], state[15]); // Column 3 | |
| CHACHA20_QROUND(state[0], state[5], state[10], state[15]); // Diagonal 1 (main diagonal) | |
| CHACHA20_QROUND(state[1], state[6], state[11], state[12]); // Diagonal 2 | |
| CHACHA20_QROUND(state[2], state[7], state[ 8], state[13]); // Diagonal 3 | |
| CHACHA20_QROUND(state[3], state[4], state[ 9], state[14]); // Diagonal 4 | |
| } | |
| for(int i = 0; i < 16; i++) | |
| ((uint32_t*)out)[i] ^= htole32(state[i] + in[i]); | |
| } | |
| #undef CHACHA20_QROUND | |
| #define BLAKE2S_QROUND(a,b,c,d,x,y) \ | |
| a = a + b + x; d ^= a; d = d<<16|d>>16; \ | |
| c = c + d; b ^= c; b = b>>12|b<<20; \ | |
| a = a + b + y; d ^= a; d = d>>8|d<<24; \ | |
| c = c + d; b ^= c; b = b>>7|b<<25; | |
| void blake2s_block(uint32_t in[16], uint32_t out[8]){ | |
| static const uint8_t sigma[10][16] = { | |
| { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 }, | |
| {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 }, | |
| {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 }, | |
| { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 }, | |
| { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 }, | |
| { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 }, | |
| {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 }, | |
| {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 }, | |
| { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 }, | |
| {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 } | |
| }; | |
| uint32_t h[8] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19}, v[16] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E523F, 0x9B05688C, 0xE07C2654, 0x5BE0CD19}; | |
| for(int r = 0; r < 10; r++){ | |
| BLAKE2S_QROUND(v[0],v[4],v[8], v[12], in[sigma[r][0]], in[sigma[r][1]]); | |
| BLAKE2S_QROUND(v[1],v[5],v[9], v[13], in[sigma[r][2]], in[sigma[r][3]]); | |
| BLAKE2S_QROUND(v[2],v[6],v[10],v[14], in[sigma[r][4]], in[sigma[r][5]]); | |
| BLAKE2S_QROUND(v[3],v[7],v[11],v[15], in[sigma[r][6]], in[sigma[r][7]]); | |
| BLAKE2S_QROUND(v[0],v[5],v[10],v[15], in[sigma[r][8]], in[sigma[r][9]]); | |
| BLAKE2S_QROUND(v[1],v[6],v[11],v[12], in[sigma[r][10]],in[sigma[r][11]]); | |
| BLAKE2S_QROUND(v[2],v[7],v[8], v[13], in[sigma[r][12]],in[sigma[r][13]]); | |
| BLAKE2S_QROUND(v[3],v[4],v[9], v[14], in[sigma[r][14]],in[sigma[r][15]]); | |
| } | |
| for(int i = 0; i < 8; i++) | |
| out[i] = htole32(h[i]^v[i]^v[i+8]); | |
| } | |
| #undef BLAKE2S_QROUND | |
| #define POLY1305_LOAD(m0,m1,m2,m3,m4,buf) m0 = le32toh(((uint32_t*)buf)[0]); \ | |
| m1 = le32toh(((uint32_t*)buf)[1]); \ | |
| m2 = le32toh(((uint32_t*)buf)[2]); \ | |
| m3 = le32toh(((uint32_t*)buf)[3]); \ | |
| m4 |= m3 >> 8; \ | |
| m3 = (m2 >> 14 | m3 << 18) & 0x3ffffff; \ | |
| m2 = (m1 >> 20 | m2 << 12) & 0x3ffffff; \ | |
| m1 = (m0 >> 26 | m1 << 6) & 0x3ffffff; \ | |
| m0 &= 0x3ffffff; | |
| /** | |
| * @param in Contents to compute checksum over. No alignment requirement | |
| * @param inlen Length of `in` in bytes | |
| * @param key Input key. Must be 4-byte aligned. | |
| * @param out Content that will be populated with poly1305 result. Must be 4-byte aligned | |
| */ | |
| void poly1305(const uint8_t *in, size_t inlen, const uint8_t key[32], uint8_t out[16]){ | |
| uint32_t r0, r1, r2, r3, r4 = 0; | |
| uint32_t s1, s2, s3, s4; | |
| uint32_t h0 = 0, h1 = 0, h2 = 0, h3 = 0, h4 = 0; | |
| POLY1305_LOAD(r0,r1,r2,r3,r4,key) | |
| r1 &= 0x3ffff03; | |
| r2 &= 0x3ffc0ff; | |
| r3 &= 0x3f03fff; | |
| r4 &= 0x00fffff; | |
| s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5; | |
| while (inlen) { | |
| uint32_t m0, m1, m2, m3, m4 = 0; | |
| uint32_t c; | |
| int block = 16; | |
| _Alignas(4) char buf[16] = {0}; | |
| if(inlen < 16){ | |
| buf[block = inlen] = 1; | |
| }else m4 = 0x1000000; | |
| memcpy(buf, in, block); | |
| POLY1305_LOAD(m0,m1,m2,m3,m4,buf) | |
| h0 += m0; h1 += m1; h2 += m2; h3 += m3; h4 += m4; | |
| /* multiply (h *= r) */ | |
| uint64_t d0 = (uint64_t)h0*r0 + (uint64_t)h1*s4 + (uint64_t)h2*s3 + (uint64_t)h3*s2 + (uint64_t)h4*s1; | |
| uint64_t d1 = (uint64_t)h0*r1 + (uint64_t)h1*r0 + (uint64_t)h2*s4 + (uint64_t)h3*s3 + (uint64_t)h4*s2; | |
| uint64_t d2 = (uint64_t)h0*r2 + (uint64_t)h1*r1 + (uint64_t)h2*r0 + (uint64_t)h3*s4 + (uint64_t)h4*s3; | |
| uint64_t d3 = (uint64_t)h0*r3 + (uint64_t)h1*r2 + (uint64_t)h2*r1 + (uint64_t)h3*r0 + (uint64_t)h4*s4; | |
| uint64_t d4 = (uint64_t)h0*r4 + (uint64_t)h1*r3 + (uint64_t)h2*r2 + (uint64_t)h3*r1 + (uint64_t)h4*r0; | |
| /* carry propagation */ | |
| c = (d0 >> 26); h0 = d0 & 0x3ffffff; d1 += c; | |
| c = (d1 >> 26); h1 = d1 & 0x3ffffff; d2 += c; | |
| c = (d2 >> 26); h2 = d2 & 0x3ffffff; d3 += c; | |
| c = (d3 >> 26); h3 = d3 & 0x3ffffff; d4 += c; | |
| c = (d4 >> 26); h4 = d4 & 0x3ffffff; h0 += c * 5; | |
| c = (h0 >> 26); h0 &= 0x3ffffff; h1 += c; | |
| in += block; | |
| inlen -= block; | |
| } | |
| /* final reduction */ | |
| uint32_t c; | |
| c = h1 >> 26; h1 &= 0x3ffffff; h2 += c; | |
| c = h2 >> 26; h2 &= 0x3ffffff; h3 += c; | |
| c = h3 >> 26; h3 &= 0x3ffffff; h4 += c; | |
| c = h4 >> 26; h4 &= 0x3ffffff; h0 += c * 5; | |
| c = h0 >> 26; h0 &= 0x3ffffff; h1 += c; | |
| /* compute h + -p to check if reduction needed */ | |
| uint32_t g0 = h0 + 5; | |
| c = g0 >> 26; g0 &= 0x3ffffff; | |
| uint32_t g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; | |
| uint32_t g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; | |
| uint32_t g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; | |
| uint32_t g4 = h4 + c - (1ULL << 26); | |
| uint32_t mask = (g4 >> 31) - 1; | |
| h0 = (h0 & ~mask) | (g0 & mask); | |
| h1 = (h1 & ~mask) | (g1 & mask); | |
| h2 = (h2 & ~mask) | (g2 & mask); | |
| h3 = (h3 & ~mask) | (g3 & mask); | |
| h4 = (h4 & ~mask) | (g4 & mask); | |
| /* serialize h */ | |
| uint64_t f0 = (uint64_t)((h0 ) | (h1 << 26)) + le32toh(((uint32_t*)key)[4]); | |
| uint64_t f1 = (uint64_t)((h1 >> 6 ) | (h2 << 20)) + le32toh(((uint32_t*)key)[5]) + (f0 >> 32); | |
| uint64_t f2 = (uint64_t)((h2 >> 12) | (h3 << 14)) + le32toh(((uint32_t*)key)[6]) + (f1 >> 32); | |
| uint64_t f3 = (uint64_t)((h3 >> 18) | (h4 << 8 )) + le32toh(((uint32_t*)key)[7]) + (f2 >> 32); | |
| ((uint32_t*)out)[0] = htole32(f0); | |
| ((uint32_t*)out)[1] = htole32(f1); | |
| ((uint32_t*)out)[2] = htole32(f2); | |
| ((uint32_t*)out)[3] = htole32(f3); | |
| } | |
| #undef POLY1305_LOAD |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment