BlobTheKat/chacha20poly1305blake2s.c

## chacha20poly1305blake2s.c
/**
 * Relatively fast and low-level ChaCha20, Poly1305 and Blake2s implementation
 * Matthew Reiner, 2026
 * Available under the GPL 3.0 license
 * Fuzz-tested on over 2.5 billion unique inputs for each algorithm, to match libsodium's implementation 1:1
 * Tests performed with clang -O3 and included UB-sanitizer and address-sanitizer
 */

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#ifdef __APPLE__
#include <libkern/OSByteOrder.h>
#include <machine/endian.h>
#define bswap_16(x) OSSwapInt16(x)
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
#define htole16(x) OSSwapHostToLittleInt16(x)
#define htole32(x) OSSwapHostToLittleInt32(x)
#define htole64(x) OSSwapHostToLittleInt64(x)
#define le16toh(x) OSSwapLittleToHostInt16(x)
#define le32toh(x) OSSwapLittleToHostInt32(x)
#define le64toh(x) OSSwapLittleToHostInt64(x)
#else
#include <byteswap.h>
#ifndef _DEFAULT_SOURCE
#define _DEFAULT_SOURCE
#endif
#include <endian.h>
#endif

#if defined(__cplusplus) && !defined(_Alignas)
#define _Alignas alignas
#endif

#define CHACHA20_QROUND(a, b, c, d) \
	a += b; d ^= a; d = d << 16 | d >> 16; \
	c += d; b ^= c; b = b << 12 | b >> 20; \
	a += b; d ^= a; d = d << 8 | d >> 24; \
	c += d; b ^= c; b = b << 7 | b >> 25

/**
 * @param in Host-endian initial state
 * @param out Content that will be XOR'd with chacha20 result. Zero this array before calling in order to get raw chacha20 output. Must be 4-byte aligned.
 */
void ChaCha20_block(const uint32_t in[16], uint8_t out[64]){
	uint32_t state[16];
	memcpy(state, in, 64);

	for(int i = 0; i < 10; i++){
		CHACHA20_QROUND(state[0], state[4], state[ 8], state[12]); // Column 0
		CHACHA20_QROUND(state[1], state[5], state[ 9], state[13]); // Column 1
		CHACHA20_QROUND(state[2], state[6], state[10], state[14]); // Column 2
		CHACHA20_QROUND(state[3], state[7], state[11], state[15]); // Column 3
		CHACHA20_QROUND(state[0], state[5], state[10], state[15]); // Diagonal 1 (main diagonal)
		CHACHA20_QROUND(state[1], state[6], state[11], state[12]); // Diagonal 2
		CHACHA20_QROUND(state[2], state[7], state[ 8], state[13]); // Diagonal 3
		CHACHA20_QROUND(state[3], state[4], state[ 9], state[14]); // Diagonal 4
	}

	for(int i = 0; i < 16; i++)
		((uint32_t*)out)[i] ^= htole32(state[i] + in[i]);
}
#undef CHACHA20_QROUND

#define BLAKE2S_QROUND(a,b,c,d,x,y) \
	a = a + b + x; d ^= a; d = d<<16|d>>16; \
	c = c + d; b ^= c; b = b>>12|b<<20; \
	a = a + b + y; d ^= a; d = d>>8|d<<24; \
	c = c + d; b ^= c; b = b>>7|b<<25;

void blake2s_block(uint32_t in[16], uint32_t out[8]){
	static const uint8_t sigma[10][16] = {
		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
		{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
		{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
		{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
		{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
		{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
		{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
		{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
		{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
		{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 }
	};

	uint32_t h[8] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19}, v[16] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E523F, 0x9B05688C, 0xE07C2654, 0x5BE0CD19};

	for(int r = 0; r < 10; r++){
		BLAKE2S_QROUND(v[0],v[4],v[8], v[12], in[sigma[r][0]], in[sigma[r][1]]);
		BLAKE2S_QROUND(v[1],v[5],v[9], v[13], in[sigma[r][2]], in[sigma[r][3]]);
		BLAKE2S_QROUND(v[2],v[6],v[10],v[14], in[sigma[r][4]], in[sigma[r][5]]);
		BLAKE2S_QROUND(v[3],v[7],v[11],v[15], in[sigma[r][6]], in[sigma[r][7]]);
		BLAKE2S_QROUND(v[0],v[5],v[10],v[15], in[sigma[r][8]], in[sigma[r][9]]);
		BLAKE2S_QROUND(v[1],v[6],v[11],v[12], in[sigma[r][10]],in[sigma[r][11]]);
		BLAKE2S_QROUND(v[2],v[7],v[8], v[13], in[sigma[r][12]],in[sigma[r][13]]);
		BLAKE2S_QROUND(v[3],v[4],v[9], v[14], in[sigma[r][14]],in[sigma[r][15]]);
	}

	for(int i = 0; i < 8; i++)
		out[i] = htole32(h[i]^v[i]^v[i+8]);
}
#undef BLAKE2S_QROUND

#define POLY1305_LOAD(m0,m1,m2,m3,m4,buf) m0 = le32toh(((uint32_t*)buf)[0]); \
	m1 = le32toh(((uint32_t*)buf)[1]); \
	m2 = le32toh(((uint32_t*)buf)[2]); \
	m3 = le32toh(((uint32_t*)buf)[3]); \
	m4 |= m3 >> 8; \
	m3 = (m2 >> 14 | m3 << 18) & 0x3ffffff; \
	m2 = (m1 >> 20 | m2 << 12) & 0x3ffffff; \
	m1 = (m0 >> 26 | m1 << 6) & 0x3ffffff; \
	m0 &= 0x3ffffff;

/**
 * @param in Contents to compute checksum over. No alignment requirement
 * @param inlen Length of `in` in bytes
 * @param key Input key. Must be 4-byte aligned.
 * @param out Content that will be populated with poly1305 result. Must be 4-byte aligned
 */
void poly1305(const uint8_t *in, size_t inlen, const uint8_t key[32], uint8_t out[16]){
	uint32_t r0, r1, r2, r3, r4 = 0;
	uint32_t s1, s2, s3, s4;

	uint32_t h0 = 0, h1 = 0, h2 = 0, h3 = 0, h4 = 0;

	POLY1305_LOAD(r0,r1,r2,r3,r4,key)

	r1 &= 0x3ffff03;
	r2 &= 0x3ffc0ff;
	r3 &= 0x3f03fff;
	r4 &= 0x00fffff;

	s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5;

	while (inlen) {
		uint32_t m0, m1, m2, m3, m4 = 0;
		uint32_t c;

		int block = 16;
		_Alignas(4) char buf[16] = {0};
		if(inlen < 16){
			buf[block = inlen] = 1;
		}else m4 = 0x1000000;
		memcpy(buf, in, block);

		POLY1305_LOAD(m0,m1,m2,m3,m4,buf)

		h0 += m0; h1 += m1; h2 += m2; h3 += m3; h4 += m4;

		/* multiply (h *= r) */
		uint64_t d0 = (uint64_t)h0*r0 + (uint64_t)h1*s4 + (uint64_t)h2*s3 + (uint64_t)h3*s2 + (uint64_t)h4*s1;
		uint64_t d1 = (uint64_t)h0*r1 + (uint64_t)h1*r0 + (uint64_t)h2*s4 + (uint64_t)h3*s3 + (uint64_t)h4*s2;
		uint64_t d2 = (uint64_t)h0*r2 + (uint64_t)h1*r1 + (uint64_t)h2*r0 + (uint64_t)h3*s4 + (uint64_t)h4*s3;
		uint64_t d3 = (uint64_t)h0*r3 + (uint64_t)h1*r2 + (uint64_t)h2*r1 + (uint64_t)h3*r0 + (uint64_t)h4*s4;
		uint64_t d4 = (uint64_t)h0*r4 + (uint64_t)h1*r3 + (uint64_t)h2*r2 + (uint64_t)h3*r1 + (uint64_t)h4*r0;

		/* carry propagation */
		c  = (d0 >> 26); h0 = d0 & 0x3ffffff; d1 += c;
		c  = (d1 >> 26); h1 = d1 & 0x3ffffff; d2 += c;
		c  = (d2 >> 26); h2 = d2 & 0x3ffffff; d3 += c;
		c  = (d3 >> 26); h3 = d3 & 0x3ffffff; d4 += c;
		c  = (d4 >> 26); h4 = d4 & 0x3ffffff; h0 += c * 5;
		c  = (h0 >> 26); h0 &= 0x3ffffff; h1 += c;

		in += block;
		inlen -= block;
	}

	/* final reduction */
	uint32_t c;
	c = h1 >> 26; h1 &= 0x3ffffff; h2 += c;
	c = h2 >> 26; h2 &= 0x3ffffff; h3 += c;
	c = h3 >> 26; h3 &= 0x3ffffff; h4 += c;
	c = h4 >> 26; h4 &= 0x3ffffff; h0 += c * 5;
	c = h0 >> 26; h0 &= 0x3ffffff; h1 += c;

	/* compute h + -p to check if reduction needed */
	uint32_t g0 = h0 + 5;
	c = g0 >> 26; g0 &= 0x3ffffff;
	uint32_t g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
	uint32_t g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
	uint32_t g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
	uint32_t g4 = h4 + c - (1ULL << 26);

	uint32_t mask = (g4 >> 31) - 1;
	h0 = (h0 & ~mask) | (g0 & mask);
	h1 = (h1 & ~mask) | (g1 & mask);
	h2 = (h2 & ~mask) | (g2 & mask);
	h3 = (h3 & ~mask) | (g3 & mask);
	h4 = (h4 & ~mask) | (g4 & mask);

	/* serialize h */
	uint64_t f0 = (uint64_t)((h0      ) | (h1 << 26)) + le32toh(((uint32_t*)key)[4]);
	uint64_t f1 = (uint64_t)((h1 >> 6 ) | (h2 << 20)) + le32toh(((uint32_t*)key)[5]) + (f0 >> 32);
	uint64_t f2 = (uint64_t)((h2 >> 12) | (h3 << 14)) + le32toh(((uint32_t*)key)[6]) + (f1 >> 32);
	uint64_t f3 = (uint64_t)((h3 >> 18) | (h4 << 8 )) + le32toh(((uint32_t*)key)[7]) + (f2 >> 32);

	((uint32_t*)out)[0] = htole32(f0);
	((uint32_t*)out)[1] = htole32(f1);
	((uint32_t*)out)[2] = htole32(f2);
	((uint32_t*)out)[3] = htole32(f3);
}

#undef POLY1305_LOAD
	/**
	* Relatively fast and low-level ChaCha20, Poly1305 and Blake2s implementation
	* Matthew Reiner, 2026
	* Available under the GPL 3.0 license
	* Fuzz-tested on over 2.5 billion unique inputs for each algorithm, to match libsodium's implementation 1:1
	* Tests performed with clang -O3 and included UB-sanitizer and address-sanitizer
	*/

	#include <stdint.h>
	#include <stddef.h>
	#include <string.h>
	#ifdef __APPLE__
	#include <libkern/OSByteOrder.h>
	#include <machine/endian.h>
	#define bswap_16(x) OSSwapInt16(x)
	#define bswap_32(x) OSSwapInt32(x)
	#define bswap_64(x) OSSwapInt64(x)
	#define htole16(x) OSSwapHostToLittleInt16(x)
	#define htole32(x) OSSwapHostToLittleInt32(x)
	#define htole64(x) OSSwapHostToLittleInt64(x)
	#define le16toh(x) OSSwapLittleToHostInt16(x)
	#define le32toh(x) OSSwapLittleToHostInt32(x)
	#define le64toh(x) OSSwapLittleToHostInt64(x)
	#else
	#include <byteswap.h>
	#ifndef _DEFAULT_SOURCE
	#define _DEFAULT_SOURCE
	#endif
	#include <endian.h>
	#endif

	#if defined(__cplusplus) && !defined(_Alignas)
	#define _Alignas alignas
	#endif

	#define CHACHA20_QROUND(a, b, c, d) \
	a += b; d ^= a; d = d << 16 \| d >> 16; \
	c += d; b ^= c; b = b << 12 \| b >> 20; \
	a += b; d ^= a; d = d << 8 \| d >> 24; \
	c += d; b ^= c; b = b << 7 \| b >> 25

	/**
	* @param in Host-endian initial state
	* @param out Content that will be XOR'd with chacha20 result. Zero this array before calling in order to get raw chacha20 output. Must be 4-byte aligned.
	*/
	void ChaCha20_block(const uint32_t in[16], uint8_t out[64]){
	uint32_t state[16];
	memcpy(state, in, 64);

	for(int i = 0; i < 10; i++){
	CHACHA20_QROUND(state[0], state[4], state[ 8], state[12]); // Column 0
	CHACHA20_QROUND(state[1], state[5], state[ 9], state[13]); // Column 1
	CHACHA20_QROUND(state[2], state[6], state[10], state[14]); // Column 2
	CHACHA20_QROUND(state[3], state[7], state[11], state[15]); // Column 3
	CHACHA20_QROUND(state[0], state[5], state[10], state[15]); // Diagonal 1 (main diagonal)
	CHACHA20_QROUND(state[1], state[6], state[11], state[12]); // Diagonal 2
	CHACHA20_QROUND(state[2], state[7], state[ 8], state[13]); // Diagonal 3
	CHACHA20_QROUND(state[3], state[4], state[ 9], state[14]); // Diagonal 4
	}

	for(int i = 0; i < 16; i++)
	((uint32_t*)out)[i] ^= htole32(state[i] + in[i]);
	}
	#undef CHACHA20_QROUND

	#define BLAKE2S_QROUND(a,b,c,d,x,y) \
	a = a + b + x; d ^= a; d = d<<16\|d>>16; \
	c = c + d; b ^= c; b = b>>12\|b<<20; \
	a = a + b + y; d ^= a; d = d>>8\|d<<24; \
	c = c + d; b ^= c; b = b>>7\|b<<25;

	void blake2s_block(uint32_t in[16], uint32_t out[8]){
	static const uint8_t sigma[10][16] = {
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
	{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
	{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
	{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
	{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
	{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
	{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 }
	};

	uint32_t h[8] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19}, v[16] = { 0x6B08E647, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E523F, 0x9B05688C, 0xE07C2654, 0x5BE0CD19};

	for(int r = 0; r < 10; r++){
	BLAKE2S_QROUND(v[0],v[4],v[8], v[12], in[sigma[r][0]], in[sigma[r][1]]);
	BLAKE2S_QROUND(v[1],v[5],v[9], v[13], in[sigma[r][2]], in[sigma[r][3]]);
	BLAKE2S_QROUND(v[2],v[6],v[10],v[14], in[sigma[r][4]], in[sigma[r][5]]);
	BLAKE2S_QROUND(v[3],v[7],v[11],v[15], in[sigma[r][6]], in[sigma[r][7]]);
	BLAKE2S_QROUND(v[0],v[5],v[10],v[15], in[sigma[r][8]], in[sigma[r][9]]);
	BLAKE2S_QROUND(v[1],v[6],v[11],v[12], in[sigma[r][10]],in[sigma[r][11]]);
	BLAKE2S_QROUND(v[2],v[7],v[8], v[13], in[sigma[r][12]],in[sigma[r][13]]);
	BLAKE2S_QROUND(v[3],v[4],v[9], v[14], in[sigma[r][14]],in[sigma[r][15]]);
	}

	for(int i = 0; i < 8; i++)
	out[i] = htole32(h[i]^v[i]^v[i+8]);
	}
	#undef BLAKE2S_QROUND

	#define POLY1305_LOAD(m0,m1,m2,m3,m4,buf) m0 = le32toh(((uint32_t*)buf)[0]); \
	m1 = le32toh(((uint32_t*)buf)[1]); \
	m2 = le32toh(((uint32_t*)buf)[2]); \
	m3 = le32toh(((uint32_t*)buf)[3]); \
	m4 \|= m3 >> 8; \
	m3 = (m2 >> 14 \| m3 << 18) & 0x3ffffff; \
	m2 = (m1 >> 20 \| m2 << 12) & 0x3ffffff; \
	m1 = (m0 >> 26 \| m1 << 6) & 0x3ffffff; \
	m0 &= 0x3ffffff;

	/**
	* @param in Contents to compute checksum over. No alignment requirement
	* @param inlen Length of `in` in bytes
	* @param key Input key. Must be 4-byte aligned.
	* @param out Content that will be populated with poly1305 result. Must be 4-byte aligned
	*/
	void poly1305(const uint8_t *in, size_t inlen, const uint8_t key[32], uint8_t out[16]){
	uint32_t r0, r1, r2, r3, r4 = 0;
	uint32_t s1, s2, s3, s4;

	uint32_t h0 = 0, h1 = 0, h2 = 0, h3 = 0, h4 = 0;

	POLY1305_LOAD(r0,r1,r2,r3,r4,key)

	r1 &= 0x3ffff03;
	r2 &= 0x3ffc0ff;
	r3 &= 0x3f03fff;
	r4 &= 0x00fffff;

	s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5;

	while (inlen) {
	uint32_t m0, m1, m2, m3, m4 = 0;
	uint32_t c;

	int block = 16;
	_Alignas(4) char buf[16] = {0};
	if(inlen < 16){
	buf[block = inlen] = 1;
	}else m4 = 0x1000000;
	memcpy(buf, in, block);

	POLY1305_LOAD(m0,m1,m2,m3,m4,buf)

	h0 += m0; h1 += m1; h2 += m2; h3 += m3; h4 += m4;

	/* multiply (h = r) /
	uint64_t d0 = (uint64_t)h0r0 + (uint64_t)h1s4 + (uint64_t)h2s3 + (uint64_t)h3s2 + (uint64_t)h4*s1;
	uint64_t d1 = (uint64_t)h0r1 + (uint64_t)h1r0 + (uint64_t)h2s4 + (uint64_t)h3s3 + (uint64_t)h4*s2;
	uint64_t d2 = (uint64_t)h0r2 + (uint64_t)h1r1 + (uint64_t)h2r0 + (uint64_t)h3s4 + (uint64_t)h4*s3;
	uint64_t d3 = (uint64_t)h0r3 + (uint64_t)h1r2 + (uint64_t)h2r1 + (uint64_t)h3r0 + (uint64_t)h4*s4;
	uint64_t d4 = (uint64_t)h0r4 + (uint64_t)h1r3 + (uint64_t)h2r2 + (uint64_t)h3r1 + (uint64_t)h4*r0;

	/* carry propagation */
	c = (d0 >> 26); h0 = d0 & 0x3ffffff; d1 += c;
	c = (d1 >> 26); h1 = d1 & 0x3ffffff; d2 += c;
	c = (d2 >> 26); h2 = d2 & 0x3ffffff; d3 += c;
	c = (d3 >> 26); h3 = d3 & 0x3ffffff; d4 += c;
	c = (d4 >> 26); h4 = d4 & 0x3ffffff; h0 += c * 5;
	c = (h0 >> 26); h0 &= 0x3ffffff; h1 += c;

	in += block;
	inlen -= block;
	}

	/* final reduction */
	uint32_t c;
	c = h1 >> 26; h1 &= 0x3ffffff; h2 += c;
	c = h2 >> 26; h2 &= 0x3ffffff; h3 += c;
	c = h3 >> 26; h3 &= 0x3ffffff; h4 += c;
	c = h4 >> 26; h4 &= 0x3ffffff; h0 += c * 5;
	c = h0 >> 26; h0 &= 0x3ffffff; h1 += c;

	/* compute h + -p to check if reduction needed */
	uint32_t g0 = h0 + 5;
	c = g0 >> 26; g0 &= 0x3ffffff;
	uint32_t g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
	uint32_t g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
	uint32_t g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
	uint32_t g4 = h4 + c - (1ULL << 26);

	uint32_t mask = (g4 >> 31) - 1;
	h0 = (h0 & ~mask) \| (g0 & mask);
	h1 = (h1 & ~mask) \| (g1 & mask);
	h2 = (h2 & ~mask) \| (g2 & mask);
	h3 = (h3 & ~mask) \| (g3 & mask);
	h4 = (h4 & ~mask) \| (g4 & mask);

	/* serialize h */
	uint64_t f0 = (uint64_t)((h0 ) \| (h1 << 26)) + le32toh(((uint32_t*)key)[4]);
	uint64_t f1 = (uint64_t)((h1 >> 6 ) \| (h2 << 20)) + le32toh(((uint32_t*)key)[5]) + (f0 >> 32);
	uint64_t f2 = (uint64_t)((h2 >> 12) \| (h3 << 14)) + le32toh(((uint32_t*)key)[6]) + (f1 >> 32);
	uint64_t f3 = (uint64_t)((h3 >> 18) \| (h4 << 8 )) + le32toh(((uint32_t*)key)[7]) + (f2 >> 32);

	((uint32_t*)out)[0] = htole32(f0);
	((uint32_t*)out)[1] = htole32(f1);
	((uint32_t*)out)[2] = htole32(f2);
	((uint32_t*)out)[3] = htole32(f3);
	}

	#undef POLY1305_LOAD
No results found