Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Last active February 20, 2026 10:02
Show Gist options
  • Select an option

  • Save jweinst1/9dd64e8a8f206d81b14d8c9117803ea3 to your computer and use it in GitHub Desktop.

Select an option

Save jweinst1/9dd64e8a8f206d81b14d8c9117803ea3 to your computer and use it in GitHub Desktop.
region specific pattern of quantization for floats
#include <cstdint>
#include <cstring>
#include <cstdio>
#include <random>
#include <limits>
#include <algorithm>
#include <array>
#include <vector>
#include <cmath>
#include <map>
#include <bitset>
#include <iostream>
static constexpr size_t dimensionLength = 64;
static constexpr size_t bitDimLengt = dimensionLength / 64;
struct FloatVec {
std::array<float, dimensionLength> data{};
bool operator==(const FloatVec& other) const {
return data == other.data;
}
void print() const {
putc('\n', stdout);
for (int i = 0; i < dimensionLength; ++i)
{
printf("| %f | ", data[i]);
}
putc('\n', stdout);
}
static std::vector<FloatVec> makeRandVec(size_t amount) {
std::random_device rd;
std::mt19937 gen(rd());
float min = -1.0f;
float max = 1.0f;
std::uniform_real_distribution<float> dis(
std::nextafter(min, max), // Just above -1.0
max // Just below 1.0 (distribution is naturally exclusive of max)
);
std::vector<FloatVec> fvs;
for (int i = 0; i < amount; ++i)
{
FloatVec fv;
for (int j = 0; j < dimensionLength; ++j)
{
fv.data[j] = dis(gen);
}
fvs.push_back(fv);
}
return fvs;
}
uint64_t quantize() const {
uint64_t result = 0;
for (int i = 0; i < dimensionLength; ++i)
{
uint64_t posBit = !std::signbit(data[i]) ? 1 : 0;
result |= posBit << i;
}
return result;
}
};
static void printRegions(uint64_t qn) {
for (int i = 0; i < 64; i += 4)
{
if (((qn >> i) & 0xf) == 0xf) {
printf("1 ");
} else {
printf("0 ");
}
}
putc('\n', stdout);
}
static uint16_t quantize16(uint64_t qn) {
uint16_t key = 0;
int j = 0;
for (int i = 0; i < 64; i += 4)
{
if (((qn >> i) & 0xf) == 0xf) {
key |= uint16_t{1} << j;
}
++j;
}
return key;
}
static uint32_t quantize32(uint64_t qn) {
uint32_t key = 0;
int j = 0;
for (int i = 0; i < 64; i += 2)
{
if (((qn >> i) & 0b11) == 0b11) {
key |= uint16_t{1} << j;
}
++j;
}
return key;
}
float calculateEuclideanDistance(const FloatVec& a, const FloatVec& b) {
float sum = 0.0f;
for (int i = 0; i < dimensionLength; ++i) {
float diff = a.data[i] - b.data[i];
sum += diff * diff;
}
return std::sqrt(sum);
}
static FloatVec calcMeanWithVec(const std::vector<FloatVec>& vecs) {
FloatVec fv;
for (const auto& vec: vecs) {
for (int i = 0; i < dimensionLength; ++i)
{
fv.data[i] += vec.data[i];
}
}
for (int i = 0; i < dimensionLength; ++i)
{
fv.data[i] = fv.data[i] / (float)vecs.size();
}
return fv;
}
static FloatVec findNearestNeighbor(const std::vector<FloatVec>& vecs, const FloatVec& query) {
float closestDist = std::numeric_limits<float>::max();
size_t closestVec = (size_t)-1;
for (size_t i = 0; i < vecs.size(); ++i)
{
float dist = calculateEuclideanDistance(query, vecs[i]);
if (closestDist > dist) {
closestDist = dist;
closestVec = i;
}
}
return vecs[closestVec];
}
static constexpr size_t testSampleSize = 1024 * 100;
int main(int argc, char const *argv[])
{
std::array<std::vector<FloatVec>, 16> popCounted16{};
std::array<std::vector<FloatVec>, 32> popCounted32{};
std::array<std::vector<FloatVec>, 64> popCounted64{};
std::map<uint16_t, std::vector<FloatVec>> mappedKeys;
std::map<uint32_t, std::vector<FloatVec>> mappedKeys32;
auto vecs = FloatVec::makeRandVec(50000);
for (const auto& vec : vecs) {
uint64_t compressed = vec.quantize();
uint16_t superComp = quantize16(compressed);
uint32_t superComp32 = quantize32(compressed);
mappedKeys[superComp].push_back(vec);
mappedKeys32[superComp32].push_back(vec);
popCounted16[ __builtin_popcount(superComp) ].push_back(vec);
popCounted32[__builtin_popcount(superComp32)].push_back(vec);
popCounted64[__builtin_popcountll(compressed)].push_back(vec);
}
for (int i = 0; i < 16; ++i)
{
printf("16{%d} - size %zu\n", i, popCounted16[i].size());
}
for (int i = 0; i < 32; ++i)
{
printf("32{%d} - size %zu\n",i, popCounted32[i].size());
}
for (int i = 0; i < 64; ++i)
{
printf("64{%d} - size %zu\n",i, popCounted64[i].size());
}
auto vecQuery = FloatVec::makeRandVec(1000);
puts("NEAREST TEST START__________");
for (const auto& q : vecQuery)
{
FloatVec baseNearest = findNearestNeighbor(vecs, q);
uint64_t compressed = q.quantize();
uint16_t superComp = quantize16(compressed);
int index16 = __builtin_popcount(superComp);
FloatVec quantNearest = findNearestNeighbor(popCounted16[index16], q);
printf("Nearest Test index:%d result:%s\n", index16, baseNearest == quantNearest ? "PASS" : "FAIL");
}
puts("NEAREST TEST END__________");
FloatVec fv1 = calcMeanWithVec(popCounted16[0]);
FloatVec fvClose = calcMeanWithVec(popCounted16[1]);
FloatVec fv2 = calcMeanWithVec(popCounted16[5]);
printf("Distance16 0 5 is %f\n", calculateEuclideanDistance(fv1, fv2));
printf("Distance inner 5 is %f\n", calculateEuclideanDistance(popCounted16[5][0], popCounted16[5][1]));
printf("Distance16 0 1 is %f\n", calculateEuclideanDistance(fv1, fvClose));
size_t first32WithV = 0;
size_t first32WithVNext = 0;
size_t last32WithV = 0;
for (int i = 0; i < 32; ++i)
{
if (popCounted32[i].size() > 0) {
first32WithV = i;
first32WithVNext = i + 1;
break;
}
}
for (int i = 31; i >= 0; --i)
{
if (popCounted32[i].size() > 0) {
last32WithV = i;
break;
}
}
/*
Distance16 0 5 is 1.931163
Distance16 0 1 is 0.300208
Distance16 inner 00 6.095141
first last 32 dist mean 1 and 18 is 6.006310
NEXT first 32 dist mean 1 and 2 is 2.812121
*/
FloatVec v32First = calcMeanWithVec(popCounted32[first32WithV]);
FloatVec v32FirstNext = calcMeanWithVec(popCounted32[first32WithVNext]);
FloatVec v32Last = calcMeanWithVec(popCounted32[last32WithV]);
printf("first last 32 dist mean %zu and %zu is %f\n", first32WithV, last32WithV, calculateEuclideanDistance(v32First, v32Last));
printf("NEXT first 32 dist mean %zu and %zu is %f\n", first32WithV, first32WithVNext, calculateEuclideanDistance(v32First, v32FirstNext));
// 64 start
size_t first64WithV = 0;
size_t first64WithVNext = 0;
size_t last64WithV = 0;
for (int i = 0; i < 64; ++i)
{
if (popCounted64[i].size() > 0) {
first64WithV = i;
first64WithVNext = i + 1;
break;
}
}
for (int i = 63; i >= 0; --i)
{
if (popCounted64[i].size() > 0) {
last64WithV = i;
break;
}
}
FloatVec v64First = calcMeanWithVec(popCounted64[first64WithV]);
FloatVec v64FirstNext = calcMeanWithVec(popCounted64[first64WithVNext]);
FloatVec v64Last = calcMeanWithVec(popCounted64[last64WithV]);
printf("first last 64 dist mean %zu and %zu is %f\n", first64WithV, last64WithV, calculateEuclideanDistance(v64First, v64Last));
printf("NEXT first 64 dist mean %zu and %zu is %f\n", first64WithV, first64WithVNext, calculateEuclideanDistance(v64First, v64FirstNext));
printf("inner 64 %f\n", calculateEuclideanDistance(popCounted64[23][0], popCounted64[23][1]));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment