Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Last active September 1, 2025 07:53
Show Gist options
  • Select an option

  • Save jweinst1/4df56d34daa41c546f69928274280fbd to your computer and use it in GitHub Desktop.

Select an option

Save jweinst1/4df56d34daa41c546f69928274280fbd to your computer and use it in GitHub Desktop.
quantizer into a hash bucket test
#include <array>
#include <vector>
#include <unordered_map>
#include <random>
#include <iostream>
#include <limits>
#include <cstdint>
#include <chrono>
constexpr size_t DIM = 128;
constexpr size_t QUANT_BYTES = 4; // 32 bits total (128 dims / 4 dims per bit)
using Vec = std::array<float, DIM>;
using QuantVec = std::array<uint8_t, QUANT_BYTES>;
// Quantize 128 floats into 32 bits (4 dims → 1 bit)
QuantVec quantize_vector(const Vec& v, float threshold = 0.0f) {
QuantVec q{};
for (size_t byte = 0; byte < QUANT_BYTES; ++byte) {
uint8_t byte_val = 0;
for (size_t bit = 0; bit < 8; ++bit) {
size_t dim_base = (byte * 32) + (bit * 4);
uint8_t b = 0;
for (size_t d = 0; d < 4; ++d)
if (v[dim_base + d] > threshold) b |= (1 << d);
byte_val |= ((b & 0xF) << bit);
}
q[byte] = byte_val;
}
return q;
}
// Hash function for QuantVec
struct QuantVecHash {
size_t operator()(const QuantVec& q) const {
size_t h = 0;
for (auto b : q)
h ^= std::hash<uint8_t>{}(b) + 0x9e3779b9 + (h << 6) + (h >> 2);
return h;
}
};
// Popcount64 Hamming distance
inline int popcount64(uint64_t x) {
#if defined(__clang__) || defined(__GNUC__)
return __builtin_popcountll(x);
#else
int cnt = 0;
while (x) { x &= (x-1); ++cnt; }
return cnt;
#endif
}
// Encode vector as 2-bit-per-dim to allow Hamming calc
uint64_t encode_vector(const Vec& v) {
uint64_t enc = 0;
for (size_t i = 0; i < 64; ++i) {
if (v[i] > 0.0f) enc |= (1ULL << i);
}
return enc;
}
int hamming_distance(const Vec& a, const Vec& b) {
uint64_t x = encode_vector(a);
uint64_t y = encode_vector(b);
return popcount64(x ^ y);
}
int main() {
constexpr size_t N = 1000000; // dataset size
constexpr size_t NQ = 10000000; // 10 million queries
std::vector<Vec> collection(N);
std::mt19937 rng(42);
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& v : collection)
for (auto& x : v) x = dist(rng);
// Build hash index
std::unordered_map<QuantVec, std::vector<size_t>, QuantVecHash> index;
index.reserve(N * 2);
for (size_t id = 0; id < N; ++id) {
QuantVec q = quantize_vector(collection[id]);
index[q].push_back(id);
}
std::cout << "Index built with " << index.size()
<< " non-empty buckets\n";
// Benchmark queries
auto start = std::chrono::high_resolution_clock::now();
size_t total_candidates = 0;
size_t total_queries_with_hits = 0;
for (size_t qi = 0; qi < NQ; ++qi) {
Vec query;
for (auto& x : query) x = dist(rng);
QuantVec qkey = quantize_vector(query);
auto it = index.find(qkey);
if (it == index.end()) continue;
const auto& candidates = it->second;
total_candidates += candidates.size();
++total_queries_with_hits;
// Find nearest among candidates
int min_dist = std::numeric_limits<int>::max();
for (auto id : candidates) {
int d = hamming_distance(query, collection[id]);
if (d < min_dist) min_dist = d;
}
}
auto end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration<double>(end - start).count();
double qps = NQ / elapsed;
std::cout << "Completed " << NQ << " queries in "
<< elapsed << " sec, QPS=" << qps << "\n";
std::cout << "Queries with hits: " << total_queries_with_hits << "\n";
std::cout << "Avg candidates per query: "
<< (double)total_candidates / std::max<size_t>(1,total_queries_with_hits) << "\n";
}
#include <array>
#include <vector>
#include <unordered_map>
#include <random>
#include <iostream>
#include <limits>
#include <cstdint>
constexpr size_t DIM = 128;
constexpr size_t QUANT_BYTES = 4; // 32 bits total (128 dims / 4 dims per bit)
using Vec = std::array<float, DIM>;
using QuantVec = std::array<uint8_t, QUANT_BYTES>;
// Simple quantization: pack 4 dimensions per bit into 32-bit result
QuantVec quantize_vector(const Vec& v, float threshold = 0.0f) {
QuantVec q{};
for (size_t byte = 0; byte < QUANT_BYTES; ++byte) {
uint8_t byte_val = 0;
for (size_t bit = 0; bit < 8; ++bit) {
size_t dim_base = (byte * 32) + (bit * 4);
uint8_t b = 0;
for (size_t d = 0; d < 4; ++d)
if (v[dim_base + d] > threshold) b |= (1 << d);
byte_val |= ((b & 0xF) << bit);
}
q[byte] = byte_val;
}
return q;
}
// Hash function for std::array
struct QuantVecHash {
size_t operator()(const QuantVec& q) const {
size_t h = 0;
for (auto b : q)
h ^= std::hash<uint8_t>{}(b) + 0x9e3779b9 + (h << 6) + (h >> 2);
return h;
}
};
// Popcount64 Hamming distance
inline int popcount64(uint64_t x) {
#if defined(__clang__) || defined(__GNUC__)
return __builtin_popcountll(x);
#else
int cnt = 0;
while (x) { x &= (x-1); ++cnt; }
return cnt;
#endif
}
int hamming_distance(const Vec& a, const Vec& b) {
int total = 0;
for (size_t i = 0; i < DIM; i += 8) {
uint64_t x = 0;
for (size_t j = 0; j < 8; ++j) {
if (a[i+j] > 0.0f) x |= (1ULL << j);
}
uint64_t y = 0;
for (size_t j = 0; j < 8; ++j) {
if (b[i+j] > 0.0f) y |= (1ULL << j);
}
total += popcount64(x ^ y);
}
return total;
}
int main() {
// Generate random vectors
constexpr size_t N = 100000; // number of vectors
std::vector<Vec> collection(N);
std::mt19937 rng(42);
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& v : collection)
for (auto& x : v) x = dist(rng);
// Build index
std::unordered_map<QuantVec, std::vector<size_t>, QuantVecHash> index;
for (size_t id = 0; id < collection.size(); ++id) {
QuantVec q = quantize_vector(collection[id]);
index[q].push_back(id);
}
// Query test
size_t hits = 0;
size_t total_candidates = 0;
for (size_t i = 0; i < 10; ++i) {
Vec query;
for (auto& x : query) x = dist(rng);
QuantVec q_key = quantize_vector(query);
auto it = index.find(q_key);
std::vector<size_t> candidates;
if (it != index.end())
candidates = it->second;
total_candidates += candidates.size();
// Find nearest neighbor among candidates
int min_dist = std::numeric_limits<int>::max();
size_t nn_id = 0;
for (auto id : candidates) {
int d = hamming_distance(query, collection[id]);
if (d < min_dist) { min_dist = d; nn_id = id; }
}
if (!candidates.empty()) ++hits;
std::cout << "Query " << i << ", candidates=" << candidates.size()
<< ", nn_id=" << nn_id << ", dist=" << min_dist << "\n";
}
std::cout << "Total queries with candidates: " << hits
<< ", Total candidates scanned: " << total_candidates << "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment