Last active
September 1, 2025 07:53
-
-
Save jweinst1/4df56d34daa41c546f69928274280fbd to your computer and use it in GitHub Desktop.
quantizer into a hash bucket test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <array> | |
| #include <vector> | |
| #include <unordered_map> | |
| #include <random> | |
| #include <iostream> | |
| #include <limits> | |
| #include <cstdint> | |
| #include <chrono> | |
| constexpr size_t DIM = 128; | |
| constexpr size_t QUANT_BYTES = 4; // 32 bits total (128 dims / 4 dims per bit) | |
| using Vec = std::array<float, DIM>; | |
| using QuantVec = std::array<uint8_t, QUANT_BYTES>; | |
| // Quantize 128 floats into 32 bits (4 dims → 1 bit) | |
| QuantVec quantize_vector(const Vec& v, float threshold = 0.0f) { | |
| QuantVec q{}; | |
| for (size_t byte = 0; byte < QUANT_BYTES; ++byte) { | |
| uint8_t byte_val = 0; | |
| for (size_t bit = 0; bit < 8; ++bit) { | |
| size_t dim_base = (byte * 32) + (bit * 4); | |
| uint8_t b = 0; | |
| for (size_t d = 0; d < 4; ++d) | |
| if (v[dim_base + d] > threshold) b |= (1 << d); | |
| byte_val |= ((b & 0xF) << bit); | |
| } | |
| q[byte] = byte_val; | |
| } | |
| return q; | |
| } | |
| // Hash function for QuantVec | |
| struct QuantVecHash { | |
| size_t operator()(const QuantVec& q) const { | |
| size_t h = 0; | |
| for (auto b : q) | |
| h ^= std::hash<uint8_t>{}(b) + 0x9e3779b9 + (h << 6) + (h >> 2); | |
| return h; | |
| } | |
| }; | |
| // Popcount64 Hamming distance | |
| inline int popcount64(uint64_t x) { | |
| #if defined(__clang__) || defined(__GNUC__) | |
| return __builtin_popcountll(x); | |
| #else | |
| int cnt = 0; | |
| while (x) { x &= (x-1); ++cnt; } | |
| return cnt; | |
| #endif | |
| } | |
| // Encode vector as 2-bit-per-dim to allow Hamming calc | |
| uint64_t encode_vector(const Vec& v) { | |
| uint64_t enc = 0; | |
| for (size_t i = 0; i < 64; ++i) { | |
| if (v[i] > 0.0f) enc |= (1ULL << i); | |
| } | |
| return enc; | |
| } | |
| int hamming_distance(const Vec& a, const Vec& b) { | |
| uint64_t x = encode_vector(a); | |
| uint64_t y = encode_vector(b); | |
| return popcount64(x ^ y); | |
| } | |
| int main() { | |
| constexpr size_t N = 1000000; // dataset size | |
| constexpr size_t NQ = 10000000; // 10 million queries | |
| std::vector<Vec> collection(N); | |
| std::mt19937 rng(42); | |
| std::uniform_real_distribution<float> dist(-1.0f, 1.0f); | |
| for (auto& v : collection) | |
| for (auto& x : v) x = dist(rng); | |
| // Build hash index | |
| std::unordered_map<QuantVec, std::vector<size_t>, QuantVecHash> index; | |
| index.reserve(N * 2); | |
| for (size_t id = 0; id < N; ++id) { | |
| QuantVec q = quantize_vector(collection[id]); | |
| index[q].push_back(id); | |
| } | |
| std::cout << "Index built with " << index.size() | |
| << " non-empty buckets\n"; | |
| // Benchmark queries | |
| auto start = std::chrono::high_resolution_clock::now(); | |
| size_t total_candidates = 0; | |
| size_t total_queries_with_hits = 0; | |
| for (size_t qi = 0; qi < NQ; ++qi) { | |
| Vec query; | |
| for (auto& x : query) x = dist(rng); | |
| QuantVec qkey = quantize_vector(query); | |
| auto it = index.find(qkey); | |
| if (it == index.end()) continue; | |
| const auto& candidates = it->second; | |
| total_candidates += candidates.size(); | |
| ++total_queries_with_hits; | |
| // Find nearest among candidates | |
| int min_dist = std::numeric_limits<int>::max(); | |
| for (auto id : candidates) { | |
| int d = hamming_distance(query, collection[id]); | |
| if (d < min_dist) min_dist = d; | |
| } | |
| } | |
| auto end = std::chrono::high_resolution_clock::now(); | |
| double elapsed = std::chrono::duration<double>(end - start).count(); | |
| double qps = NQ / elapsed; | |
| std::cout << "Completed " << NQ << " queries in " | |
| << elapsed << " sec, QPS=" << qps << "\n"; | |
| std::cout << "Queries with hits: " << total_queries_with_hits << "\n"; | |
| std::cout << "Avg candidates per query: " | |
| << (double)total_candidates / std::max<size_t>(1,total_queries_with_hits) << "\n"; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <array> | |
| #include <vector> | |
| #include <unordered_map> | |
| #include <random> | |
| #include <iostream> | |
| #include <limits> | |
| #include <cstdint> | |
| constexpr size_t DIM = 128; | |
| constexpr size_t QUANT_BYTES = 4; // 32 bits total (128 dims / 4 dims per bit) | |
| using Vec = std::array<float, DIM>; | |
| using QuantVec = std::array<uint8_t, QUANT_BYTES>; | |
| // Simple quantization: pack 4 dimensions per bit into 32-bit result | |
| QuantVec quantize_vector(const Vec& v, float threshold = 0.0f) { | |
| QuantVec q{}; | |
| for (size_t byte = 0; byte < QUANT_BYTES; ++byte) { | |
| uint8_t byte_val = 0; | |
| for (size_t bit = 0; bit < 8; ++bit) { | |
| size_t dim_base = (byte * 32) + (bit * 4); | |
| uint8_t b = 0; | |
| for (size_t d = 0; d < 4; ++d) | |
| if (v[dim_base + d] > threshold) b |= (1 << d); | |
| byte_val |= ((b & 0xF) << bit); | |
| } | |
| q[byte] = byte_val; | |
| } | |
| return q; | |
| } | |
| // Hash function for std::array | |
| struct QuantVecHash { | |
| size_t operator()(const QuantVec& q) const { | |
| size_t h = 0; | |
| for (auto b : q) | |
| h ^= std::hash<uint8_t>{}(b) + 0x9e3779b9 + (h << 6) + (h >> 2); | |
| return h; | |
| } | |
| }; | |
| // Popcount64 Hamming distance | |
| inline int popcount64(uint64_t x) { | |
| #if defined(__clang__) || defined(__GNUC__) | |
| return __builtin_popcountll(x); | |
| #else | |
| int cnt = 0; | |
| while (x) { x &= (x-1); ++cnt; } | |
| return cnt; | |
| #endif | |
| } | |
| int hamming_distance(const Vec& a, const Vec& b) { | |
| int total = 0; | |
| for (size_t i = 0; i < DIM; i += 8) { | |
| uint64_t x = 0; | |
| for (size_t j = 0; j < 8; ++j) { | |
| if (a[i+j] > 0.0f) x |= (1ULL << j); | |
| } | |
| uint64_t y = 0; | |
| for (size_t j = 0; j < 8; ++j) { | |
| if (b[i+j] > 0.0f) y |= (1ULL << j); | |
| } | |
| total += popcount64(x ^ y); | |
| } | |
| return total; | |
| } | |
| int main() { | |
| // Generate random vectors | |
| constexpr size_t N = 100000; // number of vectors | |
| std::vector<Vec> collection(N); | |
| std::mt19937 rng(42); | |
| std::uniform_real_distribution<float> dist(-1.0f, 1.0f); | |
| for (auto& v : collection) | |
| for (auto& x : v) x = dist(rng); | |
| // Build index | |
| std::unordered_map<QuantVec, std::vector<size_t>, QuantVecHash> index; | |
| for (size_t id = 0; id < collection.size(); ++id) { | |
| QuantVec q = quantize_vector(collection[id]); | |
| index[q].push_back(id); | |
| } | |
| // Query test | |
| size_t hits = 0; | |
| size_t total_candidates = 0; | |
| for (size_t i = 0; i < 10; ++i) { | |
| Vec query; | |
| for (auto& x : query) x = dist(rng); | |
| QuantVec q_key = quantize_vector(query); | |
| auto it = index.find(q_key); | |
| std::vector<size_t> candidates; | |
| if (it != index.end()) | |
| candidates = it->second; | |
| total_candidates += candidates.size(); | |
| // Find nearest neighbor among candidates | |
| int min_dist = std::numeric_limits<int>::max(); | |
| size_t nn_id = 0; | |
| for (auto id : candidates) { | |
| int d = hamming_distance(query, collection[id]); | |
| if (d < min_dist) { min_dist = d; nn_id = id; } | |
| } | |
| if (!candidates.empty()) ++hits; | |
| std::cout << "Query " << i << ", candidates=" << candidates.size() | |
| << ", nn_id=" << nn_id << ", dist=" << min_dist << "\n"; | |
| } | |
| std::cout << "Total queries with candidates: " << hits | |
| << ", Total candidates scanned: " << total_candidates << "\n"; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment