Last active
February 22, 2026 10:35
-
-
Save jweinst1/edfcc8483688ba5ebb684af011cd9d26 to your computer and use it in GitHub Desktop.
design of multi map hash map for graph db
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <vector> | |
| #include <unordered_map> | |
| #include <random> | |
| #include <chrono> | |
| #include <cstdint> | |
| #include <limits> | |
| #include <cstring> | |
| #include <cstdlib> | |
| #include <cstdio> | |
| #include <array> | |
| struct IdGen { | |
| std::random_device rd; | |
| std::mt19937 gen; | |
| std::uniform_int_distribution<uint64_t> distrib; | |
| IdGen(): rd(), gen(rd()), distrib(std::numeric_limits<uint64_t>::min(), | |
| std::numeric_limits<uint64_t>::max()) {} | |
| uint64_t next() { | |
| return static_cast<uint64_t>(distrib(gen)); | |
| } | |
| }; | |
| static inline uint32_t rotl32(uint32_t x, int8_t r) { | |
| return (x << r) | (x >> (32 - r)); | |
| } | |
| static inline uint32_t fmix32(uint32_t h) { | |
| h ^= h >> 16; | |
| h *= 0x85ebca6b; | |
| h ^= h >> 13; | |
| h *= 0xc2b2ae35; | |
| h ^= h >> 16; | |
| return h; | |
| } | |
| static uint32_t murmur3_32(const void* key, size_t len, uint32_t seed = 0) { | |
| const uint8_t* data = static_cast<const uint8_t*>(key); | |
| const size_t nblocks = len / 4; | |
| uint32_t h1 = seed; | |
| constexpr uint32_t c1 = 0xcc9e2d51; | |
| constexpr uint32_t c2 = 0x1b873593; | |
| // body | |
| const uint32_t* blocks = reinterpret_cast<const uint32_t*>(data); | |
| for (size_t i = 0; i < nblocks; i++) { | |
| uint32_t k1 = blocks[i]; | |
| k1 *= c1; | |
| k1 = rotl32(k1, 15); | |
| k1 *= c2; | |
| h1 ^= k1; | |
| h1 = rotl32(h1, 13); | |
| h1 = h1 * 5 + 0xe6546b64; | |
| } | |
| // tail | |
| const uint8_t* tail = data + nblocks * 4; | |
| uint32_t k1 = 0; | |
| switch (len & 3) { | |
| case 3: k1 ^= uint32_t(tail[2]) << 16; | |
| case 2: k1 ^= uint32_t(tail[1]) << 8; | |
| case 1: k1 ^= uint32_t(tail[0]); | |
| k1 *= c1; | |
| k1 = rotl32(k1, 15); | |
| k1 *= c2; | |
| h1 ^= k1; | |
| } | |
| // finalization | |
| h1 ^= static_cast<uint32_t>(len); | |
| h1 = fmix32(h1); | |
| return h1; | |
| } | |
| // Finalization mix - force all bits of a hash block to avalanche | |
| static inline uint64_t fmix64(uint64_t k) { | |
| k ^= k >> 33; | |
| k *= 0xff51afd7ed558ccdULL; | |
| k ^= k >> 33; | |
| k *= 0xc4ceb9fe1a85ec53ULL; | |
| k ^= k >> 33; | |
| return k; | |
| } | |
| static uint64_t murmur3_64(const void *key, size_t len, uint32_t seed = 0) { | |
| const uint8_t *data = (const uint8_t *)key; | |
| const int nblocks = len / 16; | |
| uint64_t h1 = seed; | |
| uint64_t h2 = seed; | |
| constexpr uint64_t c1 = 0x87c37b91114253d5ULL; | |
| constexpr uint64_t c2 = 0x4cf5ad432745937fULL; | |
| //---------- | |
| // body | |
| const uint64_t *blocks = (const uint64_t *)(data); | |
| for (int i = 0; i < nblocks; i++) { | |
| uint64_t k1 = blocks[i * 2 + 0]; | |
| uint64_t k2 = blocks[i * 2 + 1]; | |
| k1 *= c1; k1 = (k1 << 31) | (k1 >> (64 - 31)); k1 *= c2; h1 ^= k1; | |
| h1 = (h1 << 27) | (h1 >> (64 - 27)); h1 += h2; h1 = h1 * 5 + 0x52dce729; | |
| k2 *= c2; k2 = (k2 << 33) | (k2 >> (64 - 33)); k2 *= c1; h2 ^= k2; | |
| h2 = (h2 << 31) | (h2 >> (64 - 31)); h2 += h1; h2 = h2 * 5 + 0x38495ab5; | |
| } | |
| //---------- | |
| // tail | |
| const uint8_t *tail = (const uint8_t *)(data + nblocks * 16); | |
| uint64_t k1 = 0; | |
| uint64_t k2 = 0; | |
| switch (len & 15) { | |
| case 15: k2 ^= ((uint64_t)tail[14]) << 48; | |
| case 14: k2 ^= ((uint64_t)tail[13]) << 40; | |
| case 13: k2 ^= ((uint64_t)tail[12]) << 32; | |
| case 12: k2 ^= ((uint64_t)tail[11]) << 24; | |
| case 11: k2 ^= ((uint64_t)tail[10]) << 16; | |
| case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; | |
| case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; | |
| k2 *= c2; k2 = (k2 << 33) | (k2 >> (64 - 33)); k2 *= c1; h2 ^= k2; | |
| case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; | |
| case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; | |
| case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; | |
| case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; | |
| case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; | |
| case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; | |
| case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; | |
| case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; | |
| k1 *= c1; k1 = (k1 << 31) | (k1 >> (64 - 31)); k1 *= c2; h1 ^= k1; | |
| } | |
| //---------- | |
| // finalization | |
| h1 ^= len; | |
| h2 ^= len; | |
| h1 += h2; | |
| h2 += h1; | |
| h1 = fmix64(h1); | |
| h2 = fmix64(h2); | |
| h1 += h2; | |
| // Return 64-bit hash (can return h2 or h1 depending on needs) | |
| return h1; | |
| } | |
| struct Relation { | |
| uint32_t key; | |
| uint32_t val; | |
| void setKey(const void* input, size_t size) { | |
| key = murmur3_32(input, size); | |
| } | |
| void setVal(const void* input, size_t size) { | |
| val = murmur3_32(input, size); | |
| } | |
| }; | |
| /** | |
| class HashOnlyLinearMap { | |
| struct KVPair { | |
| uint32_t _hash = 0; | |
| uint32_t _value; | |
| inline bool add(uint32_t hash, uint32_t val) { | |
| if (_hash == 0) { | |
| _hash = hash; | |
| _value = val; | |
| return true; | |
| } else if (_hash == hash) { | |
| _value = val; | |
| return true; | |
| } | |
| return false; | |
| } | |
| inline HashQueryRes find(uint32_t hash, uint32_t* val) const { | |
| if (_hash == 0) { | |
| return HashQueryRes::Empty; | |
| } else if (_hash == hash) { | |
| *val = _value; | |
| return HashQueryRes::Found; | |
| } | |
| return HashQueryRes::Occupied; | |
| } | |
| }; | |
| void insert(uint32_t hash, uint32_t value) { | |
| size_t idx = hash & mask; | |
| while (true) { | |
| Slot& s = table[idx]; | |
| if (s.add(hash, value)) { | |
| return; | |
| } | |
| idx = (idx + 1) & mask; // linear probe | |
| } | |
| } | |
| uint32_t find(uint32_t hash) const { | |
| size_t idx = hash & mask; | |
| uint32_t myNum; | |
| while (true) { | |
| const Slot& s = table[idx]; | |
| const HashQueryRes res = s.find(hash, &myNum); | |
| if (res == HashQueryRes::Found) { | |
| return myNum; | |
| } else if (res == HashQueryRes::Empty) { | |
| return 0; | |
| } | |
| idx = (idx + 1) & mask; | |
| } | |
| } | |
| * */ | |
| template<size_t tableSize> | |
| struct RelMap { | |
| static_assert(__builtin_popcountll(tableSize) == 1, | |
| "RelMap must be sized by a power of two"); | |
| static constexpr size_t tableSizeMask = tableSize - 1; | |
| std::array<Relation, tableSize> data{}; | |
| /** | |
| * Allows multiple of the same keys | |
| * Strategy 1 : have each val be the next index of the same key | |
| * Strategy 2 : store keys without regard to read effeciency | |
| * Strategy 3 : store keys with a count of how many other keys exist | |
| * 3 would involve incrementing the value as an insert passes through and the read | |
| * simply tracks it as it traverses. | |
| * */ | |
| void insert(const Relation& rel) { | |
| size_t idx = rel.key & tableSizeMask; | |
| while (true) { | |
| Relation& s = data[idx]; | |
| if (s.key == 0) { | |
| s = rel; | |
| return; | |
| } | |
| idx = (idx + 1) & tableSizeMask; | |
| } | |
| } | |
| /** | |
| * Todo, different find modes | |
| * breath vs depth | |
| * */ | |
| template<size_t resultSize> | |
| void find(uint32_t key, std::array<uint32_t, resultSize>& results) const { | |
| size_t idx = key & tableSizeMask; | |
| size_t curRes = 0; | |
| // todo think about full table and less than results present? | |
| while (curRes < resultSize) { | |
| const Relation& s = data[idx]; | |
| if (s.key == 0) { | |
| return; | |
| } else if (s.key == key) { | |
| results[curRes++] = s.val; | |
| } | |
| idx = (idx + 1) & tableSizeMask; | |
| } | |
| } | |
| }; | |
| static void multiPerfTest() { | |
| IdGen gens; | |
| std::vector<Relation> nums; | |
| RelMap<1 << 23>* myMap = new RelMap<1 << 23>(); | |
| for (int i = 0; i < 1000000; ++i) | |
| { | |
| uint32_t chosen = gens.next() & std::numeric_limits<uint32_t>::max(); | |
| Relation r; | |
| r.key = chosen; | |
| r.val = chosen; | |
| nums.push_back(r); | |
| } | |
| auto startIns = std::chrono::high_resolution_clock::now(); | |
| for (const auto& rel : nums) | |
| { | |
| myMap->insert(rel); | |
| myMap->insert(rel); | |
| myMap->insert(rel); | |
| myMap->insert(rel); | |
| } | |
| auto endIns = std::chrono::high_resolution_clock::now(); | |
| std::cout << "hash ins " << std::chrono::duration_cast<std::chrono::microseconds>(endIns - startIns).count() << "us\n"; | |
| auto startFind = std::chrono::high_resolution_clock::now(); | |
| size_t total = 0; | |
| for (const auto& rel : nums) | |
| { | |
| std::array<uint32_t, 4> results{}; | |
| myMap->find(rel.key, results); | |
| total += results[2]; | |
| } | |
| auto endFind = std::chrono::high_resolution_clock::now(); | |
| std::cout << total << " hash find " << std::chrono::duration_cast<std::chrono::microseconds>(endFind - startFind).count() << "us\n"; | |
| /* | |
| hash ins 35968us | |
| 2147674861900849 hash find 28633us | |
| */ | |
| } | |
| int main(int argc, char const *argv[]) | |
| { | |
| printf("%zu\n", sizeof(Relation)); | |
| multiPerfTest(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment