Created
January 7, 2026 14:35
-
-
Save egorsmkv/69e5b456fc687709344cc4eaf56d9aca to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdint.h> | |
| #include <stdbool.h> | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <math.h> | |
| #include <string.h> | |
| #include <time.h> | |
| /** | |
| * SPANN High-Performance Memory-Disk Hybrid Search | |
| * Adhering to ISO C11 and Systems Programming Best Practices. | |
| */ | |
| typedef struct { | |
| float *restrict coordinates; | |
| uint32_t dimension; | |
| } vector_t; | |
| typedef struct { | |
| vector_t centroid; | |
| uint64_t disk_offset; // Byte offset in the posting file | |
| uint32_t list_size; // Number of vectors in this list | |
| } posting_meta_t; | |
| typedef struct { | |
| uint32_t dimension; | |
| uint32_t num_centroids; | |
| posting_meta_t *centroids; | |
| FILE *posting_file; | |
| } spann_index_t; | |
| typedef struct { | |
| uint32_t id; | |
| float distance; | |
| } neighbor_t; | |
| /** | |
| * Squared L2 Distance - Core Hot Loop. | |
| * Uses restrict to inform compiler of no aliasing for SIMD optimization. | |
| */ | |
| static inline float l2_sq_dist(const uint32_t dim, | |
| const float *restrict a, | |
| const float *restrict b) { | |
| float sum = 0.0f; | |
| for (uint32_t i = 0; i < dim; ++i) { | |
| float diff = a[i] - b[i]; | |
| sum += diff * diff; | |
| } | |
| return sum; | |
| } | |
| /** | |
| * Query-Aware Dynamic Pruning Logic. | |
| * Identifies which posting lists to fetch from disk based on epsilon threshold. | |
| */ | |
| size_t select_posting_lists(const spann_index_t *restrict index, | |
| const vector_t *restrict query, | |
| const float epsilon_2, | |
| uint32_t *restrict active_indices, | |
| const size_t max_candidates) { | |
| if (index->num_centroids == 0) return 0; | |
| // Phase 1: Find the absolute nearest centroid (c_i1) | |
| float min_dist = INFINITY; | |
| for (uint32_t i = 0; i < index->num_centroids; ++i) { | |
| float d = l2_sq_dist(index->dimension, query->coordinates, | |
| index->centroids[i].centroid.coordinates); | |
| if (d < min_dist) { | |
| min_dist = d; | |
| } | |
| } | |
| // Phase 2: Apply threshold (1 + epsilon_2) * min_dist | |
| // Squared threshold logic | |
| float threshold = (1.0f + epsilon_2) * (1.0f + epsilon_2) * min_dist; | |
| size_t active_count = 0; | |
| for (uint32_t i = 0; i < index->num_centroids && active_count < max_candidates; ++i) { | |
| float d = l2_sq_dist(index->dimension, query->coordinates, | |
| index->centroids[i].centroid.coordinates); | |
| if (d <= threshold) { | |
| active_indices[active_count++] = i; | |
| } | |
| } | |
| return active_count; | |
| } | |
| /** | |
| * Executes the SPANN search. | |
| * Mimics SSD I/O by seeking and reading contiguous posting blocks. | |
| */ | |
| neighbor_t search_spann(const spann_index_t *restrict index, | |
| const vector_t *restrict query, | |
| const float epsilon_2) { | |
| uint32_t *active_lists = malloc(sizeof(uint32_t) * index->num_centroids); | |
| if (!active_lists) return (neighbor_t) { | |
| 0, INFINITY | |
| }; | |
| // We limit to 16 posting lists per query for latency control | |
| size_t num_lists = select_posting_lists(index, query, epsilon_2, active_lists, 16); | |
| neighbor_t best_neighbor = {0, INFINITY}; | |
| float *buffer = malloc(index->dimension * sizeof(float)); | |
| if (!buffer) { | |
| free(active_lists); | |
| return (neighbor_t) { | |
| 0, INFINITY | |
| }; | |
| } | |
| for (size_t i = 0; i < num_lists; ++i) { | |
| posting_meta_t *meta = &index->centroids[active_lists[i]]; | |
| // Seek to disk-resident posting list | |
| fseek(index->posting_file, (long)meta->disk_offset, SEEK_SET); | |
| for (uint32_t j = 0; j < meta->list_size; ++j) { | |
| if (fread(buffer, sizeof(float), index->dimension, index->posting_file) != index->dimension) { | |
| continue; | |
| } | |
| float d = l2_sq_dist(index->dimension, query->coordinates, buffer); | |
| if (d < best_neighbor.distance) { | |
| best_neighbor.distance = d; | |
| best_neighbor.id = (uint32_t)(meta->disk_offset / (index->dimension * sizeof(float))) + j; | |
| } | |
| } | |
| } | |
| free(buffer); | |
| free(active_lists); | |
| return best_neighbor; | |
| } | |
| // Memory Lifecycle Management | |
| void destroy_index(spann_index_t *index) { | |
| if (!index) return; | |
| for (uint32_t i = 0; i < index->num_centroids; ++i) { | |
| free(index->centroids[i].centroid.coordinates); | |
| } | |
| free(index->centroids); | |
| if (index->posting_file) fclose(index->posting_file); | |
| free(index); | |
| } | |
| /** | |
| * Demo Entry Point | |
| */ | |
| int main(void) { | |
| const uint32_t dim = 64; | |
| const uint32_t num_centroids = 10; | |
| const uint32_t vectors_per_list = 100; | |
| const char *posting_filename = "postings.bin"; | |
| printf("--- SPANN Algorithm Demo ---\n"); | |
| printf("Initializing index: Dim=%u, Centroids=%u, Vectors/List=%u\n", dim, num_centroids, vectors_per_list); | |
| // 1. Setup Mock Index and Posting File | |
| spann_index_t *index = calloc(1, sizeof(spann_index_t)); | |
| if (!index) return 1; | |
| index->dimension = dim; | |
| index->num_centroids = num_centroids; | |
| index->centroids = calloc(num_centroids, sizeof(posting_meta_t)); | |
| index->posting_file = fopen(posting_filename, "wb+"); | |
| if (!index->centroids || !index->posting_file) { | |
| fprintf(stderr, "Initialization failed.\n"); | |
| destroy_index(index); | |
| return 1; | |
| } | |
| // 2. Generate synthetic data | |
| srand((unsigned int)time(NULL)); | |
| for (uint32_t i = 0; i < num_centroids; ++i) { | |
| index->centroids[i].centroid.coordinates = malloc(dim * sizeof(float)); | |
| index->centroids[i].centroid.dimension = dim; | |
| index->centroids[i].list_size = vectors_per_list; | |
| index->centroids[i].disk_offset = ftell(index->posting_file); | |
| // Random Centroid | |
| for (uint32_t d = 0; d < dim; d++) { | |
| index->centroids[i].centroid.coordinates[d] = (float)rand() / (float)RAND_MAX; | |
| } | |
| // Write "Vectors" near this centroid to the posting file | |
| for (uint32_t v = 0; v < vectors_per_list; v++) { | |
| float vec[64]; | |
| for (uint32_t d = 0; d < dim; d++) { | |
| // Jittering around the centroid | |
| vec[d] = index->centroids[i].centroid.coordinates[d] + ((float)rand() / (float)RAND_MAX * 0.1f); | |
| } | |
| fwrite(vec, sizeof(float), dim, index->posting_file); | |
| } | |
| } | |
| fflush(index->posting_file); | |
| // 3. Define Query | |
| vector_t query; | |
| query.dimension = dim; | |
| query.coordinates = malloc(dim * sizeof(float)); | |
| for (uint32_t d = 0; d < dim; d++) { | |
| query.coordinates[d] = (float)rand() / (float)RAND_MAX; | |
| } | |
| // 4. Perform Search | |
| float epsilon_2 = 0.5f; // Threshold parameter from SPANN paper | |
| printf("Executing search with epsilon_2 = %.2f...\n", epsilon_2); | |
| clock_t start = clock(); | |
| neighbor_t result = search_spann(index, &query, epsilon_2); | |
| clock_t end = clock(); | |
| double time_spent = (double)(end - start) / CLOCKS_PER_SEC; | |
| // 5. Display Results | |
| if (result.distance == INFINITY) { | |
| printf("Search failed or no neighbors found.\n"); | |
| } else { | |
| printf("Nearest Neighbor ID: %u\n", result.id); | |
| printf("Squared L2 Distance: %f\n", result.distance); | |
| printf("Search Latency: %f ms\n", time_spent * 1000.0); | |
| } | |
| // Cleanup | |
| free(query.coordinates); | |
| destroy_index(index); | |
| remove(posting_filename); // Remove mock disk file | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment