Skip to content

Instantly share code, notes, and snippets.

@egorsmkv
Created January 7, 2026 14:35
Show Gist options
  • Select an option

  • Save egorsmkv/69e5b456fc687709344cc4eaf56d9aca to your computer and use it in GitHub Desktop.

Select an option

Save egorsmkv/69e5b456fc687709344cc4eaf56d9aca to your computer and use it in GitHub Desktop.
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <time.h>
/**
* SPANN High-Performance Memory-Disk Hybrid Search
* Adhering to ISO C11 and Systems Programming Best Practices.
*/
typedef struct {
float *restrict coordinates;
uint32_t dimension;
} vector_t;
typedef struct {
vector_t centroid;
uint64_t disk_offset; // Byte offset in the posting file
uint32_t list_size; // Number of vectors in this list
} posting_meta_t;
typedef struct {
uint32_t dimension;
uint32_t num_centroids;
posting_meta_t *centroids;
FILE *posting_file;
} spann_index_t;
typedef struct {
uint32_t id;
float distance;
} neighbor_t;
/**
* Squared L2 Distance - Core Hot Loop.
* Uses restrict to inform compiler of no aliasing for SIMD optimization.
*/
static inline float l2_sq_dist(const uint32_t dim,
const float *restrict a,
const float *restrict b) {
float sum = 0.0f;
for (uint32_t i = 0; i < dim; ++i) {
float diff = a[i] - b[i];
sum += diff * diff;
}
return sum;
}
/**
* Query-Aware Dynamic Pruning Logic.
* Identifies which posting lists to fetch from disk based on epsilon threshold.
*/
size_t select_posting_lists(const spann_index_t *restrict index,
const vector_t *restrict query,
const float epsilon_2,
uint32_t *restrict active_indices,
const size_t max_candidates) {
if (index->num_centroids == 0) return 0;
// Phase 1: Find the absolute nearest centroid (c_i1)
float min_dist = INFINITY;
for (uint32_t i = 0; i < index->num_centroids; ++i) {
float d = l2_sq_dist(index->dimension, query->coordinates,
index->centroids[i].centroid.coordinates);
if (d < min_dist) {
min_dist = d;
}
}
// Phase 2: Apply threshold (1 + epsilon_2) * min_dist
// Squared threshold logic
float threshold = (1.0f + epsilon_2) * (1.0f + epsilon_2) * min_dist;
size_t active_count = 0;
for (uint32_t i = 0; i < index->num_centroids && active_count < max_candidates; ++i) {
float d = l2_sq_dist(index->dimension, query->coordinates,
index->centroids[i].centroid.coordinates);
if (d <= threshold) {
active_indices[active_count++] = i;
}
}
return active_count;
}
/**
* Executes the SPANN search.
* Mimics SSD I/O by seeking and reading contiguous posting blocks.
*/
neighbor_t search_spann(const spann_index_t *restrict index,
const vector_t *restrict query,
const float epsilon_2) {
uint32_t *active_lists = malloc(sizeof(uint32_t) * index->num_centroids);
if (!active_lists) return (neighbor_t) {
0, INFINITY
};
// We limit to 16 posting lists per query for latency control
size_t num_lists = select_posting_lists(index, query, epsilon_2, active_lists, 16);
neighbor_t best_neighbor = {0, INFINITY};
float *buffer = malloc(index->dimension * sizeof(float));
if (!buffer) {
free(active_lists);
return (neighbor_t) {
0, INFINITY
};
}
for (size_t i = 0; i < num_lists; ++i) {
posting_meta_t *meta = &index->centroids[active_lists[i]];
// Seek to disk-resident posting list
fseek(index->posting_file, (long)meta->disk_offset, SEEK_SET);
for (uint32_t j = 0; j < meta->list_size; ++j) {
if (fread(buffer, sizeof(float), index->dimension, index->posting_file) != index->dimension) {
continue;
}
float d = l2_sq_dist(index->dimension, query->coordinates, buffer);
if (d < best_neighbor.distance) {
best_neighbor.distance = d;
best_neighbor.id = (uint32_t)(meta->disk_offset / (index->dimension * sizeof(float))) + j;
}
}
}
free(buffer);
free(active_lists);
return best_neighbor;
}
// Memory Lifecycle Management
void destroy_index(spann_index_t *index) {
if (!index) return;
for (uint32_t i = 0; i < index->num_centroids; ++i) {
free(index->centroids[i].centroid.coordinates);
}
free(index->centroids);
if (index->posting_file) fclose(index->posting_file);
free(index);
}
/**
* Demo Entry Point
*/
int main(void) {
const uint32_t dim = 64;
const uint32_t num_centroids = 10;
const uint32_t vectors_per_list = 100;
const char *posting_filename = "postings.bin";
printf("--- SPANN Algorithm Demo ---\n");
printf("Initializing index: Dim=%u, Centroids=%u, Vectors/List=%u\n", dim, num_centroids, vectors_per_list);
// 1. Setup Mock Index and Posting File
spann_index_t *index = calloc(1, sizeof(spann_index_t));
if (!index) return 1;
index->dimension = dim;
index->num_centroids = num_centroids;
index->centroids = calloc(num_centroids, sizeof(posting_meta_t));
index->posting_file = fopen(posting_filename, "wb+");
if (!index->centroids || !index->posting_file) {
fprintf(stderr, "Initialization failed.\n");
destroy_index(index);
return 1;
}
// 2. Generate synthetic data
srand((unsigned int)time(NULL));
for (uint32_t i = 0; i < num_centroids; ++i) {
index->centroids[i].centroid.coordinates = malloc(dim * sizeof(float));
index->centroids[i].centroid.dimension = dim;
index->centroids[i].list_size = vectors_per_list;
index->centroids[i].disk_offset = ftell(index->posting_file);
// Random Centroid
for (uint32_t d = 0; d < dim; d++) {
index->centroids[i].centroid.coordinates[d] = (float)rand() / (float)RAND_MAX;
}
// Write "Vectors" near this centroid to the posting file
for (uint32_t v = 0; v < vectors_per_list; v++) {
float vec[64];
for (uint32_t d = 0; d < dim; d++) {
// Jittering around the centroid
vec[d] = index->centroids[i].centroid.coordinates[d] + ((float)rand() / (float)RAND_MAX * 0.1f);
}
fwrite(vec, sizeof(float), dim, index->posting_file);
}
}
fflush(index->posting_file);
// 3. Define Query
vector_t query;
query.dimension = dim;
query.coordinates = malloc(dim * sizeof(float));
for (uint32_t d = 0; d < dim; d++) {
query.coordinates[d] = (float)rand() / (float)RAND_MAX;
}
// 4. Perform Search
float epsilon_2 = 0.5f; // Threshold parameter from SPANN paper
printf("Executing search with epsilon_2 = %.2f...\n", epsilon_2);
clock_t start = clock();
neighbor_t result = search_spann(index, &query, epsilon_2);
clock_t end = clock();
double time_spent = (double)(end - start) / CLOCKS_PER_SEC;
// 5. Display Results
if (result.distance == INFINITY) {
printf("Search failed or no neighbors found.\n");
} else {
printf("Nearest Neighbor ID: %u\n", result.id);
printf("Squared L2 Distance: %f\n", result.distance);
printf("Search Latency: %f ms\n", time_spent * 1000.0);
}
// Cleanup
free(query.coordinates);
destroy_index(index);
remove(posting_filename); // Remove mock disk file
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment