A binary format to store the vectors.
| dim (4 bytes) | vector (4 * dim bytes) |
| dim (4 bytes) | vector (4 * dim bytes) |
...
| dim (4 bytes) | vector (4 * dim bytes) |
| from struct import unpack, pack | |
| import numpy as np | |
| def read_vec(filepath: str, vec_type: np.dtype = np.float32): | |
| """Read vectors from a file. Support `fvecs`, `ivecs` and `bvecs` format. | |
| Args: | |
| filepath: The path of the file. | |
| vec_type: The type of the vectors. | |
| """ | |
| size = np.dtype(vec_type).itemsize | |
| with open(filepath, "rb") as f: | |
| vecs = [] | |
| while True: | |
| try: | |
| buf = f.read(4) | |
| if len(buf) == 0: | |
| break | |
| dim = unpack("<i", buf)[0] | |
| vecs.append(np.frombuffer(f.read(dim * size), dtype=vec_type)) | |
| except Exception as err: | |
| print(err) | |
| break | |
| return np.array(vecs) | |
| def write_vec(filepath: str, vecs: np.ndarray, vec_type: np.dtype = np.float32): | |
| """Write vectors to a file. Support `fvecs`, `ivecs` and `bvecs` format.""" | |
| with open(filepath, "wb") as f: | |
| for vec in vecs: | |
| f.write(pack("<i", len(vec))) | |
| f.write(vec.tobytes()) |
| use std::fs::File; | |
| use std::io::{BufReader, BufWriter, Read, Write}; | |
| use std::path::Path; | |
| use num_traits::{FromBytes, ToBytes}; | |
| /// Read the fvces/ivces file. | |
| pub fn read_vecs<T>(path: &Path) -> std::io::Result<Vec<Vec<T>>> | |
| where | |
| T: Sized + FromBytes<Bytes = [u8; 4]>, | |
| { | |
| let file = File::open(path)?; | |
| let mut reader = BufReader::new(file); | |
| let mut buf = [0u8; 4]; | |
| let mut count: usize; | |
| let mut vecs = Vec::new(); | |
| loop { | |
| count = reader.read(&mut buf)?; | |
| if count == 0 { | |
| break; | |
| } | |
| let dim = u32::from_le_bytes(buf) as usize; | |
| let mut vec = Vec::with_capacity(dim); | |
| for _ in 0..dim { | |
| reader.read_exact(&mut buf)?; | |
| vec.push(T::from_le_bytes(&buf)); | |
| } | |
| vecs.push(vec); | |
| } | |
| Ok(vecs) | |
| } | |
| /// Write the fvecs/ivecs file. | |
| pub fn write_vecs<T>(path: &Path, vecs: &[impl AsRef<[T]>]) -> std::io::Result<()> | |
| where | |
| T: Sized + ToBytes, | |
| { | |
| let file = File::create(path)?; | |
| let mut writer = BufWriter::new(file); | |
| for vec in vecs.iter() { | |
| writer.write_all(&(vec.as_ref().len() as u32).to_le_bytes())?; | |
| for v in vec.as_ref().iter() { | |
| writer.write_all(T::to_le_bytes(v).as_ref())?; | |
| } | |
| } | |
| writer.flush()?; | |
| Ok(()) | |
| } |