Last active
December 11, 2023 04:32
-
-
Save kacperlukawski/2d3a3225f15a4cc5772cd1c81866340d to your computer and use it in GitHub Desktop.
Qdrant tips&tricks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:01:38.772705Z", | |
| "start_time": "2023-03-13T09:01:38.627212Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import config\n", | |
| "import func" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:01:39.116822Z", | |
| "start_time": "2023-03-13T09:01:38.774713Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from tqdm import tqdm\n", | |
| "from qdrant_client import QdrantClient\n", | |
| "from qdrant_client.http import models as rest" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Basic connection" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:01:39.205014Z", | |
| "start_time": "2023-03-13T09:01:39.118666Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "client = QdrantClient(\n", | |
| " url=\"http://localhost\",\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:01:39.952500Z", | |
| "start_time": "2023-03-13T09:01:39.207647Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.recreate_collection(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " vectors_config=rest.VectorParams(\n", | |
| " size=config.VECTOR_SIZE,\n", | |
| " distance=rest.Distance.COSINE,\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:14:40.339461Z", | |
| "start_time": "2023-03-13T09:01:39.953855Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "101it [13:00, 7.73s/it] " | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 12min 7s, sys: 5.23 s, total: 12min 12s\n", | |
| "Wall time: 13min\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "max_num = 50_000\n", | |
| "\n", | |
| "batch_size = config.BATCH_SIZE\n", | |
| "objects = func.iterate_objects(max_num=max_num)\n", | |
| "batched_objects = func.batchify_objects(objects, n=batch_size)\n", | |
| "for batch in tqdm(batched_objects, total=max_num // batch_size):\n", | |
| " ids, vectors, payloads = batch\n", | |
| " client.upsert(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " points=rest.Batch(\n", | |
| " ids=ids,\n", | |
| " vectors=vectors,\n", | |
| " payloads=payloads,\n", | |
| " )\n", | |
| " )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:14:40.348596Z", | |
| "start_time": "2023-03-13T09:14:40.342240Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=47500, points_count=50001, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.get_collection(config.COLLECTION_NAME)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# gRPC protocol" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:14:40.600739Z", | |
| "start_time": "2023-03-13T09:14:40.349853Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "client = QdrantClient(\n", | |
| " url=\"http://localhost\",\n", | |
| " prefer_grpc=True,\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:14:41.047721Z", | |
| "start_time": "2023-03-13T09:14:40.602615Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.recreate_collection(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " vectors_config=rest.VectorParams(\n", | |
| " size=config.VECTOR_SIZE,\n", | |
| " distance=rest.Distance.COSINE,\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:21:29.766200Z", | |
| "start_time": "2023-03-13T09:14:41.049769Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "51it [06:48, 8.01s/it] " | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 6min 10s, sys: 3.34 s, total: 6min 13s\n", | |
| "Wall time: 6min 48s\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "batch_size = batch_size * 2\n", | |
| "objects = func.iterate_objects(max_num=max_num)\n", | |
| "batched_objects = func.batchify_objects(objects, n=batch_size)\n", | |
| "for batch in tqdm(batched_objects, total=max_num // batch_size):\n", | |
| " ids, vectors, payloads = batch\n", | |
| " client.upsert(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " points=rest.Batch(\n", | |
| " ids=ids,\n", | |
| " vectors=vectors,\n", | |
| " payloads=payloads,\n", | |
| " )\n", | |
| " )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T09:21:29.777569Z", | |
| "start_time": "2023-03-13T09:21:29.770752Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "CollectionInfo(status=<CollectionStatus.YELLOW: 'yellow'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=45000, points_count=50001, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.get_collection(config.COLLECTION_NAME)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Indexing threshold" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T11:47:30.851036Z", | |
| "start_time": "2023-03-13T11:47:30.382652Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.recreate_collection(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " vectors_config=rest.VectorParams(\n", | |
| " size=config.VECTOR_SIZE,\n", | |
| " distance=rest.Distance.COSINE,\n", | |
| " ),\n", | |
| " optimizers_config=rest.OptimizersConfigDiff(\n", | |
| " indexing_threshold=1_000_000_000, # 1M KBs\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T11:52:00.538505Z", | |
| "start_time": "2023-03-13T11:47:30.852851Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "51it [04:29, 5.29s/it] " | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 4min 19s, sys: 1.8 s, total: 4min 20s\n", | |
| "Wall time: 4min 29s\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "objects = func.iterate_objects(max_num=max_num)\n", | |
| "batched_objects = func.batchify_objects(objects, n=batch_size)\n", | |
| "for batch in tqdm(batched_objects, total=max_num // batch_size):\n", | |
| " ids, vectors, payloads = batch\n", | |
| " client.upsert(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " points=rest.Batch(\n", | |
| " ids=ids,\n", | |
| " vectors=vectors,\n", | |
| " payloads=payloads,\n", | |
| " )\n", | |
| " )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T11:52:00.546753Z", | |
| "start_time": "2023-03-13T11:52:00.541890Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=0, points_count=50001, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=1000000000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.get_collection(config.COLLECTION_NAME)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T11:52:00.634095Z", | |
| "start_time": "2023-03-13T11:52:00.548790Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "client.update_collection(\n", | |
| " collection_name=config.COLLECTION_NAME,\n", | |
| " optimizer_config=rest.OptimizersConfigDiff(\n", | |
| " indexing_threshold=10_000, # 1K KBs\n", | |
| " )\n", | |
| ")\n", | |
| "\n", | |
| "while True:\n", | |
| " collection_info = client.get_collection(collection_name=config.COLLECTION_NAME)\n", | |
| " if collection_info.status == rest.CollectionStatus.GREEN:\n", | |
| " break" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2023-03-13T11:52:00.641505Z", | |
| "start_time": "2023-03-13T11:52:00.636482Z" | |
| }, | |
| "pycharm": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=0, points_count=50001, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "client.get_collection(config.COLLECTION_NAME)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment