Skip to content

Instantly share code, notes, and snippets.

@roaramburu
Created July 17, 2025 23:11
Show Gist options
  • Select an option

  • Save roaramburu/2134b47504e6fd0ff1643601c8d83d2c to your computer and use it in GitHub Desktop.

Select an option

Save roaramburu/2134b47504e6fd0ff1643601c8d83d2c to your computer and use it in GitHub Desktop.
Theseus Semantic Search Example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a2090cf9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populated lookup in 0.98 seconds\n"
]
}
],
"source": [
"import os\n",
"import pyarrow.parquet as pq\n",
"from pathlib import Path\n",
"from voltrondata_sdk.client import ViceClient\n",
"\n",
"vice_client = ViceClient.create_insecure('vd-control-plane-user-api:3001')\n",
"vice_client.admin_login(os.environ[\"ADMIN_EMAIL\"], os.environ[\"ADMIN_PASSWORD\"])"
]
},
{
"cell_type": "markdown",
"id": "105f211f",
"metadata": {},
"source": [
"# Wikipedia Dataset Downloader\n",
"\n",
"This notebook provides a clean interface for downloading Wikipedia dataset files from Hugging Face, which is split into 41 parquet files (from 00000 to 00040).\n",
"\n",
"The implementation details are in the `wikipedia_helpers.py` file for better organization."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e47755a8",
"metadata": {},
"outputs": [],
"source": [
"# Import our helper functions\n",
"from wikipedia_helpers import (\n",
" create_output_directory,\n",
" download_wikipedia_dataset,\n",
" )\n",
"\n",
"# Additional imports for advanced usage\n",
"import os\n",
"import pandas as pd\n",
"\n",
"model_name = \"sentence-transformers/all-MiniLM-L6-v2\"\n"
]
},
{
"cell_type": "markdown",
"id": "1b336fa5",
"metadata": {},
"source": [
"## 1. Create Output Directory\n",
"\n",
"First, let's create a directory to store our downloaded files."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8f7316c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Files will be saved to: /home/jupyter/data/wikipedia/text\n"
]
}
],
"source": [
"# Create the directory for storing files\n",
"output_dir = create_output_directory('/home/jupyter/data/wikipedia/text')\n",
"print(f\"Files will be saved to: {output_dir}\")"
]
},
{
"cell_type": "markdown",
"id": "5b6da54d",
"metadata": {},
"source": [
"## 2. Download Wikipedia Dataset\n",
"\n",
"Now let's download all 41 files from the Wikipedia dataset. You can customize the download by specifying:\n",
"\n",
"- `start_index`: The first file index to download (default: 0)\n",
"- `end_index`: The last file index to download (default: 40)\n",
"- `delay`: Time to wait between downloads in seconds (default: 1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c902a4c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading file 1/1: train-00000-of-00041.parquet\n",
"File already exists: /home/jupyter/data/gs:/voltrondata-demo-tests/ln-udf-rag/wiki/train-00000-of-00041.parquet\n",
"\n",
"Download summary:\n",
" - Successfully downloaded: 1/1 files\n",
" - Failed downloads: 0\n"
]
}
],
"source": [
"# Download all files (from 00000 to 00040)\n",
"# You can modify parameters to download only a subset of files\n",
"successful, failed = download_wikipedia_dataset(output_dir=\"gs://voltrondata-demo-tests/ln-udf-rag/wiki\", end_index=0)\n",
"\n",
"# If you want to download just a few files for testing, uncomment the following line:\n",
"# successful, failed = download_wikipedia_dataset(start_index=0, end_index=2)"
]
},
{
"cell_type": "markdown",
"id": "5a02eacb",
"metadata": {},
"source": [
"## 3. Starting up the engine"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b76ae435",
"metadata": {},
"outputs": [],
"source": [
"engine_template = vice_client.get_engine_template(name=\"small_engine_rag\")\n",
"# engine_template = vice_client.get_engine_template(name=\"small_engine\")\n",
"\n",
"role = vice_client.get_role(name=\"default\")\n",
"\n",
"engine = vice_client.list_engines()[0]\n",
"# Create the engine\n",
"#engine = vice_client.create_engine(engine_template_id=engine_template, role=role)\n",
"#engine"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2ca462e1-2f75-4642-b7b6-93b39fffe088",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<ibis_theseus.backend.Backend at 0x7c7c5dcffbe0>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Wait until the engine is ready for use\n",
"vice_client.wait_for_engine(engine)# Create an Ibis connection\n",
"ibis_con = engine.ibis_connect()\n",
"con = engine.connect()\n",
"ibis_con"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "71d85466",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"156289\n"
]
}
],
"source": [
"# Create a table from the parquet files we just downloaded using Ibis\n",
"# Define the path to our downloaded Wikipedia dataset\n",
"import ibis\n",
"wikipedia_dir = os.path.join(output_dir, '*.parquet')\n",
"parquet_files = [\n",
" os.path.join(output_dir, f)\n",
" for f in os.listdir(output_dir)\n",
" if f.endswith(\".parquet\")\n",
"]\n",
"ibis_con.raw_sql(\"\"\"CREATE FILESYSTEM \"voltrondata-demo-tests\" (\n",
" TYPE GCS,\n",
" PROJECT_ID 'voltrondata-demo'\n",
")\"\"\").execute()\n",
"print(datasource.num_rows)\n",
"#datasource = pq.read_table(os.path.join(os.getcwd(), \"data\", \"customer.parquet\"))\n",
"#print(datasource)\n",
"#datasource = pq.read_table(parquet_files)\n",
"#wikipedia_table = ibis_con.create_table(\n",
"# name=\"wikipedia\", \n",
"# obj=datasource[0:100],\n",
"# overwrite=True\n",
"#)\n",
"\n",
"\n",
"wikipedia_table = ibis_con.read_parquet(\"gs://voltrondata-demo-tests/ln-udf-rag/wiki/train-00000-of-00041.parquet\",table_name='wikipedia')\n",
"\n",
"# Show the table schema\n",
"print(\"Schema of the Wikipedia dataset:\")\n",
"print(wikipedia_table.schema())\n",
"\n",
"# Display a sample of the data\n",
"print(\"\\nSample of the Wikipedia data (5 rows):\")\n",
"wikipedia_sample = wikipedia_table.limit(5).execute()\n",
"print(wikipedia_sample)\n",
"\n",
"# Count the total number of rows\n",
"total_count = wikipedia_table.count().execute()\n",
"print(f\"\\nTotal number of Wikipedia articles: {total_count:,}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "148fad41",
"metadata": {},
"outputs": [],
"source": [
"#path = \"data/wikipedia_data_small/\"\n",
"#from theseus_gateway_client import WriteOptions, ParquetWriteOptions, CompressionType, StatsLevel\n",
"#!mkdir -p {path}\n",
"#result = ibis_con.con.sql(\"select * from wikipedia limit 100\",\n",
"# output_type=\"parquet\", \n",
"# output_path=str(path),\n",
"# write_options=WriteOptions(\n",
"# file_format_options=ParquetWriteOptions(\n",
"# row_group_size_rows=131072,\n",
"# compression_type=CompressionType.ZSTD,\n",
"# statistics_frequency=StatsLevel('row_group'),\n",
"# )\n",
"# )\n",
"#)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e93e58ee",
"metadata": {},
"outputs": [],
"source": [
"#path = \"/home/felipe/llm_experiments/wikipedia_data_small/\"\n",
"#wikipedia_table = con.read_parquet(path,table_name='wikipedia')\n",
"#con.sql(\"select * from wikipedia\").to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "175d2b62",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"embeddings_path = \"gs://voltrondata-demo-tests/ln-udf-rag/embeds\"\n",
"\n",
"# Ensure the embeddings directory exists\n",
"#Clean up old embeddings\n",
"#!rm -rf {embeddings_path}*"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c707b6e7",
"metadata": {},
"outputs": [],
"source": [
"#Lets make the parquet row groups more evenly sized\n",
"#from theseus_gateway_client import StatsLevel\n",
"#from theseus_gateway_client import WriteOptions, ParquetWriteOptions, CompressionType\n",
"\n",
"#path = \"wikipedia/text_resized/\"\n",
"\n",
"#clean up old files\n",
"#!rm -rf {path}*.parquet\n",
"\n",
"#result = con.con.sql(\"select * from wikipedia\",\n",
"# output_type=\"parquet\", d\n",
"# output_path=str(path),\n",
"# write_options=WriteOptions(\n",
"# file_format_options=ParquetWriteOptions(\n",
"# row_group_size_bytes=2**24, # 16MB\n",
"# compression_type=CompressionType.ZSTD,\n",
"# statistics_frequency=StatsLevel('row_group'),\n",
"# )\n",
"# )\n",
"#)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d3b87838",
"metadata": {},
"outputs": [],
"source": [
"#path = \"/data1/scratch/felipe/wikipedia/text_resized/\"\n",
"\n",
"#wikipedia_table = con.read_parquet(path,table_name='wikipedia')"
]
},
{
"cell_type": "markdown",
"id": "f76684fc",
"metadata": {},
"source": [
"# 5. Embed the data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "867792a9",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "95b6cd52c490436691d0d0e1f9fe042d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tasks Completed: 0task [00:00, ?task/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2d7c0b0e9c0c4604822c67532f69adec",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Bytes Scanned: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">DatabaseTable: wikipedia_embed\n",
" embedding_id int64\n",
" batch_id string\n",
" source_url string\n",
" embedding string\n",
" texts string\n",
"</pre>\n"
],
"text/plain": [
"DatabaseTable: wikipedia_embed\n",
" embedding_id int64\n",
" batch_id string\n",
" source_url string\n",
" embedding string\n",
" texts string"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibis_con.raw_sql(\"\"\"CREATE FILESYSTEM \"voltrondata-demo-tests\" (\n",
" TYPE GCS,\n",
" PROJECT_ID 'voltrondata-demo'\n",
")\"\"\").execute()\n",
"embedded_df = ibis_con.sql(f\"\"\"\n",
" SELECT \n",
" rag.embed_text_series(url, text, '{embeddings_path}', '{model_name}') as result,\n",
" url AS url\n",
" FROM wikipedia where\n",
" text <> ''\n",
"\"\"\").to_pandas()\n",
"\n",
"ibis_con.read_parquet(embeddings_path,table_name='wikipedia_embed')"
]
},
{
"cell_type": "markdown",
"id": "70eb1fb1",
"metadata": {},
"source": [
"# 6. Build index on embeddings using CAGRA"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b78c50ae",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "881f286d0aa441d4a8d598b50a1536fc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tasks Completed: 0task [00:00, ?task/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "00771f6ee79d42c8ae74499f858611c4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Bytes Scanned: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"ibis_con.read_parquet(embeddings_path,table_name='wikipedia_embed')\n",
"index_dir = \"/data/index\"\n",
"output_dir = \"gs://voltrondata-demo-tests/ln-udf-rag/index\"\n",
"\n",
"#Returns the path to where the indeces were written\n",
"index_df = ibis_con.sql(f\"\"\"\n",
" SELECT \n",
" rag.build_vector_index(embedding, texts, source_url, '{output_dir}', '{index_dir}')\n",
" FROM wikipedia_embed\n",
"\n",
"\"\"\").to_pandas()\n"
]
},
{
"cell_type": "markdown",
"id": "0236a082",
"metadata": {},
"source": [
"# 7. Query the Index"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ed5410b6",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9da8e92757a3427f82b17e580e3be7c5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tasks Completed: 0task [00:00, ?task/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ee58f912adec4d31b5de7dd463eb5ea1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Bytes Scanned: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0\n",
"0 /data/index/vector_index_c5b76e61-9e5a-42d0-b5...\n",
"1 /data/index/vector_index_2da29572-8568-4844-87...\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4e37020da6364e7599530d40f38c0ba6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tasks Completed: 0task [00:00, ?task/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b615c114b6d2478b9e13834a137fa63e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Bytes Scanned: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "afb5e6ffefc445d6aa83a9aa6489ab97",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tasks Completed: 0task [00:00, ?task/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "71e8c9350ea54d7c8ed687b2ea50686b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Bytes Scanned: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>url</th>\n",
" <th>distance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>History\\n\\nAncient Near Eastern alphabets\\nThe...</td>\n",
" <td>https://en.wikipedia.org/wiki/Alphabet</td>\n",
" <td>43.276222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>History\\n\\nAncient Near Eastern alphabets\\nThe...</td>\n",
" <td>https://en.wikipedia.org/wiki/Alphabet</td>\n",
" <td>43.276222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>History\\n\\nAncient Near Eastern alphabets\\nThe...</td>\n",
" <td>https://en.wikipedia.org/wiki/Alphabet</td>\n",
" <td>43.276222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>All three types may be augmented with syllabic...</td>\n",
" <td>https://en.wikipedia.org/wiki/Alphabet</td>\n",
" <td>44.606064</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>All three types may be augmented with syllabic...</td>\n",
" <td>https://en.wikipedia.org/wiki/Alphabet</td>\n",
" <td>44.606064</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>An intermediate order converts uppercase lette...</td>\n",
" <td>https://en.wikipedia.org/wiki/ASCII</td>\n",
" <td>49.615658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>An intermediate order converts uppercase lette...</td>\n",
" <td>https://en.wikipedia.org/wiki/ASCII</td>\n",
" <td>49.615658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>ITA2 was in turn based on the 5-bit telegraph ...</td>\n",
" <td>https://en.wikipedia.org/wiki/ASCII</td>\n",
" <td>49.625916</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>ITA2 was in turn based on the 5-bit telegraph ...</td>\n",
" <td>https://en.wikipedia.org/wiki/ASCII</td>\n",
" <td>49.625916</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>ITA2 was in turn based on the 5-bit telegraph ...</td>\n",
" <td>https://en.wikipedia.org/wiki/ASCII</td>\n",
" <td>49.625916</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" text \\\n",
"0 History\\n\\nAncient Near Eastern alphabets\\nThe... \n",
"1 History\\n\\nAncient Near Eastern alphabets\\nThe... \n",
"2 History\\n\\nAncient Near Eastern alphabets\\nThe... \n",
"3 All three types may be augmented with syllabic... \n",
"4 All three types may be augmented with syllabic... \n",
".. ... \n",
"91 An intermediate order converts uppercase lette... \n",
"92 An intermediate order converts uppercase lette... \n",
"93 ITA2 was in turn based on the 5-bit telegraph ... \n",
"94 ITA2 was in turn based on the 5-bit telegraph ... \n",
"95 ITA2 was in turn based on the 5-bit telegraph ... \n",
"\n",
" url distance \n",
"0 https://en.wikipedia.org/wiki/Alphabet 43.276222 \n",
"1 https://en.wikipedia.org/wiki/Alphabet 43.276222 \n",
"2 https://en.wikipedia.org/wiki/Alphabet 43.276222 \n",
"3 https://en.wikipedia.org/wiki/Alphabet 44.606064 \n",
"4 https://en.wikipedia.org/wiki/Alphabet 44.606064 \n",
".. ... ... \n",
"91 https://en.wikipedia.org/wiki/ASCII 49.615658 \n",
"92 https://en.wikipedia.org/wiki/ASCII 49.615658 \n",
"93 https://en.wikipedia.org/wiki/ASCII 49.625916 \n",
"94 https://en.wikipedia.org/wiki/ASCII 49.625916 \n",
"95 https://en.wikipedia.org/wiki/ASCII 49.625916 \n",
"\n",
"[96 rows x 3 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"k = 32\n",
"query = \"tell me about babylonian writing system\"\n",
"index_dir = \"gs://voltrondata-demo-tests/ln-udf-rag/index\"\n",
"ibis_con.read_csv(index_dir,table_name='index_files',header=False)\n",
"results_dir = \"gs://voltrondata-demo-tests/ln-udf-rag/results\"\n",
"vector_index_dir = \"gs://voltrondata-demo-tests/ln-udf-rag/index/vector\"\n",
"result_files = ibis_con.sql(f\"\"\"\n",
" SELECT \n",
" *\n",
" FROM index_files\n",
"\"\"\").to_pandas()\n",
"\n",
"print(result_files)\n",
"\n",
"result_files = ibis_con.sql(f\"\"\"\n",
" SELECT \n",
" rag.search_vector_index(index_files.\"0\", '{query}' , '{vector_index_dir}', '{results_dir}' , {k}, '{model_name}')\n",
" FROM index_files\n",
"\"\"\").to_pandas()\n",
"\n",
"\n",
"#Gather and print results\n",
"results_table = ibis_con.read_parquet(results_dir,table_name='results')\n",
"results = con.sql(f\"\"\"\n",
" SELECT \n",
"* FROM results\n",
" order by distance asc\n",
" limit 128\n",
" \"\"\").to_pandas()\n",
"results\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "vodaagent_example",
"metadata": {},
"outputs": [],
"source": [
"from build_rag_helper import VodaAgent\n",
"import pyarrow\n",
"\n",
"agent = VodaAgent()\n",
"question = 'Tell me about Babylonian writing system'\n",
"context = pyarrow.array(results['text'].tolist())\n",
"answer = agent.ask(question, context)\n",
"print(answer.to_pylist()[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "628d526c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment