Last active
September 8, 2025 01:02
-
-
Save chriscarrollsmith/42c426f38595c78f3bd8e4bcda76acdc to your computer and use it in GitHub Desktop.
Quick script for inspecting file_search tool results returned from OpenAI embeddings vector store
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| from typing import Any, Dict, List | |
| from openai import OpenAI | |
| def ensure_api_key() -> None: | |
| if not os.getenv("OPENAI_API_KEY"): | |
| print("ERROR: Please set OPENAI_API_KEY in your environment.") | |
| sys.exit(1) | |
| def to_primitive(value: Any) -> Any: | |
| if hasattr(value, "model_dump") and callable(getattr(value, "model_dump")): | |
| try: | |
| return value.model_dump(exclude_none=True) | |
| except Exception: | |
| pass | |
| if isinstance(value, dict): | |
| return {k: to_primitive(v) for k, v in value.items()} | |
| if isinstance(value, list): | |
| return [to_primitive(v) for v in value] | |
| if hasattr(value, "__dict__") and not isinstance(value, (str, bytes)): | |
| try: | |
| return {k: to_primitive(v) for k, v in value.__dict__.items() if not k.startswith("_")} | |
| except Exception: | |
| pass | |
| return value | |
| def get_assistant_vector_store_ids(client: OpenAI, assistant_id: str) -> List[str]: | |
| try: | |
| assistant = client.beta.assistants.retrieve(assistant_id) | |
| tool_resources = getattr(assistant, "tool_resources", None) | |
| if tool_resources: | |
| file_search = getattr(tool_resources, "file_search", None) | |
| if file_search: | |
| vector_store_ids = getattr(file_search, "vector_store_ids", None) | |
| if isinstance(vector_store_ids, list): | |
| return [str(v) for v in vector_store_ids] | |
| except Exception: | |
| return [] | |
| return [] | |
| def main() -> None: | |
| ensure_api_key() | |
| client = OpenAI() | |
| assistant_id = os.getenv("ASSISTANT_ID") | |
| vector_store_id = os.getenv("VECTOR_STORE_ID") | |
| if not vector_store_id: | |
| vs_ids = get_assistant_vector_store_ids(client, assistant_id) | |
| if not vs_ids: | |
| print("No vector stores found on assistant and VECTOR_STORE_ID not set.") | |
| sys.exit(1) | |
| vector_store_id = vs_ids[0] | |
| query = sys.argv[1] if len(sys.argv) > 1 else "clean cooking" | |
| print(f"Using vector_store_id: {vector_store_id}") | |
| print(f"Query: {query}") | |
| # Perform direct vector store search (requires SDK that supports this method) | |
| if not hasattr(client.vector_stores, "search"): | |
| print("This SDK version does not support client.vector_stores.search. Try upgrading openai.") | |
| sys.exit(2) | |
| try: | |
| # Note: Some SDK versions use `limit` instead of `max_num_results`. Adjust if needed. | |
| results = client.vector_stores.search( | |
| vector_store_id=vector_store_id, | |
| query=query, | |
| max_num_results=5, | |
| ) | |
| except TypeError: | |
| # Try an alternative signature with `limit` | |
| results = client.vector_stores.search( | |
| vector_store_id=vector_store_id, | |
| query=query, | |
| limit=5, | |
| ) | |
| # Print raw shapes | |
| import json | |
| prim = to_primitive(results) | |
| print("\nRaw search response shape:") | |
| try: | |
| print(json.dumps(prim, indent=2)) | |
| except Exception: | |
| print(str(prim)) | |
| # If there is a `data` field with chunk-like entries, pretty-print a compact view | |
| data = getattr(results, "data", None) or (prim.get("data") if isinstance(prim, dict) else None) | |
| if data and isinstance(data, list): | |
| print("\nTop chunks (compact view):") | |
| for idx, item in enumerate(data, start=1): | |
| item_prim: Dict[str, Any] = to_primitive(item) | |
| file_id = item_prim.get("file_id") or item_prim.get("fileId") | |
| score = item_prim.get("score") or item_prim.get("similarity") | |
| # Try to pull a text field if one exists | |
| content = item_prim.get("content") | |
| snippet = None | |
| if isinstance(content, str): | |
| snippet = content | |
| elif isinstance(content, list) and content: | |
| # Some SDKs put text blocks in a list | |
| maybe_text = content[0].get("text") if isinstance(content[0], dict) else None | |
| if isinstance(maybe_text, str): | |
| snippet = maybe_text | |
| elif isinstance(maybe_text, dict): | |
| snippet = maybe_text.get("value") | |
| if snippet: | |
| snippet = snippet if len(snippet) <= 400 else snippet[:400] + "..." | |
| print(f"{idx}. file_id={file_id} score={score}\n text={snippet}") | |
| if __name__ == "__main__": | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment