Skip to content

Instantly share code, notes, and snippets.

@SharathHebbar
Created September 30, 2024 09:15
Show Gist options
  • Select an option

  • Save SharathHebbar/500b4921feef8cd14bafb3cab6a5f673 to your computer and use it in GitHub Desktop.

Select an option

Save SharathHebbar/500b4921feef8cd14bafb3cab6a5f673 to your computer and use it in GitHub Desktop.
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma
# Step 1: Load PDF Document
def load_pdf(pdf_path):
loader = PyPDFLoader(pdf_path)
documents = loader.load()
return documents
# Step 2: Chunk the Document
def chunk_documents(documents):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunks = splitter.split_documents(documents)
return chunks
# Step 3: Generate Embeddings in Batches
def embed_chunks_in_batches(chunks, batch_size=1):
embeddings_model = AzureOpenAIEmbeddings(
azure_deployment="cds_text_embedding_2",
openai_api_version=os.getenv("AZURE_API_Version")
)
embeddings = []
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
batch_embeddings = [embeddings_model.embed_documents([chunk.page_content])[0] for chunk in batch]
embeddings.extend(batch_embeddings)
return embeddings
# Step 4: Store in Chroma
def store_in_chroma(chunks, embeddings, collection_name="pdf_chunk_collection"):
texts = [chunk.page_content for chunk in chunks]
ids = [f"chunk_{i}" for i in range(len(chunks))]
chroma_client = Chroma(
collection_name=collection_name,
persist_directory="./chroma_storage" # Optional: specify a directory to persist the data
)
chroma_client.add_texts(
texts=texts,
ids=ids,
embeddings=embeddings
)
return chroma_client
# Step 5: Query the Chroma Vector Store
def query_chroma(query_text, vector_store):
results = vector_store.similarity_search(query_text, n_results=5)
return results
# Step 6: Main Function to Execute the Workflow
def main(pdf_path, query):
# Load the PDF
documents = load_pdf(pdf_path)
# Chunk the documents
chunks = chunk_documents(documents)
# Generate embeddings for the chunks
embeddings = embed_chunks_in_batches(chunks, batch_size=1)
# Store chunks and embeddings in Chroma
vector_store = store_in_chroma(chunks, embeddings)
# Query the Chroma Vector Store
results = query_chroma(query, vector_store)
# Display the results
for i, result in enumerate(results):
print(f"Result {i + 1}:")
print(result.page_content)
print("\n---\n")
# Example Usage
if __name__ == "__main__":
# Specify the path to your PDF file and your query
pdf_file_path = "your_pdf_file.pdf"
user_query = "What is the document about?"
# Execute the main function
main(pdf_file_path, user_query)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment