SharathHebbar/lang.py

## lang.py
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma

# Step 1: Load PDF Document
def load_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

# Step 2: Chunk the Document
def chunk_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    chunks = splitter.split_documents(documents)
    return chunks

# Step 3: Generate Embeddings in Batches
def embed_chunks_in_batches(chunks, batch_size=1):
    embeddings_model = AzureOpenAIEmbeddings(
        azure_deployment="cds_text_embedding_2",
        openai_api_version=os.getenv("AZURE_API_Version")
    )

    embeddings = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        batch_embeddings = [embeddings_model.embed_documents([chunk.page_content])[0] for chunk in batch]
        embeddings.extend(batch_embeddings)

    return embeddings

# Step 4: Store in Chroma
def store_in_chroma(chunks, embeddings, collection_name="pdf_chunk_collection"):
    texts = [chunk.page_content for chunk in chunks]
    ids = [f"chunk_{i}" for i in range(len(chunks))]

    chroma_client = Chroma(
        collection_name=collection_name,
        persist_directory="./chroma_storage"  # Optional: specify a directory to persist the data
    )

    chroma_client.add_texts(
        texts=texts,
        ids=ids,
        embeddings=embeddings
    )

    return chroma_client

# Step 5: Query the Chroma Vector Store
def query_chroma(query_text, vector_store):
    results = vector_store.similarity_search(query_text, n_results=5)
    return results

# Step 6: Main Function to Execute the Workflow
def main(pdf_path, query):
    # Load the PDF
    documents = load_pdf(pdf_path)

    # Chunk the documents
    chunks = chunk_documents(documents)

    # Generate embeddings for the chunks
    embeddings = embed_chunks_in_batches(chunks, batch_size=1)

    # Store chunks and embeddings in Chroma
    vector_store = store_in_chroma(chunks, embeddings)

    # Query the Chroma Vector Store
    results = query_chroma(query, vector_store)

    # Display the results
    for i, result in enumerate(results):
        print(f"Result {i + 1}:")
        print(result.page_content)
        print("\n---\n")

# Example Usage
if __name__ == "__main__":
    # Specify the path to your PDF file and your query
    pdf_file_path = "your_pdf_file.pdf"
    user_query = "What is the document about?"

    # Execute the main function
    main(pdf_file_path, user_query)
	import os
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import AzureOpenAIEmbeddings
	from langchain.vectorstores import Chroma

	# Step 1: Load PDF Document
	def load_pdf(pdf_path):
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()
	return documents

	# Step 2: Chunk the Document
	def chunk_documents(documents):
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
	chunks = splitter.split_documents(documents)
	return chunks

	# Step 3: Generate Embeddings in Batches
	def embed_chunks_in_batches(chunks, batch_size=1):
	embeddings_model = AzureOpenAIEmbeddings(
	azure_deployment="cds_text_embedding_2",
	openai_api_version=os.getenv("AZURE_API_Version")
	)

	embeddings = []

	for i in range(0, len(chunks), batch_size):
	batch = chunks[i:i + batch_size]
	batch_embeddings = [embeddings_model.embed_documents([chunk.page_content])[0] for chunk in batch]
	embeddings.extend(batch_embeddings)

	return embeddings

	# Step 4: Store in Chroma
	def store_in_chroma(chunks, embeddings, collection_name="pdf_chunk_collection"):
	texts = [chunk.page_content for chunk in chunks]
	ids = [f"chunk_{i}" for i in range(len(chunks))]

	chroma_client = Chroma(
	collection_name=collection_name,
	persist_directory="./chroma_storage" # Optional: specify a directory to persist the data
	)

	chroma_client.add_texts(
	texts=texts,
	ids=ids,
	embeddings=embeddings
	)

	return chroma_client

	# Step 5: Query the Chroma Vector Store
	def query_chroma(query_text, vector_store):
	results = vector_store.similarity_search(query_text, n_results=5)
	return results

	# Step 6: Main Function to Execute the Workflow
	def main(pdf_path, query):
	# Load the PDF
	documents = load_pdf(pdf_path)

	# Chunk the documents
	chunks = chunk_documents(documents)

	# Generate embeddings for the chunks
	embeddings = embed_chunks_in_batches(chunks, batch_size=1)

	# Store chunks and embeddings in Chroma
	vector_store = store_in_chroma(chunks, embeddings)

	# Query the Chroma Vector Store
	results = query_chroma(query, vector_store)

	# Display the results
	for i, result in enumerate(results):
	print(f"Result {i + 1}:")
	print(result.page_content)
	print("\n---\n")

	# Example Usage
	if __name__ == "__main__":
	# Specify the path to your PDF file and your query
	pdf_file_path = "your_pdf_file.pdf"
	user_query = "What is the document about?"

	# Execute the main function
	main(pdf_file_path, user_query)
No results found