Steboss/semantic_chunker.py

## semantic_chunker.py
import os
import re
import glob
import pymupdf4llm
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma

# CONSTANTS
PDF_SOURCE_DIR = "../rag_corpus"
CHROMA_DB_DIR = "../chroma_db"
EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5"
CONVERTER = PdfConverter(artifact_dict=create_model_dict())
EMBED_MODEL = FastEmbedEmbeddings(model_name=EMBED_MODEL_NAME)
SEMANTIC_CHUNKER = SemanticChunker(
    EMBED_MODEL,
    breakpoint_threshold_type="percentile"
    )


def process_pdf(pdf_file: str) -> list[Document]:
    """ Main function that uses marker-pdf to parsed PDF to markdown"""
    try:
        print(f"Processing PDF: {pdf_file}")
        rendered = CONVERTER(pdf_file)
        text, _, _ = text_from_rendered(rendered) # text, tables, images

        # remove html spans
        text = re.sub(r'<[^>]+>', '', text)

        # remove references
        if "## References" in text:
            text = text.split("## References")[0]

        # fix too many new lines
        text = re.sub(r'\n{3,}', '\n\n', text)

        # Chunking
        chunks = SEMANTIC_CHUNKER.create_documents([text])

        for chunk in chunks:
            chunk.metadata = {"source": os.path.basename(pdf_file)}

        print(f"  -> Generated {len(chunks)} Clean LaTeX chunks.")
        return chunks

    except Exception as e:
        print(f"  !! FAILED to process {pdf_file}: {e}")
        return []


def build_corpus():
    """
    Builds the vector database from PDFs in the source directory.
    - Parses PDFs to clean Markdown using pymupdf4llm.
    - Chunks the Markdown semantically.
    - Stores the chunks in a local ChromaDB.
    """
    # find all PDF files
    pdf_files = glob.glob(f"{PDF_SOURCE_DIR}/*.pdf")
    if not pdf_files:
        print(f"Error: No PDFs found in {PDF_SOURCE_DIR}. Please add your papers.")
        return

    print(f"Found {len(pdf_files)} PDF(s) to process.")
    print("Initializing Semantic Chunker...")
    all_chunks = []
    for pdf_file in pdf_files:
        try:
            file_chunks = process_pdf(pdf_file)
            all_chunks.extend(file_chunks)
        except Exception as e:
            print(f"!! FAILED to process {pdf_file}: {e}")

    if not all_chunks:
        print("Error: No chunks were generated. Halting.")
        return

    print(f"\nTotal chunks generated: {len(all_chunks)}")

    # Save in chromaDB
    print(f"Initializing ChromaDB at {CHROMA_DB_DIR}")
    vector_store = Chroma(
        persist_directory=CHROMA_DB_DIR,
        embedding_function=EMBED_MODEL
    )
    print("Adding all chunks to the vector store... (This may take a moment)")
    vector_store.add_documents(all_chunks)
    vector_store.persist()
    print("\n--- RAG Corpus Build Complete! ---")


if __name__ == "__main__":
    # Check if the corpus directory exists
    if not os.path.exists(PDF_SOURCE_DIR):
        os.makedirs(PDF_SOURCE_DIR)
        print(f"Created folder '{PDF_SOURCE_DIR}'.")
        print("Please add your MatMul PDF papers to this folder and run again.")
    else:
        build_corpus()
	import os
	import re
	import glob
	import pymupdf4llm
	from marker.converters.pdf import PdfConverter
	from marker.models import create_model_dict
	from marker.output import text_from_rendered
	from langchain_core.documents import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_experimental.text_splitter import SemanticChunker
	from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
	from langchain_community.vectorstores import Chroma

	# CONSTANTS
	PDF_SOURCE_DIR = "../rag_corpus"
	CHROMA_DB_DIR = "../chroma_db"
	EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5"
	CONVERTER = PdfConverter(artifact_dict=create_model_dict())
	EMBED_MODEL = FastEmbedEmbeddings(model_name=EMBED_MODEL_NAME)
	SEMANTIC_CHUNKER = SemanticChunker(
	EMBED_MODEL,
	breakpoint_threshold_type="percentile"
	)


	def process_pdf(pdf_file: str) -> list[Document]:
	""" Main function that uses marker-pdf to parsed PDF to markdown"""
	try:
	print(f"Processing PDF: {pdf_file}")
	rendered = CONVERTER(pdf_file)
	text, _, _ = text_from_rendered(rendered) # text, tables, images

	# remove html spans
	text = re.sub(r'<[^>]+>', '', text)

	# remove references
	if "## References" in text:
	text = text.split("## References")[0]

	# fix too many new lines
	text = re.sub(r'\n{3,}', '\n\n', text)

	# Chunking
	chunks = SEMANTIC_CHUNKER.create_documents([text])

	for chunk in chunks:
	chunk.metadata = {"source": os.path.basename(pdf_file)}

	print(f" -> Generated {len(chunks)} Clean LaTeX chunks.")
	return chunks

	except Exception as e:
	print(f" !! FAILED to process {pdf_file}: {e}")
	return []


	def build_corpus():
	"""
	Builds the vector database from PDFs in the source directory.
	- Parses PDFs to clean Markdown using pymupdf4llm.
	- Chunks the Markdown semantically.
	- Stores the chunks in a local ChromaDB.
	"""
	# find all PDF files
	pdf_files = glob.glob(f"{PDF_SOURCE_DIR}/*.pdf")
	if not pdf_files:
	print(f"Error: No PDFs found in {PDF_SOURCE_DIR}. Please add your papers.")
	return

	print(f"Found {len(pdf_files)} PDF(s) to process.")
	print("Initializing Semantic Chunker...")
	all_chunks = []
	for pdf_file in pdf_files:
	try:
	file_chunks = process_pdf(pdf_file)
	all_chunks.extend(file_chunks)
	except Exception as e:
	print(f"!! FAILED to process {pdf_file}: {e}")

	if not all_chunks:
	print("Error: No chunks were generated. Halting.")
	return

	print(f"\nTotal chunks generated: {len(all_chunks)}")

	# Save in chromaDB
	print(f"Initializing ChromaDB at {CHROMA_DB_DIR}")
	vector_store = Chroma(
	persist_directory=CHROMA_DB_DIR,
	embedding_function=EMBED_MODEL
	)
	print("Adding all chunks to the vector store... (This may take a moment)")
	vector_store.add_documents(all_chunks)
	vector_store.persist()
	print("\n--- RAG Corpus Build Complete! ---")


	if __name__ == "__main__":
	# Check if the corpus directory exists
	if not os.path.exists(PDF_SOURCE_DIR):
	os.makedirs(PDF_SOURCE_DIR)
	print(f"Created folder '{PDF_SOURCE_DIR}'.")
	print("Please add your MatMul PDF papers to this folder and run again.")
	else:
	build_corpus()
No results found