Skip to content

Instantly share code, notes, and snippets.

@Steboss
Created January 16, 2026 10:02
Show Gist options
  • Select an option

  • Save Steboss/2e0248ad7291c2e1065574f600325098 to your computer and use it in GitHub Desktop.

Select an option

Save Steboss/2e0248ad7291c2e1065574f600325098 to your computer and use it in GitHub Desktop.
Create a vector database with semantic chunker
import os
import re
import glob
import pymupdf4llm
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
# CONSTANTS
PDF_SOURCE_DIR = "../rag_corpus"
CHROMA_DB_DIR = "../chroma_db"
EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5"
CONVERTER = PdfConverter(artifact_dict=create_model_dict())
EMBED_MODEL = FastEmbedEmbeddings(model_name=EMBED_MODEL_NAME)
SEMANTIC_CHUNKER = SemanticChunker(
EMBED_MODEL,
breakpoint_threshold_type="percentile"
)
def process_pdf(pdf_file: str) -> list[Document]:
""" Main function that uses marker-pdf to parsed PDF to markdown"""
try:
print(f"Processing PDF: {pdf_file}")
rendered = CONVERTER(pdf_file)
text, _, _ = text_from_rendered(rendered) # text, tables, images
# remove html spans
text = re.sub(r'<[^>]+>', '', text)
# remove references
if "## References" in text:
text = text.split("## References")[0]
# fix too many new lines
text = re.sub(r'\n{3,}', '\n\n', text)
# Chunking
chunks = SEMANTIC_CHUNKER.create_documents([text])
for chunk in chunks:
chunk.metadata = {"source": os.path.basename(pdf_file)}
print(f" -> Generated {len(chunks)} Clean LaTeX chunks.")
return chunks
except Exception as e:
print(f" !! FAILED to process {pdf_file}: {e}")
return []
def build_corpus():
"""
Builds the vector database from PDFs in the source directory.
- Parses PDFs to clean Markdown using pymupdf4llm.
- Chunks the Markdown semantically.
- Stores the chunks in a local ChromaDB.
"""
# find all PDF files
pdf_files = glob.glob(f"{PDF_SOURCE_DIR}/*.pdf")
if not pdf_files:
print(f"Error: No PDFs found in {PDF_SOURCE_DIR}. Please add your papers.")
return
print(f"Found {len(pdf_files)} PDF(s) to process.")
print("Initializing Semantic Chunker...")
all_chunks = []
for pdf_file in pdf_files:
try:
file_chunks = process_pdf(pdf_file)
all_chunks.extend(file_chunks)
except Exception as e:
print(f"!! FAILED to process {pdf_file}: {e}")
if not all_chunks:
print("Error: No chunks were generated. Halting.")
return
print(f"\nTotal chunks generated: {len(all_chunks)}")
# Save in chromaDB
print(f"Initializing ChromaDB at {CHROMA_DB_DIR}")
vector_store = Chroma(
persist_directory=CHROMA_DB_DIR,
embedding_function=EMBED_MODEL
)
print("Adding all chunks to the vector store... (This may take a moment)")
vector_store.add_documents(all_chunks)
vector_store.persist()
print("\n--- RAG Corpus Build Complete! ---")
if __name__ == "__main__":
# Check if the corpus directory exists
if not os.path.exists(PDF_SOURCE_DIR):
os.makedirs(PDF_SOURCE_DIR)
print(f"Created folder '{PDF_SOURCE_DIR}'.")
print("Please add your MatMul PDF papers to this folder and run again.")
else:
build_corpus()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment