Created
January 16, 2026 10:02
-
-
Save Steboss/2e0248ad7291c2e1065574f600325098 to your computer and use it in GitHub Desktop.
Create a vector database with semantic chunker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import re | |
| import glob | |
| import pymupdf4llm | |
| from marker.converters.pdf import PdfConverter | |
| from marker.models import create_model_dict | |
| from marker.output import text_from_rendered | |
| from langchain_core.documents import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_experimental.text_splitter import SemanticChunker | |
| from langchain_community.embeddings.fastembed import FastEmbedEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| # CONSTANTS | |
| PDF_SOURCE_DIR = "../rag_corpus" | |
| CHROMA_DB_DIR = "../chroma_db" | |
| EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5" | |
| CONVERTER = PdfConverter(artifact_dict=create_model_dict()) | |
| EMBED_MODEL = FastEmbedEmbeddings(model_name=EMBED_MODEL_NAME) | |
| SEMANTIC_CHUNKER = SemanticChunker( | |
| EMBED_MODEL, | |
| breakpoint_threshold_type="percentile" | |
| ) | |
| def process_pdf(pdf_file: str) -> list[Document]: | |
| """ Main function that uses marker-pdf to parsed PDF to markdown""" | |
| try: | |
| print(f"Processing PDF: {pdf_file}") | |
| rendered = CONVERTER(pdf_file) | |
| text, _, _ = text_from_rendered(rendered) # text, tables, images | |
| # remove html spans | |
| text = re.sub(r'<[^>]+>', '', text) | |
| # remove references | |
| if "## References" in text: | |
| text = text.split("## References")[0] | |
| # fix too many new lines | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| # Chunking | |
| chunks = SEMANTIC_CHUNKER.create_documents([text]) | |
| for chunk in chunks: | |
| chunk.metadata = {"source": os.path.basename(pdf_file)} | |
| print(f" -> Generated {len(chunks)} Clean LaTeX chunks.") | |
| return chunks | |
| except Exception as e: | |
| print(f" !! FAILED to process {pdf_file}: {e}") | |
| return [] | |
| def build_corpus(): | |
| """ | |
| Builds the vector database from PDFs in the source directory. | |
| - Parses PDFs to clean Markdown using pymupdf4llm. | |
| - Chunks the Markdown semantically. | |
| - Stores the chunks in a local ChromaDB. | |
| """ | |
| # find all PDF files | |
| pdf_files = glob.glob(f"{PDF_SOURCE_DIR}/*.pdf") | |
| if not pdf_files: | |
| print(f"Error: No PDFs found in {PDF_SOURCE_DIR}. Please add your papers.") | |
| return | |
| print(f"Found {len(pdf_files)} PDF(s) to process.") | |
| print("Initializing Semantic Chunker...") | |
| all_chunks = [] | |
| for pdf_file in pdf_files: | |
| try: | |
| file_chunks = process_pdf(pdf_file) | |
| all_chunks.extend(file_chunks) | |
| except Exception as e: | |
| print(f"!! FAILED to process {pdf_file}: {e}") | |
| if not all_chunks: | |
| print("Error: No chunks were generated. Halting.") | |
| return | |
| print(f"\nTotal chunks generated: {len(all_chunks)}") | |
| # Save in chromaDB | |
| print(f"Initializing ChromaDB at {CHROMA_DB_DIR}") | |
| vector_store = Chroma( | |
| persist_directory=CHROMA_DB_DIR, | |
| embedding_function=EMBED_MODEL | |
| ) | |
| print("Adding all chunks to the vector store... (This may take a moment)") | |
| vector_store.add_documents(all_chunks) | |
| vector_store.persist() | |
| print("\n--- RAG Corpus Build Complete! ---") | |
| if __name__ == "__main__": | |
| # Check if the corpus directory exists | |
| if not os.path.exists(PDF_SOURCE_DIR): | |
| os.makedirs(PDF_SOURCE_DIR) | |
| print(f"Created folder '{PDF_SOURCE_DIR}'.") | |
| print("Please add your MatMul PDF papers to this folder and run again.") | |
| else: | |
| build_corpus() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment