Skip to content

Instantly share code, notes, and snippets.

@chandradeepc
Last active October 1, 2025 02:03
Show Gist options
  • Select an option

  • Save chandradeepc/51b5fb752577fd63be3c2195eb30b46e to your computer and use it in GitHub Desktop.

Select an option

Save chandradeepc/51b5fb752577fd63be3c2195eb30b46e to your computer and use it in GitHub Desktop.
MinerU 2.0 Modal SGLang Deployment
import modal
from typing import Dict, Any, Optional
from pydantic import BaseModel
import json
import os
import uuid
import base64
import io
# Define the image - using sglang base image as recommended by mineru
image = (
modal.Image.from_registry("lmsysorg/sglang:v0.4.8.post1-cu126")
.apt_install(["libgl1", "fonts-noto-core", "fonts-noto-cjk", "fontconfig"])
.run_commands(["fc-cache -fv", "apt-get clean", "rm -rf /var/lib/apt/lists/*"])
.pip_install(
[
"setuptools<70.0.0", # Fix setuptools compatibility issue with antlr4
"transformers>=4.56.0", # Update transformers to latest version for mineru2_qwen support
"mineru[core]<=2.2.2", # Install only mineru[core] to avoid dependency conflicts
"nest-asyncio", # Allow nested event loops to fix "This event loop is already running" error
"Pillow", # For image format conversion
]
)
.run_commands(
[
# Add MinerU import right after line 2666 (the try: line)
'SCHEDULER_FILE=$(find /sgl-workspace -name "scheduler.py" -path "*/sglang/srt/managers/*" 2>/dev/null || find /usr/local -name "scheduler.py" -path "*/sglang/srt/managers/*" 2>/dev/null) && if [ -n "$SCHEDULER_FILE" ]; then echo "=== FOUND SCHEDULER.PY AT: $SCHEDULER_FILE ===" && echo "=== BEFORE MODIFICATION ===" && grep -n -A5 -B3 "scheduler = Scheduler" "$SCHEDULER_FILE" && sed -i "2666a\\ from mineru.backend.vlm.predictor import get_predictor" "$SCHEDULER_FILE" && echo "=== AFTER MODIFICATION ===" && grep -n -A7 -B3 "scheduler = Scheduler" "$SCHEDULER_FILE"; else echo "scheduler.py not found"; fi',
]
)
)
# Create the Modal app
app = modal.App("app")
class PDFRequest(BaseModel):
file: str # base64 encoded PDF
lang: str = "en"
start_page: int = 0
end_page: Optional[int] = None
class PDFResponse(BaseModel):
result: Dict[str, Any]
status: str = "success"
class PDFErrorResponse(BaseModel):
error: str
status: str = "failed"
# Global variable to store initialization status
initialized = False
def add_base64_images_to_json(json_data, image_dir):
"""
Recursively traverse the JSON structure and add base64 image content
to spans with type="image" and image_path property.
Converts images to PNG format before base64 encoding.
"""
import os
from loguru import logger
from PIL import Image
def process_item(item):
if isinstance(item, dict):
# Check if this is an image span
if item.get("type") == "image" and "image_path" in item:
image_path = item["image_path"]
full_image_path = os.path.join(image_dir, image_path)
try:
# Read the image file, convert to PNG, and encode to base64
if os.path.exists(full_image_path):
# Open image with PIL and convert to PNG
with Image.open(full_image_path) as img:
# Convert to RGB if necessary (some formats like P mode need this)
if img.mode in ("RGBA", "LA", "P"):
img = img.convert("RGBA")
elif img.mode != "RGB":
img = img.convert("RGB")
# Save as PNG to BytesIO buffer
png_buffer = io.BytesIO()
img.save(png_buffer, format="PNG")
png_data = png_buffer.getvalue()
# Encode to base64
base64_image = base64.b64encode(png_data).decode("utf-8")
item["base64image"] = base64_image
# logger.info(f"Added base64 PNG image for {image_path}")
else:
pass # logger.warning(f"Image file not found: {full_image_path}")
except Exception as e:
pass # logger.error(f"Error processing image {image_path}: {str(e)}")
# Recursively process all dictionary values
for value in item.values():
process_item(value)
elif isinstance(item, list):
# Recursively process all list items
for list_item in item:
process_item(list_item)
process_item(json_data)
def load_model():
"""Initialize MinerU with sglang backend"""
return {"status": "ready", "backend": "vlm-sglang-engine"}
@app.function(
image=image,
# gpu=["T4", "L4", "A10", "L40S"],
gpu=["L40S", "A100-80GB", "H100", "H200"],
timeout=1800, # 30 minutes
min_containers=0,
max_containers=10,
scaledown_window=60, # 1 minute
enable_memory_snapshot=True,
experimental_options={"enable_gpu_snapshot": False},
)
@modal.fastapi_endpoint(method="POST")
async def parse_pdf(request: PDFRequest):
"""
Parse PDF using MinerU with sglang backend
Returns the JSON result directly instead of saving to file
"""
# Apply nest_asyncio to allow nested event loops - must be done in Modal runtime
import nest_asyncio
nest_asyncio.apply()
# Initialize on first use
global initialized
if not initialized:
load_model()
initialized = True
# Import MinerU components (only available in remote environment)
from mineru.cli.common import (
convert_pdf_bytes_to_bytes_by_pypdfium2,
prepare_env,
read_fn,
)
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from loguru import logger
import multiprocessing as mp
import shutil
file = request.file
lang = request.lang
start_page = request.start_page
end_page = request.end_page
if not file:
return PDFErrorResponse(error="Please provide a PDF file (base64) to parse")
# Decode base64 to bytes
try:
file_bytes = base64.b64decode(file)
except Exception as e:
return PDFErrorResponse(error=f"Invalid base64 file data: {str(e)}")
# Validate it's a PDF
if not file_bytes.startswith(b"%PDF"):
return PDFErrorResponse(error="File does not appear to be a valid PDF")
try:
# Create temporary directories
temp_id = str(uuid.uuid4())
temp_dir = f"/tmp/mineru_{temp_id}"
output_dir = f"{temp_dir}/output"
os.makedirs(output_dir, exist_ok=True)
# Use the decoded bytes directly - no need to save and read back
pdf_bytes = file_bytes
pdf_file_name = "document"
# Convert PDF pages if needed
if start_page > 0 or end_page is not None:
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
pdf_bytes, start_page, end_page
)
# Prepare environment
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, "vlm")
image_writer = FileBasedDataWriter(local_image_dir)
md_writer = FileBasedDataWriter(local_md_dir)
# Use sglang-engine backend for analysis
backend = "sglang-engine"
# logger.info(f"Starting PDF analysis with backend: {backend}")
# Analyze document with VLM
# logger.info("About to call vlm_doc_analyze...")
middle_json, infer_result = vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend=backend, server_url=None
)
# Add base64 image content to image spans
# logger.info("Adding base64 image content to image spans...")
add_base64_images_to_json(middle_json, local_image_dir)
# Instead of saving to file, return the JSON directly
return PDFResponse(result=middle_json)
except Exception as e:
# logger.exception(f"Error processing PDF: {str(e)}")
return PDFErrorResponse(error=f"Failed to process PDF: {str(e)}")
finally:
# Cleanup temporary files
try:
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
except Exception as cleanup_error:
pass # logger.warning(f"Failed to cleanup temporary files: {cleanup_error}")
# To run the ephemeral endpoint: modal serve minerusglangapp.py
# To deploy: modal deploy minerusglangapp.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment