Skip to content

Instantly share code, notes, and snippets.

@0187773933
Created January 18, 2026 23:27
Show Gist options
  • Select an option

  • Save 0187773933/03044c9c00af767c1931ebd2c5978df8 to your computer and use it in GitHub Desktop.

Select an option

Save 0187773933/03044c9c00af767c1931ebd2c5978df8 to your computer and use it in GitHub Desktop.
PDF Image and Table Extractor YOLO Document Parser
#!/usr/bin/env python3
import subprocess
import tempfile
from pathlib import Path
import argparse
import cv2
from doclayout_yolo import YOLOv10
from tqdm import tqdm
from PIL import Image
from PIL.PngImagePlugin import PngInfo
MAX_PAGES = 30 # skip long books
# ---------------------------
# PDF helpers
# ---------------------------
def get_pdf_page_count(pdf_path: Path) -> int:
res = subprocess.run(
["pdfinfo", str(pdf_path)],
capture_output=True,
text=True,
check=True,
)
for line in res.stdout.splitlines():
if line.startswith("Pages:"):
return int(line.split(":")[1].strip())
raise RuntimeError("Could not determine page count")
# ---------------------------
# Geometry helpers
# ---------------------------
def scale_bbox(b, scale, padding, w, h):
x1, y1, x2, y2 = b
x1 = int(x1 / scale) - padding
y1 = int(y1 / scale) - padding
x2 = int(x2 / scale) + padding
y2 = int(y2 / scale) + padding
return (
max(0, x1),
max(0, y1),
min(w, x2),
min(h, y2),
)
def overlaps_x(a, b, frac=0.3):
ax1, _, ax2, _ = a
bx1, _, bx2, _ = b
overlap = max(0, min(ax2, bx2) - max(ax1, bx1))
return overlap >= frac * min(ax2 - ax1, bx2 - bx1)
def is_attached(anchor, other, max_gap=150):
ax1, ay1, ax2, ay2 = anchor
bx1, by1, bx2, by2 = other
if not overlaps_x(anchor, other):
return False
# below anchor (captions / footnotes)
if 0 <= by1 - ay2 <= max_gap:
return True
# above anchor (titles)
if 0 <= ay1 - by2 <= max_gap:
return True
return False
# ---------------------------
# Main extraction logic
# ---------------------------
def extract_images_from_pdf(
pdf_path: Path,
model: YOLOv10,
output_root: Path | None,
dpi_high=400,
dpi_low=120,
imgsz=1024,
conf=0.2,
device="cpu",
padding_px=10,
):
pdf_path = pdf_path.resolve()
if output_root:
out_dir = output_root
prefix = pdf_path.stem
else:
out_dir = pdf_path.with_suffix("")
out_dir = out_dir.parent / f"{out_dir.name}-images"
prefix = ""
out_dir.mkdir(parents=True, exist_ok=True)
pdf_title = pdf_path.stem
try:
info = subprocess.run(
["pdfinfo", str(pdf_path)],
capture_output=True,
text=True,
)
for line in info.stdout.splitlines():
if line.lower().startswith("title:"):
pdf_title = line.split(":", 1)[1].strip()
break
except Exception:
pass
page_count = min(get_pdf_page_count(pdf_path), MAX_PAGES)
scale = dpi_low / dpi_high
ANCHORS = {"figure", "table"}
ATTACHMENTS = {
"figure_caption",
"table_caption",
"table_footnote",
"title",
}
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
for page_idx in tqdm(
range(1, page_count + 1),
desc=f"Pages ({pdf_path.name})",
unit="page",
leave=False,
):
out_base = tmpdir / f"page_{page_idx:04d}"
subprocess.run(
[
"pdftoppm",
"-png",
"-r", str(dpi_high),
"-f", str(page_idx),
"-l", str(page_idx),
str(pdf_path),
str(out_base),
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)
matches = list(tmpdir.glob(out_base.name + "*.png"))
if not matches:
continue
hi_path = matches[0]
high_img = cv2.imread(str(hi_path))
if high_img is None:
continue
low_img = cv2.resize(
high_img,
None,
fx=scale,
fy=scale,
interpolation=cv2.INTER_AREA,
)
det = model.predict(
source=low_img,
imgsz=imgsz,
conf=conf,
device=device,
)
if not det or det[0].boxes is None:
continue
H, W = high_img.shape[:2]
names = det[0].names
detections = []
for box in det[0].boxes:
cls_id = int(box.cls[0])
cls_name = names.get(cls_id, "")
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
bbox_hi = scale_bbox(
(x1, y1, x2, y2),
scale,
padding_px,
W,
H,
)
detections.append({
"cls": cls_name,
"bbox": bbox_hi,
})
fig_count = 0
for d in detections:
if d["cls"] not in ANCHORS:
continue
union = list(d["bbox"])
for o in detections:
if o["cls"] not in ATTACHMENTS:
continue
if is_attached(d["bbox"], o["bbox"]):
ox1, oy1, ox2, oy2 = o["bbox"]
union[0] = min(union[0], ox1)
union[1] = min(union[1], oy1)
union[2] = max(union[2], ox2)
union[3] = max(union[3], oy2)
x1, y1, x2, y2 = union
if x2 <= x1 or y2 <= y1:
continue
crop = high_img[y1:y2, x1:x2]
fig_count += 1
name = f"page_{page_idx:03d}_fig_{fig_count:02d}.png"
if prefix:
name = f"{prefix}_{name}"
out_path = out_dir / name
pil_img = Image.fromarray(
cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
)
meta = PngInfo()
meta.add_text("source_pdf_title", pdf_title)
meta.add_text("source_pdf_path", str(pdf_path))
meta.add_text("source_page", str(page_idx))
meta.add_text("anchor_class", d["cls"])
meta.add_text("extractor", "PDFImageExtractorYOLO_grouped")
pil_img.save(out_path, pnginfo=meta)
# ---------------------------
# CLI
# ---------------------------
def main():
parser = argparse.ArgumentParser(
description="Extract figures/tables with captions from PDFs using DocLayout-YOLO"
)
parser.add_argument("path", help="PDF file or folder (recursive)")
parser.add_argument("--output", default=None)
parser.add_argument("--model", default="doclayout_yolo_docstructbench_imgsz1024.pt")
parser.add_argument("--device", default="cpu")
args = parser.parse_args()
input_path = Path(args.path).resolve()
output_root = Path(args.output).resolve() if args.output else None
if output_root:
output_root.mkdir(parents=True, exist_ok=True)
model = YOLOv10(args.model)
if input_path.is_file():
pdfs = [input_path]
elif input_path.is_dir():
pdfs = list(input_path.rglob("*.pdf"))
else:
raise ValueError("Invalid input path")
for pdf in tqdm(pdfs, desc="PDFs", unit="pdf"):
extract_images_from_pdf(pdf, model, output_root, device=args.device)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment