Created
January 18, 2026 23:27
-
-
Save 0187773933/03044c9c00af767c1931ebd2c5978df8 to your computer and use it in GitHub Desktop.
PDF Image and Table Extractor YOLO Document Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| import argparse | |
| import cv2 | |
| from doclayout_yolo import YOLOv10 | |
| from tqdm import tqdm | |
| from PIL import Image | |
| from PIL.PngImagePlugin import PngInfo | |
| MAX_PAGES = 30 # skip long books | |
| # --------------------------- | |
| # PDF helpers | |
| # --------------------------- | |
| def get_pdf_page_count(pdf_path: Path) -> int: | |
| res = subprocess.run( | |
| ["pdfinfo", str(pdf_path)], | |
| capture_output=True, | |
| text=True, | |
| check=True, | |
| ) | |
| for line in res.stdout.splitlines(): | |
| if line.startswith("Pages:"): | |
| return int(line.split(":")[1].strip()) | |
| raise RuntimeError("Could not determine page count") | |
| # --------------------------- | |
| # Geometry helpers | |
| # --------------------------- | |
| def scale_bbox(b, scale, padding, w, h): | |
| x1, y1, x2, y2 = b | |
| x1 = int(x1 / scale) - padding | |
| y1 = int(y1 / scale) - padding | |
| x2 = int(x2 / scale) + padding | |
| y2 = int(y2 / scale) + padding | |
| return ( | |
| max(0, x1), | |
| max(0, y1), | |
| min(w, x2), | |
| min(h, y2), | |
| ) | |
| def overlaps_x(a, b, frac=0.3): | |
| ax1, _, ax2, _ = a | |
| bx1, _, bx2, _ = b | |
| overlap = max(0, min(ax2, bx2) - max(ax1, bx1)) | |
| return overlap >= frac * min(ax2 - ax1, bx2 - bx1) | |
| def is_attached(anchor, other, max_gap=150): | |
| ax1, ay1, ax2, ay2 = anchor | |
| bx1, by1, bx2, by2 = other | |
| if not overlaps_x(anchor, other): | |
| return False | |
| # below anchor (captions / footnotes) | |
| if 0 <= by1 - ay2 <= max_gap: | |
| return True | |
| # above anchor (titles) | |
| if 0 <= ay1 - by2 <= max_gap: | |
| return True | |
| return False | |
| # --------------------------- | |
| # Main extraction logic | |
| # --------------------------- | |
| def extract_images_from_pdf( | |
| pdf_path: Path, | |
| model: YOLOv10, | |
| output_root: Path | None, | |
| dpi_high=400, | |
| dpi_low=120, | |
| imgsz=1024, | |
| conf=0.2, | |
| device="cpu", | |
| padding_px=10, | |
| ): | |
| pdf_path = pdf_path.resolve() | |
| if output_root: | |
| out_dir = output_root | |
| prefix = pdf_path.stem | |
| else: | |
| out_dir = pdf_path.with_suffix("") | |
| out_dir = out_dir.parent / f"{out_dir.name}-images" | |
| prefix = "" | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| pdf_title = pdf_path.stem | |
| try: | |
| info = subprocess.run( | |
| ["pdfinfo", str(pdf_path)], | |
| capture_output=True, | |
| text=True, | |
| ) | |
| for line in info.stdout.splitlines(): | |
| if line.lower().startswith("title:"): | |
| pdf_title = line.split(":", 1)[1].strip() | |
| break | |
| except Exception: | |
| pass | |
| page_count = min(get_pdf_page_count(pdf_path), MAX_PAGES) | |
| scale = dpi_low / dpi_high | |
| ANCHORS = {"figure", "table"} | |
| ATTACHMENTS = { | |
| "figure_caption", | |
| "table_caption", | |
| "table_footnote", | |
| "title", | |
| } | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmpdir = Path(tmpdir) | |
| for page_idx in tqdm( | |
| range(1, page_count + 1), | |
| desc=f"Pages ({pdf_path.name})", | |
| unit="page", | |
| leave=False, | |
| ): | |
| out_base = tmpdir / f"page_{page_idx:04d}" | |
| subprocess.run( | |
| [ | |
| "pdftoppm", | |
| "-png", | |
| "-r", str(dpi_high), | |
| "-f", str(page_idx), | |
| "-l", str(page_idx), | |
| str(pdf_path), | |
| str(out_base), | |
| ], | |
| stdout=subprocess.DEVNULL, | |
| stderr=subprocess.DEVNULL, | |
| check=True, | |
| ) | |
| matches = list(tmpdir.glob(out_base.name + "*.png")) | |
| if not matches: | |
| continue | |
| hi_path = matches[0] | |
| high_img = cv2.imread(str(hi_path)) | |
| if high_img is None: | |
| continue | |
| low_img = cv2.resize( | |
| high_img, | |
| None, | |
| fx=scale, | |
| fy=scale, | |
| interpolation=cv2.INTER_AREA, | |
| ) | |
| det = model.predict( | |
| source=low_img, | |
| imgsz=imgsz, | |
| conf=conf, | |
| device=device, | |
| ) | |
| if not det or det[0].boxes is None: | |
| continue | |
| H, W = high_img.shape[:2] | |
| names = det[0].names | |
| detections = [] | |
| for box in det[0].boxes: | |
| cls_id = int(box.cls[0]) | |
| cls_name = names.get(cls_id, "") | |
| x1, y1, x2, y2 = box.xyxy[0].cpu().numpy() | |
| bbox_hi = scale_bbox( | |
| (x1, y1, x2, y2), | |
| scale, | |
| padding_px, | |
| W, | |
| H, | |
| ) | |
| detections.append({ | |
| "cls": cls_name, | |
| "bbox": bbox_hi, | |
| }) | |
| fig_count = 0 | |
| for d in detections: | |
| if d["cls"] not in ANCHORS: | |
| continue | |
| union = list(d["bbox"]) | |
| for o in detections: | |
| if o["cls"] not in ATTACHMENTS: | |
| continue | |
| if is_attached(d["bbox"], o["bbox"]): | |
| ox1, oy1, ox2, oy2 = o["bbox"] | |
| union[0] = min(union[0], ox1) | |
| union[1] = min(union[1], oy1) | |
| union[2] = max(union[2], ox2) | |
| union[3] = max(union[3], oy2) | |
| x1, y1, x2, y2 = union | |
| if x2 <= x1 or y2 <= y1: | |
| continue | |
| crop = high_img[y1:y2, x1:x2] | |
| fig_count += 1 | |
| name = f"page_{page_idx:03d}_fig_{fig_count:02d}.png" | |
| if prefix: | |
| name = f"{prefix}_{name}" | |
| out_path = out_dir / name | |
| pil_img = Image.fromarray( | |
| cv2.cvtColor(crop, cv2.COLOR_BGR2RGB) | |
| ) | |
| meta = PngInfo() | |
| meta.add_text("source_pdf_title", pdf_title) | |
| meta.add_text("source_pdf_path", str(pdf_path)) | |
| meta.add_text("source_page", str(page_idx)) | |
| meta.add_text("anchor_class", d["cls"]) | |
| meta.add_text("extractor", "PDFImageExtractorYOLO_grouped") | |
| pil_img.save(out_path, pnginfo=meta) | |
| # --------------------------- | |
| # CLI | |
| # --------------------------- | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Extract figures/tables with captions from PDFs using DocLayout-YOLO" | |
| ) | |
| parser.add_argument("path", help="PDF file or folder (recursive)") | |
| parser.add_argument("--output", default=None) | |
| parser.add_argument("--model", default="doclayout_yolo_docstructbench_imgsz1024.pt") | |
| parser.add_argument("--device", default="cpu") | |
| args = parser.parse_args() | |
| input_path = Path(args.path).resolve() | |
| output_root = Path(args.output).resolve() if args.output else None | |
| if output_root: | |
| output_root.mkdir(parents=True, exist_ok=True) | |
| model = YOLOv10(args.model) | |
| if input_path.is_file(): | |
| pdfs = [input_path] | |
| elif input_path.is_dir(): | |
| pdfs = list(input_path.rglob("*.pdf")) | |
| else: | |
| raise ValueError("Invalid input path") | |
| for pdf in tqdm(pdfs, desc="PDFs", unit="pdf"): | |
| extract_images_from_pdf(pdf, model, output_root, device=args.device) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment