0187773933/PDFImageExtractorYOLO.py

## PDFImageExtractorYOLO.py
#!/usr/bin/env python3

import subprocess
import tempfile
from pathlib import Path
import argparse

import cv2
from doclayout_yolo import YOLOv10
from tqdm import tqdm
from PIL import Image
from PIL.PngImagePlugin import PngInfo

MAX_PAGES = 30  # skip long books


# ---------------------------
# PDF helpers
# ---------------------------

def get_pdf_page_count(pdf_path: Path) -> int:
	res = subprocess.run(
		["pdfinfo", str(pdf_path)],
		capture_output=True,
		text=True,
		check=True,
	)
	for line in res.stdout.splitlines():
		if line.startswith("Pages:"):
			return int(line.split(":")[1].strip())
	raise RuntimeError("Could not determine page count")


# ---------------------------
# Geometry helpers
# ---------------------------

def scale_bbox(b, scale, padding, w, h):
	x1, y1, x2, y2 = b
	x1 = int(x1 / scale) - padding
	y1 = int(y1 / scale) - padding
	x2 = int(x2 / scale) + padding
	y2 = int(y2 / scale) + padding
	return (
		max(0, x1),
		max(0, y1),
		min(w, x2),
		min(h, y2),
	)


def overlaps_x(a, b, frac=0.3):
	ax1, _, ax2, _ = a
	bx1, _, bx2, _ = b
	overlap = max(0, min(ax2, bx2) - max(ax1, bx1))
	return overlap >= frac * min(ax2 - ax1, bx2 - bx1)


def is_attached(anchor, other, max_gap=150):
	ax1, ay1, ax2, ay2 = anchor
	bx1, by1, bx2, by2 = other

	if not overlaps_x(anchor, other):
		return False

	# below anchor (captions / footnotes)
	if 0 <= by1 - ay2 <= max_gap:
		return True

	# above anchor (titles)
	if 0 <= ay1 - by2 <= max_gap:
		return True

	return False


# ---------------------------
# Main extraction logic
# ---------------------------

def extract_images_from_pdf(
	pdf_path: Path,
	model: YOLOv10,
	output_root: Path | None,
	dpi_high=400,
	dpi_low=120,
	imgsz=1024,
	conf=0.2,
	device="cpu",
	padding_px=10,
):
	pdf_path = pdf_path.resolve()

	if output_root:
		out_dir = output_root
		prefix = pdf_path.stem
	else:
		out_dir = pdf_path.with_suffix("")
		out_dir = out_dir.parent / f"{out_dir.name}-images"
		prefix = ""

	out_dir.mkdir(parents=True, exist_ok=True)

	pdf_title = pdf_path.stem
	try:
		info = subprocess.run(
			["pdfinfo", str(pdf_path)],
			capture_output=True,
			text=True,
		)
		for line in info.stdout.splitlines():
			if line.lower().startswith("title:"):
				pdf_title = line.split(":", 1)[1].strip()
				break
	except Exception:
		pass

	page_count = min(get_pdf_page_count(pdf_path), MAX_PAGES)
	scale = dpi_low / dpi_high

	ANCHORS = {"figure", "table"}
	ATTACHMENTS = {
		"figure_caption",
		"table_caption",
		"table_footnote",
		"title",
	}

	with tempfile.TemporaryDirectory() as tmpdir:
		tmpdir = Path(tmpdir)

		for page_idx in tqdm(
			range(1, page_count + 1),
			desc=f"Pages ({pdf_path.name})",
			unit="page",
			leave=False,
		):
			out_base = tmpdir / f"page_{page_idx:04d}"

			subprocess.run(
				[
					"pdftoppm",
					"-png",
					"-r", str(dpi_high),
					"-f", str(page_idx),
					"-l", str(page_idx),
					str(pdf_path),
					str(out_base),
				],
				stdout=subprocess.DEVNULL,
				stderr=subprocess.DEVNULL,
				check=True,
			)

			matches = list(tmpdir.glob(out_base.name + "*.png"))
			if not matches:
				continue

			hi_path = matches[0]
			high_img = cv2.imread(str(hi_path))
			if high_img is None:
				continue

			low_img = cv2.resize(
				high_img,
				None,
				fx=scale,
				fy=scale,
				interpolation=cv2.INTER_AREA,
			)

			det = model.predict(
				source=low_img,
				imgsz=imgsz,
				conf=conf,
				device=device,
			)

			if not det or det[0].boxes is None:
				continue

			H, W = high_img.shape[:2]
			names = det[0].names

			detections = []
			for box in det[0].boxes:
				cls_id = int(box.cls[0])
				cls_name = names.get(cls_id, "")
				x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()

				bbox_hi = scale_bbox(
					(x1, y1, x2, y2),
					scale,
					padding_px,
					W,
					H,
				)

				detections.append({
					"cls": cls_name,
					"bbox": bbox_hi,
				})

			fig_count = 0

			for d in detections:
				if d["cls"] not in ANCHORS:
					continue

				union = list(d["bbox"])

				for o in detections:
					if o["cls"] not in ATTACHMENTS:
						continue

					if is_attached(d["bbox"], o["bbox"]):
						ox1, oy1, ox2, oy2 = o["bbox"]
						union[0] = min(union[0], ox1)
						union[1] = min(union[1], oy1)
						union[2] = max(union[2], ox2)
						union[3] = max(union[3], oy2)

				x1, y1, x2, y2 = union
				if x2 <= x1 or y2 <= y1:
					continue

				crop = high_img[y1:y2, x1:x2]
				fig_count += 1

				name = f"page_{page_idx:03d}_fig_{fig_count:02d}.png"
				if prefix:
					name = f"{prefix}_{name}"

				out_path = out_dir / name

				pil_img = Image.fromarray(
					cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
				)

				meta = PngInfo()
				meta.add_text("source_pdf_title", pdf_title)
				meta.add_text("source_pdf_path", str(pdf_path))
				meta.add_text("source_page", str(page_idx))
				meta.add_text("anchor_class", d["cls"])
				meta.add_text("extractor", "PDFImageExtractorYOLO_grouped")

				pil_img.save(out_path, pnginfo=meta)


# ---------------------------
# CLI
# ---------------------------

def main():
	parser = argparse.ArgumentParser(
		description="Extract figures/tables with captions from PDFs using DocLayout-YOLO"
	)
	parser.add_argument("path", help="PDF file or folder (recursive)")
	parser.add_argument("--output", default=None)
	parser.add_argument("--model", default="doclayout_yolo_docstructbench_imgsz1024.pt")
	parser.add_argument("--device", default="cpu")
	args = parser.parse_args()

	input_path = Path(args.path).resolve()
	output_root = Path(args.output).resolve() if args.output else None
	if output_root:
		output_root.mkdir(parents=True, exist_ok=True)

	model = YOLOv10(args.model)

	if input_path.is_file():
		pdfs = [input_path]
	elif input_path.is_dir():
		pdfs = list(input_path.rglob("*.pdf"))
	else:
		raise ValueError("Invalid input path")

	for pdf in tqdm(pdfs, desc="PDFs", unit="pdf"):
		extract_images_from_pdf(pdf, model, output_root, device=args.device)


if __name__ == "__main__":
	main()
	#!/usr/bin/env python3

	import subprocess
	import tempfile
	from pathlib import Path
	import argparse

	import cv2
	from doclayout_yolo import YOLOv10
	from tqdm import tqdm
	from PIL import Image
	from PIL.PngImagePlugin import PngInfo

	MAX_PAGES = 30 # skip long books


	# ---------------------------
	# PDF helpers
	# ---------------------------

	def get_pdf_page_count(pdf_path: Path) -> int:
	res = subprocess.run(
	["pdfinfo", str(pdf_path)],
	capture_output=True,
	text=True,
	check=True,
	)
	for line in res.stdout.splitlines():
	if line.startswith("Pages:"):
	return int(line.split(":")[1].strip())
	raise RuntimeError("Could not determine page count")


	# ---------------------------
	# Geometry helpers
	# ---------------------------

	def scale_bbox(b, scale, padding, w, h):
	x1, y1, x2, y2 = b
	x1 = int(x1 / scale) - padding
	y1 = int(y1 / scale) - padding
	x2 = int(x2 / scale) + padding
	y2 = int(y2 / scale) + padding
	return (
	max(0, x1),
	max(0, y1),
	min(w, x2),
	min(h, y2),
	)


	def overlaps_x(a, b, frac=0.3):
	ax1, _, ax2, _ = a
	bx1, _, bx2, _ = b
	overlap = max(0, min(ax2, bx2) - max(ax1, bx1))
	return overlap >= frac * min(ax2 - ax1, bx2 - bx1)


	def is_attached(anchor, other, max_gap=150):
	ax1, ay1, ax2, ay2 = anchor
	bx1, by1, bx2, by2 = other

	if not overlaps_x(anchor, other):
	return False

	# below anchor (captions / footnotes)
	if 0 <= by1 - ay2 <= max_gap:
	return True

	# above anchor (titles)
	if 0 <= ay1 - by2 <= max_gap:
	return True

	return False


	# ---------------------------
	# Main extraction logic
	# ---------------------------

	def extract_images_from_pdf(
	pdf_path: Path,
	model: YOLOv10,
	output_root: Path \| None,
	dpi_high=400,
	dpi_low=120,
	imgsz=1024,
	conf=0.2,
	device="cpu",
	padding_px=10,
	):
	pdf_path = pdf_path.resolve()

	if output_root:
	out_dir = output_root
	prefix = pdf_path.stem
	else:
	out_dir = pdf_path.with_suffix("")
	out_dir = out_dir.parent / f"{out_dir.name}-images"
	prefix = ""

	out_dir.mkdir(parents=True, exist_ok=True)

	pdf_title = pdf_path.stem
	try:
	info = subprocess.run(
	["pdfinfo", str(pdf_path)],
	capture_output=True,
	text=True,
	)
	for line in info.stdout.splitlines():
	if line.lower().startswith("title:"):
	pdf_title = line.split(":", 1)[1].strip()
	break
	except Exception:
	pass

	page_count = min(get_pdf_page_count(pdf_path), MAX_PAGES)
	scale = dpi_low / dpi_high

	ANCHORS = {"figure", "table"}
	ATTACHMENTS = {
	"figure_caption",
	"table_caption",
	"table_footnote",
	"title",
	}

	with tempfile.TemporaryDirectory() as tmpdir:
	tmpdir = Path(tmpdir)

	for page_idx in tqdm(
	range(1, page_count + 1),
	desc=f"Pages ({pdf_path.name})",
	unit="page",
	leave=False,
	):
	out_base = tmpdir / f"page_{page_idx:04d}"

	subprocess.run(
	[
	"pdftoppm",
	"-png",
	"-r", str(dpi_high),
	"-f", str(page_idx),
	"-l", str(page_idx),
	str(pdf_path),
	str(out_base),
	],
	stdout=subprocess.DEVNULL,
	stderr=subprocess.DEVNULL,
	check=True,
	)

	matches = list(tmpdir.glob(out_base.name + "*.png"))
	if not matches:
	continue

	hi_path = matches[0]
	high_img = cv2.imread(str(hi_path))
	if high_img is None:
	continue

	low_img = cv2.resize(
	high_img,
	None,
	fx=scale,
	fy=scale,
	interpolation=cv2.INTER_AREA,
	)

	det = model.predict(
	source=low_img,
	imgsz=imgsz,
	conf=conf,
	device=device,
	)

	if not det or det[0].boxes is None:
	continue

	H, W = high_img.shape[:2]
	names = det[0].names

	detections = []
	for box in det[0].boxes:
	cls_id = int(box.cls[0])
	cls_name = names.get(cls_id, "")
	x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()

	bbox_hi = scale_bbox(
	(x1, y1, x2, y2),
	scale,
	padding_px,
	W,
	H,
	)

	detections.append({
	"cls": cls_name,
	"bbox": bbox_hi,
	})

	fig_count = 0

	for d in detections:
	if d["cls"] not in ANCHORS:
	continue

	union = list(d["bbox"])

	for o in detections:
	if o["cls"] not in ATTACHMENTS:
	continue

	if is_attached(d["bbox"], o["bbox"]):
	ox1, oy1, ox2, oy2 = o["bbox"]
	union[0] = min(union[0], ox1)
	union[1] = min(union[1], oy1)
	union[2] = max(union[2], ox2)
	union[3] = max(union[3], oy2)

	x1, y1, x2, y2 = union
	if x2 <= x1 or y2 <= y1:
	continue

	crop = high_img[y1:y2, x1:x2]
	fig_count += 1

	name = f"page_{page_idx:03d}_fig_{fig_count:02d}.png"
	if prefix:
	name = f"{prefix}_{name}"

	out_path = out_dir / name

	pil_img = Image.fromarray(
	cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
	)

	meta = PngInfo()
	meta.add_text("source_pdf_title", pdf_title)
	meta.add_text("source_pdf_path", str(pdf_path))
	meta.add_text("source_page", str(page_idx))
	meta.add_text("anchor_class", d["cls"])
	meta.add_text("extractor", "PDFImageExtractorYOLO_grouped")

	pil_img.save(out_path, pnginfo=meta)


	# ---------------------------
	# CLI
	# ---------------------------

	def main():
	parser = argparse.ArgumentParser(
	description="Extract figures/tables with captions from PDFs using DocLayout-YOLO"
	)
	parser.add_argument("path", help="PDF file or folder (recursive)")
	parser.add_argument("--output", default=None)
	parser.add_argument("--model", default="doclayout_yolo_docstructbench_imgsz1024.pt")
	parser.add_argument("--device", default="cpu")
	args = parser.parse_args()

	input_path = Path(args.path).resolve()
	output_root = Path(args.output).resolve() if args.output else None
	if output_root:
	output_root.mkdir(parents=True, exist_ok=True)

	model = YOLOv10(args.model)

	if input_path.is_file():
	pdfs = [input_path]
	elif input_path.is_dir():
	pdfs = list(input_path.rglob("*.pdf"))
	else:
	raise ValueError("Invalid input path")

	for pdf in tqdm(pdfs, desc="PDFs", unit="pdf"):
	extract_images_from_pdf(pdf, model, output_root, device=args.device)


	if __name__ == "__main__":
	main()
No results found