noaione/export_poppler_batch.py

## export_poppler_batch.py
# This should be run second

import json
import subprocess as sp
from pathlib import Path

import pydantic


class DpiPrecalculation(pydantic.BaseModel):
    dpi: int
    pages: list[int]
    color: bool


class PdfDpiPrecalculation(pydantic.BaseModel):
    pdf_file: str
    dpis: list[DpiPrecalculation]


# Load all .dpis.json files in the current directory
ROOT_DIR = Path(__file__).parent.resolve()


def make_grouped_command(file_path: Path, calculation: DpiPrecalculation) -> list[list[str]]:
    # We want to group multiple pages together for the same DPI
    grouped_pages: list[list[int]] = []
    for page in calculation.pages:
        if not grouped_pages:
            grouped_pages.append([page])
        elif page == grouped_pages[-1][-1] + 1:
            grouped_pages[-1].append(page)
        else:
            grouped_pages.append([page])
    base_cmds = [
        "pdftoppm",
        "-png",
        "-r",
        str(calculation.dpi),
        "-progress",
        "-cropbox",
    ]

    folder_path = file_path.parent
    new_stem = file_path.stem
    if not calculation.color:
        new_stem += "_gray"

    output_dir = folder_path / "source" / new_stem
    output_dir.mkdir(parents=True, exist_ok=True)

    if not calculation.color:
        base_cmds.append("-gray")

    commands_finals: list[list[str]] = []
    for page_group in grouped_pages:
        copied_group = [cmd for cmd in base_cmds]
        copied_group.extend(["-f", str(page_group[0]), "-l", str(page_group[-1])])
        copied_group.append(str(file_path))
        copied_group.append(str(output_dir / "p"))
        commands_finals.append(copied_group)
    return commands_finals


for dpi_files in ROOT_DIR.glob("*.dpis.json"):
    print("Processing:", dpi_files)
    with dpi_files.open("r", encoding="utf-8") as f:
        data = json.load(f)
        precalc = PdfDpiPrecalculation.model_validate(data, strict=True)
        # Here you can use the `precalc` object as needed
    pdf_file = Path(precalc.pdf_file)
    all_commands: list[list[str]] = []
    for dpi_calc in precalc.dpis:
        commands = make_grouped_command(pdf_file, dpi_calc)
        # Here you can use the `commands` list as needed
        all_commands.extend(commands)

    print(f"Total commands to run: {len(all_commands)}")
    for cmd in all_commands:
        print("Running command:", " ".join(cmd))
        sp.run(cmd, check=True)

## precalc_dpi.py
# This should be ran first

import json
from pathlib import Path

import pymupdf

MAXIMUM_DPI = 600  # Maximum DPI we would ever use
ACHROMATIC_NAMES = {
    "black",
    "gray",
    "grey",
    "darkgray",
    "darkgrey",
    "lightgray",
    "lightgrey",
}

ROOT_DIR = Path(__file__).parent.resolve()


def dpi_from_height(page_height_pt: int, img_height_px: int) -> int:
    # 1 inch = 72 points
    page_height_inch = page_height_pt / 72.0
    dpi = img_height_px / page_height_inch
    return max(int(dpi), 72)  # Minimum DPI is 72


def round_down_to_nearest_5(n: int) -> int:
    return (n // 5) * 5


def is_colorspace_color(colorspace_name: str | None) -> bool:
    """
    Parses a PyMuPDF cs-name string to determine if it's "color" or "gray".
    Returns True for color, False for gray.

    This code is written by Google Gemini since I'm lazy as fuck
    """
    if not colorspace_name:
        return False  # No colorspace info, assume monochrome

    # Normalize for comparison
    cs_name_lower = colorspace_name.lower()

    # 1. Simple, direct "Gray" cases
    if cs_name_lower in ("devicegray", "calgray", "none"):
        return False

    # 2. Simple, direct "Color" cases
    if cs_name_lower in ("devicergb", "calrgb", "devicecmyk", "lab", "iccbased"):
        return True

    # 3. Handle "Indexed(..., BaseSpace)"
    if colorspace_name.startswith("Indexed("):
        try:
            # Get content inside parens: e.g., "0,DeviceCMYK"
            content = colorspace_name[8:-1]
            # Find the base space, which is the last argument
            base_space = content.split(",")[-1].strip()
            # Recurse to find out what the base space is
            return is_colorspace_color(base_space)
        except Exception:
            return True  # Fail safe to color

    # 4. Handle "DeviceN(count, base, colorant1, ...)"
    # This is the most complex one, especially for your "Black" only example.
    if colorspace_name.startswith("DeviceN("):
        try:
            # Get content inside parens: e.g., "1,DeviceCMYK,Black"
            content = colorspace_name[8:-1]
            parts = [p.strip() for p in content.split(",")]

            # The colorants start from the 3rd item (index 2)
            colorants = parts[2:]  # e.g., ["Black"] or ["Magenta", "Yellow", "PANTONE 231 C"]

            if not colorants:
                return True  # No colorants specified? Fail safe to color.

            # Check if ALL colorants are achromatic
            for colorant in colorants:
                # Strip quotes just in case: 'Black'
                clean_colorant = colorant.strip("'\"").lower()
                if clean_colorant not in ACHROMATIC_NAMES:
                    # Found a chromatic colorant (like "Magenta", "PANTONE", "Red")
                    return True

            # If we get here, all colorants were in the achromatic list (e.g., just "Black")
            return False
        except Exception:
            return True  # Fail safe to color

    # 5. Handle "Separation(name, base, ...)"
    # A spot color. We just check the colorant's name.
    if colorspace_name.startswith("Separation("):
        try:
            content = colorspace_name[11:-1]
            parts = [p.strip() for p in content.split(",")]
            # The colorant name is the first item
            colorant_name = parts[0].strip("'\"")  # e.g., "PANTONE 231 C" or "Black"

            if colorant_name.lower() in ACHROMATIC_NAMES:
                return False
            else:
                return True  # "Magenta", "PANTONE", etc. are treated as color
        except Exception:
            return True  # Fail safe to color

    # 6. Fallback for unknowns (Pattern, etc.)
    # If we don't recognize it, assume color to be safe.
    return True


for pdf_file in ROOT_DIR.glob("*.pdf"):
    print("Processing:", pdf_file)
    doc = pymupdf.open(pdf_file)
    colleted_dpis = {}
    collected_color_dpis = {}
    total_pages = doc.page_count
    for page_number in range(total_pages):
        idx = page_number + 1
        page = doc.load_page(page_number)
        page_height_pt = page.rect.height

        images = page.get_image_info()
        if not images:
            # No images, export at maximum DPI
            colleted_dpis.setdefault(MAXIMUM_DPI, []).append(idx)
            continue

        # We only allow image that has height at least 2x the page width
        images_candidates = []
        has_color = False
        for img in images:
            img_width = img.get("width", 0)  # width in pixels
            img_height = img.get("height", 0)  # height in pixels
            cs_name = img.get("cs-name")  # colorspace name
            if not has_color and is_colorspace_color(cs_name):  # Only set once
                has_color = True
            if img_height >= 1.25 * img_width:
                # Okay, calculate DPI
                calc_dpi = dpi_from_height(page_height_pt, img_height)
                images_candidates.append(calc_dpi)

        # Maximum DPI from all candidates
        max_dpi = min(max(images_candidates) if images_candidates else MAXIMUM_DPI, MAXIMUM_DPI)
        rounded_dpi = round_down_to_nearest_5(max_dpi)
        if has_color:
            collected_color_dpis.setdefault(rounded_dpi, []).append(idx)
        else:
            colleted_dpis.setdefault(rounded_dpi, []).append(idx)
    doc.close()

    # Change the output from DPI -> list of pages to an array of {dpi, pages}
    formatted_dpis = []
    for dpi, pages in sorted(colleted_dpis.items()):
        formatted_dpis.append({"dpi": dpi, "pages": pages, "color": False})
    formatted_color_dpis = []
    for dpi, pages in sorted(collected_color_dpis.items()):
        formatted_color_dpis.append({"dpi": dpi, "pages": pages, "color": True})
    collected_pages = {
        "pdf_file": str(pdf_file.resolve()),
        "dpis": formatted_dpis + formatted_color_dpis,
    }
    output_file = pdf_file.with_suffix(".dpis.json")
    output_file.write_text(json.dumps(collected_pages, indent=4))
	# This should be run second

	import json
	import subprocess as sp
	from pathlib import Path

	import pydantic


	class DpiPrecalculation(pydantic.BaseModel):
	dpi: int
	pages: list[int]
	color: bool


	class PdfDpiPrecalculation(pydantic.BaseModel):
	pdf_file: str
	dpis: list[DpiPrecalculation]


	# Load all .dpis.json files in the current directory
	ROOT_DIR = Path(__file__).parent.resolve()


	def make_grouped_command(file_path: Path, calculation: DpiPrecalculation) -> list[list[str]]:
	# We want to group multiple pages together for the same DPI
	grouped_pages: list[list[int]] = []
	for page in calculation.pages:
	if not grouped_pages:
	grouped_pages.append([page])
	elif page == grouped_pages[-1][-1] + 1:
	grouped_pages[-1].append(page)
	else:
	grouped_pages.append([page])
	base_cmds = [
	"pdftoppm",
	"-png",
	"-r",
	str(calculation.dpi),
	"-progress",
	"-cropbox",
	]

	folder_path = file_path.parent
	new_stem = file_path.stem
	if not calculation.color:
	new_stem += "_gray"

	output_dir = folder_path / "source" / new_stem
	output_dir.mkdir(parents=True, exist_ok=True)

	if not calculation.color:
	base_cmds.append("-gray")

	commands_finals: list[list[str]] = []
	for page_group in grouped_pages:
	copied_group = [cmd for cmd in base_cmds]
	copied_group.extend(["-f", str(page_group[0]), "-l", str(page_group[-1])])
	copied_group.append(str(file_path))
	copied_group.append(str(output_dir / "p"))
	commands_finals.append(copied_group)
	return commands_finals


	for dpi_files in ROOT_DIR.glob("*.dpis.json"):
	print("Processing:", dpi_files)
	with dpi_files.open("r", encoding="utf-8") as f:
	data = json.load(f)
	precalc = PdfDpiPrecalculation.model_validate(data, strict=True)
	# Here you can use the `precalc` object as needed
	pdf_file = Path(precalc.pdf_file)
	all_commands: list[list[str]] = []
	for dpi_calc in precalc.dpis:
	commands = make_grouped_command(pdf_file, dpi_calc)
	# Here you can use the `commands` list as needed
	all_commands.extend(commands)

	print(f"Total commands to run: {len(all_commands)}")
	for cmd in all_commands:
	print("Running command:", " ".join(cmd))
	sp.run(cmd, check=True)
	# This should be ran first

	import json
	from pathlib import Path

	import pymupdf

	MAXIMUM_DPI = 600 # Maximum DPI we would ever use
	ACHROMATIC_NAMES = {
	"black",
	"gray",
	"grey",
	"darkgray",
	"darkgrey",
	"lightgray",
	"lightgrey",
	}

	ROOT_DIR = Path(__file__).parent.resolve()


	def dpi_from_height(page_height_pt: int, img_height_px: int) -> int:
	# 1 inch = 72 points
	page_height_inch = page_height_pt / 72.0
	dpi = img_height_px / page_height_inch
	return max(int(dpi), 72) # Minimum DPI is 72


	def round_down_to_nearest_5(n: int) -> int:
	return (n // 5) * 5


	def is_colorspace_color(colorspace_name: str \| None) -> bool:
	"""
	Parses a PyMuPDF cs-name string to determine if it's "color" or "gray".
	Returns True for color, False for gray.

	This code is written by Google Gemini since I'm lazy as fuck
	"""
	if not colorspace_name:
	return False # No colorspace info, assume monochrome

	# Normalize for comparison
	cs_name_lower = colorspace_name.lower()

	# 1. Simple, direct "Gray" cases
	if cs_name_lower in ("devicegray", "calgray", "none"):
	return False

	# 2. Simple, direct "Color" cases
	if cs_name_lower in ("devicergb", "calrgb", "devicecmyk", "lab", "iccbased"):
	return True

	# 3. Handle "Indexed(..., BaseSpace)"
	if colorspace_name.startswith("Indexed("):
	try:
	# Get content inside parens: e.g., "0,DeviceCMYK"
	content = colorspace_name[8:-1]
	# Find the base space, which is the last argument
	base_space = content.split(",")[-1].strip()
	# Recurse to find out what the base space is
	return is_colorspace_color(base_space)
	except Exception:
	return True # Fail safe to color

	# 4. Handle "DeviceN(count, base, colorant1, ...)"
	# This is the most complex one, especially for your "Black" only example.
	if colorspace_name.startswith("DeviceN("):
	try:
	# Get content inside parens: e.g., "1,DeviceCMYK,Black"
	content = colorspace_name[8:-1]
	parts = [p.strip() for p in content.split(",")]

	# The colorants start from the 3rd item (index 2)
	colorants = parts[2:] # e.g., ["Black"] or ["Magenta", "Yellow", "PANTONE 231 C"]

	if not colorants:
	return True # No colorants specified? Fail safe to color.

	# Check if ALL colorants are achromatic
	for colorant in colorants:
	# Strip quotes just in case: 'Black'
	clean_colorant = colorant.strip("'\"").lower()
	if clean_colorant not in ACHROMATIC_NAMES:
	# Found a chromatic colorant (like "Magenta", "PANTONE", "Red")
	return True

	# If we get here, all colorants were in the achromatic list (e.g., just "Black")
	return False
	except Exception:
	return True # Fail safe to color

	# 5. Handle "Separation(name, base, ...)"
	# A spot color. We just check the colorant's name.
	if colorspace_name.startswith("Separation("):
	try:
	content = colorspace_name[11:-1]
	parts = [p.strip() for p in content.split(",")]
	# The colorant name is the first item
	colorant_name = parts[0].strip("'\"") # e.g., "PANTONE 231 C" or "Black"

	if colorant_name.lower() in ACHROMATIC_NAMES:
	return False
	else:
	return True # "Magenta", "PANTONE", etc. are treated as color
	except Exception:
	return True # Fail safe to color

	# 6. Fallback for unknowns (Pattern, etc.)
	# If we don't recognize it, assume color to be safe.
	return True


	for pdf_file in ROOT_DIR.glob("*.pdf"):
	print("Processing:", pdf_file)
	doc = pymupdf.open(pdf_file)
	colleted_dpis = {}
	collected_color_dpis = {}
	total_pages = doc.page_count
	for page_number in range(total_pages):
	idx = page_number + 1
	page = doc.load_page(page_number)
	page_height_pt = page.rect.height

	images = page.get_image_info()
	if not images:
	# No images, export at maximum DPI
	colleted_dpis.setdefault(MAXIMUM_DPI, []).append(idx)
	continue

	# We only allow image that has height at least 2x the page width
	images_candidates = []
	has_color = False
	for img in images:
	img_width = img.get("width", 0) # width in pixels
	img_height = img.get("height", 0) # height in pixels
	cs_name = img.get("cs-name") # colorspace name
	if not has_color and is_colorspace_color(cs_name): # Only set once
	has_color = True
	if img_height >= 1.25 * img_width:
	# Okay, calculate DPI
	calc_dpi = dpi_from_height(page_height_pt, img_height)
	images_candidates.append(calc_dpi)

	# Maximum DPI from all candidates
	max_dpi = min(max(images_candidates) if images_candidates else MAXIMUM_DPI, MAXIMUM_DPI)
	rounded_dpi = round_down_to_nearest_5(max_dpi)
	if has_color:
	collected_color_dpis.setdefault(rounded_dpi, []).append(idx)
	else:
	colleted_dpis.setdefault(rounded_dpi, []).append(idx)
	doc.close()

	# Change the output from DPI -> list of pages to an array of {dpi, pages}
	formatted_dpis = []
	for dpi, pages in sorted(colleted_dpis.items()):
	formatted_dpis.append({"dpi": dpi, "pages": pages, "color": False})
	formatted_color_dpis = []
	for dpi, pages in sorted(collected_color_dpis.items()):
	formatted_color_dpis.append({"dpi": dpi, "pages": pages, "color": True})
	collected_pages = {
	"pdf_file": str(pdf_file.resolve()),
	"dpis": formatted_dpis + formatted_color_dpis,
	}
	output_file = pdf_file.with_suffix(".dpis.json")
	output_file.write_text(json.dumps(collected_pages, indent=4))