Skip to content

Instantly share code, notes, and snippets.

@noaione
Last active November 1, 2025 00:17
Show Gist options
  • Select an option

  • Save noaione/5cb9874f33c357162906eb3756eccae3 to your computer and use it in GitHub Desktop.

Select an option

Save noaione/5cb9874f33c357162906eb3756eccae3 to your computer and use it in GitHub Desktop.
batch precalc dpi + exporting (2 files)
# This should be run second
import json
import subprocess as sp
from pathlib import Path
import pydantic
class DpiPrecalculation(pydantic.BaseModel):
dpi: int
pages: list[int]
color: bool
class PdfDpiPrecalculation(pydantic.BaseModel):
pdf_file: str
dpis: list[DpiPrecalculation]
# Load all .dpis.json files in the current directory
ROOT_DIR = Path(__file__).parent.resolve()
def make_grouped_command(file_path: Path, calculation: DpiPrecalculation) -> list[list[str]]:
# We want to group multiple pages together for the same DPI
grouped_pages: list[list[int]] = []
for page in calculation.pages:
if not grouped_pages:
grouped_pages.append([page])
elif page == grouped_pages[-1][-1] + 1:
grouped_pages[-1].append(page)
else:
grouped_pages.append([page])
base_cmds = [
"pdftoppm",
"-png",
"-r",
str(calculation.dpi),
"-progress",
"-cropbox",
]
folder_path = file_path.parent
new_stem = file_path.stem
if not calculation.color:
new_stem += "_gray"
output_dir = folder_path / "source" / new_stem
output_dir.mkdir(parents=True, exist_ok=True)
if not calculation.color:
base_cmds.append("-gray")
commands_finals: list[list[str]] = []
for page_group in grouped_pages:
copied_group = [cmd for cmd in base_cmds]
copied_group.extend(["-f", str(page_group[0]), "-l", str(page_group[-1])])
copied_group.append(str(file_path))
copied_group.append(str(output_dir / "p"))
commands_finals.append(copied_group)
return commands_finals
for dpi_files in ROOT_DIR.glob("*.dpis.json"):
print("Processing:", dpi_files)
with dpi_files.open("r", encoding="utf-8") as f:
data = json.load(f)
precalc = PdfDpiPrecalculation.model_validate(data, strict=True)
# Here you can use the `precalc` object as needed
pdf_file = Path(precalc.pdf_file)
all_commands: list[list[str]] = []
for dpi_calc in precalc.dpis:
commands = make_grouped_command(pdf_file, dpi_calc)
# Here you can use the `commands` list as needed
all_commands.extend(commands)
print(f"Total commands to run: {len(all_commands)}")
for cmd in all_commands:
print("Running command:", " ".join(cmd))
sp.run(cmd, check=True)
# This should be ran first
import json
from pathlib import Path
import pymupdf
MAXIMUM_DPI = 600 # Maximum DPI we would ever use
ACHROMATIC_NAMES = {
"black",
"gray",
"grey",
"darkgray",
"darkgrey",
"lightgray",
"lightgrey",
}
ROOT_DIR = Path(__file__).parent.resolve()
def dpi_from_height(page_height_pt: int, img_height_px: int) -> int:
# 1 inch = 72 points
page_height_inch = page_height_pt / 72.0
dpi = img_height_px / page_height_inch
return max(int(dpi), 72) # Minimum DPI is 72
def round_down_to_nearest_5(n: int) -> int:
return (n // 5) * 5
def is_colorspace_color(colorspace_name: str | None) -> bool:
"""
Parses a PyMuPDF cs-name string to determine if it's "color" or "gray".
Returns True for color, False for gray.
This code is written by Google Gemini since I'm lazy as fuck
"""
if not colorspace_name:
return False # No colorspace info, assume monochrome
# Normalize for comparison
cs_name_lower = colorspace_name.lower()
# 1. Simple, direct "Gray" cases
if cs_name_lower in ("devicegray", "calgray", "none"):
return False
# 2. Simple, direct "Color" cases
if cs_name_lower in ("devicergb", "calrgb", "devicecmyk", "lab", "iccbased"):
return True
# 3. Handle "Indexed(..., BaseSpace)"
if colorspace_name.startswith("Indexed("):
try:
# Get content inside parens: e.g., "0,DeviceCMYK"
content = colorspace_name[8:-1]
# Find the base space, which is the last argument
base_space = content.split(",")[-1].strip()
# Recurse to find out what the base space is
return is_colorspace_color(base_space)
except Exception:
return True # Fail safe to color
# 4. Handle "DeviceN(count, base, colorant1, ...)"
# This is the most complex one, especially for your "Black" only example.
if colorspace_name.startswith("DeviceN("):
try:
# Get content inside parens: e.g., "1,DeviceCMYK,Black"
content = colorspace_name[8:-1]
parts = [p.strip() for p in content.split(",")]
# The colorants start from the 3rd item (index 2)
colorants = parts[2:] # e.g., ["Black"] or ["Magenta", "Yellow", "PANTONE 231 C"]
if not colorants:
return True # No colorants specified? Fail safe to color.
# Check if ALL colorants are achromatic
for colorant in colorants:
# Strip quotes just in case: 'Black'
clean_colorant = colorant.strip("'\"").lower()
if clean_colorant not in ACHROMATIC_NAMES:
# Found a chromatic colorant (like "Magenta", "PANTONE", "Red")
return True
# If we get here, all colorants were in the achromatic list (e.g., just "Black")
return False
except Exception:
return True # Fail safe to color
# 5. Handle "Separation(name, base, ...)"
# A spot color. We just check the colorant's name.
if colorspace_name.startswith("Separation("):
try:
content = colorspace_name[11:-1]
parts = [p.strip() for p in content.split(",")]
# The colorant name is the first item
colorant_name = parts[0].strip("'\"") # e.g., "PANTONE 231 C" or "Black"
if colorant_name.lower() in ACHROMATIC_NAMES:
return False
else:
return True # "Magenta", "PANTONE", etc. are treated as color
except Exception:
return True # Fail safe to color
# 6. Fallback for unknowns (Pattern, etc.)
# If we don't recognize it, assume color to be safe.
return True
for pdf_file in ROOT_DIR.glob("*.pdf"):
print("Processing:", pdf_file)
doc = pymupdf.open(pdf_file)
colleted_dpis = {}
collected_color_dpis = {}
total_pages = doc.page_count
for page_number in range(total_pages):
idx = page_number + 1
page = doc.load_page(page_number)
page_height_pt = page.rect.height
images = page.get_image_info()
if not images:
# No images, export at maximum DPI
colleted_dpis.setdefault(MAXIMUM_DPI, []).append(idx)
continue
# We only allow image that has height at least 2x the page width
images_candidates = []
has_color = False
for img in images:
img_width = img.get("width", 0) # width in pixels
img_height = img.get("height", 0) # height in pixels
cs_name = img.get("cs-name") # colorspace name
if not has_color and is_colorspace_color(cs_name): # Only set once
has_color = True
if img_height >= 1.25 * img_width:
# Okay, calculate DPI
calc_dpi = dpi_from_height(page_height_pt, img_height)
images_candidates.append(calc_dpi)
# Maximum DPI from all candidates
max_dpi = min(max(images_candidates) if images_candidates else MAXIMUM_DPI, MAXIMUM_DPI)
rounded_dpi = round_down_to_nearest_5(max_dpi)
if has_color:
collected_color_dpis.setdefault(rounded_dpi, []).append(idx)
else:
colleted_dpis.setdefault(rounded_dpi, []).append(idx)
doc.close()
# Change the output from DPI -> list of pages to an array of {dpi, pages}
formatted_dpis = []
for dpi, pages in sorted(colleted_dpis.items()):
formatted_dpis.append({"dpi": dpi, "pages": pages, "color": False})
formatted_color_dpis = []
for dpi, pages in sorted(collected_color_dpis.items()):
formatted_color_dpis.append({"dpi": dpi, "pages": pages, "color": True})
collected_pages = {
"pdf_file": str(pdf_file.resolve()),
"dpis": formatted_dpis + formatted_color_dpis,
}
output_file = pdf_file.with_suffix(".dpis.json")
output_file.write_text(json.dumps(collected_pages, indent=4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment