Last active
November 1, 2025 00:17
-
-
Save noaione/5cb9874f33c357162906eb3756eccae3 to your computer and use it in GitHub Desktop.
batch precalc dpi + exporting (2 files)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This should be run second | |
| import json | |
| import subprocess as sp | |
| from pathlib import Path | |
| import pydantic | |
| class DpiPrecalculation(pydantic.BaseModel): | |
| dpi: int | |
| pages: list[int] | |
| color: bool | |
| class PdfDpiPrecalculation(pydantic.BaseModel): | |
| pdf_file: str | |
| dpis: list[DpiPrecalculation] | |
| # Load all .dpis.json files in the current directory | |
| ROOT_DIR = Path(__file__).parent.resolve() | |
| def make_grouped_command(file_path: Path, calculation: DpiPrecalculation) -> list[list[str]]: | |
| # We want to group multiple pages together for the same DPI | |
| grouped_pages: list[list[int]] = [] | |
| for page in calculation.pages: | |
| if not grouped_pages: | |
| grouped_pages.append([page]) | |
| elif page == grouped_pages[-1][-1] + 1: | |
| grouped_pages[-1].append(page) | |
| else: | |
| grouped_pages.append([page]) | |
| base_cmds = [ | |
| "pdftoppm", | |
| "-png", | |
| "-r", | |
| str(calculation.dpi), | |
| "-progress", | |
| "-cropbox", | |
| ] | |
| folder_path = file_path.parent | |
| new_stem = file_path.stem | |
| if not calculation.color: | |
| new_stem += "_gray" | |
| output_dir = folder_path / "source" / new_stem | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| if not calculation.color: | |
| base_cmds.append("-gray") | |
| commands_finals: list[list[str]] = [] | |
| for page_group in grouped_pages: | |
| copied_group = [cmd for cmd in base_cmds] | |
| copied_group.extend(["-f", str(page_group[0]), "-l", str(page_group[-1])]) | |
| copied_group.append(str(file_path)) | |
| copied_group.append(str(output_dir / "p")) | |
| commands_finals.append(copied_group) | |
| return commands_finals | |
| for dpi_files in ROOT_DIR.glob("*.dpis.json"): | |
| print("Processing:", dpi_files) | |
| with dpi_files.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| precalc = PdfDpiPrecalculation.model_validate(data, strict=True) | |
| # Here you can use the `precalc` object as needed | |
| pdf_file = Path(precalc.pdf_file) | |
| all_commands: list[list[str]] = [] | |
| for dpi_calc in precalc.dpis: | |
| commands = make_grouped_command(pdf_file, dpi_calc) | |
| # Here you can use the `commands` list as needed | |
| all_commands.extend(commands) | |
| print(f"Total commands to run: {len(all_commands)}") | |
| for cmd in all_commands: | |
| print("Running command:", " ".join(cmd)) | |
| sp.run(cmd, check=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This should be ran first | |
| import json | |
| from pathlib import Path | |
| import pymupdf | |
| MAXIMUM_DPI = 600 # Maximum DPI we would ever use | |
| ACHROMATIC_NAMES = { | |
| "black", | |
| "gray", | |
| "grey", | |
| "darkgray", | |
| "darkgrey", | |
| "lightgray", | |
| "lightgrey", | |
| } | |
| ROOT_DIR = Path(__file__).parent.resolve() | |
| def dpi_from_height(page_height_pt: int, img_height_px: int) -> int: | |
| # 1 inch = 72 points | |
| page_height_inch = page_height_pt / 72.0 | |
| dpi = img_height_px / page_height_inch | |
| return max(int(dpi), 72) # Minimum DPI is 72 | |
| def round_down_to_nearest_5(n: int) -> int: | |
| return (n // 5) * 5 | |
| def is_colorspace_color(colorspace_name: str | None) -> bool: | |
| """ | |
| Parses a PyMuPDF cs-name string to determine if it's "color" or "gray". | |
| Returns True for color, False for gray. | |
| This code is written by Google Gemini since I'm lazy as fuck | |
| """ | |
| if not colorspace_name: | |
| return False # No colorspace info, assume monochrome | |
| # Normalize for comparison | |
| cs_name_lower = colorspace_name.lower() | |
| # 1. Simple, direct "Gray" cases | |
| if cs_name_lower in ("devicegray", "calgray", "none"): | |
| return False | |
| # 2. Simple, direct "Color" cases | |
| if cs_name_lower in ("devicergb", "calrgb", "devicecmyk", "lab", "iccbased"): | |
| return True | |
| # 3. Handle "Indexed(..., BaseSpace)" | |
| if colorspace_name.startswith("Indexed("): | |
| try: | |
| # Get content inside parens: e.g., "0,DeviceCMYK" | |
| content = colorspace_name[8:-1] | |
| # Find the base space, which is the last argument | |
| base_space = content.split(",")[-1].strip() | |
| # Recurse to find out what the base space is | |
| return is_colorspace_color(base_space) | |
| except Exception: | |
| return True # Fail safe to color | |
| # 4. Handle "DeviceN(count, base, colorant1, ...)" | |
| # This is the most complex one, especially for your "Black" only example. | |
| if colorspace_name.startswith("DeviceN("): | |
| try: | |
| # Get content inside parens: e.g., "1,DeviceCMYK,Black" | |
| content = colorspace_name[8:-1] | |
| parts = [p.strip() for p in content.split(",")] | |
| # The colorants start from the 3rd item (index 2) | |
| colorants = parts[2:] # e.g., ["Black"] or ["Magenta", "Yellow", "PANTONE 231 C"] | |
| if not colorants: | |
| return True # No colorants specified? Fail safe to color. | |
| # Check if ALL colorants are achromatic | |
| for colorant in colorants: | |
| # Strip quotes just in case: 'Black' | |
| clean_colorant = colorant.strip("'\"").lower() | |
| if clean_colorant not in ACHROMATIC_NAMES: | |
| # Found a chromatic colorant (like "Magenta", "PANTONE", "Red") | |
| return True | |
| # If we get here, all colorants were in the achromatic list (e.g., just "Black") | |
| return False | |
| except Exception: | |
| return True # Fail safe to color | |
| # 5. Handle "Separation(name, base, ...)" | |
| # A spot color. We just check the colorant's name. | |
| if colorspace_name.startswith("Separation("): | |
| try: | |
| content = colorspace_name[11:-1] | |
| parts = [p.strip() for p in content.split(",")] | |
| # The colorant name is the first item | |
| colorant_name = parts[0].strip("'\"") # e.g., "PANTONE 231 C" or "Black" | |
| if colorant_name.lower() in ACHROMATIC_NAMES: | |
| return False | |
| else: | |
| return True # "Magenta", "PANTONE", etc. are treated as color | |
| except Exception: | |
| return True # Fail safe to color | |
| # 6. Fallback for unknowns (Pattern, etc.) | |
| # If we don't recognize it, assume color to be safe. | |
| return True | |
| for pdf_file in ROOT_DIR.glob("*.pdf"): | |
| print("Processing:", pdf_file) | |
| doc = pymupdf.open(pdf_file) | |
| colleted_dpis = {} | |
| collected_color_dpis = {} | |
| total_pages = doc.page_count | |
| for page_number in range(total_pages): | |
| idx = page_number + 1 | |
| page = doc.load_page(page_number) | |
| page_height_pt = page.rect.height | |
| images = page.get_image_info() | |
| if not images: | |
| # No images, export at maximum DPI | |
| colleted_dpis.setdefault(MAXIMUM_DPI, []).append(idx) | |
| continue | |
| # We only allow image that has height at least 2x the page width | |
| images_candidates = [] | |
| has_color = False | |
| for img in images: | |
| img_width = img.get("width", 0) # width in pixels | |
| img_height = img.get("height", 0) # height in pixels | |
| cs_name = img.get("cs-name") # colorspace name | |
| if not has_color and is_colorspace_color(cs_name): # Only set once | |
| has_color = True | |
| if img_height >= 1.25 * img_width: | |
| # Okay, calculate DPI | |
| calc_dpi = dpi_from_height(page_height_pt, img_height) | |
| images_candidates.append(calc_dpi) | |
| # Maximum DPI from all candidates | |
| max_dpi = min(max(images_candidates) if images_candidates else MAXIMUM_DPI, MAXIMUM_DPI) | |
| rounded_dpi = round_down_to_nearest_5(max_dpi) | |
| if has_color: | |
| collected_color_dpis.setdefault(rounded_dpi, []).append(idx) | |
| else: | |
| colleted_dpis.setdefault(rounded_dpi, []).append(idx) | |
| doc.close() | |
| # Change the output from DPI -> list of pages to an array of {dpi, pages} | |
| formatted_dpis = [] | |
| for dpi, pages in sorted(colleted_dpis.items()): | |
| formatted_dpis.append({"dpi": dpi, "pages": pages, "color": False}) | |
| formatted_color_dpis = [] | |
| for dpi, pages in sorted(collected_color_dpis.items()): | |
| formatted_color_dpis.append({"dpi": dpi, "pages": pages, "color": True}) | |
| collected_pages = { | |
| "pdf_file": str(pdf_file.resolve()), | |
| "dpis": formatted_dpis + formatted_color_dpis, | |
| } | |
| output_file = pdf_file.with_suffix(".dpis.json") | |
| output_file.write_text(json.dumps(collected_pages, indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment