Last active
January 17, 2026 12:54
-
-
Save philsquared/451d3c48a0286180775408300a93882b to your computer and use it in GitHub Desktop.
A simple script to reverse engineer a table of contents from PDFs with a fairly standard numbered heading system, and add the TOC into (a copy of) the PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import fitz # needs: pip install pymupdf | |
| import re | |
| import sys | |
| import glob | |
| import os | |
| def extract_toc_from_pdf(pdf_path, output_path=None): | |
| """ | |
| Extract table of contents from PDF based on large bold headings. | |
| Captures both section (size ~20) and subsection (size ~15) headings. | |
| Handles multi-line titles and unnumbered sections. | |
| """ | |
| doc = fitz.open(pdf_path) | |
| # Pattern to match numbered headings like "1 Title" or "1.2 Title" | |
| heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.+)$') | |
| toc_entries = [] | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| blocks = page.get_text("dict")["blocks"] | |
| for block in blocks: | |
| if block.get("type") == 0: # Text block | |
| lines = block.get("lines", []) | |
| skip_next = False # Track if we should skip the next line | |
| for line_idx, line in enumerate(lines): | |
| # Skip this line if it was a continuation of the previous heading | |
| if skip_next: | |
| skip_next = False | |
| continue | |
| line_text = "" | |
| is_bold = False | |
| max_size = 0 | |
| first_line_y_pos = line['bbox'][1] # Store first line position | |
| for span in line.get("spans", []): | |
| line_text += span.get("text", "") | |
| font = span.get("font", "") | |
| size = span.get("size", 0) | |
| # Check for bold fonts (BX in Computer Modern) | |
| if "BX" in font or "Bold" in font or "bold" in font: | |
| is_bold = True | |
| max_size = max(max_size, size) | |
| line_text = line_text.strip() | |
| # Look for large bold headings (numbered or unnumbered) | |
| # Main sections ~19.9, subsections ~14.9 | |
| if is_bold and max_size > 13 and line_text: | |
| match = heading_pattern.match(line_text) | |
| if match: | |
| # Numbered heading | |
| number = match.group(1) | |
| title = match.group(2).strip() | |
| level = number.count('.') + 1 | |
| full_title = f"{number} {title}" | |
| else: | |
| # Check if it's an unnumbered section (larger size) | |
| # and not just a random bold line | |
| if max_size > 15: # Only main sections, not subsections | |
| title = line_text | |
| level = 1 # Treat as top-level | |
| full_title = title | |
| else: | |
| continue # Skip subsection-sized unnumbered text | |
| # Check if next line continues the title | |
| # (bold, same size, no number at start) | |
| if line_idx + 1 < len(lines): | |
| next_line = lines[line_idx + 1] | |
| next_text = "" | |
| next_is_bold = False | |
| next_size = 0 | |
| for span in next_line.get("spans", []): | |
| next_text += span.get("text", "") | |
| font = span.get("font", "") | |
| size = span.get("size", 0) | |
| if "BX" in font or "Bold" in font or "bold" in font: | |
| next_is_bold = True | |
| next_size = max(next_size, size) | |
| next_text = next_text.strip() | |
| # If continuation line: bold, similar size, no heading pattern | |
| if (next_is_bold and | |
| abs(next_size - max_size) < 1 and | |
| not heading_pattern.match(next_text) and | |
| next_text): | |
| full_title += " " + next_text | |
| skip_next = True # Mark next line to be skipped | |
| # Calculate position slightly above the heading (20 points up) | |
| # Use the first line's position, not the continuation | |
| adjusted_y = max(0, first_line_y_pos - 20) | |
| toc_entries.append({ | |
| 'level': level, | |
| 'title': full_title, | |
| 'page': page_num + 1, | |
| 'y_pos': adjusted_y | |
| }) | |
| # Convert to TOC format | |
| toc = [[e['level'], e['title'], e['page'], e['y_pos']] for e in toc_entries] | |
| # Print preview | |
| print(f"\nFound {len(toc)} TOC entries:") | |
| print("-" * 80) | |
| for entry in toc[:40]: | |
| indent = " " * (entry[0] - 1) | |
| print(f"{indent}{entry[1]} (page {entry[2]})") | |
| if len(toc) > 40: | |
| print(f"... and {len(toc) - 40} more entries") | |
| # Add TOC to PDF | |
| if output_path is None: | |
| output_path = pdf_path.replace('.pdf', '_with_toc.pdf') | |
| doc.set_toc(toc) | |
| doc.save(output_path) | |
| doc.close() | |
| print(f"\nSaved PDF with TOC to: {output_path}") | |
| return toc | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 2: | |
| print("Usage: python script.py <pdf_path> [output_path]") | |
| print(" pdf_path: Path to PDF file or wildcard pattern (e.g., 'pdfs/*.pdf')") | |
| print(" output_path: Output file path or directory (created if doesn't exist)") | |
| sys.exit(1) | |
| pdf_pattern = sys.argv[1] | |
| output_arg = sys.argv[2] if len(sys.argv) > 2 else None | |
| # Expand wildcard pattern | |
| pdf_files = glob.glob(pdf_pattern) | |
| if not pdf_files: | |
| print(f"No files found matching: {pdf_pattern}") | |
| sys.exit(1) | |
| # Check if output_arg is or should be a directory | |
| output_is_dir = False | |
| if output_arg: | |
| # If multiple input files, output must be a directory | |
| if len(pdf_files) > 1: | |
| output_is_dir = True | |
| os.makedirs(output_arg, exist_ok=True) | |
| # If output_arg ends with /, treat as directory | |
| elif output_arg.endswith('/') or output_arg.endswith(os.sep): | |
| output_is_dir = True | |
| os.makedirs(output_arg, exist_ok=True) | |
| # If output_arg exists and is a directory | |
| elif os.path.isdir(output_arg): | |
| output_is_dir = True | |
| # Process each PDF file | |
| for pdf_path in pdf_files: | |
| if output_is_dir: | |
| # Generate output filename in the directory | |
| base_name = os.path.basename(pdf_path) | |
| output_path = os.path.join(output_arg, base_name) | |
| elif output_arg: | |
| output_path = output_arg | |
| else: | |
| output_path = None | |
| print(f"\n{'='*80}") | |
| print(f"Processing: {pdf_path}") | |
| print(f"{'='*80}") | |
| toc = extract_toc_from_pdf(pdf_path, output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment