Skip to content

Instantly share code, notes, and snippets.

@philsquared
Last active January 17, 2026 12:54
Show Gist options
  • Select an option

  • Save philsquared/451d3c48a0286180775408300a93882b to your computer and use it in GitHub Desktop.

Select an option

Save philsquared/451d3c48a0286180775408300a93882b to your computer and use it in GitHub Desktop.
A simple script to reverse engineer a table of contents from PDFs with a fairly standard numbered heading system, and add the TOC into (a copy of) the PDFs
import fitz # needs: pip install pymupdf
import re
import sys
import glob
import os
def extract_toc_from_pdf(pdf_path, output_path=None):
"""
Extract table of contents from PDF based on large bold headings.
Captures both section (size ~20) and subsection (size ~15) headings.
Handles multi-line titles and unnumbered sections.
"""
doc = fitz.open(pdf_path)
# Pattern to match numbered headings like "1 Title" or "1.2 Title"
heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.+)$')
toc_entries = []
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if block.get("type") == 0: # Text block
lines = block.get("lines", [])
skip_next = False # Track if we should skip the next line
for line_idx, line in enumerate(lines):
# Skip this line if it was a continuation of the previous heading
if skip_next:
skip_next = False
continue
line_text = ""
is_bold = False
max_size = 0
first_line_y_pos = line['bbox'][1] # Store first line position
for span in line.get("spans", []):
line_text += span.get("text", "")
font = span.get("font", "")
size = span.get("size", 0)
# Check for bold fonts (BX in Computer Modern)
if "BX" in font or "Bold" in font or "bold" in font:
is_bold = True
max_size = max(max_size, size)
line_text = line_text.strip()
# Look for large bold headings (numbered or unnumbered)
# Main sections ~19.9, subsections ~14.9
if is_bold and max_size > 13 and line_text:
match = heading_pattern.match(line_text)
if match:
# Numbered heading
number = match.group(1)
title = match.group(2).strip()
level = number.count('.') + 1
full_title = f"{number} {title}"
else:
# Check if it's an unnumbered section (larger size)
# and not just a random bold line
if max_size > 15: # Only main sections, not subsections
title = line_text
level = 1 # Treat as top-level
full_title = title
else:
continue # Skip subsection-sized unnumbered text
# Check if next line continues the title
# (bold, same size, no number at start)
if line_idx + 1 < len(lines):
next_line = lines[line_idx + 1]
next_text = ""
next_is_bold = False
next_size = 0
for span in next_line.get("spans", []):
next_text += span.get("text", "")
font = span.get("font", "")
size = span.get("size", 0)
if "BX" in font or "Bold" in font or "bold" in font:
next_is_bold = True
next_size = max(next_size, size)
next_text = next_text.strip()
# If continuation line: bold, similar size, no heading pattern
if (next_is_bold and
abs(next_size - max_size) < 1 and
not heading_pattern.match(next_text) and
next_text):
full_title += " " + next_text
skip_next = True # Mark next line to be skipped
# Calculate position slightly above the heading (20 points up)
# Use the first line's position, not the continuation
adjusted_y = max(0, first_line_y_pos - 20)
toc_entries.append({
'level': level,
'title': full_title,
'page': page_num + 1,
'y_pos': adjusted_y
})
# Convert to TOC format
toc = [[e['level'], e['title'], e['page'], e['y_pos']] for e in toc_entries]
# Print preview
print(f"\nFound {len(toc)} TOC entries:")
print("-" * 80)
for entry in toc[:40]:
indent = " " * (entry[0] - 1)
print(f"{indent}{entry[1]} (page {entry[2]})")
if len(toc) > 40:
print(f"... and {len(toc) - 40} more entries")
# Add TOC to PDF
if output_path is None:
output_path = pdf_path.replace('.pdf', '_with_toc.pdf')
doc.set_toc(toc)
doc.save(output_path)
doc.close()
print(f"\nSaved PDF with TOC to: {output_path}")
return toc
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script.py <pdf_path> [output_path]")
print(" pdf_path: Path to PDF file or wildcard pattern (e.g., 'pdfs/*.pdf')")
print(" output_path: Output file path or directory (created if doesn't exist)")
sys.exit(1)
pdf_pattern = sys.argv[1]
output_arg = sys.argv[2] if len(sys.argv) > 2 else None
# Expand wildcard pattern
pdf_files = glob.glob(pdf_pattern)
if not pdf_files:
print(f"No files found matching: {pdf_pattern}")
sys.exit(1)
# Check if output_arg is or should be a directory
output_is_dir = False
if output_arg:
# If multiple input files, output must be a directory
if len(pdf_files) > 1:
output_is_dir = True
os.makedirs(output_arg, exist_ok=True)
# If output_arg ends with /, treat as directory
elif output_arg.endswith('/') or output_arg.endswith(os.sep):
output_is_dir = True
os.makedirs(output_arg, exist_ok=True)
# If output_arg exists and is a directory
elif os.path.isdir(output_arg):
output_is_dir = True
# Process each PDF file
for pdf_path in pdf_files:
if output_is_dir:
# Generate output filename in the directory
base_name = os.path.basename(pdf_path)
output_path = os.path.join(output_arg, base_name)
elif output_arg:
output_path = output_arg
else:
output_path = None
print(f"\n{'='*80}")
print(f"Processing: {pdf_path}")
print(f"{'='*80}")
toc = extract_toc_from_pdf(pdf_path, output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment