philsquared/toc_adder.py

## toc_adder.py
import fitz  # needs: pip install pymupdf
import re
import sys
import glob
import os


def extract_toc_from_pdf(pdf_path, output_path=None):
    """
    Extract table of contents from PDF based on large bold headings.
    Captures both section (size ~20) and subsection (size ~15) headings.
    Handles multi-line titles and unnumbered sections.
    """
    doc = fitz.open(pdf_path)

    # Pattern to match numbered headings like "1 Title" or "1.2 Title"
    heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.+)$')

    toc_entries = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block.get("type") == 0:  # Text block
                lines = block.get("lines", [])
                skip_next = False  # Track if we should skip the next line

                for line_idx, line in enumerate(lines):
                    # Skip this line if it was a continuation of the previous heading
                    if skip_next:
                        skip_next = False
                        continue

                    line_text = ""
                    is_bold = False
                    max_size = 0
                    first_line_y_pos = line['bbox'][1]  # Store first line position

                    for span in line.get("spans", []):
                        line_text += span.get("text", "")
                        font = span.get("font", "")
                        size = span.get("size", 0)

                        # Check for bold fonts (BX in Computer Modern)
                        if "BX" in font or "Bold" in font or "bold" in font:
                            is_bold = True

                        max_size = max(max_size, size)

                    line_text = line_text.strip()

                    # Look for large bold headings (numbered or unnumbered)
                    # Main sections ~19.9, subsections ~14.9
                    if is_bold and max_size > 13 and line_text:
                        match = heading_pattern.match(line_text)

                        if match:
                            # Numbered heading
                            number = match.group(1)
                            title = match.group(2).strip()
                            level = number.count('.') + 1
                            full_title = f"{number} {title}"
                        else:
                            # Check if it's an unnumbered section (larger size)
                            # and not just a random bold line
                            if max_size > 15:  # Only main sections, not subsections
                                title = line_text
                                level = 1  # Treat as top-level
                                full_title = title
                            else:
                                continue  # Skip subsection-sized unnumbered text

                        # Check if next line continues the title
                        # (bold, same size, no number at start)
                        if line_idx + 1 < len(lines):
                            next_line = lines[line_idx + 1]
                            next_text = ""
                            next_is_bold = False
                            next_size = 0

                            for span in next_line.get("spans", []):
                                next_text += span.get("text", "")
                                font = span.get("font", "")
                                size = span.get("size", 0)

                                if "BX" in font or "Bold" in font or "bold" in font:
                                    next_is_bold = True

                                next_size = max(next_size, size)

                            next_text = next_text.strip()

                            # If continuation line: bold, similar size, no heading pattern
                            if (next_is_bold and
                                    abs(next_size - max_size) < 1 and
                                    not heading_pattern.match(next_text) and
                                    next_text):
                                full_title += " " + next_text
                                skip_next = True  # Mark next line to be skipped

                        # Calculate position slightly above the heading (20 points up)
                        # Use the first line's position, not the continuation
                        adjusted_y = max(0, first_line_y_pos - 20)

                        toc_entries.append({
                            'level': level,
                            'title': full_title,
                            'page': page_num + 1,
                            'y_pos': adjusted_y
                        })

    # Convert to TOC format
    toc = [[e['level'], e['title'], e['page'], e['y_pos']] for e in toc_entries]

    # Print preview
    print(f"\nFound {len(toc)} TOC entries:")
    print("-" * 80)
    for entry in toc[:40]:
        indent = "  " * (entry[0] - 1)
        print(f"{indent}{entry[1]} (page {entry[2]})")
    if len(toc) > 40:
        print(f"... and {len(toc) - 40} more entries")

    # Add TOC to PDF
    if output_path is None:
        output_path = pdf_path.replace('.pdf', '_with_toc.pdf')

    doc.set_toc(toc)
    doc.save(output_path)
    doc.close()

    print(f"\nSaved PDF with TOC to: {output_path}")
    return toc


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python script.py <pdf_path> [output_path]")
        print("  pdf_path: Path to PDF file or wildcard pattern (e.g., 'pdfs/*.pdf')")
        print("  output_path: Output file path or directory (created if doesn't exist)")
        sys.exit(1)

    pdf_pattern = sys.argv[1]
    output_arg = sys.argv[2] if len(sys.argv) > 2 else None

    # Expand wildcard pattern
    pdf_files = glob.glob(pdf_pattern)

    if not pdf_files:
        print(f"No files found matching: {pdf_pattern}")
        sys.exit(1)

    # Check if output_arg is or should be a directory
    output_is_dir = False
    if output_arg:
        # If multiple input files, output must be a directory
        if len(pdf_files) > 1:
            output_is_dir = True
            os.makedirs(output_arg, exist_ok=True)
        # If output_arg ends with /, treat as directory
        elif output_arg.endswith('/') or output_arg.endswith(os.sep):
            output_is_dir = True
            os.makedirs(output_arg, exist_ok=True)
        # If output_arg exists and is a directory
        elif os.path.isdir(output_arg):
            output_is_dir = True

    # Process each PDF file
    for pdf_path in pdf_files:
        if output_is_dir:
            # Generate output filename in the directory
            base_name = os.path.basename(pdf_path)
            output_path = os.path.join(output_arg, base_name)
        elif output_arg:
            output_path = output_arg
        else:
            output_path = None

        print(f"\n{'='*80}")
        print(f"Processing: {pdf_path}")
        print(f"{'='*80}")

        toc = extract_toc_from_pdf(pdf_path, output_path)
	import fitz # needs: pip install pymupdf
	import re
	import sys
	import glob
	import os


	def extract_toc_from_pdf(pdf_path, output_path=None):
	"""
	Extract table of contents from PDF based on large bold headings.
	Captures both section (size ~20) and subsection (size ~15) headings.
	Handles multi-line titles and unnumbered sections.
	"""
	doc = fitz.open(pdf_path)

	# Pattern to match numbered headings like "1 Title" or "1.2 Title"
	heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.+)$')

	toc_entries = []

	for page_num in range(len(doc)):
	page = doc[page_num]
	blocks = page.get_text("dict")["blocks"]

	for block in blocks:
	if block.get("type") == 0: # Text block
	lines = block.get("lines", [])
	skip_next = False # Track if we should skip the next line

	for line_idx, line in enumerate(lines):
	# Skip this line if it was a continuation of the previous heading
	if skip_next:
	skip_next = False
	continue

	line_text = ""
	is_bold = False
	max_size = 0
	first_line_y_pos = line['bbox'][1] # Store first line position

	for span in line.get("spans", []):
	line_text += span.get("text", "")
	font = span.get("font", "")
	size = span.get("size", 0)

	# Check for bold fonts (BX in Computer Modern)
	if "BX" in font or "Bold" in font or "bold" in font:
	is_bold = True

	max_size = max(max_size, size)

	line_text = line_text.strip()

	# Look for large bold headings (numbered or unnumbered)
	# Main sections ~19.9, subsections ~14.9
	if is_bold and max_size > 13 and line_text:
	match = heading_pattern.match(line_text)

	if match:
	# Numbered heading
	number = match.group(1)
	title = match.group(2).strip()
	level = number.count('.') + 1
	full_title = f"{number} {title}"
	else:
	# Check if it's an unnumbered section (larger size)
	# and not just a random bold line
	if max_size > 15: # Only main sections, not subsections
	title = line_text
	level = 1 # Treat as top-level
	full_title = title
	else:
	continue # Skip subsection-sized unnumbered text

	# Check if next line continues the title
	# (bold, same size, no number at start)
	if line_idx + 1 < len(lines):
	next_line = lines[line_idx + 1]
	next_text = ""
	next_is_bold = False
	next_size = 0

	for span in next_line.get("spans", []):
	next_text += span.get("text", "")
	font = span.get("font", "")
	size = span.get("size", 0)

	if "BX" in font or "Bold" in font or "bold" in font:
	next_is_bold = True

	next_size = max(next_size, size)

	next_text = next_text.strip()

	# If continuation line: bold, similar size, no heading pattern
	if (next_is_bold and
	abs(next_size - max_size) < 1 and
	not heading_pattern.match(next_text) and
	next_text):
	full_title += " " + next_text
	skip_next = True # Mark next line to be skipped

	# Calculate position slightly above the heading (20 points up)
	# Use the first line's position, not the continuation
	adjusted_y = max(0, first_line_y_pos - 20)

	toc_entries.append({
	'level': level,
	'title': full_title,
	'page': page_num + 1,
	'y_pos': adjusted_y
	})

	# Convert to TOC format
	toc = [[e['level'], e['title'], e['page'], e['y_pos']] for e in toc_entries]

	# Print preview
	print(f"\nFound {len(toc)} TOC entries:")
	print("-" * 80)
	for entry in toc[:40]:
	indent = " " * (entry[0] - 1)
	print(f"{indent}{entry[1]} (page {entry[2]})")
	if len(toc) > 40:
	print(f"... and {len(toc) - 40} more entries")

	# Add TOC to PDF
	if output_path is None:
	output_path = pdf_path.replace('.pdf', '_with_toc.pdf')

	doc.set_toc(toc)
	doc.save(output_path)
	doc.close()

	print(f"\nSaved PDF with TOC to: {output_path}")
	return toc


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python script.py <pdf_path> [output_path]")
	print(" pdf_path: Path to PDF file or wildcard pattern (e.g., 'pdfs/*.pdf')")
	print(" output_path: Output file path or directory (created if doesn't exist)")
	sys.exit(1)

	pdf_pattern = sys.argv[1]
	output_arg = sys.argv[2] if len(sys.argv) > 2 else None

	# Expand wildcard pattern
	pdf_files = glob.glob(pdf_pattern)

	if not pdf_files:
	print(f"No files found matching: {pdf_pattern}")
	sys.exit(1)

	# Check if output_arg is or should be a directory
	output_is_dir = False
	if output_arg:
	# If multiple input files, output must be a directory
	if len(pdf_files) > 1:
	output_is_dir = True
	os.makedirs(output_arg, exist_ok=True)
	# If output_arg ends with /, treat as directory
	elif output_arg.endswith('/') or output_arg.endswith(os.sep):
	output_is_dir = True
	os.makedirs(output_arg, exist_ok=True)
	# If output_arg exists and is a directory
	elif os.path.isdir(output_arg):
	output_is_dir = True

	# Process each PDF file
	for pdf_path in pdf_files:
	if output_is_dir:
	# Generate output filename in the directory
	base_name = os.path.basename(pdf_path)
	output_path = os.path.join(output_arg, base_name)
	elif output_arg:
	output_path = output_arg
	else:
	output_path = None

	print(f"\n{'='*80}")
	print(f"Processing: {pdf_path}")
	print(f"{'='*80}")

	toc = extract_toc_from_pdf(pdf_path, output_path)
No results found