Skip to content

Instantly share code, notes, and snippets.

@divyavanmahajan
Created February 21, 2026 11:59
Show Gist options
  • Select an option

  • Save divyavanmahajan/df917ea361dd6ae5361ac4ec449ae556 to your computer and use it in GitHub Desktop.

Select an option

Save divyavanmahajan/df917ea361dd6ae5361ac4ec449ae556 to your computer and use it in GitHub Desktop.
Combine markdown files from subdirectories in correct page order. Ignores files with duplicate page numbers and reports them. Only processes files that are exactly one subdirectory level deep.
#!/usr/bin/env python3
"""
Combine markdown files from subdirectories in correct page order.
Ignores files with duplicate page numbers and reports them.
Only processes files that are exactly one subdirectory level deep.
"""
import os
import re
from pathlib import Path
from collections import defaultdict
def find_markdown_files(root_dir):
"""Find all markdown.md files exactly one subdirectory level deep."""
markdown_files = []
for item in os.scandir(root_dir):
if item.is_dir():
markdown_file = os.path.join(item.path, "markdown.md")
if os.path.isfile(markdown_file):
markdown_files.append(markdown_file)
return markdown_files
def extract_page_number(file_path):
"""Extract page number from the first line of the file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
first_line = f.readline().strip()
# Extract only digits from the first line
match = re.search(r'\d+', first_line)
if match:
return int(match.group())
return None
except Exception as e:
print(f"Error reading {file_path}: {e}")
return None
def combine_markdown_files(root_dir, output_file="combined.md"):
"""Combine markdown files in page order, handling duplicates."""
# Find all markdown files
markdown_files = find_markdown_files(root_dir)
if not markdown_files:
print("No markdown.md files found.")
return
# Extract page numbers and track duplicates
page_info = []
page_to_files = defaultdict(list)
for file_path in markdown_files:
page_num = extract_page_number(file_path)
if page_num is not None:
page_to_files[page_num].append(file_path)
# Identify duplicates
duplicates = {page: files for page, files in page_to_files.items() if len(files) > 1}
if duplicates:
print("Duplicate page numbers found (only first occurrence will be used):")
for page, files in duplicates.items():
print(f" Page {page}: {', '.join([os.path.basename(os.path.dirname(f)) for f in files])}")
# Sort by page number and get unique pages (first occurrence only)
unique_pages = sorted(page_to_files.keys())
print(f"\nCombining {len(unique_pages)} unique pages in order...")
# Combine files in order
with open(output_file, 'w', encoding='utf-8') as outfile:
for page_num in unique_pages:
# Use the first file for this page number
file_path = page_to_files[page_num][0]
with open(file_path, 'r', encoding='utf-8') as infile:
content = infile.read()
outfile.write(content)
outfile.write("\n\n---\n\n") # Add separator between pages
print(f"\nCombined markdown file created: {output_file}")
print(f"Total pages combined: {len(unique_pages)}")
if duplicates:
print(f"Duplicate pages ignored: {len(duplicates)}")
if __name__ == "__main__":
# Get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
print(f"Scanning for markdown files in: {current_dir}")
print("=" * 60)
combine_markdown_files(current_dir)
print("\nDone!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment