Created
January 21, 2026 17:16
-
-
Save ppo/0aa118cfee8c831e2b0ae6677e32465d to your computer and use it in GitHub Desktop.
Script to clean Notion export by standardizing folder/file names.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ | |
| Clean Notion export by standardizing folder/file names. | |
| - Keep only `_all.csv` files (remove duplicates) | |
| - Rename folders to match their corresponding `.{md,csv}` file UUID suffixes | |
| """ | |
| import argparse | |
| import os | |
| import re | |
| from glob import glob | |
| from pathlib import Path | |
| UUID_SUFFIX_RE = r" [0-9a-f]{32}" | |
| FILE_EXT_RE = r"(csv|md)" | |
| # ANSI color codes | |
| RESET = "\033[0m" | |
| BOLD = "\033[1m" | |
| RED = "\033[31m" | |
| GREEN = "\033[32m" | |
| YELLOW = "\033[33m" | |
| CYAN = "\033[36m" | |
| DIM_GRAY = "\033[90m" | |
| WHITE = "\033[97m" | |
| HEADING = f"{BOLD}{YELLOW}" | |
| def h1(text): | |
| print(f"\n{HEADING}=== {text} ==={RESET}\n") | |
| def h2(text): | |
| print(f"{HEADING}{text}{RESET}") | |
| def highlight(text): | |
| print(f"{WHITE}{text}{RESET}") | |
| def get_folders_by_depth_desc(base_path="."): | |
| """Get all folders sorted by depth (deepest first)""" | |
| folders = [] | |
| base = Path(base_path) | |
| for root, dirs, files in os.walk(base): | |
| root_path = Path(root) | |
| for d in dirs: | |
| folder_path = root_path / d | |
| # Get relative path from base | |
| rel_path = folder_path.relative_to(base) | |
| depth = len(rel_path.parts) | |
| folders.append((depth, rel_path)) | |
| # Sort by depth descending | |
| folders.sort(key=lambda x: x[0], reverse=True) | |
| return [f[1] for f in folders] | |
| def process_csv(): | |
| h2("Process CSV: Keep Only '*_all.csv'") | |
| for csv_path_str in glob("**/*_all.csv", recursive=True): | |
| csv_path = Path(csv_path_str) | |
| highlight(csv_path) | |
| other_csv_path = csv_path.with_name(csv_path.name.replace("_all.csv", ".csv")) | |
| if other_csv_path.is_file(): | |
| print(f" Deleting {other_csv_path}") | |
| other_csv_path.unlink() | |
| h2("Process CSV: Rename '*_all.csv'") | |
| for csv_path_str in glob("**/*_all.csv", recursive=True): | |
| csv_path = Path(csv_path_str) | |
| print(csv_path) | |
| new_path = csv_path.with_name(csv_path.name.replace("_all", "")) | |
| csv_path.rename(new_path) | |
| def process_folders(): | |
| h2("Process Folders") | |
| errors = [] | |
| # Loop over folders (deepest first) | |
| for folder_path in get_folders_by_depth_desc(): | |
| # Skip if already has UUID | |
| if re.search(fr"{UUID_SUFFIX_RE}$", str(folder_path)): | |
| print(f"{DIM_GRAY}Skip {folder_path}{RESET}") | |
| continue | |
| highlight(folder_path) | |
| # Find matching .md or .csv files with UUID | |
| pattern = re.compile(fr"{re.escape(str(folder_path))}{UUID_SUFFIX_RE}\.{FILE_EXT_RE}$") | |
| # Get potential files | |
| candidates = glob(f"{folder_path}*") | |
| files = [f for f in candidates if pattern.match(f)] | |
| match len(files): | |
| case 0: | |
| print(f" 🔴 {RED}Not found{RESET}") | |
| errors.append(f"Not Found: {CYAN}{folder_path}{RED}") | |
| case 1: | |
| # Extract UUID suffix | |
| match = re.search(fr"({UUID_SUFFIX_RE})\.{FILE_EXT_RE}$", files[0]) | |
| if match: | |
| suffix = match.group(1) | |
| new_folder_path = Path(f"{folder_path}{suffix}") | |
| print(f" 🟢 Found: {Path(files[0]).name}") | |
| print(f" {GREEN}Rename: {new_folder_path}{RESET}") | |
| folder_path.rename(new_folder_path) | |
| case _: | |
| for f in files: | |
| print(f" 🔴 {RED}{f}{RESET}") | |
| errors.append(f"Multiple: {CYAN}{folder_path}{RED}") | |
| print() | |
| if errors: | |
| print(f"{BOLD}{RED}ERRORS:{RESET}") | |
| for error in errors: | |
| print(f"{RED}{error}{RESET}") | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Clean Notion export by standardizing folder/file names.") | |
| parser.add_argument("folders", nargs="+", help="PDF files to convert") | |
| args = parser.parse_args() | |
| # Sanitizing | |
| for i, folder in enumerate(args.folders): | |
| args.folders[i] = Path(folder).resolve() | |
| return args | |
| def main(): | |
| args = parse_args() | |
| for folder in args.folders: | |
| h1(f"Cleaning {CYAN}{folder}{HEADING}") | |
| if not folder.exists(): | |
| print(f"{RED}ERROR: Folder '{CYAN}{folder}{RED}' not found{RESET}") | |
| continue | |
| os.chdir(folder) | |
| process_csv() | |
| process_folders() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment