ppo/notion-export-clean.py

## notion-export-clean.py
#!/usr/bin/env python
"""
Clean Notion export by standardizing folder/file names.

- Keep only `_all.csv` files (remove duplicates)
- Rename folders to match their corresponding `.{md,csv}` file UUID suffixes
"""

import argparse
import os
import re
from glob import glob
from pathlib import Path

UUID_SUFFIX_RE = r" [0-9a-f]{32}"
FILE_EXT_RE = r"(csv|md)"


# ANSI color codes
RESET = "\033[0m"
BOLD = "\033[1m"
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
CYAN = "\033[36m"
DIM_GRAY = "\033[90m"
WHITE = "\033[97m"

HEADING = f"{BOLD}{YELLOW}"


def h1(text):
    print(f"\n{HEADING}=== {text} ==={RESET}\n")


def h2(text):
    print(f"{HEADING}{text}{RESET}")


def highlight(text):
    print(f"{WHITE}{text}{RESET}")


def get_folders_by_depth_desc(base_path="."):
    """Get all folders sorted by depth (deepest first)"""
    folders = []
    base = Path(base_path)

    for root, dirs, files in os.walk(base):
        root_path = Path(root)
        for d in dirs:
            folder_path = root_path / d
            # Get relative path from base
            rel_path = folder_path.relative_to(base)
            depth = len(rel_path.parts)
            folders.append((depth, rel_path))

    # Sort by depth descending
    folders.sort(key=lambda x: x[0], reverse=True)
    return [f[1] for f in folders]

def process_csv():
    h2("Process CSV: Keep Only '*_all.csv'")
    for csv_path_str in glob("**/*_all.csv", recursive=True):
        csv_path = Path(csv_path_str)
        highlight(csv_path)
        other_csv_path = csv_path.with_name(csv_path.name.replace("_all.csv", ".csv"))
        if other_csv_path.is_file():
            print(f"  Deleting {other_csv_path}")
            other_csv_path.unlink()

    h2("Process CSV: Rename '*_all.csv'")
    for csv_path_str in glob("**/*_all.csv", recursive=True):
        csv_path = Path(csv_path_str)
        print(csv_path)
        new_path = csv_path.with_name(csv_path.name.replace("_all", ""))
        csv_path.rename(new_path)


def process_folders():
    h2("Process Folders")

    errors = []

    # Loop over folders (deepest first)
    for folder_path in get_folders_by_depth_desc():
        # Skip if already has UUID
        if re.search(fr"{UUID_SUFFIX_RE}$", str(folder_path)):
            print(f"{DIM_GRAY}Skip {folder_path}{RESET}")
            continue

        highlight(folder_path)

        # Find matching .md or .csv files with UUID
        pattern = re.compile(fr"{re.escape(str(folder_path))}{UUID_SUFFIX_RE}\.{FILE_EXT_RE}$")

        # Get potential files
        candidates = glob(f"{folder_path}*")
        files = [f for f in candidates if pattern.match(f)]

        match len(files):
            case 0:
                print(f"  🔴 {RED}Not found{RESET}")
                errors.append(f"Not Found: {CYAN}{folder_path}{RED}")
            case 1:
                # Extract UUID suffix
                match = re.search(fr"({UUID_SUFFIX_RE})\.{FILE_EXT_RE}$", files[0])
                if match:
                    suffix = match.group(1)
                    new_folder_path = Path(f"{folder_path}{suffix}")
                    print(f"  🟢 Found: {Path(files[0]).name}")
                    print(f"  {GREEN}Rename: {new_folder_path}{RESET}")
                    folder_path.rename(new_folder_path)
            case _:
                for f in files:
                    print(f"  🔴 {RED}{f}{RESET}")
                errors.append(f"Multiple: {CYAN}{folder_path}{RED}")

        print()

    if errors:
        print(f"{BOLD}{RED}ERRORS:{RESET}")
        for error in errors:
            print(f"{RED}{error}{RESET}")


def parse_args():
    parser = argparse.ArgumentParser(description="Clean Notion export by standardizing folder/file names.")
    parser.add_argument("folders", nargs="+", help="PDF files to convert")
    args = parser.parse_args()

    # Sanitizing
    for i, folder in enumerate(args.folders):
        args.folders[i] = Path(folder).resolve()

    return args


def main():
    args = parse_args()

    for folder in args.folders:
        h1(f"Cleaning {CYAN}{folder}{HEADING}")
        if not folder.exists():
            print(f"{RED}ERROR: Folder '{CYAN}{folder}{RED}' not found{RESET}")
            continue

        os.chdir(folder)
        process_csv()
        process_folders()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	"""
	Clean Notion export by standardizing folder/file names.

	- Keep only `_all.csv` files (remove duplicates)
	- Rename folders to match their corresponding `.{md,csv}` file UUID suffixes
	"""

	import argparse
	import os
	import re
	from glob import glob
	from pathlib import Path

	UUID_SUFFIX_RE = r" [0-9a-f]{32}"
	FILE_EXT_RE = r"(csv\|md)"


	# ANSI color codes
	RESET = "\033[0m"
	BOLD = "\033[1m"
	RED = "\033[31m"
	GREEN = "\033[32m"
	YELLOW = "\033[33m"
	CYAN = "\033[36m"
	DIM_GRAY = "\033[90m"
	WHITE = "\033[97m"

	HEADING = f"{BOLD}{YELLOW}"


	def h1(text):
	print(f"\n{HEADING}=== {text} ==={RESET}\n")


	def h2(text):
	print(f"{HEADING}{text}{RESET}")


	def highlight(text):
	print(f"{WHITE}{text}{RESET}")


	def get_folders_by_depth_desc(base_path="."):
	"""Get all folders sorted by depth (deepest first)"""
	folders = []
	base = Path(base_path)

	for root, dirs, files in os.walk(base):
	root_path = Path(root)
	for d in dirs:
	folder_path = root_path / d
	# Get relative path from base
	rel_path = folder_path.relative_to(base)
	depth = len(rel_path.parts)
	folders.append((depth, rel_path))

	# Sort by depth descending
	folders.sort(key=lambda x: x[0], reverse=True)
	return [f[1] for f in folders]

	def process_csv():
	h2("Process CSV: Keep Only '*_all.csv'")
	for csv_path_str in glob("*/_all.csv", recursive=True):
	csv_path = Path(csv_path_str)
	highlight(csv_path)
	other_csv_path = csv_path.with_name(csv_path.name.replace("_all.csv", ".csv"))
	if other_csv_path.is_file():
	print(f" Deleting {other_csv_path}")
	other_csv_path.unlink()

	h2("Process CSV: Rename '*_all.csv'")
	for csv_path_str in glob("*/_all.csv", recursive=True):
	csv_path = Path(csv_path_str)
	print(csv_path)
	new_path = csv_path.with_name(csv_path.name.replace("_all", ""))
	csv_path.rename(new_path)


	def process_folders():
	h2("Process Folders")

	errors = []

	# Loop over folders (deepest first)
	for folder_path in get_folders_by_depth_desc():
	# Skip if already has UUID
	if re.search(fr"{UUID_SUFFIX_RE}$", str(folder_path)):
	print(f"{DIM_GRAY}Skip {folder_path}{RESET}")
	continue

	highlight(folder_path)

	# Find matching .md or .csv files with UUID
	pattern = re.compile(fr"{re.escape(str(folder_path))}{UUID_SUFFIX_RE}\.{FILE_EXT_RE}$")

	# Get potential files
	candidates = glob(f"{folder_path}*")
	files = [f for f in candidates if pattern.match(f)]

	match len(files):
	case 0:
	print(f" 🔴 {RED}Not found{RESET}")
	errors.append(f"Not Found: {CYAN}{folder_path}{RED}")
	case 1:
	# Extract UUID suffix
	match = re.search(fr"({UUID_SUFFIX_RE})\.{FILE_EXT_RE}$", files[0])
	if match:
	suffix = match.group(1)
	new_folder_path = Path(f"{folder_path}{suffix}")
	print(f" 🟢 Found: {Path(files[0]).name}")
	print(f" {GREEN}Rename: {new_folder_path}{RESET}")
	folder_path.rename(new_folder_path)
	case _:
	for f in files:
	print(f" 🔴 {RED}{f}{RESET}")
	errors.append(f"Multiple: {CYAN}{folder_path}{RED}")

	print()

	if errors:
	print(f"{BOLD}{RED}ERRORS:{RESET}")
	for error in errors:
	print(f"{RED}{error}{RESET}")


	def parse_args():
	parser = argparse.ArgumentParser(description="Clean Notion export by standardizing folder/file names.")
	parser.add_argument("folders", nargs="+", help="PDF files to convert")
	args = parser.parse_args()

	# Sanitizing
	for i, folder in enumerate(args.folders):
	args.folders[i] = Path(folder).resolve()

	return args


	def main():
	args = parse_args()

	for folder in args.folders:
	h1(f"Cleaning {CYAN}{folder}{HEADING}")
	if not folder.exists():
	print(f"{RED}ERROR: Folder '{CYAN}{folder}{RED}' not found{RESET}")
	continue

	os.chdir(folder)
	process_csv()
	process_folders()


	if __name__ == "__main__":
	main()
No results found