victorespigares/convert_rtf_plain.py

## convert_rtf_plain.py
#!/usr/bin/env python3
"""
Convert nvALT RTF notes to Markdown using textutil.
- Uses textutil for clean, fast conversion
- Extracts tags from filename [tag], converts to camelCase
- Escapes hashtags to prevent FSNotes tag interpretation
- Preserves file timestamps and original filename
- Only adds front matter if there are tags
"""

import sys
import subprocess
import re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import os

def extract_tags_from_filename(filename: str) -> list:
    """
    Extract [tag] patterns from filename and convert to camelCase.
    [metodo arde] -> metodoArde
    [metodo arde,formacion] -> metodoArde, formacion
    """
    tag_blocks = re.findall(r'\[([^\]]+)\]', filename)
    tags = []

    for block in tag_blocks:
        parts = [p.strip() for p in block.split(',')]

        for part in parts:
            words = part.split()
            if len(words) > 1:
                camel = words[0].lower() + ''.join(w.capitalize() for w in words[1:])
            else:
                camel = part.lower()
            tags.append(camel)

    return tags

def escape_hashtags(content: str) -> str:
    """
    Escape hashtags followed by word characters to prevent FSNotes tag interpretation.
    Converts #word to # word, but preserves markdown headers.
    """
    lines = content.split('\n')
    result = []

    for line in lines:
        # Skip markdown headers
        if line.startswith('# ') or line.startswith('##') or line.startswith('###'):
            result.append(line)
            continue

        # Replace #word with # word
        modified_line = re.sub(
            r'#([a-zA-Z_][a-zA-Z0-9_]*)',
            r'# \1',
            line
        )
        result.append(modified_line)

    return '\n'.join(result)

def convert_single_rtf(rtf_file_str: str, output_dir_str: str) -> tuple:
    """Convert a single RTF file to Markdown using textutil."""
    try:
        rtf_file = Path(rtf_file_str)
        output_dir = Path(output_dir_str)

        # Get file modification time
        mtime = os.path.getmtime(rtf_file)

        # Extract tags from filename
        filename_without_ext = rtf_file.stem
        file_tags = extract_tags_from_filename(filename_without_ext)

        # Keep original filename (with tags and all)
        md_filename = rtf_file.stem + ".md"
        md_file = output_dir / md_filename

        # Use textutil to convert RTF to plain text
        result = subprocess.run(
            [
                'textutil',
                '-convert', 'txt',
                '-encoding', 'UTF-8',
                '-output', str(md_file),
                str(rtf_file)
            ],
            capture_output=True,
            text=True,
            timeout=30
        )

        if result.returncode != 0:
            return (rtf_file.name, False, f"textutil failed: {result.stderr}")

        # Read the converted content
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Clean up content
        content = content.strip()

        # Remove excessive blank lines (more than 2 consecutive)
        content = re.sub(r'\n{3,}', '\n\n', content)

        # Escape hashtags in code/comments
        content = escape_hashtags(content)

        # Only add front matter if there are tags
        if file_tags:
            front_matter_lines = []
            front_matter_lines.append('---')
            tags_str = ' '.join([f'#{tag}' for tag in file_tags])
            front_matter_lines.append(f'tags: {tags_str}')
            front_matter_lines.append('---')
            front_matter_lines.append('\n')  # Empty line after front matter

            final_content = '\n'.join(front_matter_lines) + content
        else:
            # No tags, no front matter
            final_content = content

        # Write back
        with open(md_file, 'w', encoding='utf-8') as f:
            f.write(final_content)

        # Preserve original file timestamp
        os.utime(md_file, (mtime, mtime))

        return (rtf_file.name, True, None)

    except Exception as e:
        return (rtf_file.name if 'rtf_file' in locals() else "unknown", False, str(e))

def convert_rtf_to_markdown(input_dir: str, output_dir: str, max_workers: int = 4):
    """Batch convert all .rtf files to Markdown with parallel processing."""
    input_path = Path(input_dir).expanduser()
    output_path = Path(output_dir).expanduser()
    output_path.mkdir(parents=True, exist_ok=True)

    rtf_files = list(input_path.glob("*.rtf"))

    if not rtf_files:
        print(f"❌ No .rtf files found in {input_path}")
        return

    print(f"🔄 Found {len(rtf_files)} .rtf files to convert")
    print(f"📁 Input:  {input_path}")
    print(f"📁 Output: {output_path}")
    print(f"⚡ Using {max_workers} parallel workers\n")

    success_count = 0
    error_count = 0
    errors = []

    rtf_files_str = [str(f) for f in rtf_files]
    output_dir_str = str(output_path)

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(convert_single_rtf, rtf_file_str, output_dir_str): rtf_file_str
            for rtf_file_str in rtf_files_str
        }

        for idx, future in enumerate(as_completed(futures), 1):
            filename, success, error = future.result()

            if success:
                print(f"✅ [{idx:4d}/{len(rtf_files)}] {filename}")
                success_count += 1
            else:
                print(f"❌ [{idx:4d}/{len(rtf_files)}] {filename} - Error: {error}")
                errors.append((filename, error))
                error_count += 1

    print(f"\n{'='*70}")
    print(f"📊 Conversion complete!")
    print(f"   ✅ Success: {success_count}")
    print(f"   ❌ Errors:  {error_count}")
    print(f"   📁 Output:  {output_path}")
    print(f"{'='*70}")

    if errors:
        print(f"\n⚠️  Failed conversions (first 10):")
        for filename, error in errors[:10]:
            print(f"   • {filename}")
        if len(errors) > 10:
            print(f"   ... and {len(errors) - 10} more")

def main():
    """Main entry point."""
    input_dir = "~/Library/Application Support/Notational Data"
    output_dir = "~/Library/Application Support/Notational Data/md-out"

    if len(sys.argv) > 1:
        input_dir = sys.argv[1]
    if len(sys.argv) > 2:
        output_dir = sys.argv[2]

    convert_rtf_to_markdown(input_dir, output_dir)

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Convert nvALT RTF notes to Markdown using textutil.
	- Uses textutil for clean, fast conversion
	- Extracts tags from filename [tag], converts to camelCase
	- Escapes hashtags to prevent FSNotes tag interpretation
	- Preserves file timestamps and original filename
	- Only adds front matter if there are tags
	"""

	import sys
	import subprocess
	import re
	from pathlib import Path
	from concurrent.futures import ProcessPoolExecutor, as_completed
	import os

	def extract_tags_from_filename(filename: str) -> list:
	"""
	Extract [tag] patterns from filename and convert to camelCase.
	[metodo arde] -> metodoArde
	[metodo arde,formacion] -> metodoArde, formacion
	"""
	tag_blocks = re.findall(r'\[([^\]]+)\]', filename)
	tags = []

	for block in tag_blocks:
	parts = [p.strip() for p in block.split(',')]

	for part in parts:
	words = part.split()
	if len(words) > 1:
	camel = words[0].lower() + ''.join(w.capitalize() for w in words[1:])
	else:
	camel = part.lower()
	tags.append(camel)

	return tags

	def escape_hashtags(content: str) -> str:
	"""
	Escape hashtags followed by word characters to prevent FSNotes tag interpretation.
	Converts #word to # word, but preserves markdown headers.
	"""
	lines = content.split('\n')
	result = []

	for line in lines:
	# Skip markdown headers
	if line.startswith('# ') or line.startswith('##') or line.startswith('###'):
	result.append(line)
	continue

	# Replace #word with # word
	modified_line = re.sub(
	r'#([a-zA-Z_][a-zA-Z0-9_]*)',
	r'# \1',
	line
	)
	result.append(modified_line)

	return '\n'.join(result)

	def convert_single_rtf(rtf_file_str: str, output_dir_str: str) -> tuple:
	"""Convert a single RTF file to Markdown using textutil."""
	try:
	rtf_file = Path(rtf_file_str)
	output_dir = Path(output_dir_str)

	# Get file modification time
	mtime = os.path.getmtime(rtf_file)

	# Extract tags from filename
	filename_without_ext = rtf_file.stem
	file_tags = extract_tags_from_filename(filename_without_ext)

	# Keep original filename (with tags and all)
	md_filename = rtf_file.stem + ".md"
	md_file = output_dir / md_filename

	# Use textutil to convert RTF to plain text
	result = subprocess.run(
	[
	'textutil',
	'-convert', 'txt',
	'-encoding', 'UTF-8',
	'-output', str(md_file),
	str(rtf_file)
	],
	capture_output=True,
	text=True,
	timeout=30
	)

	if result.returncode != 0:
	return (rtf_file.name, False, f"textutil failed: {result.stderr}")

	# Read the converted content
	with open(md_file, 'r', encoding='utf-8') as f:
	content = f.read()

	# Clean up content
	content = content.strip()

	# Remove excessive blank lines (more than 2 consecutive)
	content = re.sub(r'\n{3,}', '\n\n', content)

	# Escape hashtags in code/comments
	content = escape_hashtags(content)

	# Only add front matter if there are tags
	if file_tags:
	front_matter_lines = []
	front_matter_lines.append('---')
	tags_str = ' '.join([f'#{tag}' for tag in file_tags])
	front_matter_lines.append(f'tags: {tags_str}')
	front_matter_lines.append('---')
	front_matter_lines.append('\n') # Empty line after front matter

	final_content = '\n'.join(front_matter_lines) + content
	else:
	# No tags, no front matter
	final_content = content

	# Write back
	with open(md_file, 'w', encoding='utf-8') as f:
	f.write(final_content)

	# Preserve original file timestamp
	os.utime(md_file, (mtime, mtime))

	return (rtf_file.name, True, None)

	except Exception as e:
	return (rtf_file.name if 'rtf_file' in locals() else "unknown", False, str(e))

	def convert_rtf_to_markdown(input_dir: str, output_dir: str, max_workers: int = 4):
	"""Batch convert all .rtf files to Markdown with parallel processing."""
	input_path = Path(input_dir).expanduser()
	output_path = Path(output_dir).expanduser()
	output_path.mkdir(parents=True, exist_ok=True)

	rtf_files = list(input_path.glob("*.rtf"))

	if not rtf_files:
	print(f"❌ No .rtf files found in {input_path}")
	return

	print(f"🔄 Found {len(rtf_files)} .rtf files to convert")
	print(f"📁 Input: {input_path}")
	print(f"📁 Output: {output_path}")
	print(f"⚡ Using {max_workers} parallel workers\n")

	success_count = 0
	error_count = 0
	errors = []

	rtf_files_str = [str(f) for f in rtf_files]
	output_dir_str = str(output_path)

	with ProcessPoolExecutor(max_workers=max_workers) as executor:
	futures = {
	executor.submit(convert_single_rtf, rtf_file_str, output_dir_str): rtf_file_str
	for rtf_file_str in rtf_files_str
	}

	for idx, future in enumerate(as_completed(futures), 1):
	filename, success, error = future.result()

	if success:
	print(f"✅ [{idx:4d}/{len(rtf_files)}] {filename}")
	success_count += 1
	else:
	print(f"❌ [{idx:4d}/{len(rtf_files)}] {filename} - Error: {error}")
	errors.append((filename, error))
	error_count += 1

	print(f"\n{'='*70}")
	print(f"📊 Conversion complete!")
	print(f" ✅ Success: {success_count}")
	print(f" ❌ Errors: {error_count}")
	print(f" 📁 Output: {output_path}")
	print(f"{'='*70}")

	if errors:
	print(f"\n⚠️ Failed conversions (first 10):")
	for filename, error in errors[:10]:
	print(f" • {filename}")
	if len(errors) > 10:
	print(f" ... and {len(errors) - 10} more")

	def main():
	"""Main entry point."""
	input_dir = "~/Library/Application Support/Notational Data"
	output_dir = "~/Library/Application Support/Notational Data/md-out"

	if len(sys.argv) > 1:
	input_dir = sys.argv[1]
	if len(sys.argv) > 2:
	output_dir = sys.argv[2]

	convert_rtf_to_markdown(input_dir, output_dir)

	if __name__ == "__main__":
	main()
No results found