heychriszappa/perplexity_markdown_cleaner.py

## perplexity_markdown_cleaner.py
#!/usr/bin/env python3
"""
Script to automatically removes the references section, square bracket references,
and the perplexity image from perplexity markdown export.

Examples:
  # Process a single file (creates backup)
  python perplexity_markdown_cleaner.py file.md

  # Process a single file without backup
  python perplexity_markdown_cleaner.py file.md --no-backup

  # Process a single file and save to new location
  python perplexity_markdown_cleaner.py input.md --output cleaned.md

  # Process all .md files in a directory
  python perplexity_markdown_cleaner.py --directory /path/to/docs

  # Process all .txt files in current directory (non-recursive)
  python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive

"""

import re
import argparse
import os
from pathlib import Path


def remove_references(text):
    """
    Remove square bracket references, image sources, and reference sections from text.

    Args:
        text (str): Input text containing references

    Returns:
        str: Text with references removed
    """
    # Pattern to match square bracket references like [^1], [^2], [^10], etc.
    # This pattern matches:
    # - Opening square bracket [
    # - Caret symbol ^
    # - One or more digits \d+
    # - Closing square bracket ]
    # The pattern can appear multiple times consecutively
    pattern = r'\[\^\d+\]+'

    # Remove all matching patterns
    cleaned_text = re.sub(pattern, '', text)

    # Remove image source lines at the beginning of the document
    # Pattern matches lines like: <img src="..." style="..."/>
    img_pattern = r'<img\s+src="[^"]*"[^>]*/?>\s*\n?'
    cleaned_text = re.sub(img_pattern, '', cleaned_text, flags=re.IGNORECASE)

     # Remove hidden span elements like <span style="display:none"></span>
    span_pattern = r'<span\s+style="display:\s*none"[^>]*></span>\s*'
    cleaned_text = re.sub(span_pattern, '', cleaned_text, flags=re.IGNORECASE)

    # Remove the references section from the bottom
    # This removes everything from the div with ⁂ symbol to the end of the document
    # Pattern matches: <div style="...">⁂</div> followed by any content including [^X]: links
    references_pattern = r'<div[^>]*>⁂</div>\s*\n*(.*?)$'
    cleaned_text = re.sub(references_pattern, '', cleaned_text, flags=re.DOTALL)

    # Also remove any standalone reference links that might remain
    # Pattern matches lines like: [^1]: https://...
    standalone_refs_pattern = r'^\[\^\d+\]:\s*.*$'
    cleaned_text = re.sub(standalone_refs_pattern, '', cleaned_text, flags=re.MULTILINE)

    # Clean up any extra blank lines that might be left behind
    cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)

    return cleaned_text.strip()


def process_file(file_path, output_path=None, backup=True):
    """
    Process a single file to remove references.

    Args:
        file_path (str): Path to the input file
        output_path (str, optional): Path for output file. If None, overwrites original
        backup (bool): Whether to create a backup of the original file

    Returns:
        bool: True if successful, False otherwise
    """
    try:
        file_path = Path(file_path)

        # Check if file exists
        if not file_path.exists():
            print(f"Error: File '{file_path}' does not exist.")
            return False

        # Read the original file
        with open(file_path, 'r', encoding='utf-8') as f:
            original_content = f.read()

        # Remove references
        cleaned_content = remove_references(original_content)

        # Create backup if requested
        if backup and output_path is None:
            backup_path = file_path.with_suffix(file_path.suffix + '.backup')
            with open(backup_path, 'w', encoding='utf-8') as f:
                f.write(original_content)
            print(f"Backup created: {backup_path}")

        # Determine output path
        if output_path is None:
            output_path = file_path
        else:
            output_path = Path(output_path)

        # Write cleaned content
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)

        print(f"References removed from: {file_path}")
        if output_path != file_path:
            print(f"Output written to: {output_path}")

        return True

    except Exception as e:
        print(f"Error processing file '{file_path}': {str(e)}")
        return False


def process_directory(directory_path, pattern="*.md", recursive=True, backup=True):
    """
    Process all files in a directory matching the given pattern.

    Args:
        directory_path (str): Path to the directory
        pattern (str): File pattern to match (default: "*.md")
        recursive (bool): Whether to search subdirectories
        backup (bool): Whether to create backups

    Returns:
        tuple: (success_count, total_count)
    """
    directory_path = Path(directory_path)

    if not directory_path.exists():
        print(f"Error: Directory '{directory_path}' does not exist.")
        return 0, 0

    if not directory_path.is_dir():
        print(f"Error: '{directory_path}' is not a directory.")
        return 0, 0

    # Find matching files
    if recursive:
        files = list(directory_path.rglob(pattern))
    else:
        files = list(directory_path.glob(pattern))

    if not files:
        print(f"No files matching pattern '{pattern}' found in '{directory_path}'")
        return 0, 0

    print(f"Found {len(files)} files to process...")

    success_count = 0
    for file_path in files:
        print(f"\nProcessing: {file_path}")
        if process_file(file_path, backup=backup):
            success_count += 1

    return success_count, len(files)


def main():
    """Main function to handle command line arguments and execute the script."""
    parser = argparse.ArgumentParser(
        description="Remove square bracket references from markdown files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Process a single file (creates backup)
  python perplexity_markdown_cleaner.py file.md

  # Process a single file without backup
  python perplexity_markdown_cleaner.py file.md --no-backup

  # Process a single file and save to new location
  python perplexity_markdown_cleaner.py input.md --output cleaned.md

  # Process all .md files in a directory
  python perplexity_markdown_cleaner.py --directory /path/to/docs

  # Process all .txt files in current directory (non-recursive)
  python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive
        """
    )

    # File or directory input
    parser.add_argument(
        'input',
        nargs='?',
        help='Input file path (if not using --directory)'
    )

    parser.add_argument(
        '--output', '-o',
        help='Output file path (only for single file processing)'
    )

    parser.add_argument(
        '--directory', '-d',
        help='Process all files in directory matching pattern'
    )

    parser.add_argument(
        '--pattern', '-p',
        default='*.md',
        help='File pattern to match when processing directory (default: *.md)'
    )

    parser.add_argument(
        '--recursive', '-r',
        action='store_true',
        default=True,
        help='Search subdirectories (default: True)'
    )

    parser.add_argument(
        '--no-recursive',
        action='store_true',
        help='Do not search subdirectories'
    )

    parser.add_argument(
        '--backup', '-b',
        action='store_true',
        default=True,
        help='Create backup files (default: True)'
    )

    parser.add_argument(
        '--no-backup',
        action='store_true',
        help='Do not create backup files'
    )

    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be processed without making changes'
    )

    args = parser.parse_args()

    # Handle conflicting arguments
    if args.no_recursive:
        args.recursive = False

    if args.no_backup:
        args.backup = False

    # Validate arguments
    if not args.directory and not args.input:
        parser.error("Either provide an input file or use --directory option")

    if args.directory and args.input:
        parser.error("Cannot use both input file and --directory option")

    if args.output and args.directory:
        parser.error("Cannot use --output with --directory option")

    # Dry run mode
    if args.dry_run:
        print("DRY RUN MODE - No files will be modified")
        print("-" * 50)

    # Process directory
    if args.directory:
        print(f"Processing directory: {args.directory}")
        print(f"Pattern: {args.pattern}")
        print(f"Recursive: {args.recursive}")
        print(f"Backup: {args.backup}")
        print("-" * 50)

        if args.dry_run:
            directory_path = Path(args.directory)
            if args.recursive:
                files = list(directory_path.rglob(args.pattern))
            else:
                files = list(directory_path.glob(args.pattern))
            print(f"Would process {len(files)} files:")
            for file_path in files:
                print(f"  - {file_path}")
        else:
            success_count, total_count = process_directory(
                args.directory,
                args.pattern,
                args.recursive,
                args.backup
            )
            print(f"\nCompleted: {success_count}/{total_count} files processed successfully")

    # Process single file
    else:
        print(f"Processing file: {args.input}")
        print(f"Output: {args.output or 'overwrite original'}")
        print(f"Backup: {args.backup}")
        print("-" * 50)

        if args.dry_run:
            if Path(args.input).exists():
                print(f"Would process: {args.input}")
                with open(args.input, 'r', encoding='utf-8') as f:
                    content = f.read()
                    references = re.findall(r'\[\^\d+\]+', content)
                    if references:
                        print(f"Found references to remove: {set(references)}")
                    else:
                        print("No references found in file")
            else:
                print(f"File does not exist: {args.input}")
        else:
            success = process_file(args.input, args.output, args.backup)
            if success:
                print("File processed successfully!")
            else:
                print("Failed to process file.")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Script to automatically removes the references section, square bracket references,
	and the perplexity image from perplexity markdown export.

	Examples:
	# Process a single file (creates backup)
	python perplexity_markdown_cleaner.py file.md

	# Process a single file without backup
	python perplexity_markdown_cleaner.py file.md --no-backup

	# Process a single file and save to new location
	python perplexity_markdown_cleaner.py input.md --output cleaned.md

	# Process all .md files in a directory
	python perplexity_markdown_cleaner.py --directory /path/to/docs

	# Process all .txt files in current directory (non-recursive)
	python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive

	"""

	import re
	import argparse
	import os
	from pathlib import Path


	def remove_references(text):
	"""
	Remove square bracket references, image sources, and reference sections from text.

	Args:
	text (str): Input text containing references

	Returns:
	str: Text with references removed
	"""
	# Pattern to match square bracket references like [^1], [^2], [^10], etc.
	# This pattern matches:
	# - Opening square bracket [
	# - Caret symbol ^
	# - One or more digits \d+
	# - Closing square bracket ]
	# The pattern can appear multiple times consecutively
	pattern = r'\[\^\d+\]+'

	# Remove all matching patterns
	cleaned_text = re.sub(pattern, '', text)

	# Remove image source lines at the beginning of the document
	# Pattern matches lines like: <img src="..." style="..."/>
	img_pattern = r'<img\s+src="[^"]"[^>]/?>\s*\n?'
	cleaned_text = re.sub(img_pattern, '', cleaned_text, flags=re.IGNORECASE)

	# Remove hidden span elements like <span style="display:none"></span>
	span_pattern = r'<span\s+style="display:\snone"[^>]></span>\s*'
	cleaned_text = re.sub(span_pattern, '', cleaned_text, flags=re.IGNORECASE)

	# Remove the references section from the bottom
	# This removes everything from the div with ⁂ symbol to the end of the document
	# Pattern matches: <div style="...">⁂</div> followed by any content including [^X]: links
	references_pattern = r'<div[^>]>⁂</div>\s\n(.?)$'
	cleaned_text = re.sub(references_pattern, '', cleaned_text, flags=re.DOTALL)

	# Also remove any standalone reference links that might remain
	# Pattern matches lines like: [^1]: https://...
	standalone_refs_pattern = r'^\[\^\d+\]:\s.$'
	cleaned_text = re.sub(standalone_refs_pattern, '', cleaned_text, flags=re.MULTILINE)

	# Clean up any extra blank lines that might be left behind
	cleaned_text = re.sub(r'\n\s\n\s\n', '\n\n', cleaned_text)

	return cleaned_text.strip()


	def process_file(file_path, output_path=None, backup=True):
	"""
	Process a single file to remove references.

	Args:
	file_path (str): Path to the input file
	output_path (str, optional): Path for output file. If None, overwrites original
	backup (bool): Whether to create a backup of the original file

	Returns:
	bool: True if successful, False otherwise
	"""
	try:
	file_path = Path(file_path)

	# Check if file exists
	if not file_path.exists():
	print(f"Error: File '{file_path}' does not exist.")
	return False

	# Read the original file
	with open(file_path, 'r', encoding='utf-8') as f:
	original_content = f.read()

	# Remove references
	cleaned_content = remove_references(original_content)

	# Create backup if requested
	if backup and output_path is None:
	backup_path = file_path.with_suffix(file_path.suffix + '.backup')
	with open(backup_path, 'w', encoding='utf-8') as f:
	f.write(original_content)
	print(f"Backup created: {backup_path}")

	# Determine output path
	if output_path is None:
	output_path = file_path
	else:
	output_path = Path(output_path)

	# Write cleaned content
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(cleaned_content)

	print(f"References removed from: {file_path}")
	if output_path != file_path:
	print(f"Output written to: {output_path}")

	return True

	except Exception as e:
	print(f"Error processing file '{file_path}': {str(e)}")
	return False


	def process_directory(directory_path, pattern="*.md", recursive=True, backup=True):
	"""
	Process all files in a directory matching the given pattern.

	Args:
	directory_path (str): Path to the directory
	pattern (str): File pattern to match (default: "*.md")
	recursive (bool): Whether to search subdirectories
	backup (bool): Whether to create backups

	Returns:
	tuple: (success_count, total_count)
	"""
	directory_path = Path(directory_path)

	if not directory_path.exists():
	print(f"Error: Directory '{directory_path}' does not exist.")
	return 0, 0

	if not directory_path.is_dir():
	print(f"Error: '{directory_path}' is not a directory.")
	return 0, 0

	# Find matching files
	if recursive:
	files = list(directory_path.rglob(pattern))
	else:
	files = list(directory_path.glob(pattern))

	if not files:
	print(f"No files matching pattern '{pattern}' found in '{directory_path}'")
	return 0, 0

	print(f"Found {len(files)} files to process...")

	success_count = 0
	for file_path in files:
	print(f"\nProcessing: {file_path}")
	if process_file(file_path, backup=backup):
	success_count += 1

	return success_count, len(files)


	def main():
	"""Main function to handle command line arguments and execute the script."""
	parser = argparse.ArgumentParser(
	description="Remove square bracket references from markdown files",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Process a single file (creates backup)
	python perplexity_markdown_cleaner.py file.md

	# Process a single file without backup
	python perplexity_markdown_cleaner.py file.md --no-backup

	# Process a single file and save to new location
	python perplexity_markdown_cleaner.py input.md --output cleaned.md

	# Process all .md files in a directory
	python perplexity_markdown_cleaner.py --directory /path/to/docs

	# Process all .txt files in current directory (non-recursive)
	python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive
	"""
	)

	# File or directory input
	parser.add_argument(
	'input',
	nargs='?',
	help='Input file path (if not using --directory)'
	)

	parser.add_argument(
	'--output', '-o',
	help='Output file path (only for single file processing)'
	)

	parser.add_argument(
	'--directory', '-d',
	help='Process all files in directory matching pattern'
	)

	parser.add_argument(
	'--pattern', '-p',
	default='*.md',
	help='File pattern to match when processing directory (default: *.md)'
	)

	parser.add_argument(
	'--recursive', '-r',
	action='store_true',
	default=True,
	help='Search subdirectories (default: True)'
	)

	parser.add_argument(
	'--no-recursive',
	action='store_true',
	help='Do not search subdirectories'
	)

	parser.add_argument(
	'--backup', '-b',
	action='store_true',
	default=True,
	help='Create backup files (default: True)'
	)

	parser.add_argument(
	'--no-backup',
	action='store_true',
	help='Do not create backup files'
	)

	parser.add_argument(
	'--dry-run',
	action='store_true',
	help='Show what would be processed without making changes'
	)

	args = parser.parse_args()

	# Handle conflicting arguments
	if args.no_recursive:
	args.recursive = False

	if args.no_backup:
	args.backup = False

	# Validate arguments
	if not args.directory and not args.input:
	parser.error("Either provide an input file or use --directory option")

	if args.directory and args.input:
	parser.error("Cannot use both input file and --directory option")

	if args.output and args.directory:
	parser.error("Cannot use --output with --directory option")

	# Dry run mode
	if args.dry_run:
	print("DRY RUN MODE - No files will be modified")
	print("-" * 50)

	# Process directory
	if args.directory:
	print(f"Processing directory: {args.directory}")
	print(f"Pattern: {args.pattern}")
	print(f"Recursive: {args.recursive}")
	print(f"Backup: {args.backup}")
	print("-" * 50)

	if args.dry_run:
	directory_path = Path(args.directory)
	if args.recursive:
	files = list(directory_path.rglob(args.pattern))
	else:
	files = list(directory_path.glob(args.pattern))
	print(f"Would process {len(files)} files:")
	for file_path in files:
	print(f" - {file_path}")
	else:
	success_count, total_count = process_directory(
	args.directory,
	args.pattern,
	args.recursive,
	args.backup
	)
	print(f"\nCompleted: {success_count}/{total_count} files processed successfully")

	# Process single file
	else:
	print(f"Processing file: {args.input}")
	print(f"Output: {args.output or 'overwrite original'}")
	print(f"Backup: {args.backup}")
	print("-" * 50)

	if args.dry_run:
	if Path(args.input).exists():
	print(f"Would process: {args.input}")
	with open(args.input, 'r', encoding='utf-8') as f:
	content = f.read()
	references = re.findall(r'\[\^\d+\]+', content)
	if references:
	print(f"Found references to remove: {set(references)}")
	else:
	print("No references found in file")
	else:
	print(f"File does not exist: {args.input}")
	else:
	success = process_file(args.input, args.output, args.backup)
	if success:
	print("File processed successfully!")
	else:
	print("Failed to process file.")


	if __name__ == "__main__":
	main()
No results found