Forked from codeslord/perplexity_markdown_cleaner.py
Created
November 11, 2025 08:23
-
-
Save heychriszappa/8d06185bb4d368be339b9b6f563086bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Script to automatically removes the references section, square bracket references, | |
| and the perplexity image from perplexity markdown export. | |
| Examples: | |
| # Process a single file (creates backup) | |
| python perplexity_markdown_cleaner.py file.md | |
| # Process a single file without backup | |
| python perplexity_markdown_cleaner.py file.md --no-backup | |
| # Process a single file and save to new location | |
| python perplexity_markdown_cleaner.py input.md --output cleaned.md | |
| # Process all .md files in a directory | |
| python perplexity_markdown_cleaner.py --directory /path/to/docs | |
| # Process all .txt files in current directory (non-recursive) | |
| python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive | |
| """ | |
| import re | |
| import argparse | |
| import os | |
| from pathlib import Path | |
| def remove_references(text): | |
| """ | |
| Remove square bracket references, image sources, and reference sections from text. | |
| Args: | |
| text (str): Input text containing references | |
| Returns: | |
| str: Text with references removed | |
| """ | |
| # Pattern to match square bracket references like [^1], [^2], [^10], etc. | |
| # This pattern matches: | |
| # - Opening square bracket [ | |
| # - Caret symbol ^ | |
| # - One or more digits \d+ | |
| # - Closing square bracket ] | |
| # The pattern can appear multiple times consecutively | |
| pattern = r'\[\^\d+\]+' | |
| # Remove all matching patterns | |
| cleaned_text = re.sub(pattern, '', text) | |
| # Remove image source lines at the beginning of the document | |
| # Pattern matches lines like: <img src="..." style="..."/> | |
| img_pattern = r'<img\s+src="[^"]*"[^>]*/?>\s*\n?' | |
| cleaned_text = re.sub(img_pattern, '', cleaned_text, flags=re.IGNORECASE) | |
| # Remove hidden span elements like <span style="display:none"></span> | |
| span_pattern = r'<span\s+style="display:\s*none"[^>]*></span>\s*' | |
| cleaned_text = re.sub(span_pattern, '', cleaned_text, flags=re.IGNORECASE) | |
| # Remove the references section from the bottom | |
| # This removes everything from the div with ⁂ symbol to the end of the document | |
| # Pattern matches: <div style="...">⁂</div> followed by any content including [^X]: links | |
| references_pattern = r'<div[^>]*>⁂</div>\s*\n*(.*?)$' | |
| cleaned_text = re.sub(references_pattern, '', cleaned_text, flags=re.DOTALL) | |
| # Also remove any standalone reference links that might remain | |
| # Pattern matches lines like: [^1]: https://... | |
| standalone_refs_pattern = r'^\[\^\d+\]:\s*.*$' | |
| cleaned_text = re.sub(standalone_refs_pattern, '', cleaned_text, flags=re.MULTILINE) | |
| # Clean up any extra blank lines that might be left behind | |
| cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text) | |
| return cleaned_text.strip() | |
| def process_file(file_path, output_path=None, backup=True): | |
| """ | |
| Process a single file to remove references. | |
| Args: | |
| file_path (str): Path to the input file | |
| output_path (str, optional): Path for output file. If None, overwrites original | |
| backup (bool): Whether to create a backup of the original file | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| file_path = Path(file_path) | |
| # Check if file exists | |
| if not file_path.exists(): | |
| print(f"Error: File '{file_path}' does not exist.") | |
| return False | |
| # Read the original file | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| original_content = f.read() | |
| # Remove references | |
| cleaned_content = remove_references(original_content) | |
| # Create backup if requested | |
| if backup and output_path is None: | |
| backup_path = file_path.with_suffix(file_path.suffix + '.backup') | |
| with open(backup_path, 'w', encoding='utf-8') as f: | |
| f.write(original_content) | |
| print(f"Backup created: {backup_path}") | |
| # Determine output path | |
| if output_path is None: | |
| output_path = file_path | |
| else: | |
| output_path = Path(output_path) | |
| # Write cleaned content | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(cleaned_content) | |
| print(f"References removed from: {file_path}") | |
| if output_path != file_path: | |
| print(f"Output written to: {output_path}") | |
| return True | |
| except Exception as e: | |
| print(f"Error processing file '{file_path}': {str(e)}") | |
| return False | |
| def process_directory(directory_path, pattern="*.md", recursive=True, backup=True): | |
| """ | |
| Process all files in a directory matching the given pattern. | |
| Args: | |
| directory_path (str): Path to the directory | |
| pattern (str): File pattern to match (default: "*.md") | |
| recursive (bool): Whether to search subdirectories | |
| backup (bool): Whether to create backups | |
| Returns: | |
| tuple: (success_count, total_count) | |
| """ | |
| directory_path = Path(directory_path) | |
| if not directory_path.exists(): | |
| print(f"Error: Directory '{directory_path}' does not exist.") | |
| return 0, 0 | |
| if not directory_path.is_dir(): | |
| print(f"Error: '{directory_path}' is not a directory.") | |
| return 0, 0 | |
| # Find matching files | |
| if recursive: | |
| files = list(directory_path.rglob(pattern)) | |
| else: | |
| files = list(directory_path.glob(pattern)) | |
| if not files: | |
| print(f"No files matching pattern '{pattern}' found in '{directory_path}'") | |
| return 0, 0 | |
| print(f"Found {len(files)} files to process...") | |
| success_count = 0 | |
| for file_path in files: | |
| print(f"\nProcessing: {file_path}") | |
| if process_file(file_path, backup=backup): | |
| success_count += 1 | |
| return success_count, len(files) | |
| def main(): | |
| """Main function to handle command line arguments and execute the script.""" | |
| parser = argparse.ArgumentParser( | |
| description="Remove square bracket references from markdown files", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Process a single file (creates backup) | |
| python perplexity_markdown_cleaner.py file.md | |
| # Process a single file without backup | |
| python perplexity_markdown_cleaner.py file.md --no-backup | |
| # Process a single file and save to new location | |
| python perplexity_markdown_cleaner.py input.md --output cleaned.md | |
| # Process all .md files in a directory | |
| python perplexity_markdown_cleaner.py --directory /path/to/docs | |
| # Process all .txt files in current directory (non-recursive) | |
| python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive | |
| """ | |
| ) | |
| # File or directory input | |
| parser.add_argument( | |
| 'input', | |
| nargs='?', | |
| help='Input file path (if not using --directory)' | |
| ) | |
| parser.add_argument( | |
| '--output', '-o', | |
| help='Output file path (only for single file processing)' | |
| ) | |
| parser.add_argument( | |
| '--directory', '-d', | |
| help='Process all files in directory matching pattern' | |
| ) | |
| parser.add_argument( | |
| '--pattern', '-p', | |
| default='*.md', | |
| help='File pattern to match when processing directory (default: *.md)' | |
| ) | |
| parser.add_argument( | |
| '--recursive', '-r', | |
| action='store_true', | |
| default=True, | |
| help='Search subdirectories (default: True)' | |
| ) | |
| parser.add_argument( | |
| '--no-recursive', | |
| action='store_true', | |
| help='Do not search subdirectories' | |
| ) | |
| parser.add_argument( | |
| '--backup', '-b', | |
| action='store_true', | |
| default=True, | |
| help='Create backup files (default: True)' | |
| ) | |
| parser.add_argument( | |
| '--no-backup', | |
| action='store_true', | |
| help='Do not create backup files' | |
| ) | |
| parser.add_argument( | |
| '--dry-run', | |
| action='store_true', | |
| help='Show what would be processed without making changes' | |
| ) | |
| args = parser.parse_args() | |
| # Handle conflicting arguments | |
| if args.no_recursive: | |
| args.recursive = False | |
| if args.no_backup: | |
| args.backup = False | |
| # Validate arguments | |
| if not args.directory and not args.input: | |
| parser.error("Either provide an input file or use --directory option") | |
| if args.directory and args.input: | |
| parser.error("Cannot use both input file and --directory option") | |
| if args.output and args.directory: | |
| parser.error("Cannot use --output with --directory option") | |
| # Dry run mode | |
| if args.dry_run: | |
| print("DRY RUN MODE - No files will be modified") | |
| print("-" * 50) | |
| # Process directory | |
| if args.directory: | |
| print(f"Processing directory: {args.directory}") | |
| print(f"Pattern: {args.pattern}") | |
| print(f"Recursive: {args.recursive}") | |
| print(f"Backup: {args.backup}") | |
| print("-" * 50) | |
| if args.dry_run: | |
| directory_path = Path(args.directory) | |
| if args.recursive: | |
| files = list(directory_path.rglob(args.pattern)) | |
| else: | |
| files = list(directory_path.glob(args.pattern)) | |
| print(f"Would process {len(files)} files:") | |
| for file_path in files: | |
| print(f" - {file_path}") | |
| else: | |
| success_count, total_count = process_directory( | |
| args.directory, | |
| args.pattern, | |
| args.recursive, | |
| args.backup | |
| ) | |
| print(f"\nCompleted: {success_count}/{total_count} files processed successfully") | |
| # Process single file | |
| else: | |
| print(f"Processing file: {args.input}") | |
| print(f"Output: {args.output or 'overwrite original'}") | |
| print(f"Backup: {args.backup}") | |
| print("-" * 50) | |
| if args.dry_run: | |
| if Path(args.input).exists(): | |
| print(f"Would process: {args.input}") | |
| with open(args.input, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| references = re.findall(r'\[\^\d+\]+', content) | |
| if references: | |
| print(f"Found references to remove: {set(references)}") | |
| else: | |
| print("No references found in file") | |
| else: | |
| print(f"File does not exist: {args.input}") | |
| else: | |
| success = process_file(args.input, args.output, args.backup) | |
| if success: | |
| print("File processed successfully!") | |
| else: | |
| print("Failed to process file.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment